云计算大数据技术之带了代理的
程序员文章站
2022-08-21 16:32:10
# -*- conding: utf-8 -*-
import requests
import re
import csv
import time
import glob
im...
# -*- conding: utf-8 -*- import requests import re import csv import time import glob import xlrd import pymysql from random import choice # for i in phones: db =pymysql.connect(host='',port=3306,user='root',passwd='123456',db='联系人消费分类',charset='utf8') i_num = 0 def run_1(i_num): filename = '03142034手机匹配精准消费用户数据.csv' filename = '03151054手机匹配精准消费用户数据.csv' datavalue = [] allxls = glob.glob('0322all_简历联系人.xlsx') # allxls = glob.glob('联系方式*.xlsx') print('\n总共发现%s个信息数据xlsx文件' % len(allxls)) for fl in allxls: fh = xlrd.open_workbook(fl) x = 0 sh = fh.sheets() for sheet in sh: x += 1 for shnum in range(0, 1): fh = xlrd.open_workbook(fl) table = fh.sheets()[shnum] num = table.nrows th = [] for row in range(num): rdata = table.row_values(row) rdata = rdata datavalue.append(rdata) phone = str(rdata[2]).replace('.0', '') phone = str(phone) pat = '1\d{10}' testdata = re.compile(pat).findall(phone) if len(testdata) < 1: continue else: phone = testdata[0] print(phone) # 控制匹配开始位置: begin_num = 2 i_num = i_num + 1 if i_num < begin_num: continue else: print('当前匹配位置:第%s行' % i_num) pass # 控制匹配停止位置: end_num = 15 if i_num > end_num: print('停止匹配位置:第%s行' % end_num) break else: pass phone = '18377336165' f2 = open('dxip.txt', 'r') data1 = f2.read() pat = '\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}:\d{1,8}' ips = re.compile(pat).findall(data1) for ip in ips: print(ip) ip = choice(ips) print('待确认ip:' + str(ip)) try: proxies = {"http": "http://" + str(ip), "https": "http://" + str(ip), } # http://www.zhipin.com/gongsi/113904.html data1 = requests.get('https://www.reg007.com', proxies=proxies, timeout=3) if len(data1.text) > 14900: thisip = ip break else: pass except Exception as gl: pass print('有效ip:' + str(thisip)) proxies = {"http": "http://" + str(thisip), "https": "http://" + str(thisip), } url = "https://www.reg007.com/search?q="+str(phone) cookie = '' headers = { 'Host': 'www.reg007.com', 'Referer': 'https://www.reg007.com'+str(phone), 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36', 'Cookie': cookie } try: data = requests.post(url=url,proxies=proxies,headers=headers,timeout=10) except Exception as gl: print(gl) #二次抓取begin: f2 = open('dxip.txt', 'r') data1 = f2.read() pat = '\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}:\d{1,8}' ips = re.compile(pat).findall(data1) for ip in ips: print(ip) ip = choice(ips) print('待确认ip:' + str(ip)) try: proxies = {"http": "http://" + str(ip), "https": "http://" + str(ip), } # http://www.zhipin.com/gongsi/113904.html data1 = requests.get('https://www.reg007.com', proxies=proxies, timeout=3) if len(data1.text) > 14900: thisip = ip break else: pass except Exception as gl: pass print('有效ip:' + str(thisip)) proxies = {"http": "http://" + str(thisip), "https": "http://" + str(thisip), } url = "https://www.reg007.com/search?q=" + str(phone) cookie = '' headers = { 'Host': 'www.reg007.com', 'Referer': 'https://www.reg007.com' + str(phone), 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36', 'Cookie': cookie } try: data = requests.post(url=url, proxies=proxies, headers=headers, timeout=10) except Exception as gl: print(gl) #二次抓取end. time.sleep(10) data= data.text print(len(data)) # print(data) data1 = data.replace('\n', '').replace(' ', '') if '皮肤病' in data1 or '皮肤科' in data1: type1 = '皮肤病患者' else: type1 = '' if '医疗用品' in data1 or '医疗药品' in data1 or '药店' in data1 or '医院' in data1 or '诊所' in data1: type2 = '医疗用品' else: type2 = '' if '潮流品牌' in data1: type3 = '潮流品牌消费' else: type3 = '' if '理财产品' in data1 or '基金产品' in data1 or '股票产品' in data1 or '期货' in data1: type4 = '投资理财' else: type4 = '' if '母婴用品' in data1 or '婴儿' in data1 or '妇幼' in data1: type5 = '母婴用品消费' else: type5 = '' if '汽车报价' in data1 or '汽车用品' in data1 or '二手车' in data1: type6 = '汽车用品消费' else: type6 = '' if '旅行社' in data1 or '旅游网' in data1: type7 = '旅游消费' else: type7 = '' if '美肤' in data1 or '妆品' in data1 or '护肤' in data1 or '美妆' in data1: type8 = '化妆品消费' else: type8 = '' if '酒水' in data1 or '红酒' in data1: type9 = '酒水消费' else: type9 = '' if '人寿' in data1 or '保险' in data1: type10 = '保险消费' else: type10 = '' if '健康' in data1 or '保健' in data1 or '保养' in data1 or '养生' in data1 or '': type11 = '保健品消费' else: type11 = '' if '建材' in data1 or '五金' in data1 or '家饰' in data1: type12 = '建材消费' else: type12 = '' if '生鲜' in data1 or '蔬菜' in data1 or '海鲜' in data1: type13 = '生活品消费' else: type13 = '' if '百货' in data1 or '五金' in data1 or '零食' in data1 or '家饰' in data1 or '家具' in data1: type14 = '日用品消费' else: type14 = '' if '电竞' in data1 or '网游' in data1 or '电设' in data1: type15 = '电脑配件消费' else: type15 = '' if '英语培训' in data1 or '英语学习' in data1 or '教育学习' in data1: type16 = '教育消费' else: type16 = '' if '运动用品' in data1 or '户外用品' in data1 or '篮球爱好' in data1 or '足球爱好' in data1: type17 = '运动品消费' else: type17 = '' if '钻石' in data1: type18 = '奢侈品消费' else: type18 = '' if '珠宝' in data1 or '名贵手表' in data1: type19 = '轻奢侈品消费' else: type19 = '' if '贷' in data1 or '借呗' in data1: type20 = '贷款消费' else: type20 = '' # if '贷款消费' in type20: # with open('贷款消费者数据0324.csv','a',newline='') as gl: # writer = csv.writer(gl) # writer.writerow([phone,type20]) # else: # pass type_all = str(type2) + '/' + str(type3) + '/' + str(type4) + '/' + str(type5) + '/' + str( type7) + '/' + str(type6) + '/' + str(type1) + '/' + str(type8) + '/' + str(type9) + '/' + str( type10) + '/' + str(type11) + '/' + str(type12) + '/' + str(type13) + '/' + str( type14) + '/' + str(type15) + '/' + str(type16) + '/' + str(type17) + '/' + str( type18) + '/' + str(type19) + '/' + str(type20) type_all = type_all.replace('//////', '/') type_all = type_all.replace('/////', '/') type_all = type_all.replace('////', '/') type_all = type_all.replace('///', '/') type_all = type_all.replace('//', '/') type_all = type_all.replace('//', '/') type_all = type_all.replace('//', '/') name = rdata[0] email = rdata[2] marital_status = rdata[4] if '未婚' in marital_status: marital_status = 'null' else: marital_status = marital_status if 'null' in marital_status: marital_status = '保密' site = rdata[5] main_job = rdata[3] print('链接url:' + str(url) + ' 编号:' + str(i_num) + ' 号码:' + str(phone) + ' 类别:' + str( type_all) + ' 姓名:' + str(name) + ' 邮箱:' + str(email) + ' 婚否:' + str( marital_status) + ' 所属地区:' + str(site) + ' 从事行业:' + str(main_job)) # with open(filename,'a',newline='') as gl: # writer = csv.writer(gl) # writer.writerow([i_num,phone,type_all,name,email,marital_status,site,main_job]) # cursor = db.cursor() # sql = " INSERT INTO self_person_info(phone,type1,name,email,marital_status,site,main_job) VALUES ('%s', '%s', '%s', '%s', '%s', '%s','%s')" % ( # phone, type_all, name, email, marital_status, site, main_job) # # 执行sql语句 # cursor.execute(sql) # # 提交到数据库执行 # db.commit() print('\n数据条信息解析完成:') if __name__=='__main__': run_1(i_num) # import requests # import re # import csv # import time # phones = ['15296003823','18377336165'] # for i in phones: # url = "https://www.reg007.com/search?q="+str(i) # # cookie = 'reg007_c_s_t=1; reg007_account=1148728004%40qq.com; reg007_password=4ddf00b56f92a6581b66f44c0eebdb6f; _ga=GA1.2.934306575.1520587872; _gid=GA1.2.238444148.1520932474; reg007_q='+str(i)+'; reg007_think_language=zh-CN; PHPSESSID=qtn27q10pr41685ujh0646v164; reg007_f_c_j=9522c373d41d008ecb30986bec4469d9' # headers = { # 'Host': 'www.reg007.com', # 'Referer': 'https://www.reg007.com'+str(i), # 'Upgrade-Insecure-Requests': '1', # 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36', # 'Cookie': cookie # } # data = requests.post(url=url,headers=headers,timeout=10) # time.sleep(3) # data= data.text # print(data)