欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  科技

云计算大数据技术之带了代理的

程序员文章站 2022-08-21 16:32:10
# -*- conding: utf-8 -*- import requests import re import csv import time import glob im...
# -*- conding: utf-8 -*-
import requests
import re
import csv
import time
import glob
import xlrd
import pymysql
from random import choice
# for i in phones:
db =pymysql.connect(host='',port=3306,user='root',passwd='123456',db='联系人消费分类',charset='utf8')
i_num = 0
def run_1(i_num):
    filename = '03142034手机匹配精准消费用户数据.csv'
    filename = '03151054手机匹配精准消费用户数据.csv'
    datavalue = []
    allxls = glob.glob('0322all_简历联系人.xlsx')
    # allxls = glob.glob('联系方式*.xlsx')
    print('\n总共发现%s个信息数据xlsx文件' % len(allxls))
    for fl in allxls:
        fh = xlrd.open_workbook(fl)
        x = 0
        sh = fh.sheets()
        for sheet in sh:
            x += 1
            for shnum in range(0, 1):
                fh = xlrd.open_workbook(fl)
                table = fh.sheets()[shnum]
                num = table.nrows
                th = []
                for row in range(num):
                    rdata = table.row_values(row)
                    rdata = rdata
                    datavalue.append(rdata)
                    phone = str(rdata[2]).replace('.0', '')
                    phone = str(phone)
                    pat = '1\d{10}'
                    testdata = re.compile(pat).findall(phone)
                    if len(testdata) < 1:
                        continue
                    else:
                        phone = testdata[0]
                    print(phone)

                    # 控制匹配开始位置:
                    begin_num = 2
                    i_num = i_num + 1
                    if i_num < begin_num:
                        continue
                    else:
                        print('当前匹配位置:第%s行' % i_num)
                        pass
                        # 控制匹配停止位置:
                    end_num = 15
                    if i_num > end_num:
                        print('停止匹配位置:第%s行' % end_num)
                        break
                    else:
                        pass
                    phone = '18377336165'
                    f2 = open('dxip.txt', 'r')
                    data1 = f2.read()
                    pat = '\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}:\d{1,8}'
                    ips = re.compile(pat).findall(data1)
                    for ip in ips:
                        print(ip)
                        ip = choice(ips)
                        print('待确认ip:' + str(ip))
                        try:
                            proxies = {"http": "http://" + str(ip), "https": "http://" + str(ip), }
                            # http://www.zhipin.com/gongsi/113904.html
                            data1 = requests.get('https://www.reg007.com', proxies=proxies, timeout=3)
                            if len(data1.text) > 14900:
                                thisip = ip
                                break
                            else:
                                pass
                        except Exception as gl:
                            pass
                    print('有效ip:' + str(thisip))
                    proxies = {"http": "http://" + str(thisip), "https": "http://" + str(thisip), }
                    url = "https://www.reg007.com/search?q="+str(phone)
                    cookie = ''
                    headers = {
                                'Host': 'www.reg007.com',
                                'Referer': 'https://www.reg007.com'+str(phone),
                                'Upgrade-Insecure-Requests': '1',
                                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36',
                                'Cookie': cookie
                            }
                    try:
                        data = requests.post(url=url,proxies=proxies,headers=headers,timeout=10)
                    except Exception as gl:
                        print(gl)
#二次抓取begin:

                        f2 = open('dxip.txt', 'r')
                        data1 = f2.read()
                        pat = '\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}:\d{1,8}'
                        ips = re.compile(pat).findall(data1)
                        for ip in ips:
                            print(ip)
                            ip = choice(ips)
                            print('待确认ip:' + str(ip))
                            try:
                                proxies = {"http": "http://" + str(ip), "https": "http://" + str(ip), }
                                # http://www.zhipin.com/gongsi/113904.html
                                data1 = requests.get('https://www.reg007.com', proxies=proxies, timeout=3)
                                if len(data1.text) > 14900:
                                    thisip = ip
                                    break
                                else:
                                    pass
                            except Exception as gl:
                                pass
                        print('有效ip:' + str(thisip))
                        proxies = {"http": "http://" + str(thisip), "https": "http://" + str(thisip), }

                        url = "https://www.reg007.com/search?q=" + str(phone)
                        cookie = ''
                        headers = {
                            'Host': 'www.reg007.com',
                            'Referer': 'https://www.reg007.com' + str(phone),
                            'Upgrade-Insecure-Requests': '1',
                            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36',
                            'Cookie': cookie
                        }
                        try:
                            data = requests.post(url=url, proxies=proxies, headers=headers, timeout=10)
                        except Exception as gl:
                            print(gl)
#二次抓取end.

                    time.sleep(10)
                    data= data.text
                    print(len(data))
                    # print(data)
                    data1 = data.replace('\n', '').replace(' ', '')
                    if '皮肤病' in data1 or '皮肤科' in data1:
                        type1 = '皮肤病患者'
                    else:
                        type1 = ''

                    if '医疗用品' in data1 or '医疗药品' in data1 or '药店' in data1 or '医院' in data1 or '诊所' in data1:
                        type2 = '医疗用品'
                    else:
                        type2 = ''

                    if '潮流品牌' in data1:
                        type3 = '潮流品牌消费'
                    else:
                        type3 = ''
                    if '理财产品' in data1 or '基金产品' in data1 or '股票产品' in data1  or '期货' in  data1:
                        type4 = '投资理财'
                    else:
                        type4 = ''

                    if '母婴用品' in data1 or '婴儿' in data1 or '妇幼' in data1:
                        type5 = '母婴用品消费'
                    else:
                        type5 = ''

                    if '汽车报价' in data1 or '汽车用品' in data1 or '二手车' in data1:
                        type6 = '汽车用品消费'
                    else:
                        type6 = ''

                    if '旅行社' in data1 or '旅游网' in data1:
                        type7 = '旅游消费'
                    else:
                        type7 = ''
                    if '美肤' in data1 or '妆品' in data1 or '护肤' in data1 or '美妆' in data1:
                        type8 = '化妆品消费'
                    else:
                        type8 = ''
                    if '酒水' in data1 or '红酒' in data1:
                        type9 = '酒水消费'
                    else:
                        type9 = ''
                    if '人寿' in data1 or '保险' in data1:
                        type10 = '保险消费'
                    else:
                        type10 = ''
                    if '健康' in data1 or '保健' in data1 or '保养' in data1 or '养生' in data1 or '':
                        type11 = '保健品消费'
                    else:
                        type11 = ''
                    if '建材' in data1 or '五金' in data1 or '家饰' in data1:
                        type12 = '建材消费'
                    else:
                        type12 = ''
                    if '生鲜' in data1 or '蔬菜' in data1 or '海鲜' in data1:
                        type13 = '生活品消费'
                    else:
                        type13 = ''
                    if '百货' in data1 or '五金' in data1 or '零食' in data1 or '家饰' in data1 or '家具' in data1:
                        type14 = '日用品消费'
                    else:
                        type14 = ''

                    if '电竞' in data1 or '网游' in data1 or '电设' in data1:
                        type15 = '电脑配件消费'
                    else:
                        type15 = ''

                    if '英语培训' in data1 or '英语学习' in data1 or '教育学习' in data1:
                        type16 = '教育消费'
                    else:
                        type16 = ''

                    if '运动用品' in data1 or '户外用品' in data1 or '篮球爱好' in data1 or '足球爱好' in data1:
                        type17 = '运动品消费'
                    else:
                        type17 = ''

                    if '钻石' in data1:
                        type18 = '奢侈品消费'
                    else:
                        type18 = ''

                    if '珠宝' in data1 or '名贵手表' in data1:
                        type19 = '轻奢侈品消费'
                    else:
                        type19 = ''
                    if '贷' in data1 or '借呗' in data1:
                        type20 = '贷款消费'
                    else:
                        type20 = ''
                    # if '贷款消费' in type20:
                    #         with open('贷款消费者数据0324.csv','a',newline='') as gl:
                    #             writer = csv.writer(gl)
                    #             writer.writerow([phone,type20])
                    # else:
                    #     pass
                    type_all = str(type2) + '/' + str(type3) + '/' + str(type4) + '/' + str(type5) + '/' + str(
                        type7) + '/' + str(type6) + '/' + str(type1) + '/' + str(type8) + '/' + str(type9) + '/' + str(
                        type10) + '/' + str(type11) + '/' + str(type12) + '/' + str(type13) + '/' + str(
                        type14) + '/' + str(type15) + '/' + str(type16) + '/' + str(type17) + '/' + str(
                        type18) + '/' + str(type19) + '/' + str(type20)
                    type_all = type_all.replace('//////', '/')
                    type_all = type_all.replace('/////', '/')
                    type_all = type_all.replace('////', '/')
                    type_all = type_all.replace('///', '/')
                    type_all = type_all.replace('//', '/')
                    type_all = type_all.replace('//', '/')
                    type_all = type_all.replace('//', '/')

                    name = rdata[0]
                    email = rdata[2]
                    marital_status = rdata[4]
                    if '未婚' in marital_status:
                        marital_status = 'null'
                    else:
                        marital_status = marital_status
                    if 'null' in marital_status:
                        marital_status = '保密'
                    site = rdata[5]
                    main_job = rdata[3]
                    print('链接url:' + str(url) + '  编号:' + str(i_num) + '  号码:' + str(phone) + '  类别:' + str(
                        type_all) + '  姓名:' + str(name) + '  邮箱:' + str(email) + '  婚否:' + str(
                        marital_status) + '  所属地区:' + str(site) + '  从事行业:' + str(main_job))
                    # with open(filename,'a',newline='') as gl:
                    #     writer = csv.writer(gl)
                    #     writer.writerow([i_num,phone,type_all,name,email,marital_status,site,main_job])
                    # cursor = db.cursor()
                    # sql = " INSERT INTO self_person_info(phone,type1,name,email,marital_status,site,main_job) VALUES ('%s', '%s', '%s', '%s', '%s', '%s','%s')" % (
                    # phone, type_all, name, email, marital_status, site, main_job)
                    # # 执行sql语句
                    # cursor.execute(sql)
                    # # 提交到数据库执行
                    # db.commit()
            print('\n数据条信息解析完成:')

if __name__=='__main__':
    run_1(i_num)


# import requests
# import re
# import csv
# import time
# phones = ['15296003823','18377336165']
# for i in phones:
#     url = "https://www.reg007.com/search?q="+str(i)
#
#     cookie = 'reg007_c_s_t=1; reg007_account=1148728004%40qq.com; reg007_password=4ddf00b56f92a6581b66f44c0eebdb6f; _ga=GA1.2.934306575.1520587872; _gid=GA1.2.238444148.1520932474; reg007_q='+str(i)+'; reg007_think_language=zh-CN; PHPSESSID=qtn27q10pr41685ujh0646v164; reg007_f_c_j=9522c373d41d008ecb30986bec4469d9'
#     headers = {
#                 'Host': 'www.reg007.com',
#                 'Referer': 'https://www.reg007.com'+str(i),
#                 'Upgrade-Insecure-Requests': '1',
#                 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36',
#                 'Cookie': cookie
#             }
#     data = requests.post(url=url,headers=headers,timeout=10)
#     time.sleep(3)
#     data= data.text
#     print(data)