云计算大数据技术之带了代理的

程序员文章站 2022-08-21 16:32:10
# -*- conding: utf-8 -*- import requests import re import csv import time import glob im...
# -*- conding: utf-8 -*-
import requests
import re
import csv
import time
import glob
import xlrd
import pymysql
from random import choice
# for i in phones:
db =pymysql.connect(host='',port=3306,user='root',passwd='123456',db='联系人消费分类',charset='utf8')
i_num = 0
def run_1(i_num):
    filename = '03142034手机匹配精准消费用户数据.csv'
    filename = '03151054手机匹配精准消费用户数据.csv'
    datavalue = []
    allxls = glob.glob('0322all_简历联系人.xlsx')
    # allxls = glob.glob('联系方式*.xlsx')
    print('\n总共发现%s个信息数据xlsx文件' % len(allxls))
    for fl in allxls:
        fh = xlrd.open_workbook(fl)
        x = 0
        sh = fh.sheets()
        for sheet in sh:
            x += 1
            for shnum in range(0, 1):
                fh = xlrd.open_workbook(fl)
                table = fh.sheets()[shnum]
                num = table.nrows
                th = []
                for row in range(num):
                    rdata = table.row_values(row)
                    rdata = rdata
                    datavalue.append(rdata)
                    phone = str(rdata[2]).replace('.0', '')
                    phone = str(phone)
                    pat = '1\d{10}'
                    testdata = re.compile(pat).findall(phone)
                    if len(testdata) < 1:
                        continue
                    else:
                        phone = testdata[0]
                    print(phone)

                    # 控制匹配开始位置：
                    begin_num = 2
                    i_num = i_num + 1
                    if i_num < begin_num:
                        continue
                    else:
                        print('当前匹配位置：第%s行' % i_num)
                        pass
                        # 控制匹配停止位置：
                    end_num = 15
                    if i_num > end_num:
                        print('停止匹配位置：第%s行' % end_num)
                        break
                    else:
                        pass
                    phone = '18377336165'
                    f2 = open('dxip.txt', 'r')
                    data1 = f2.read()
                    pat = '\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}:\d{1,8}'
                    ips = re.compile(pat).findall(data1)
                    for ip in ips:
                        print(ip)
                        ip = choice(ips)
                        print('待确认ip:' + str(ip))
                        try:
                            proxies = {"http": "http://" + str(ip), "https": "http://" + str(ip), }
                            # http://www.zhipin.com/gongsi/113904.html
                            data1 = requests.get('https://www.reg007.com', proxies=proxies, timeout=3)
                            if len(data1.text) > 14900:
                                thisip = ip
                                break
                            else:
                                pass
                        except Exception as gl:
                            pass
                    print('有效ip:' + str(thisip))
                    proxies = {"http": "http://" + str(thisip), "https": "http://" + str(thisip), }
                    url = "https://www.reg007.com/search?q="+str(phone)
                    cookie = ''
                    headers = {
                                'Host': 'www.reg007.com',
                                'Referer': 'https://www.reg007.com'+str(phone),
                                'Upgrade-Insecure-Requests': '1',
                                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36',
                                'Cookie': cookie
                            }
                    try:
                        data = requests.post(url=url,proxies=proxies,headers=headers,timeout=10)
                    except Exception as gl:
                        print(gl)
#二次抓取begin：

                        f2 = open('dxip.txt', 'r')
                        data1 = f2.read()
                        pat = '\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}:\d{1,8}'
                        ips = re.compile(pat).findall(data1)
                        for ip in ips:
                            print(ip)
                            ip = choice(ips)
                            print('待确认ip:' + str(ip))
                            try:
                                proxies = {"http": "http://" + str(ip), "https": "http://" + str(ip), }
                                # http://www.zhipin.com/gongsi/113904.html
                                data1 = requests.get('https://www.reg007.com', proxies=proxies, timeout=3)
                                if len(data1.text) > 14900:
                                    thisip = ip
                                    break
                                else:
                                    pass
                            except Exception as gl:
                                pass
                        print('有效ip:' + str(thisip))
                        proxies = {"http": "http://" + str(thisip), "https": "http://" + str(thisip), }

                        url = "https://www.reg007.com/search?q=" + str(phone)
                        cookie = ''
                        headers = {
                            'Host': 'www.reg007.com',
                            'Referer': 'https://www.reg007.com' + str(phone),
                            'Upgrade-Insecure-Requests': '1',
                            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36',
                            'Cookie': cookie
                        }
                        try:
                            data = requests.post(url=url, proxies=proxies, headers=headers, timeout=10)
                        except Exception as gl:
                            print(gl)
#二次抓取end.

                    time.sleep(10)
                    data= data.text
                    print(len(data))
                    # print(data)
                    data1 = data.replace('\n', '').replace(' ', '')
                    if '皮肤病' in data1 or '皮肤科' in data1:
                        type1 = '皮肤病患者'
                    else:
                        type1 = ''

                    if '医疗用品' in data1 or '医疗药品' in data1 or '药店' in data1 or '医院' in data1 or '诊所' in data1:
                        type2 = '医疗用品'
                    else:
                        type2 = ''

                    if '潮流品牌' in data1:
                        type3 = '潮流品牌消费'
                    else:
                        type3 = ''
                    if '理财产品' in data1 or '基金产品' in data1 or '股票产品' in data1  or '期货' in  data1:
                        type4 = '投资理财'
                    else:
                        type4 = ''

                    if '母婴用品' in data1 or '婴儿' in data1 or '妇幼' in data1:
                        type5 = '母婴用品消费'
                    else:
                        type5 = ''

                    if '汽车报价' in data1 or '汽车用品' in data1 or '二手车' in data1:
                        type6 = '汽车用品消费'
                    else:
                        type6 = ''

                    if '旅行社' in data1 or '旅游网' in data1:
                        type7 = '旅游消费'
                    else:
                        type7 = ''
                    if '美肤' in data1 or '妆品' in data1 or '护肤' in data1 or '美妆' in data1:
                        type8 = '化妆品消费'
                    else:
                        type8 = ''
                    if '酒水' in data1 or '红酒' in data1:
                        type9 = '酒水消费'
                    else:
                        type9 = ''
                    if '人寿' in data1 or '保险' in data1:
                        type10 = '保险消费'
                    else:
                        type10 = ''
                    if '健康' in data1 or '保健' in data1 or '保养' in data1 or '养生' in data1 or '':
                        type11 = '保健品消费'
                    else:
                        type11 = ''
                    if '建材' in data1 or '五金' in data1 or '家饰' in data1:
                        type12 = '建材消费'
                    else:
                        type12 = ''
                    if '生鲜' in data1 or '蔬菜' in data1 or '海鲜' in data1:
                        type13 = '生活品消费'
                    else:
                        type13 = ''
                    if '百货' in data1 or '五金' in data1 or '零食' in data1 or '家饰' in data1 or '家具' in data1:
                        type14 = '日用品消费'
                    else:
                        type14 = ''

                    if '电竞' in data1 or '网游' in data1 or '电设' in data1:
                        type15 = '电脑配件消费'
                    else:
                        type15 = ''

                    if '英语培训' in data1 or '英语学习' in data1 or '教育学习' in data1:
                        type16 = '教育消费'
                    else:
                        type16 = ''

                    if '运动用品' in data1 or '户外用品' in data1 or '篮球爱好' in data1 or '足球爱好' in data1:
                        type17 = '运动品消费'
                    else:
                        type17 = ''

                    if '钻石' in data1:
                        type18 = '奢侈品消费'
                    else:
                        type18 = ''

                    if '珠宝' in data1 or '名贵手表' in data1:
                        type19 = '轻奢侈品消费'
                    else:
                        type19 = ''
                    if '贷' in data1 or '借呗' in data1:
                        type20 = '贷款消费'
                    else:
                        type20 = ''
                    # if '贷款消费' in type20:
                    #         with open('贷款消费者数据0324.csv','a',newline='') as gl:
                    #             writer = csv.writer(gl)
                    #             writer.writerow([phone,type20])
                    # else:
                    #     pass
                    type_all = str(type2) + '/' + str(type3) + '/' + str(type4) + '/' + str(type5) + '/' + str(
                        type7) + '/' + str(type6) + '/' + str(type1) + '/' + str(type8) + '/' + str(type9) + '/' + str(
                        type10) + '/' + str(type11) + '/' + str(type12) + '/' + str(type13) + '/' + str(
                        type14) + '/' + str(type15) + '/' + str(type16) + '/' + str(type17) + '/' + str(
                        type18) + '/' + str(type19) + '/' + str(type20)
                    type_all = type_all.replace('//////', '/')
                    type_all = type_all.replace('/////', '/')
                    type_all = type_all.replace('////', '/')
                    type_all = type_all.replace('///', '/')
                    type_all = type_all.replace('//', '/')
                    type_all = type_all.replace('//', '/')
                    type_all = type_all.replace('//', '/')

                    name = rdata[0]
                    email = rdata[2]
                    marital_status = rdata[4]
                    if '未婚' in marital_status:
                        marital_status = 'null'
                    else:
                        marital_status = marital_status
                    if 'null' in marital_status:
                        marital_status = '保密'
                    site = rdata[5]
                    main_job = rdata[3]
                    print('链接url：' + str(url) + '  编号：' + str(i_num) + '  号码：' + str(phone) + '  类别：' + str(
                        type_all) + '  姓名：' + str(name) + '  邮箱：' + str(email) + '  婚否：' + str(
                        marital_status) + '  所属地区：' + str(site) + '  从事行业：' + str(main_job))
                    # with open(filename,'a',newline='') as gl:
                    #     writer = csv.writer(gl)
                    #     writer.writerow([i_num,phone,type_all,name,email,marital_status,site,main_job])
                    # cursor = db.cursor()
                    # sql = " INSERT INTO self_person_info(phone,type1,name,email,marital_status,site,main_job) VALUES ('%s', '%s', '%s', '%s', '%s', '%s','%s')" % (
                    # phone, type_all, name, email, marital_status, site, main_job)
                    # # 执行sql语句
                    # cursor.execute(sql)
                    # # 提交到数据库执行
                    # db.commit()
            print('\n数据条信息解析完成：')

if __name__=='__main__':
    run_1(i_num)


# import requests
# import re
# import csv
# import time
# phones = ['15296003823','18377336165']
# for i in phones:
#     url = "https://www.reg007.com/search?q="+str(i)
#
#     cookie = 'reg007_c_s_t=1; reg007_account=1148728004%40qq.com; reg007_password=4ddf00b56f92a6581b66f44c0eebdb6f; _ga=GA1.2.934306575.1520587872; _gid=GA1.2.238444148.1520932474; reg007_q='+str(i)+'; reg007_think_language=zh-CN; PHPSESSID=qtn27q10pr41685ujh0646v164; reg007_f_c_j=9522c373d41d008ecb30986bec4469d9'
#     headers = {
#                 'Host': 'www.reg007.com',
#                 'Referer': 'https://www.reg007.com'+str(i),
#                 'Upgrade-Insecure-Requests': '1',
#                 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36',
#                 'Cookie': cookie
#             }
#     data = requests.post(url=url,headers=headers,timeout=10)
#     time.sleep(3)
#     data= data.text
#     print(data)
上一篇： MySQL 笔记整理（1） --基础架构，一条SQL查询语句如何执行
下一篇：孙膑被师兄断了双脚，后来他是怎么复仇的？
云计算大数据技术之带了代理的

赤峰市雪亮工程：基于开放的云计算大数据技术，让视频更智能

2012大数据：云计算之后的又一次泡沫？

2013年关于云计算的七大未解之迷

IBM新兴技术大学成立：六大学院聚焦大数据、移动、云计算等热门

2013年关于云计算的七大未解之迷

大企业用云计算与虚拟化技术的价值

云计算本身是数据中心用分布式技术做的

企业从云计算吸取关于大数据的三大教训

云计算技术之数据可视化教程

云计算大数据技术之带了代理的

云计算大数据技术之带了代理的

赤峰市雪亮工程：基于开放的云计算大数据技术，让视频更智能

2012大数据：云计算之后的又一次泡沫？

2013年 关于云计算的七大未解之迷

IBM新兴技术大学成立：六大学院聚焦大数据、移动、云计算等热门

2013年关于云计算的七大未解之迷

大企业用云计算与虚拟化技术的价值

云计算本身是数据中心用分布式技术做的

企业从云计算吸取关于大数据的三大教训

云计算技术之数据可视化教程

云计算大数据技术之带了代理的

2013年关于云计算的七大未解之迷