开线程爬取黑猫里的阿里投诉信息
程序员文章站
2022-03-10 11:52:43
`仅供学习,请适度开线程` 一.代码 ......
仅供学习,请适度开线程
一.代码
import requests from requests_html import htmlsession import time from concurrent.futures import threadpoolexecutor import json pool = threadpoolexecutor(30) big_list = [] pool_name_list =[] session = htmlsession() def dewu_company(x): try: print(f'第{x+1}页') params = { 'couid': '1878960481', 'type': '1', 'page_size': f'{(x + 1) * 10}', 'page': f'{x + 1}', # 'callback':'jquery11', } url = 'https://tousu.sina.com.cn/api/company/received_complaints' res = requests.get(url, params=params, verify=false) info_list = res.json()['result']['data']['complaints'] for dict_info in info_list: dict_info['main']['url'] = 'https:' + dict_info['main']['url'] dict_info['author']['avatar'] = 'https:' + dict_info['author']['avatar'] info_url = dict_info['main']['url'] print(info_url) res = session.get(info_url, verify=false) new_dict = dict() new_dict['投诉编号'] = res.html.xpath('/html/body/div[2]/div/div/div[1]/div[2]/ul/li[1]/text()')[0] new_dict['投诉对象'] = res.html.xpath('/html/body/div[2]/div/div/div[1]/div[2]/ul/li[2]/a/text()')[0] new_dict['投诉问题'] = res.html.xpath('/html/body/div[2]/div/div/div[1]/div[2]/ul/li[3]/text()')[0] new_dict['投诉要求'] = res.html.xpath('/html/body/div[2]/div/div/div[1]/div[2]/ul/li[4]/text()')[0] new_dict['涉诉金额'] = res.html.xpath('/html/body/div[2]/div/div/div[1]/div[2]/ul/li[5]/text()')[0] new_dict['投诉进度'] = res.html.xpath('/html/body/div[2]/div/div/div[1]/div[2]/ul/li[6]/b/text()')[0] # new_dict['a'] = res_dome.xpath('//*[@class="u-name"]/text()') # new_dict['b'] = res_dome.xpath('//*[@class="u-status"]/text()') new_dict['投诉进程详情'] = res.html.xpath('//*[@class="ts-d-steplist"]')[0].text not_have_http_img_list = res.html.xpath('//*[@class="example-image-link"]/@href') have_http_img_list = [] for a in not_have_http_img_list: have_http_img_list.append('https:' + a) new_dict['投诉图片'] = have_http_img_list vide_id_list = res.html.xpath('//*[@class="video-ico"]/@data-id') print(vide_id_list) new_vide_list = [] if vide_id_list: for vide_id in vide_id_list: t = int(time.time()) vide_info_url = f'https://api.ivideo.sina.com.cn/public/video/play?video_id={vide_id}&appver=v11220.191231.02&appname=sinaplayer_pc&applt=web&tags=sinaplayer_pc&player=all&jsonp=&plid=2019090301&prid=&uid=&tid=&pid=1&ran=0.34558379845592846&r=https%3a%2f%2ftousu.sina.com.cn%2fcomplaint%2fview%2f17349160365%2f&referrer=&ssid=gusr_pc_{t}&preload=0&uu=60.180.189.200_1581579174.788519&isauto=1' res = session.get(vide_info_url, verify=false) try: new_vide_list.append(res.json()) except: pass new_dict['投诉视频详情'] = new_vide_list dict_info['投诉详情'] = new_dict big_list.append(dict_info) except: print('错误跳过这一页') def run(page): '''爬取的页面数量''' for x in range(page): name = pool.submit(dewu_company,x) pool_name_list.append(name) for name_1 in pool_name_list: name_1.result() print('全部结束开始保存本地') with open(f'阿里投诉信息.json', "w", encoding='utf8') as fw: json.dump(big_list, fw) print('保存完毕') if __name__ == '__main__': run(1)