Python 爬虫 多进程清洗代理
程序员文章站
2022-04-09 18:15:59
利用多线程检测代理网站提供的免费代理是否可用 ......
利用多线程检测代理网站提供的免费代理是否可用
1 import requests 2 from lxml import etree 3 import time 4 import multiprocessing 5 6 def get_all_proxy(queue): 7 url = 'http://www.xicidaili.com/nn/1' 8 headers = { 9 'user-agent': 'mozilla/5.0 (windows nt 6.1; win64; x64) applewebkit/537.36 (khtml, like gecko) chrome/68.0.3440.106 safari/537.36', 10 } 11 response = requests.get(url, headers=headers) 12 html_ele = etree.html(response.text) 13 14 ip_eles = html_ele.xpath('//table[@id="ip_list"]/tr/td[2]/text()') 15 port_ele = html_ele.xpath('//table[@id="ip_list"]/tr/td[3]/text()') 16 # proxy_list = [] 17 for i in range(0,len(ip_eles)): 18 proxy_str = 'http://' + ip_eles[i] + ':' + port_ele[i] 19 #proxy_list.append(proxy_str) 20 #print(proxy_str) 21 queue.put(proxy_str) 22 23 def check_one_proxy(proxy): 24 try: 25 #proxy = proxy_and_queue 26 url = 'http://www.baidu.com/s?wd=ip' 27 proxy_dict = { 28 'http': proxy 29 } 30 try: 31 response = requests.get(url, proxies=proxy_dict, timeout=5) 32 if response.status_code == 200: 33 print(proxy) 34 return proxy 35 else: 36 print('bad '+proxy) 37 return proxy 38 except: 39 return none 40 except exception as e: 41 print(e) 42 43 if __name__ == '__main__': 44 start_time = time.time() 45 # 创建队列 46 q = multiprocessing.queue() 47 # pool 进程池中, 要用的是下面的这个queue 48 #result_q = multiprocessing.manager().queue() 49 # 获取所有代理 50 p = multiprocessing.process(target=get_all_proxy, args=(q,)) 51 p.start() 52 # proxy_list = get_all_proxy() 53 # 检测代理的可用性 54 55 pool = multiprocessing.pool(30) 56 result_list = [] 57 while true: 58 try: 59 proxy_str = q.get(timeout=5) 60 except: 61 break 62 #print('apply_async 之前') 63 #proxy_and_queue = [proxy_str, result_q] 64 proxy_res = pool.apply_async(check_one_proxy, (proxy_str,)) 65 result_list.append(proxy_res) 66 #valid_proxy_list = check_all_proxy(proxy_list) 67 68 valid_proxy_list = [] 69 for proxy_res in result_list: 70 result = proxy_res.get() 71 if result is none: 72 pass 73 else: 74 valid_proxy_list.append(result) 75 #print(result) 76 print('all proxy we can get:') 77 print(valid_proxy_list) 78 pool.close() 79 pool.join() 80 p.join() 81 82 end_time = time.time() 83 print('--'*30) 84 # print(valid_proxy_list) 85 print('耗时:' + str(end_time-start_time))