Python笔记-多线程爬虫实例
程序员文章站
2023-12-27 08:24:21
...
如下,线程池两个线程:
线程池关键代码:
源码如下:
import re, multiprocessing
import requests, time
class HandleLaGou(object):
def __init__(self):
self.laGou_session = requests.session()
self.header = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}
self.city_list = ""
#获取全国城市列表
def handle_city(self):
city_search = re.compile(r'zhaopin/">(.*?)</a>')
city_url = "https://www.lagou.com/jobs/allCity.html"
city_result = self.handle_request(method = "GET", url = city_url)
self.city_list = city_search.findall(city_result)
self.laGou_session.cookies.clear()
def handle_city_job(self, city):
first_request_url = "https://www.lagou.com/jobs/list_python?city=%s&cl=false&fromSearch=true&labelWords=&suginput=" % city
first_response = self.handle_request(method = "GET", url = first_request_url)
total_page_search = re.compile(r'class="span\stotalNum">(\d+)</span>')
try:
total_page = total_page_search.search(first_response).group(1)
except:
return
else:
for i in range(1, int(total_page) + 1):
data = {
"pn": i,
"kd": "python"
}
page_url = "https://www.lagou.com/jobs/positionAjax.json?city=%s&needAddtionalResult=false" % city
referer_url = "https://www.lagou.com/jobs/list_python?city=%s&cl=false&fromSearch=true&labelWords=&suginput=" % city
self.header['Referer'] = referer_url.encode()
response = self.handle_request(method = "POST", url = page_url, data = data, info = city)
print(response)
def handle_request(self, method, url, data = None, info = None):
while True:
proxyinfo = "http://%(user)s:%(pass)aaa@qq.com%(host)s:%(port)s" % {
"host" : "http-dyn.abuyun.com",
"port" : 9020,
"user" : "V21C9SWA4CQ3FSHD",
"pass" : "1DF3191F6103Q34",
}
proxy = {
"http": proxyinfo,
"https": proxyinfo
}
try:
if method == "GET":
response = self.laGou_session.get(url=url, headers=self.header, proxies=proxy,timeout=6)
return response.text
elif method == "POST":
response = self.laGou_session.post(url=url, headers=self.header, data=data, proxies=proxy,timeout=6)
print(response.text)
except:
self.laGou_session.cookies.clear()
first_request_url = "https://www.lagou.com/jobs/list_python?city=%s&cl=false&fromSearch=true&labelWords=&suginput=" % info
self.handle_request(method="GET", url=first_request_url)
time.sleep(10)
continue
response.encoding = 'utf-8'
if '频繁' in response.text:
# 先清除cookies再重新获取cookies
self.laGou_session.cookies.clear()
first_request_url = "https://www.lagou.com/jobs/list_python?city=%s&cl=false&fromSearch=true&labelWords=&suginput=" % info
self.handle_request(method="GET", url=first_request_url)
time.sleep(10)
continue
return response.text
if __name__ == '__main__':
laGou = HandleLaGou()
laGou.handle_city()
#多进程爬网站
pool = multiprocessing.Pool(2)
for city in laGou.city_list:
pool.apply_async(laGou.handle_city_job, args=(city,))
pool.close()
pool.join()
pass