Scrapy爬取拉钩网(反爬很强、可能以下代码就过时了!)职位信息 > 2019/4/2更新
程序员文章站
2022-04-21 23:50:28
...
拉勾网反爬
- headers
- cookies
- IP代理池
爬取的数据.csv
导包:import scrapy,requests,time,json,random,pandas as pd,codecs
获取cookies、headers的地址:url=“https://www.lagou.com/jobs/list_Python?labelWords=&fromSearch=true&suginput=”
获取职位json数据的地址:json_url=“https://www.lagou.com/jobs/positionAjax.json”
以下内容都要写到代码里
伪装cookies
核心代码
def get_cookie(self):
headers = {
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
}
response = requests.get(
self.url,
headers=headers) # 请求原网页
r = requests.utils.dict_from_cookiejar(response.cookies) # 获取cookies
r["user_trace_token"] = r["LGRID"]
r["LGSID"] = r["LGRID"]
r["LGUID"] = r["LGRID"] # 构造cookies的参数
cookies = {
'X_MIDDLE_TOKEN': '797bc148d133274a162ba797a6875817',
'JSESSIONID': 'ABAAABAAAIAACBI03F33A375F98E05C5108D4D742A34114',
'_ga': 'GA1.2.1912257997.1548059451',
'_gat': '1',
'Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6': '1548059451',
'user_trace_token': '20190121163050-dbd72da2-1d56-11e9-8927-525400f775ce',
'LGSID': '20190121163050-dbd72f67-1d56-11e9-8927-525400f775ce',
'PRE_UTM': '',
'PRE_HOST': '',
'PRE_SITE': '',
'PRE_LAND': 'https%3A%2F%2Fwww.lagou.com%2F%3F_from_mid%3D1',
'LGUID': '20190121163050-dbd73128-1d56-11e9-8927-525400f775ce',
'_gid': 'GA1.2.1194828713.1548059451',
'index_location_city': '%E5%85%A8%E5%9B%BD',
'TG-TRACK-CODE': 'index_hotjob',
'LGRID': '20190121163142-fb0cc9c0-1d56-11e9-8928-525400f775ce',
'Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6': str(time.time()),
'SEARCH_ID': '86ed37f5d8da417dafb53aa25cd6fbc0',
}
cookies.update(r) # 更新接口的cookies
return cookies
伪装成浏览器,直接在settings.py这样写
# Crawl responsibly by identifying yourself (and your website) on the user-agent
from fake_useragent import UserAgent
USER_AGENT = UserAgent().random
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
IP代理池:在middlewares.py新建类如下
class ProxyMiddleware(object):
proxyList = [
'121.193.143.249:80','112.126.65.193:80','122.96.59.104:82','115.29.98.139:9999','117.131.216.214:80','116.226.243.166:8118','101.81.22.21:8118','122.96.59.107:843'
]
def process_request(self,request,spider):
p = random.choice(self.proxyList)
request.meta['proxy']='http://'+p
#添加参数:settings.py就ok了
DOWNLOADER_MIDDLEWARES = {
'scrapyTest.middlewares.ScrapytestDownloaderMiddleware': None,
"scrapyTest.middlewares.RandomUserAgentMidderware":543,
"scrapyTest.middlewares.ProxyMiddleware":100 #这个*********
}
爬虫
#参数
allowed_domains = ['lagou.com']
page=1
sum_pages=20
json_url="https://www.lagou.com/jobs/positionAjax.json"
start_urls = [json_url]
header = {
'Origin': 'https://www.lagou.com',
'X-Anit-Forge-Code': '0',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Referer': url,
'X-Requested-With': 'XMLHttpRequest',
'Connection': 'keep-alive',
'X-Anit-Forge-Token': 'None',
}
#免费IP
proxy_list = [
'http://140.224.76.21:808',
'http://60.178.14.90:8081',
'http://121.232.146.13:9000',
]
proxy_ip = random.choice(proxy_list)
data={
"pn": "1",#页数
"kd": "Python",#关键字
}
params = (
('px', 'new'),
('needAddtionalResult', 'false'),
)
def start_requests(self):
for url in self.start_urls:
yield scrapy.FormRequest(url,formdata=self.data,headers=self.header,cookies=self.get_cookie(),callback=self.parse)
def parse(self, response):
info=json.loads(response.text)
jobs=info["content"]["positionResult"]["result"]
# json.dump(jobs,self.f)
self.parse_details(jobs)#获取有用数据dict
time.sleep(random.randint(1,3))
try:
while self.page<self.sum_pages:
time.sleep(random.randint(1,3))
self.page+=1
self.data["pn"]=str(self.page)
yield scrapy.FormRequest(self.json_url,formdata=self.data,headers=self.header,cookies=self.get_cookie(),callback=self.parse)#POST
except Exception as e:
print(e)
finally:
self.save_jobsList()#最最后保存jobs_list
解析数据、保存为csv
#参数
jobs_list=[]#全局jobs列表,存储全部数据最后到save_jobsList()保存
csv_path="D:\\lagou_jobs.csv"
f=codecs.open("D:\\test.json","a","utf-8")
def parse_details(self,jobs):
for job in jobs:
list=[job["companyFullName"],
job["financeStage"],job["positionName"],
job["positionLables"],job["salary"],
job["city"],job["education"],job["workYear"],
job["jobNature"],job["createTime"]]
print("*****",list)
self.f.write(str(list))
self.jobs_list.append(list)
def save_jobsList(self):
p = pd.DataFrame(data=self.jobs_list,columns=["公司全名","融资情况","职位名","标签","薪水","城市","学历要求","经验要求","工作类型","发布时间"])
p.to_csv(self.csv_path,index=True)
print("保存完毕!")
在lagou.py最后代码:不用cmd,直接运行lagou.py就行
if __name__=="__main__":
from scrapy import cmdline
cmdline.execute("scrapy crawl lagou".split())
职位名 词云
爬了73条就爬不了了,数据太少,词云就很丑了~~~~~~~~~
样本图 cloud.png 我的小太阳~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ttf在哪里?复制到桌面就有ttf文件了
#word_cloud.py
import pandas as pd
import jieba
from imageio import imread
from wordcloud import WordCloud
import matplotlib.pyplot as plt
# from mat
f = pd.read_csv("D:\\lagou_jobs.csv", encoding="utf-8")["职位名"]
print(f.describe())
# print(f)
def get_word(f):
import re
text=""
p=re.findall("'(.*?)'",f)
for i in p:
text+=i
# return text
return f
def paint_wordcloud():
label_str=""
for line in f:
label_str+=get_word(line)
print(label_str)
cut_text="".join(jieba.cut(label_str))
word_cloud=WordCloud(
font_path='simhei.ttf',##不加的话,中文就是小框框的乱码了
background_color='white',
mask=imread("D:\\Documents\\Pictures\\cloud.png"),
max_words=1000,
max_font_size=100
).generate(cut_text)
word_cloud.to_file('D:\\Documents\\Pictures\\word_cloud.jpg')
plt.imshow(word_cloud)
plt.axis('off')
plt.show()
if __name__=="__main__":
paint_wordcloud()