欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

Scrapy爬取拉钩网(反爬很强、可能以下代码就过时了!)职位信息 > 2019/4/2更新

程序员文章站 2022-04-21 23:50:28
...

拉勾网反爬

  1. headers
  2. cookies
  3. IP代理池

爬取的数据.csv

Scrapy爬取拉钩网(反爬很强、可能以下代码就过时了!)职位信息 > 2019/4/2更新

导包:import scrapy,requests,time,json,random,pandas as pd,codecs

获取cookies、headers的地址:url=“https://www.lagou.com/jobs/list_Python?labelWords=&fromSearch=true&suginput=

获取职位json数据的地址:json_url=“https://www.lagou.com/jobs/positionAjax.json

以下内容都要写到代码里

伪装cookies

核心代码

def get_cookie(self):
        headers = {
        'Connection': 'keep-alive',
        'Cache-Control': 'max-age=0',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
        }
        response = requests.get(
        self.url,
        headers=headers)  # 请求原网页
        r = requests.utils.dict_from_cookiejar(response.cookies)  # 获取cookies
        r["user_trace_token"] = r["LGRID"]
        r["LGSID"] = r["LGRID"]
        r["LGUID"] = r["LGRID"]  # 构造cookies的参数
        cookies = {
            'X_MIDDLE_TOKEN': '797bc148d133274a162ba797a6875817',
            'JSESSIONID': 'ABAAABAAAIAACBI03F33A375F98E05C5108D4D742A34114',
            '_ga': 'GA1.2.1912257997.1548059451',
            '_gat': '1',
            'Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6': '1548059451',
            'user_trace_token': '20190121163050-dbd72da2-1d56-11e9-8927-525400f775ce',
            'LGSID': '20190121163050-dbd72f67-1d56-11e9-8927-525400f775ce',
            'PRE_UTM': '',
            'PRE_HOST': '',
            'PRE_SITE': '',
            'PRE_LAND': 'https%3A%2F%2Fwww.lagou.com%2F%3F_from_mid%3D1',
            'LGUID': '20190121163050-dbd73128-1d56-11e9-8927-525400f775ce',
            '_gid': 'GA1.2.1194828713.1548059451',
            'index_location_city': '%E5%85%A8%E5%9B%BD',
            'TG-TRACK-CODE': 'index_hotjob',
            'LGRID': '20190121163142-fb0cc9c0-1d56-11e9-8928-525400f775ce',
            'Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6': str(time.time()),
            'SEARCH_ID': '86ed37f5d8da417dafb53aa25cd6fbc0',
        }
        cookies.update(r)  # 更新接口的cookies
        return cookies

伪装成浏览器,直接在settings.py这样写

# Crawl responsibly by identifying yourself (and your website) on the user-agent
from fake_useragent import UserAgent
USER_AGENT = UserAgent().random
# Obey robots.txt rules
ROBOTSTXT_OBEY = False

IP代理池:在middlewares.py新建类如下

class ProxyMiddleware(object):
    proxyList = [ 
        '121.193.143.249:80','112.126.65.193:80','122.96.59.104:82','115.29.98.139:9999','117.131.216.214:80','116.226.243.166:8118','101.81.22.21:8118','122.96.59.107:843'      
    ]  
    def process_request(self,request,spider):
        p = random.choice(self.proxyList)
        request.meta['proxy']='http://'+p

#添加参数:settings.py就ok了


DOWNLOADER_MIDDLEWARES = {
   'scrapyTest.middlewares.ScrapytestDownloaderMiddleware': None,
   "scrapyTest.middlewares.RandomUserAgentMidderware":543,
   "scrapyTest.middlewares.ProxyMiddleware":100 #这个*********
}

爬虫

#参数
	allowed_domains = ['lagou.com']
    page=1
    sum_pages=20
    json_url="https://www.lagou.com/jobs/positionAjax.json"
    start_urls = [json_url]
    header = {
        'Origin': 'https://www.lagou.com',
        'X-Anit-Forge-Code': '0',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36',
        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
        'Accept': 'application/json, text/javascript, */*; q=0.01',
        'Referer': url,
        'X-Requested-With': 'XMLHttpRequest',
        'Connection': 'keep-alive',
        'X-Anit-Forge-Token': 'None',
    }
   	#免费IP
    proxy_list = [
        'http://140.224.76.21:808',
        'http://60.178.14.90:8081',
        'http://121.232.146.13:9000',
    ]
    proxy_ip = random.choice(proxy_list)
    data={
            "pn": "1",#页数
            "kd": "Python",#关键字
        }
    params = (
        ('px', 'new'),
        ('needAddtionalResult', 'false'),
    )
def start_requests(self):
        for url in self.start_urls:
            yield scrapy.FormRequest(url,formdata=self.data,headers=self.header,cookies=self.get_cookie(),callback=self.parse)
          
def parse(self, response):
        info=json.loads(response.text)
        jobs=info["content"]["positionResult"]["result"]
        # json.dump(jobs,self.f)
        self.parse_details(jobs)#获取有用数据dict       
        time.sleep(random.randint(1,3))
        try:
            while self.page<self.sum_pages:
                time.sleep(random.randint(1,3))
                self.page+=1
                self.data["pn"]=str(self.page)
                yield scrapy.FormRequest(self.json_url,formdata=self.data,headers=self.header,cookies=self.get_cookie(),callback=self.parse)#POST
        except Exception as e:
            print(e)
        finally:
            self.save_jobsList()#最最后保存jobs_list

解析数据、保存为csv

#参数
	jobs_list=[]#全局jobs列表,存储全部数据最后到save_jobsList()保存
    csv_path="D:\\lagou_jobs.csv"
    f=codecs.open("D:\\test.json","a","utf-8")
def parse_details(self,jobs):
        for job in jobs:
            
            list=[job["companyFullName"],
            job["financeStage"],job["positionName"],
            job["positionLables"],job["salary"],
            job["city"],job["education"],job["workYear"],
            job["jobNature"],job["createTime"]]
            print("*****",list)
            self.f.write(str(list))
            self.jobs_list.append(list)
    def save_jobsList(self):
        p = pd.DataFrame(data=self.jobs_list,columns=["公司全名","融资情况","职位名","标签","薪水","城市","学历要求","经验要求","工作类型","发布时间"])
        p.to_csv(self.csv_path,index=True)
        print("保存完毕!")

在lagou.py最后代码:不用cmd,直接运行lagou.py就行

if __name__=="__main__":
    from scrapy import cmdline
    cmdline.execute("scrapy crawl lagou".split())

职位名 词云
爬了73条就爬不了了,数据太少,词云就很丑了~~~~~~~~~

Scrapy爬取拉钩网(反爬很强、可能以下代码就过时了!)职位信息 > 2019/4/2更新

样本图 cloud.png 我的小太阳~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Scrapy爬取拉钩网(反爬很强、可能以下代码就过时了!)职位信息 > 2019/4/2更新

ttf在哪里?复制到桌面就有ttf文件了

Scrapy爬取拉钩网(反爬很强、可能以下代码就过时了!)职位信息 > 2019/4/2更新

#word_cloud.py
import pandas as pd
import jieba
from imageio import imread
from wordcloud import WordCloud
import matplotlib.pyplot as plt
# from mat
f = pd.read_csv("D:\\lagou_jobs.csv", encoding="utf-8")["职位名"]
print(f.describe())
# print(f)
def get_word(f):             
        import re
        text=""
        p=re.findall("'(.*?)'",f)
        for i in p:
                text+=i
        # return text
        return f

def paint_wordcloud():
        label_str=""
        for line in f:
                label_str+=get_word(line)
        print(label_str)
        cut_text="".join(jieba.cut(label_str))
        word_cloud=WordCloud(
                font_path='simhei.ttf',##不加的话,中文就是小框框的乱码了
                background_color='white',
                mask=imread("D:\\Documents\\Pictures\\cloud.png"),
                max_words=1000,
                max_font_size=100
        ).generate(cut_text)
        word_cloud.to_file('D:\\Documents\\Pictures\\word_cloud.jpg')
        plt.imshow(word_cloud)
        plt.axis('off')
        plt.show()

if __name__=="__main__":
        paint_wordcloud()