使用scrapy框架+模拟浏览器方法实现爬取智联的职位信息
程序员文章站
2022-02-11 06:29:30
...
由于智联的页面是由js动态加载的,一般的方法只能得到js加载前的页面,为了得到加载过的页面需要通过模拟浏览器来拿到完整的页面.
下面的代码只是简单的实现,爬取智联页面的部分功能,其他根据需要自己实现
中间件(middleswares.py)代码:
from scrapy.http import HtmlResponse
from selenium import webdriver
import time
# from selenium.webdriver.chrome.options import Options
from selenium.webdriver.firefox.options import Options
class SeleniumMiddleware(object):
def __init__(self):
self.options = Options()
# self.options.add_argument('-headless')
# self.browser = webdriver.Chrome(executable_path=r"D:\python_others\Spider\code\day06\tools\chromedriver.exe",chrome_options=self.options)
self.browser = webdriver.Firefox(executable_path=r"D:\python_others\Spider\code\day06\tools\geckodriver.exe",
firefox_options=self.options)
def process_request(self, request, spider):
if int(request.meta['page']) == 2:
# 执行javascript使浏览器滚动条滚动到最后
self.browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
time.sleep(3)
div = self.browser.find_element_by_css_selector(".soupager")
next_page = div.find_element_by_tag_name("button")
next_page[1].click()
# page = self.browser.find_element_by_xpath('//*[@id="pagination_content"]/div/button[2]')
# page.click()
# time.sleep(10)
else:
if (request.meta['page']) == 0:
try:
print("url is ::::", request.url)
self.browser.get(request.url)
except TimeoutError as e:
print("超时")
time.sleep(5)
return HtmlResponse(url=self.browser.current_url, body=self.browser.page_source, encoding="utf-8",
request=request)
# 在模拟浏览器过程中如果还想要在downloader实现下载 只要中间件不return就可以
# 出现页面一直加载的情况时,显示页面一直在加载,只要差掉加载过程的小圆圈,页面就会加载出来
# browser.execute_script(('window.stop()') 使用这个方法
爬虫文件(spider.py)代码:
# -*- coding: utf-8 -*-
import time
import scrapy
import lxml.html
from scrapy import Request
class JobDes(object):
def __init__(self):
self.detail_url = ""
self.title = ""
def parse_lxml_zhilian(html_str):
tree = lxml.html.fromstring(html_str)
job_url = tree.xpath('//a[@class="contentpile__content__wrapper__item__info__boxle"]/@href')
job_name = tree.xpath('//a[@class="contentpile__content__wrapper__item__info__boxle"]/@title')
print(job_url)
print(job_name)
#全局变量用于判断翻页速度是否远大于局部下载速度
count = 0
class ZhaopinSpider(scrapy.Spider):
name = 'zhaopin'
# allowed_domains = ['ts.zhaopin.com']
# start_urls = ['http://ts.zhaopin.com/']
def start_requests(self):
url_str = 'https://sou.zhaopin.com/?jl=489&kw=python&kt=3'
yield Request(url=url_str, callback=self.parse, meta={"page": "0"})
def parse(self, response):
#使用模拟器翻页加载ajax页面
#在模拟器弹出页面分析抓取页面
#抓取标签不是一成不变的,谨慎使用带数字的css选择器nth-child(1)
#使用简单可调式的页面去调试
#selenium可以用于模拟测试
rs = response.css('#listContent > div:nth-child(1)')
page_next = response.xpath('//*[@id="pagination_content"]/div/button[2]')
# pagination_content > div > button:nth-child(7)
print("rs is :::::", rs)
print("page_next is :::::", page_next)
# listContent > div:nth-child(1)
# pagination_content > div > button:nth-child(7)
# button.btn:nth-child(8)
#每页60个下载任务,每翻一页多60条任务
global count
count += 60
for r in rs:
job_url = parse_lxml_zhilian(r)
yield Request(url=job_url, callback=self.parse_detal, meta={"page": "3"}, dont_filter=True)
if len(page_next) > 0:
#当下载任务大于300时,暂停翻页等待数据下载
while count > 300:
time.sleep(0.5)
yield Request(url=response.url, callback=self.parse, meta={"page": "2"}, dont_filter=True)
def parse_detal(self):
pass
上一篇: scrapy中使用代理cookies user-agent
下一篇: ubuntu安装SCrapy