selenium爬取lagou

程序员文章站 2022-04-03 16:08:51

from selenium import webdriver import time from lxml import etree import re class LagouSpider(object): def __init__(self): self.driver = webdriver.Chr ......

from selenium import webdriver
import time
from lxml import etree
import re


class lagouspider(object):

    def __init__(self):
        self.driver = webdriver.chrome()
        self.url = "https://www.lagou.com/jobs/list_python?px=default&city=%e5%85%a8%e5%9b%bd#filterbox"

    def run(self):
        self.driver.get(self.url)
        while true:
            source = self.driver.page_source
            self.parse_page_list(source)
            next_btn = self.driver.find_element_by_xpath("//div[@class='pager_container']/span[last()]")
            if "pager_next_disabled" in next_btn.get_attribute("class"):
                break
            else:
                next_btn.click()

    def parse_page_list(self, source):
        html = etree.html(source)
        detail_urls = html.xpath("//div/a[@class='position_link']/@href")
        for detail_url in detail_urls:
            self.get_detail_page(detail_url)
            time.sleep(1)

    def get_detail_page(self, detail_url):
        # self.driver.get(detail_url)
        # 新打开一个窗口
        self.driver.execute_script("window.open('%s')" % detail_url)
        self.driver.switch_to.window(self.driver.window_handles[1])
        source = self.driver.page_source
        self.parse_datail_page(source)
        # 关闭该窗口
        self.driver.close()
        # 继续切换回职位列表页
        self.driver.switch_to.window(self.driver.window_handles[0])

    def parse_datail_page(self, source):
        html = etree.html(source)
        job_name = html.xpath("//div[@class='job-name']/h2/text()")[0].strip()
        job_request_spans = html.xpath("//dd[@class='job_request']//span")
        job_salary = job_request_spans[0].xpath("./text()")[0].strip()
        city = job_request_spans[1].xpath("./text()")[0].strip()
        city = re.sub(r'[/\s]', '', city)
        work_year = job_request_spans[2].xpath("./text()")[0].strip()
        work_year = re.sub(r'[/\s]', '', work_year)
        education = job_request_spans[3].xpath("./text()")[0].strip()
        education = re.sub(r'[/\s]', '', education)
        company_name = html.xpath("//h3[@class='fl']//text()")[0].strip()
        desc = "".join(html.xpath("//dl[@id='job_detail']/dd[@class='job_bt']//text()")).strip()
        desc = re.sub(r'[/\s\\xa]', '', desc)
        position = {
            "name": job_name,
            "job_salary": job_salary,
            "city": city,
            "work_year": work_year,
            "education": education,
            "company_name": company_name,
            "desc": desc
        }
        print(position)


lagou = lagouspider()
lagou.run()

上一篇： Intellij IDEA导入JAVA项目并启动(图文教程)

下一篇： jquery处理页面弹出层查询数据等待操作实例教程

selenium爬取lagou

利用Python爬取可用的代理IP

教你用python3根据关键词爬取百度百科的内容

c#爬虫爬取京东的商品信息

Okhttp3实现爬取验证码及获取Cookie的示例

Python爬取王者荣耀全英雄全皮肤图片

requests+xpath+json爬取糗事百科

Java爬虫实现爬取京东上的手机搜索页面 HttpCliient+Jsoup

利用python爬虫爬取斗鱼图片(简单详细)

python scrapy框架爬取80s保存mysql

几行Python代码爬取3000+上市公司的信息