拉勾网爬虫之利用selenium控制谷歌浏览器爬取职位信息
程序员文章站
2022-03-02 23:24:41
...
拉勾网爬虫之利用selenium控制谷歌浏览器爬取职位信息
import time
from lxml import etree
from selenium import webdriver
JOB_LIST = []
class Lagou_Spider(object):
driver_path = r"C:\ChromeDriver\chromedriver.exe"
def __init__(self):
self.driver = webdriver.Chrome(executable_path=self.driver_path)
self.url = 'https://www.lagou.com/jobs/list_%E4%BE%9B%E5%BA%94%E9%93%BE?labelWords=&fromSearch=true&suginput='
def run(self):
self.driver.get(self.url)
while True:
source = self.driver.page_source
self.parse_page_url(source)
next_btn = self.driver.find_element_by_class_name("pager_next ")
# 查看是否已到最后一页,如是,则爬取完成
if "pager_next pager_next_disabled" in next_btn.get_attribute("class"):
break
else:
next_btn.click()
time.sleep(1)
def parse_page_url(self, source):
html = etree.HTML(source)
detail_urls = html.xpath('//a[@class="position_link"]/@href')
for detail_url in detail_urls:
self.parse_detail_url(detail_url)
time.sleep(2)
def parse_detail_url(self, detail_url):
self.driver.execute_script("window.open('%s')" % detail_url)
self.driver.switch_to_window(self.driver.window_handles[1])
html = etree.HTML(self.driver.page_source)
title = html.xpath('//h1[@class="name"]/text()')[0].strip()
company = html.xpath('//h4[@class="company"]/text()')[0].strip()
salary = html.xpath('//h3/span[@class="salary"]/text()')[0].strip()
address = html.xpath('//h3/span/text()')[1].replace('/', '').strip()
years = html.xpath('//h3/span/text()')[2].replace('/', '').strip()
education = html.xpath('//h3/span/text()')[3].replace('/', '').strip()
status = html.xpath('//h3/span/text()')[4].strip()
description = ''.join(html.xpath('//div[@class="job-detail"]//p/text()')).strip()
lables = '|'.join(html.xpath('//ul[@class="position-label clearfix"]/li/text()')) # 将得到的列表以'|'连接起来并转换为文本
job_dic = {}
job_dic = {'职位': title,
'标签': lables,
'公司': company,
'薪资': salary,
'地址': address,
'工作经验': years,
'学历': education,
'兼全职': status,
'详细要求': description}
print(job_dic)
JOB_LIST.append(job_dic)
self.driver.close()
self.driver.switch_to_window(self.driver.window_handles[0])
if __name__ == '__main__':
lagou = Lagou_Spider()
lagou.run()