欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  IT编程

拉钩爬虫

程序员文章站 2022-07-11 16:17:38
#!/usr/bin/env python# -*- coding:utf-8 -*-import jsonimport reimport timeimport lxml.htmlfrom selenium import webdriverfrom selenium.webdriver.common ......
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import json
import re
import time

import lxml.html
from selenium import webdriver
from selenium.webdriver.common.by import by
from selenium.webdriver.support.ui import webdriverwait
from selenium.webdriver.support import expected_conditions as ec

from redis_cache import rediscache


class lagouspider(object):

def __init__(self):
# 调用webdriver.chrome()启动浏览器
self.driver = webdriver.chrome()
self.url = 'https://www.lagou.com/jobs/list_python?labelwords=&fromsearch=true&suginput='
self.detail_url = none

def run(self):
# 获得url打开浏览器
self.driver.get(self.url)
while true:
# 获取当前页面源代码
source = self.driver.page_source
# 进行等待页面加载,如果需要的内容已出现,就进行下一步
webdriverwait(driver=self.driver, timeout=10).until(
ec.presence_of_element_located((by.xpath, '//div[@class="pager_container"]/span[last()]'))
)
# 将source传入parse_list_page函数进行解析
self.parse_list_page(source)
try:
next_btn = self.driver.find_element_by_xpath('//div[@class="pager_container"]/span[last()]')
if "pager_next_disabled" in next_btn.get_attribute("class"):
break
else:
next_btn.click()
except:
print(source)
time.sleep(1)

def parse_list_page(self, source):
"""
进行原始页面解析
:param source:
:return:
"""
html = lxml.html.fromstring(source)
# 获取详情页链接集
links = html.xpath('//a[@class="position_link"]/@href')
for link in links:
self.detail_url = link
self.requests_detail_page(link)
time.sleep(1)

def requests_detail_page(self,url):
"""
请求详情页信息
:param url:
:return:
"""
self.driver.execute_script("window.open('%s')" % url)
self.driver.switch_to.window(self.driver.window_handles[1])
webdriverwait(self.driver, timeout=10).until(
ec.presence_of_element_located((by.xpath, '//div[@class="job-name"]//span[@class="name"]'))
)
source = self.driver.page_source
self.parse_datail_page(source)
self.driver.close()
self.driver.switch_to.window(self.driver.window_handles[0])

def parse_datail_page(self, source):
"""详情页解析"""
html = lxml.html.fromstring(source)

job_name = html.xpath('//div[@class="job-name"]//span[@class="name"]/text()')[0]
job_salary = html.xpath('//dd[@class="job_request"]/p//span[1]/text()')[0]
job_city = html.xpath('//dd[@class="job_request"]/p//span[2]/text()')[0]
job_city = re.sub(r"[\s/]", "", job_city)
experience = html.xpath('//dd[@class="job_request"]/p//span[3]/text()')[0].strip()
experience = re.sub(r"[\s/]", "", experience)
education = html.xpath('//dd[@class="job_request"]/p//span[4]/text()')[0]
education = re.sub(r"[\s/]", "", education)
job_time = html.xpath('//dd[@class="job_request"]/p//span[5]/text()')[0]
job_advantage = html.xpath('//dd[@class="job-advantage"]/p/text()')[0]
desc = "".join(html.xpath('//dd[@class="job_bt"]//text()')).strip()
job_address = "".join(html.xpath('//div[@class="work_addr"]//text()'))
job_address = re.sub(r"[\s/]", "", job_address)[0:-4]

position = {
'job_name': job_name,
'job_salary': job_salary,
'job_city': job_city,
'experience': experience,
'education': education,
'job_advantage': job_advantage,
'desc': desc,
'job_address': job_address,
'job_time': job_time,
}

rc = rediscache()
rc[self.detail_url] = position
position_print = json.loads(rc[self.detail_url])
print(self.detail_url)
print(position_print)
print('='*40)


if __name__ == '__main__':
spider = lagouspider()
spider.run()