selenium爬取拉勾网python职位信息
程序员文章站
2022-05-09 17:41:22
...
直接上代码,主要爬取的是广州的python职位信息
from selenium import webdriver
import time
from lxml import etree
import re
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import xlwt
import csv
class LagouSpider(object):
driver_path = r"H:\python\chromedriver.exe"
chromeOptions = webdriver.ChromeOptions()
chromeOptions.add_argument("service_args = ['–ignore - ssl - errors = true', '–ssl - protocol = TLSv1']") # Python2/3
chromeOptions.add_experimental_option('excludeSwitches', ['enable - automation'])
# 设置代理
chromeOptions.add_argument("--proxy-server=http://47.100.7.167:8989 ")
def __init__(self):
self.driver = webdriver.Chrome(chrome_options=LagouSpider.chromeOptions,executable_path=LagouSpider.driver_path,)
self.url = 'https://www.lagou.com/jobs/list_python/p-city_213?px=default#filterBox'
self.positions = []
def run(self):
self.driver.get(self.url)
while True:
source = self.driver.page_source
self.parse_list_page(source)
if re.search(r'action="next" class="pager_next pager_next_disabled"', self.driver.page_source):
break
self.next_page() # 点击进入下一页
self.driver.quit() # 将浏览器退出
self.write_to_csv() # 将获取的数据写入文件
def parse_list_page(self,source):
html = etree.HTML(source)
links = html.xpath("//a[@class='position_link']/@href")
self.driver.execute_script("window.open()") # 开启新的标签页
self.driver.switch_to.window(self.driver.window_handles[1]) # 切换到新的标签页
for url in links: # 遍历职位的详情页
self.driver.get(url) # 打开职位的详情页
html = etree.HTML(self.driver.page_source) # 解析详情页
title = html.xpath('//h4[@class="company"]/text()')[0]
job_request_span = html.xpath('//dd[@class ="job_request"]/h3/span/text()')
salary = job_request_span[0]
salary = re.sub(r"[\s/]","",salary)
city = job_request_span[1]
city = re.sub(r"[\s/]","",city)
work_year = job_request_span[2]
work_year = re.sub(r"[\s/]", "", work_year)
education = job_request_span[3]
education = re.sub(r"[\s/]", "", education)
desc = ''.join(html.xpath('//dd[@class="job_bt"]//text()')).strip()
company = html.xpath('//em[@class="fl-cn"]/text()')[0].strip()
positon = {
'title': title,
'company': company,
'salary': salary,
'city': city,
'work_year': work_year,
'education': education,
'desc': desc,
}
self.positions.append(positon)
print(positon)
time.sleep(5)
self.driver.close() # 关闭标签页
self.driver.switch_to.window(self.driver.window_handles[0]) # 切换页面
def next_page(self):
# 找到下一页标签
element = WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "pager_next")))
element.click() # 点击下一页标签
time.sleep(1)
def write_to_csv(self): # 写入文件
header = ['title', 'company', 'desc', 'salary', 'work_year', 'education']
with open('positons.csv', 'w', newline='', encoding='utf-8') as fp:
write = csv.DictWriter(fp, header)
write.writeheader()
write.writerows(self.positions)
if __name__ == '__main__':
spider = LagouSpider()
spider.run()
上一篇: Codeforces 3B
下一篇: python爬虫之爬取拉勾网职位信息