Selenium 高效爬取拉勾网职位信息并保存至MongoDB
程序员文章站
2022-05-09 17:59:51
...
主程序:
import re
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from pyquery import PyQuery as pq
import time
from config import *
import pymongo
class LagouSpider(object):
def __init__(self):
self.browser = webdriver.Chrome()
self.wait = WebDriverWait(self.browser, 10)
self.url = 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='
self.positions = []
self.client = pymongo.MongoClient(MONGO_URL)
self.db = self.client[MONGO_DB]
def run(self):
self.browser.get(self.url)
while True:
source = self.browser.page_source
self.wait.until(EC.element_to_be_clickable(
(By.CSS_SELECTOR, '#s_position_list > div.item_con_pager > div > span.pager_next')))
self.parse_list_page(source)
submit = self.browser.find_element_by_css_selector( '#s_position_list > div.item_con_pager > div > span.pager_next')
if 'pager_next_disabled' in submit.get_attribute('class'):
break
else:
submit.click()
time.sleep(2)
def parse_list_page(self, source):
doc = pq(source)
links = doc('.position .p_top .position_link').items()
for link in links:
link = link.attr('href')
self.request_detail_page(link)
time.sleep(2)
def request_detail_page(self, url):
self.browser.execute_script('window.open("%s")' % url)
self.browser.switch_to.window(self.browser.window_handles[1])
self.browser.get(url)
source = self.browser.page_source
self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.job_request span')))
self.parse_detail_page(source)
self.browser.close()
self.browser.switch_to.window(self.browser.window_handles[0])
def parse_detail_page(self, source):
doc = pq(source)
position_name = doc('.job-name .name').text()
salary = doc(doc('.job_request span')[0]).text().strip()
city = doc(doc('.job_request span')[1]).text().strip()
city = re.sub('[\s/]', '', city)
work_years = doc(doc('.job_request span')[2]).text().strip()
work_years = re.sub('[\s/]', '', work_years)
education = doc(doc('.job_request span')[3]).text().strip()
education = re.sub('[\s/]', '', education)
desc = doc(".job_bt").text().strip()
company = doc('.fl .fl-cn').text().strip()
position = {
'position_name': position_name,
'salary': salary,
'city': city,
"work_years": work_years,
"education": education,
"desc": desc,
'company': company
}
self.positions.append(position)
# print(position)
# print('='*40)
self.save_to_mongo(position)
def save_to_mongo(self, result):
try:
if self.db[MONGO_TABLE].insert(result):
print('保存到MONGODB成功', result)
except Exception:
print('保存到MONGODB失败', result)
if __name__ == '__main__':
spider = LagouSpider()
spider.run()
设置代码段:
config.py
MONGO_URL = 'localhost'
MONGO_DB = 'lagou'
MONGO_TABLE = 'position'
上一篇: 世界上七个最“疯狂”的计划!真是异想天开匪夷所思!
下一篇: 求二叉树的层序遍历 python版本