scrapy爬取智联招聘中python工程师的招聘信息
程序员文章站
2022-05-09 22:01:57
...
# -*- coding:utf-8 -*-
'''
import sys
import codecs
import chardet
import scrapy
from scrapy import log
from scrapy.selector import Selector
from tutorial.items import DmozItem
'''
import pymongo
import scrapy
from scrapy.http import Request
#import sys
import codecs
import re
#from tutorial.items import DmozItem
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["zhaopin.com"]
start_urls = ["http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E5%8C%97%E4%BA%AC&kw=python&sm=0&sg=3e5d326a89a44a8594d2904a3a207b9e&p=1"]
def parse2(self,response):
f = codecs.open(r'F:\test\tutorial\a','a','utf-8')
#职位待遇
try:
pay_data = response.xpath('//body/div[@class="terminalpage clearfix"]/div[@class="terminalpage-left"]/ul[@class="terminal-ul clearfix"]/li')
pay = ''
for sel in pay_data:
pay_middle_data = sel.xpath('strong').extract()
pay = pay + re.sub(r'<.*?>','',pay_middle_data[0]) + ' '
f.write(pay+'\n')
except:
pass
#任职要求
try:
require_data = response.xpath('//body/div[@class="terminalpage clearfix"]/div[@class="terminalpage-left"]/div[@class="terminalpage-main clearfix"]/div[@class="tab-cont-box"]/div[1]/p').extract()
require_data_middle = ''
for i in require_data:
i_middle = re.sub(r'<.*?>',r'',i,re.S)
require_data_middle = require_data_middle + re.sub(r'\s*',r'',i_middle,re.S)
f.write(require_data_middle+'\n')
except:
pass
#公司地址
try:
company_data = response.xpath('//body/div[@class="terminalpage clearfix"]/div[@class="terminalpage-left"]/div[@class="terminalpage-main clearfix"]/div[@class="tab-cont-box"]/div[@class="tab-inner-cont"]/h2').extract()
company_data = re.search(r'<h2>\s*(.*?)\s*<a',company_data[0],re.S).group(1)
f.write(company_data+'\n')
except:
pass
#招聘信息地址
company_url = str(response)[5:-1]
f.write(company_url+'\n\n')
f.close()
conn = pymongo.MongoClient('192.168.1.5',27017)
db = conn.test
employee = db.employee
employee.insert({"url":company_url,"pay":pay,"require_data":require_data_middle,"company_data":company_data})
def parse(self,response):
#获取本页职位链接
url_data = response.xpath('//div[@id="newlist_list_content_table"]//td[@class="zwmc"]//a/@href').extract()
# f = codecs.open(r'F:\test\tutorial\a','a','utf-8')
# for company_url in url_data:
# f.write(company_url+'\n')
# f.close()
for company_url in url_data:
yield Request(company_url,callback=self.parse2)
#跳转到下一页
next_page_url = response.xpath('//body/div[@class="main"]/div[@class="search_newlist_main"]/div[@class="newlist_main"]/form[@name="frmMain"]/div[@class="clearfix"]/div[@class="newlist_wrap fl"]/div[@class="pagesDown"]/ul/li[@class="pagesDown-pos"]/a/@href').extract()
yield Request(next_page_url[0],callback=self.parse)
'''
def parse(self,response):
f = codecs.open(r'F:\test\tutorial\a','a','utf-8')
#职位待遇
pay_data = response.xpath('//body/div[@class="terminalpage clearfix"]/div[@class="terminalpage-left"]/ul[@class="terminal-ul clearfix"]/li')
pay = ''
for sel in pay_data:
try:
pay_middle_data = sel.xpath('strong').extract()
pay = pay + re.sub(r'<.*?>','',pay_middle_data[0]) + ' '
except:
pass
f.write(pay+'\n')
#任职要求
require_data = response.xpath('//body/div[@class="terminalpage clearfix"]/div[@class="terminalpage-left"]/div[@class="terminalpage-main clearfix"]/div[@class="tab-cont-box"]/div[1]/p').extract()
require_data_middle = ''
for i in require_data:
i_middle = re.sub(r'<.*?>',r'',i,re.S)
require_data_middle = require_data_middle + re.sub(r'\s*',r'',i_middle,re.S)
f.write(require_data_middle+'\n')
#公司地址
company_data = response.xpath('//body/div[@class="terminalpage clearfix"]/div[@class="terminalpage-left"]/div[@class="terminalpage-main clearfix"]/div[@class="tab-cont-box"]/div[@class="tab-inner-cont"]/h2').extract()
company_data = re.search(r'<h2>\s*(.*?)\s*<a',company_data[0],re.S).group(1)
f.write(company_data+'\n')
#招聘信息地址
company_url = str(response)[5:-1]
f.write(company_url+'\n\n')
f.close()
'''
转载于:https://blog.51cto.com/8588103/1761043
上一篇: 模拟浏览器进行反爬虫之银行信息的获取
下一篇: 【Codeforces】Birthday