Python爬取前程无忧网站上python的招聘信息
程序员文章站
2022-06-22 10:29:43
前言 文的文字及图片来源于网络,仅供学习、交流使用,不具有任何商业用途,版权归原作者所有,如有问题请及时联系我们以作处理。 作者: 我姓刘却留不住你的心 PS:如有需要Python学习资料的小伙伴可以加点击下方链接自行获取 http://note.youdao.com/noteshare?id=30 ......
文的文字及图片来源于网络,仅供学习、交流使用,不具有任何商业用途,版权归原作者所有,如有问题请及时联系我们以作处理。
作者: 我姓刘却留不住你的心
ps:如有需要python学习资料的小伙伴可以加点击下方链接自行获取
本文获取的字段有为职位名称,公司名称,公司地点,薪资,发布时间
创建爬虫项目
scrapy startproject qianchengwuyou cd qianchengwuyou scrapy genspider -t crawl qcwy www.xxx.com
items中定义爬取的字段
1 import scrapy 2 3 4 class qianchengwuyouitem(scrapy.item): 5 # define the fields for your item here like: 6 job_title = scrapy.field() 7 company_name = scrapy.field() 8 company_address = scrapy.field() 9 salary = scrapy.field()
release_time = scrapy.field()
qcwy.py文件内写主程序
1 import scrapy 2 from scrapy.linkextractors import linkextractor 3 from scrapy.spiders import crawlspider, rule 4 from qianchengwuyou.items import qianchengwuyouitem 5 6 class qcwyspider(crawlspider): 7 name = 'qcwy' 8 # allowed_domains = ['www.xxx.com'] 9 start_urls = ['https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,1.html?'] 10 # https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,7.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare= 11 rules = ( 12 rule(linkextractor(allow=r'https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,(\d+).html?'), callback='parse_item', follow=true), 13 ) 14 15 def parse_item(self, response): 16 17 list_job = response.xpath('//div[@id="resultlist"]/div[@class="el"][position()>1]') 18 for job in list_job: 19 item = qianchengwuyouitem() 20 item['job_title'] = job.xpath('./p/span/a/@title').extract_first() 21 item['company_name'] = job.xpath('./span[1]/a/@title').extract_first() 22 item['company_address'] = job.xpath('./span[2]/text()').extract_first() 23 item['salary'] = job.xpath('./span[3]/text()').extract_first() 24 item['release_time'] = job.xpath('./span[4]/text()').extract_first() 25 yield item
pipelines.py文件中写下载规则
1 import pymysql 2 3 class qianchengwuyoupipeline(object): 4 conn = none 5 mycursor = none 6 7 def open_spider(self, spider): 8 print('链接数据库...') 9 self.conn = pymysql.connect(host='172.16.25.4', user='root', password='root', db='scrapy') 10 self.mycursor = self.conn.cursor() 11 12 def process_item(self, item, spider): 13 print('正在写数据库...') 14 job_title = item['job_title'] 15 company_name = item['company_name'] 16 company_address = item['company_address'] 17 salary = item['salary'] 18 release_time = item['release_time'] 19 sql = 'insert into qcwy values (null,"%s","%s","%s","%s","%s")' % ( 20 job_title, company_name, company_address, salary, release_time) 21 bool = self.mycursor.execute(sql) 22 self.conn.commit() 23 return item 24 25 def close_spider(self, spider): 26 print('写入数据库完成...') 27 self.mycursor.close() 28 self.conn.close()
settings.py文件中打开下载管道和请求头
item_pipelines = { 'qianchengwuyou.pipelines.qianchengwuyoupipeline': 300, } user_agent = 'mozilla/5.0 (windows nt 6.1; wow64) applewebkit/534.57.2 (khtml, like gecko) version/5.1.7 safari/534.57.2'
运行爬虫,同时写入.json文件
scrapy crawl qcwy -o qcwy.json --nolog
查看数据库是否写入成功,