scrapy爬虫实战 - 51job爬虫职位爬取
程序员文章站
2022-05-04 19:17:19
思路:首先爬取所有内容页的链接存储到数据库然后再新建一个scrapy 爬取这些链接需要用到的模块:scrapy urllib pymysql内容页链接爬取这里使用scrapy的通用爬虫框架创建命令:scrapy genspider -t crawl [name] [domains]# -*- coding: utf-8 -*-import scrapyfrom scrapy.linkextractors import LinkExtractorfrom scrapy.spid.....
51job(爬虫相关职位)数据分析实战:
猛戳:51job爬虫职位数据分析实战
思路:
- 首先爬取所有内容页的链接存储到数据库
- 然后再新建一个scrapy 爬取这些链接
- 需要用到的模块:scrapy urllib pymysql
内容页链接爬取
- 这里使用scrapy的通用爬虫框架
- 创建命令:scrapy genspider -t crawl [name] [domains]
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from urllib.parse import quote
class Job51LinksSpider(CrawlSpider):
name = 'job51_links'
allowed_domains = ['51job.com']
# 关键词url转码
position = quote(quote("爬虫"))
start_urls = [f"https://search.51job.com/list/000000,000000,0000,00,9,99,{position},2,1.html?"]
rules = (
# xpaths抓取每一页的链接
Rule(LinkExtractor(restrict_xpaths="//div[@class='p_in']//li/a"), callback='parse_item', follow=True),
)
def parse_item(self, response):
# 有时候t1标签后面会跟随一个空格才能爬取内容,所以这里做个判断
# 如果解析t1没内容,就在t1后面加个空格即可
t1 = response.xpath("//p[@class='t1']//span//a/@href").extract()
# 解析url链接
links = t1 if t1 else response.xpath("//p[@class='t1 ']//span//a/@href").extract()
# 存入mysql数据库前先把数据处理一下
# 第一个0是id,因为数据库设置了id自增,所以这里给0即可
# 第二个就是每个内容页的url
# 第三个0是爬取状态(status),这里设置首先设置为0,日后爬取过这个链接把这个0改成1代表已经爬取过
links = [(0, url, 0) for url in links]
# 传输给pipeline处理
yield {"links": links}
内容爬取
- 使用scrapy框架
- 创建命令:scrapy genspider [name] [domains]
# -*- coding: utf-8 -*-
import scrapy, pymysql
class Jon51ContentSpider(scrapy.Spider):
name = 'job51_content'
allowed_domains = ['51job.com']
# 创建mysql连接对象
mysql_cli = pymysql.connect("localhost", "root", "123456", "51job")
# 创建游标
cursor = mysql_cli.cursor()
def start_requests(self):
# 读取status为0的url,status为0表示未被爬取过,日后爬取过后把这个status设为1
self.cursor.execute("select id,url from links where status = 0")
urls = self.cursor.fetchall()
# i是这个url的id,为了以后修改status方便,这里先提取出来
for i, url in urls:
# 使用meta把id传过去
yield scrapy.Request(url, callback=self.parse, meta={"id": i})
def parse(self, response):
# 解析页面内容
# 因只作演示,所以仅仅爬取标题(title),工资(salary),公司(company),城市(city),要求(demand)
title = response.xpath("//div[@class='cn']//h1/text()").extract_first()
salary = response.xpath("//div[@class='cn']//strong/text()").extract_first()
company = response.xpath("//p[@class='cname']//a[@class='catn']/text()").extract_first()
city = response.xpath("//p[@class='msg ltype']/text()").extract_first().strip()
demand = response.xpath("string(//div[@class='bmsg job_msg inbox'])").extract_first()
data = {
"title": title,
"salary": salary,
"company": company,
"city": city,
"demand": demand,
"url": response.url,
"id":response.meta["id"]
}
# 传输给pipeline处理
yield data
pipeline管道
处理爬虫爬取下来的数据
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
# 此类用于存储爬取的内容页链接
class Job51_link_Pipeline(object):
def open_spider(self, spider):
# 爬虫开启时候调用
self.mysql_cli = pymysql.connect("localhost", "root", "123456", "51job")
self.cursor = self.mysql_cli.cursor()
# 查看数据表links是否存在
exists = self.cursor.execute("show tables like 'links'")
if not exists:
# 数据表不存在便创建数据表
try:
sql = '''
create table links(
id int primary key auto_increment,
url varchar(150),
status int
)
'''
self.cursor.execute(sql)
self.mysql_cli.commit()
except:
self.mysql_cli.rollback()
print("create table links failed")
def process_item(self, item, spider):
# 数据存储到mysql中
try:
sql = "insert into links values (%s,%s,%s)"
# executemany执行批量插入操作,后面参数类型如:[(0,url1,0),(0,url2,0)....]
self.cursor.executemany(sql, item["links"])
self.mysql_cli.commit()
except:
self.mysql_cli.rollback()
print("insert links failed")
def close_spider(self, spider):
self.cursor.close()
self.mysql_cli.close()
# 此类用于爬取储存爬取内容,基本上功能和上面大同小异
class Job51_content_Pipeline(object):
def open_spider(self, spider):
self.mysql_cli = pymysql.connect("localhost", "root", "123456", "51job")
self.cursor = self.mysql_cli.cursor()
exists = self.cursor.execute("show tables like 'information'")
if not exists:
try:
sql = '''
create table information(
id int primary key auto_increment,
title varchar(100),
salary varchar(10),
company varchar(100),
city varchar(50),
demand text(512),
url varchar(255)
)
'''
self.cursor.execute(sql)
self.mysql_cli.commit()
except:
self.mysql_cli.rollback()
print("create table information failed")
def process_item(self, item, spider):
# 存储数据
try:
info = "%s," * 6 + "%s"
sql = f"insert into information values ({info})"
# 逐条逐条数据插入数据库
self.cursor.execute(sql, (
0,
item["title"],
item["salary"],
item["company"],
item["city"],
item["demand"],
item["url"]
))
self.cursor.execute(f"update links set status=1 where id = {item['id']}")
self.mysql_cli.commit()
except Exception as err:
self.mysql_cli.rollback()
print("insert information failed")
def close_spider(self, spider):
self.cursor.close()
self.mysql_cli.close()
scrapy配置
- 爬虫相关配置,如果看不明白可以直接覆盖掉settings.py即可
BOT_NAME = 'job_51'
SPIDER_MODULES = ['job_51.spiders']
NEWSPIDER_MODULE = 'job_51.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
LOG_LEVEL = "DEBUG"
ITEM_PIPELINES = {
# 注意,如果爬取内容页链接的话开启此行,注释掉下一行
# 'job_51.pipelines.Job51_link_Pipeline': 300,
# 默认开启爬取内容
'job_51.pipelines.Job51_content_Pipeline': 300,
}
本文地址:https://blog.csdn.net/weixin_44345359/article/details/107430338
推荐阅读
-
Python爬虫实战用 BeautifulSoup 爬取电影网站信息
-
python爬虫项目实战:爬取500px图片
-
使用爬虫框架scrapy爬取网站妹子图
-
Python爬虫实战教程:爬取网易新闻
-
Python爬虫实战用 BeautifulSoup 爬取电影网站信息
-
Python使用Scrapy爬虫框架全站爬取图片并保存本地的实现代码
-
爬虫 Scrapy框架 爬取图虫图片并下载
-
爬虫(十七):Scrapy框架(四) 对接selenium爬取京东商品数据
-
python网络爬虫之解析网页的XPath(爬取Path职位信息)[三]
-
荐 Python爬虫:基于Scrapy爬取京东商品数据并保存到mysql且下载商品图片