python爬虫07:scrapy items定义和使用,md5的获取
程序员文章站
2022-05-05 15:09:43
...
scrapy items定义和使用
当要爬取的字段过多,要把字段拼在一块
scrapy提供了 item 类
生成爬虫时已经生成了items.py
item是dict的增强版本
关闭robot协议:
关于网站的robot协议:域名+/robots.txt 在浏览器可获取
不遵循robots协议:
在settings.py 搜索ROBOTSTXT_OBEY = True
把true改成False即可
顺便在settings.py 搜索piplines,把默认注释取消
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'ArticleSpider.pipelines.ArticlespiderPipeline': 300,
}
其中300是优先级,越小越先执行
添加md5获取工具:
在根目录下新建一个python package文件夹,里面新建一个utils文件夹,里面建一个common.py文件,之后复制以下代码进去
import hashlib
def get_md5(url):
if isinstance(url, str):
url = url.encode('utf8')
m = hashlib.md5()
m.update(url)
return m.hexdigest()
if __name__ == '__main__':
print(get_md5('https://cnblogs.com'))
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class ArticlespiderItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
class JobBoleArticleItem(scrapy.Item):
url = scrapy.Field()
url_object_id = scrapy.Field()
title = scrapy.Field()
create_date = scrapy.Field()
front_image_url = scrapy.Field()
front_image_path = scrapy.Field() # 下载之后存在哪里
praise_nums = scrapy.Field()
comment_nums = scrapy.Field()
fav_nums = scrapy.Field()
tags = scrapy.Field()
content = scrapy.Field()
jobbole.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
from urllib import parse
import requests
import json
import re
from ArticleSpider.utils.common import get_md5
from ArticleSpider.items import JobBoleArticleItem
class JobboleSpider(scrapy.Spider):
name = 'jobbole'
allowed_domains = ['news.cnblogs.com']
start_urls = ['http://news.cnblogs.com/']
def parse(self, response):
"""
parse 中一般写抓取策略,网页解析写在别处
1.这里是获取新闻列表页中的新闻url并交给scrapy进行下载,之后调用相应的解析方法
2.还要获取下一个列表页,继续循环
:param response: 调用此方法的结果
:return:
"""
post_nodes = response.css('#news_list .news_block')[:1] # 一个selector列表 加切片方便调试
for post_node in post_nodes:
image_url = post_node.css('.entry_summary a img::attr(href)').extract_first("") # 封面图片网址
post_url = post_node.css('h2 a::attr(href)').extract_first("")
yield Request(url=parse.urljoin(response.url, post_url),
meta={"front_image_url": image_url},
callback=self.parse_detail
) # 简化代码处理异常url
'''
提取下一页并交给scrapy下载
next_url = response.css("div.pager a:last_chile::text").extract_first("")
if next_url == "Next >":
next_url = response.css("div.pager a:last_chile::attr(href)").extract_first("")
yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse) # 简化代码处理异常url
或者
next_url = response.xpath("//a[contains(text(), 'Next >')]/@href").extract_first("")
yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse)
'''
def parse_detail(self, response):
'''处理下载的详情页'''
match_id = re.match(".*?(\d+)", response.url) # 正则表达式提取文章id,但这种模式兼容性不高
if match_id:
article_item = JobBoleArticleItem()
# 写成xpath版本
# title = response.css("#news_title a::text").extract_first("")
title = response.xpath("//*[@id='news_title']//a/text()").extract_first("")
# create_date = response.css("#news_info .time::text").extract_first("")
create_date = response.xpath("//*[@id='news_info']//*[@class='time']/text()").extract_first("")
match_re = re.match(".*?(\d+.*)", create_date)
if match_re:
create_date = match_re.group(1)
# content = response.css("#news_content").extract()[0] # 一般提取那段文章所在的html
content = response.xpath("//*[@id='news_content']").extract()[0]
# tag_list = response.css(".news_tags a::text").extract()
tag_list = response.xpath("//*[@class='news_tags']//a/text").extract()
tags = ", ".join(tag_list)
post_id = match_id.group(1)
article_item['url'] = response.url
article_item['url_object_id'] = get_md5(article_item['url'])
article_item['title'] = title
article_item['create_date'] = create_date
article_item['content'] = content
article_item['tags'] = tags
article_item['front_image_url'] = response.meta.get("front_image_url", "") # 不用get可能会抛异常
yield Request(url=parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)),
meta={'article_item': article_item},
callback=self.parse_page_nums,)
'''都是基于回调的, 尽管这样会麻烦一点'''
def parse_page_nums(self, response):
article_item = response.meta.get('article_item', '')
j_data = json.loads(response.text)
praise_nums = j_data["DiggCount"]
fav_nums = j_data["TotalView"]
comment_nums = j_data["CommentCount"]
print(j_data)
article_item['praise_nums'] = praise_nums
article_item['fav_nums'] = fav_nums
article_item['comment_nums'] = comment_nums
yield article_item # item 和 Request 类型都可以随时yield出去,item 类会走 piplines.py 的逻辑
print('stop')
piplines.py
在最后一行打断点,运行前几篇设置好的main.py,可以看到跳转到了piplines.py
class ArticlespiderPipeline:
def process_item(self, item, spider):
return item