Scrapy 有ItemLoader | items及spider格式
程序员文章站
2022-05-05 15:10:07
...
在爬虫代码中不用写清洗逻辑
需要在item.py中定义一个xxxLoader 继承ItemLoader
在 items中写清洗逻辑
# -*- coding: utf-8 -*-
import scrapy
from ArticleSpider.items import JobBoleArticleItem, ArticleItemLoader
from ArticleSpider.utils.common import get_md5
from scrapy.loader import ItemLoader
class JobboleSpider(scrapy.Spider):
name = 'jobbole'
allowed_domains = ['blog.jobbole.com']
start_urls = ['http://blog.jobbole.com/all-posts/']
def parse(self, response):
nodes = response.xpath('//div[@class="post floated-thumb"]/div[@class="post-thumb"]/a')
for node in nodes:
link = node.xpath('./@href').get()
front_img_url = node.xpath('./img/@src').get()
meta = {
'front_img_url': response.urljoin(front_img_url)
}
yield response.follow(link, callback=self.parse_article, meta=meta)
# for next_page in response.xpath('//a[contains(@class,"next")]'):
# yield response.follow(next_page, self.parse)
next_page = response.xpath('//a[contains(@class,"next")]/@href').get()
if next_page is not None:
yield response.follow(next_page, self.parse)
def parse_article(self, response):
front_img_url = [response.meta.get('front_img_url', '')]
item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response)
item_loader.add_xpath('title','//h1/text()')
item_loader.add_css('content','.entry')
item_loader.add_xpath('pub_date','//p[@class="entry-meta-hide-on-mobile"]/text()')
item_loader.add_xpath('tags','//p[@class="entry-meta-hide-on-mobile"]//a[not(contains(@href,"comment"))]/text()')
item_loader.add_xpath('fav_num','//h10/text()')
item_loader.add_xpath('book_num','//span[contains(@class,"bookmark-btn")]/text()')
item_loader.add_xpath('comment_num','//a[@href="#article-comment"]/span/text()')
item_loader.add_value('url_md5_id', get_md5(response.url))
item_loader.add_value('front_img_url',front_img_url)
item_loader.add_value('url', response.url)
article_item = item_loader.load_item()
yield article_item
items.py
# -*- coding: utf-8 -*-
import scrapy
import datetime
import re
def date_convert(value):
try:
pub_date = datetime.datetime.strptime(value, '%Y/%m/%d').date()
except Exception as e:
pub_date = datetime.datetime.now().date()
return pub_date
def get_nums(value):
pattern = re.compile('.*?(\d+).*')
match_re = re.search(pattern, value)
num = int(match_re.group(1)) if match_re else 0
return num
'''这样做就会返回原来的值 原来的值是一个列表 不会使用TakeFirst来拿第一个数据'''
def return_value(value):
return value
from scrapy.loader.processors import MapCompose, TakeFirst, Join
from scrapy.loader import ItemLoader
'''自定义Loader 并在spider中使用'''
class ArticleItemLoader(ItemLoader):
'''重载ItemLoader类 改变默认输出为列表中的第一个值'''
default_output_processor = TakeFirst()
class JobBoleArticleItem(scrapy.Item):
'''
title pub_date content url url_md5_id front_img_url front_img_path tags fav_num book_num comment_num
'''
title = scrapy.Field(
)
pub_date = scrapy.Field(
input_processor=MapCompose(date_convert),
)
content = scrapy.Field()
url = scrapy.Field()
# 把 url 转成 md5
url_md5_id = scrapy.Field()
front_img_url = scrapy.Field(
'''设置输出格式'''
output_processor=MapCompose(return_value)
)
# 图片本地存储路径
front_img_path = scrapy.Field()
tags = scrapy.Field(
# input_processor=MapCompose(get_nums),
output_processor=Join(',')
)
fav_num = scrapy.Field(
input_processor=MapCompose(get_nums)
)
book_num = scrapy.Field(
input_processor=MapCompose(get_nums)
)
comment_num = scrapy.Field(
input_processor=MapCompose(get_nums)
)
input_processor是一个列表 MapCompose会对列表里的所有数据都运行里面的函数
def remove_comment_tags(value):
#去掉tag中提取的评论
if "评论" in value:
return ""
else:
return value
tags = scrapy.Field(
input_processor=MapCompose(remove_comment_tags),
output_processor=Join(",")
)
转载于:https://www.jianshu.com/p/076c17ca0609
上一篇: 极简Scrapy爬虫4:items包装
下一篇: 攒了10条经得起考验的道理