欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

Scrapy 有ItemLoader | items及spider格式

程序员文章站 2022-05-05 15:10:07
...

在爬虫代码中不用写清洗逻辑
需要在item.py中定义一个xxxLoader 继承ItemLoader
在 items中写清洗逻辑

# -*- coding: utf-8 -*-
import scrapy


from ArticleSpider.items import JobBoleArticleItem, ArticleItemLoader
from ArticleSpider.utils.common import get_md5
from scrapy.loader import ItemLoader

class JobboleSpider(scrapy.Spider):
    name = 'jobbole'
    allowed_domains = ['blog.jobbole.com']
    start_urls = ['http://blog.jobbole.com/all-posts/']


    def parse(self, response):
        nodes = response.xpath('//div[@class="post floated-thumb"]/div[@class="post-thumb"]/a')
        for node in nodes:
            link = node.xpath('./@href').get()
            front_img_url = node.xpath('./img/@src').get()
            meta = {
                'front_img_url': response.urljoin(front_img_url)
            }
            yield response.follow(link, callback=self.parse_article, meta=meta)

        # for next_page in response.xpath('//a[contains(@class,"next")]'):
        #     yield response.follow(next_page, self.parse)
        next_page = response.xpath('//a[contains(@class,"next")]/@href').get()
        if next_page is not None:
            yield response.follow(next_page, self.parse)


    def parse_article(self, response):
        front_img_url = [response.meta.get('front_img_url', '')]

        item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response)
        item_loader.add_xpath('title','//h1/text()')
        item_loader.add_css('content','.entry')
        item_loader.add_xpath('pub_date','//p[@class="entry-meta-hide-on-mobile"]/text()')
        item_loader.add_xpath('tags','//p[@class="entry-meta-hide-on-mobile"]//a[not(contains(@href,"comment"))]/text()')
        item_loader.add_xpath('fav_num','//h10/text()')
        item_loader.add_xpath('book_num','//span[contains(@class,"bookmark-btn")]/text()')
        item_loader.add_xpath('comment_num','//a[@href="#article-comment"]/span/text()')
        item_loader.add_value('url_md5_id', get_md5(response.url))
        item_loader.add_value('front_img_url',front_img_url)
        item_loader.add_value('url', response.url)
        article_item = item_loader.load_item()
        yield  article_item

items.py

# -*- coding: utf-8 -*-

import scrapy
import datetime
import re

def date_convert(value):
    try:
        pub_date = datetime.datetime.strptime(value, '%Y/%m/%d').date()
    except Exception as e:
        pub_date = datetime.datetime.now().date()
    return pub_date


def get_nums(value):
    pattern = re.compile('.*?(\d+).*')
    match_re = re.search(pattern, value)
    num = int(match_re.group(1)) if match_re else 0
    return num

'''这样做就会返回原来的值 原来的值是一个列表 不会使用TakeFirst来拿第一个数据'''
def return_value(value):
    return value

from scrapy.loader.processors import MapCompose, TakeFirst, Join
from scrapy.loader import ItemLoader
'''自定义Loader 并在spider中使用'''
class ArticleItemLoader(ItemLoader):
    '''重载ItemLoader类 改变默认输出为列表中的第一个值'''
    default_output_processor = TakeFirst()

class JobBoleArticleItem(scrapy.Item):
    '''
    title pub_date content url url_md5_id front_img_url front_img_path tags fav_num book_num comment_num

    '''
    title = scrapy.Field(

    )
    pub_date = scrapy.Field(
        input_processor=MapCompose(date_convert),
    )
    content = scrapy.Field()
    url = scrapy.Field()
    # 把 url 转成 md5
    url_md5_id = scrapy.Field()
    front_img_url = scrapy.Field(
        '''设置输出格式'''
        output_processor=MapCompose(return_value)
    )
    # 图片本地存储路径
    front_img_path = scrapy.Field()
    tags = scrapy.Field(
        # input_processor=MapCompose(get_nums),
        output_processor=Join(',')
    )
    fav_num = scrapy.Field(
        input_processor=MapCompose(get_nums)
    )
    book_num = scrapy.Field(
        input_processor=MapCompose(get_nums)
    )
    comment_num = scrapy.Field(
        input_processor=MapCompose(get_nums)
    )

input_processor是一个列表 MapCompose会对列表里的所有数据都运行里面的函数

def remove_comment_tags(value):
    #去掉tag中提取的评论
    if "评论" in value:
        return ""
    else:
        return value

tags = scrapy.Field(
        input_processor=MapCompose(remove_comment_tags),
        output_processor=Join(",")
    )

转载于:https://www.jianshu.com/p/076c17ca0609