欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

Scrapy 无ItemLoader items及spider格式

程序员文章站 2022-05-05 18:00:07
...

在爬虫页面有大量的数据清洗代码

spider.py

# -*- coding: utf-8 -*-
import scrapy
import re
import datetime

from ArticleSpider.items import JobBoleArticleItem

class JobboleSpider(scrapy.Spider):
    name = 'jobbole'
    allowed_domains = ['blog.jobbole.com']
    start_urls = ['http://blog.jobbole.com/all-posts/']


    def parse(self, response):
        nodes = response.xpath('//div[@class="post floated-thumb"]/div[@class="post-thumb"]/a')
        for node in nodes:
            link = node.xpath('./@href').get()
            front_img_url = node.xpath('./img/@src').get()
            meta = {
                'front_img_url': response.urljoin(front_img_url)
            }
            yield response.follow(link, callback=self.parse_article, meta=meta)

        # for next_page in response.xpath('//a[contains(@class,"next")]'):
        #     yield response.follow(next_page, self.parse)
        next_page = response.xpath('//a[contains(@class,"next")]/@href').get()
        if next_page is not None:
            yield response.follow(next_page, self.parse)


    def parse_article(self, response):
        front_img_url = [response.meta.get('front_img_url', '')]
        title = response.xpath('//h1/text()').get()
        content = response.css('.entry').get()
        pub_date = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').get().strip().split(' ')[0]
        tags = ','.join(response.xpath('//p[@class="entry-meta-hide-on-mobile"]//a[not(contains(@href,"comment"))]/text()').getall())
        fav_num = response.xpath('//h10/text()').get()
        book_num = response.xpath('//span[contains(@class,"bookmark-btn")]/text()').get()
        pattern = re.compile('.*?(\d+).*')
        book_num = re.match(pattern, book_num).group(1) if re.match(pattern, book_num) else 0
        comment_num = response.xpath('//a[@href="#article-comment"]/span/text()').get()
        comment_num = re.match(pattern, comment_num).group(1) if re.match(pattern, comment_num) else 0

        article_item = JobBoleArticleItem()
        article_item['title'] = title
        article_item['front_img_url'] = front_img_url
        article_item['url'] = response.url
        article_item['content'] = content
        try:
            pub_date = datetime.datetime.strptime(pub_date, '%Y/%m/%d').date()
        except Exception as e:
            pub_date = datetime.datetime.now().date()
        article_item['pub_date'] = pub_date
        article_item['tags'] = tags
        article_item['fav_num'] = int(fav_num)
        article_item['book_num'] = int(book_num)
        article_item['comment_num'] = int(comment_num)
        article_item['url_md5_id'] = get_md5(response.url)
        yield article_item

items.py

import scrapy
class JobBoleArticleItem(scrapy.Item):
'''
    title pub_date content url url_md5_id front_img_url front_img_path tags fav_num book_num comment_num

    '''
    title = scrapy.Field()
    pub_date = scrapy.Field()
    content = scrapy.Field()
    url = scrapy.Field()
    # 把 url 转成 md5
    url_md5_id = scrapy.Field()
    front_img_url = scrapy.Field()
    # 图片本地存储路径
    front_img_path = scrapy.Field()
    tags = scrapy.Field()
    fav_num = scrapy.Field()
    book_num = scrapy.Field()
    comment_num = scrapy.Field()

转载于:https://www.jianshu.com/p/606c9c011a3f