Scrapy 无ItemLoader items及spider格式
程序员文章站
2022-05-05 18:00:07
...
在爬虫页面有大量的数据清洗代码
spider.py
# -*- coding: utf-8 -*-
import scrapy
import re
import datetime
from ArticleSpider.items import JobBoleArticleItem
class JobboleSpider(scrapy.Spider):
name = 'jobbole'
allowed_domains = ['blog.jobbole.com']
start_urls = ['http://blog.jobbole.com/all-posts/']
def parse(self, response):
nodes = response.xpath('//div[@class="post floated-thumb"]/div[@class="post-thumb"]/a')
for node in nodes:
link = node.xpath('./@href').get()
front_img_url = node.xpath('./img/@src').get()
meta = {
'front_img_url': response.urljoin(front_img_url)
}
yield response.follow(link, callback=self.parse_article, meta=meta)
# for next_page in response.xpath('//a[contains(@class,"next")]'):
# yield response.follow(next_page, self.parse)
next_page = response.xpath('//a[contains(@class,"next")]/@href').get()
if next_page is not None:
yield response.follow(next_page, self.parse)
def parse_article(self, response):
front_img_url = [response.meta.get('front_img_url', '')]
title = response.xpath('//h1/text()').get()
content = response.css('.entry').get()
pub_date = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').get().strip().split(' ')[0]
tags = ','.join(response.xpath('//p[@class="entry-meta-hide-on-mobile"]//a[not(contains(@href,"comment"))]/text()').getall())
fav_num = response.xpath('//h10/text()').get()
book_num = response.xpath('//span[contains(@class,"bookmark-btn")]/text()').get()
pattern = re.compile('.*?(\d+).*')
book_num = re.match(pattern, book_num).group(1) if re.match(pattern, book_num) else 0
comment_num = response.xpath('//a[@href="#article-comment"]/span/text()').get()
comment_num = re.match(pattern, comment_num).group(1) if re.match(pattern, comment_num) else 0
article_item = JobBoleArticleItem()
article_item['title'] = title
article_item['front_img_url'] = front_img_url
article_item['url'] = response.url
article_item['content'] = content
try:
pub_date = datetime.datetime.strptime(pub_date, '%Y/%m/%d').date()
except Exception as e:
pub_date = datetime.datetime.now().date()
article_item['pub_date'] = pub_date
article_item['tags'] = tags
article_item['fav_num'] = int(fav_num)
article_item['book_num'] = int(book_num)
article_item['comment_num'] = int(comment_num)
article_item['url_md5_id'] = get_md5(response.url)
yield article_item
items.py
import scrapy
class JobBoleArticleItem(scrapy.Item):
'''
title pub_date content url url_md5_id front_img_url front_img_path tags fav_num book_num comment_num
'''
title = scrapy.Field()
pub_date = scrapy.Field()
content = scrapy.Field()
url = scrapy.Field()
# 把 url 转成 md5
url_md5_id = scrapy.Field()
front_img_url = scrapy.Field()
# 图片本地存储路径
front_img_path = scrapy.Field()
tags = scrapy.Field()
fav_num = scrapy.Field()
book_num = scrapy.Field()
comment_num = scrapy.Field()
转载于:https://www.jianshu.com/p/606c9c011a3f
下一篇: C#多线程Task的用法