21天打造分布式爬虫-Crawl爬取小程序社区(八)
程序员文章站
2022-07-23 08:17:27
8.1.Crawl的用法实战 新建项目 wxapp_spider.py items.py pipelines.py settings.py start.py ......
8.1.Crawl的用法实战
新建项目
scrapy startproject wxapp scrapy genspider -t crawl wxapp_spider "wxapp-union.com"
wxapp_spider.py
# -*- coding: utf-8 -*- import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from wxapp.items import WxappItem class WxappSpiderSpider(CrawlSpider): name = 'wxapp_spider' allowed_domains = ['wxapp-union.com'] start_urls = ['http://www.wxapp-union.com/portal.php?mod=list&catid=2&page=1'] rules = ( Rule(LinkExtractor(allow=r'.+mod=list&catid=\d'), follow=True), Rule(LinkExtractor(allow=r'.+article-.+\.html'), callback="parse_detail",follow=False), ) def parse_detail(self, response): title = response.xpath("//h1[@class='ph']/text()").get() author_p = response.xpath("//p[@class='authors']") author = author_p.xpath(".//a/text()").get() pub_time = author_p.xpath(".//span/text()").get() article_content = response.xpath("//td[@id='article_content']//text()").getall() content = "".join(article_content).strip() item = WxappItem(title=title,author=author,pub_time=pub_time,content=content) return item
items.py
# -*- coding: utf-8 -*- import scrapy class WxappItem(scrapy.Item): title = scrapy.Field() author = scrapy.Field() pub_time = scrapy.Field() content = scrapy.Field()
pipelines.py
# -*- coding: utf-8 -*- from scrapy.exporters import JsonLinesItemExporter class WxappPipeline(object): def __init__(self): self.fp = open('wxapp.json','wb') self.exporter = JsonLinesItemExporter(self.fp, ensure_ascii=False, encoding='utf-8') def process_item(self, item, spider): self.exporter.export_item(item) return item def close_spider(self, spider): self.fp.close()
settings.py
ROBOTSTXT_OBEY = False DOWNLOAD_DELAY = 1 DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36', } ITEM_PIPELINES = { 'wxapp.pipelines.WxappPipeline': 300, }
start.py
from scrapy import cmdline cmdline.execute("scrapy crawl wxapp_spider".split())
上一篇: 捕捉苍蝇新技能