21天打造分布式爬虫-Scrapy框架(七)
程序员文章站
2022-06-30 11:55:44
7.1.糗事百科 安装 pip install pypiwin32 pip install Twisted-18.7.0-cp36-cp36m-win_amd64.whl pip install scrapy 创建和运行项目 代码 qsbk_spider.py item.py pipelines.p ......
7.1.糗事百科
安装
pip install pypiwin32
pip install
pip install scrapy
创建和运行项目
scrapy startproject qsbk #创建项目 scrapy genspider qsbk_spider "qiushibaike.com" #创建爬虫 scrapy crawl qsbk_spider #运行爬虫
代码
qsbk_spider.py
# -*- coding: utf-8 -*- import scrapy from qsbk.items import QsbkItem class QsbkSpiderSpider(scrapy.Spider): name = 'qsbk_spider' allowed_domains = ['qiushibaike.com'] start_urls = ['https://www.qiushibaike.com/8hr/page/1/'] base_domain = "https://www.qiushibaike.com" def parse(self, response): duanzidivs = response.xpath("//div[@id='content-left']/div") for duanzidiv in duanzidivs: author = duanzidiv.xpath(".//h2/text()").get().strip() content = duanzidiv.xpath(".//div[@class='content']//text()").getall() content = "".join(content).strip() item = QsbkItem(author=author,content=content) yield item #爬后面页的数据 next_url = response.xpath("//ul[@class='pagination']/li[last()]/a/@href").get() if not next_url: return else: yield scrapy.Request(self.base_domain+next_url,callback=self.parse)
item.py
import scrapy class QsbkItem(scrapy.Item): author = scrapy.Field() content = scrapy.Field()
pipelines.py
# -*- coding: utf-8 -*- import json #1.手动把dick转换成json格式 # class QsbkPipeline(object): # def __init__(self): # self.fp = open('duanzi.json','w',encoding='utf-8') # # def open_spider(self,spider): # print('开始爬虫') # # def process_item(self, item, spider): # item_json = json.dumps(dict(item),ensure_ascii=False) # self.fp.write(item_json+'\n') # return item # # def close_spider(self,spider): # self.fp.close() # print('爬虫结束了') #2.适用JsonItemExporter,使用与数据量小的情况下 # from scrapy.exporters import JsonItemExporter # class QsbkPipeline(object): # def __init__(self): # self.fp = open('duanzi.json','wb') # self.exporter = JsonItemExporter(self.fp,ensure_ascii=False,encoding='utf-8') # self.exporter.start_exporting() # # def open_spider(self,spider): # print('开始爬虫') # # def process_item(self, item, spider): # self.exporter.export_item(item) # return item # # def close_spider(self,spider): # self.exporter.finish_exporting() # self.fp.close() # print('爬虫结束了') #3.JsonLinesItemExporter,适用与数据量大的情况下 from scrapy.exporters import JsonLinesItemExporter class QsbkPipeline(object): def __init__(self): self.fp = open('duanzi.json','wb') self.exporter = JsonLinesItemExporter(self.fp,ensure_ascii=False,encoding='utf-8') def open_spider(self,spider): print('开始爬虫') def process_item(self, item, spider): self.exporter.export_item(item) return item def close_spider(self,spider): self.fp.close() print('爬虫结束了')
settings.py
ROBOTSTXT_OBEY = False DOWNLOAD_DELAY = 1 DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36', }
ITEM_PIPELINES = {
'qsbk.pipelines.QsbkPipeline': 300,
}
start.py
from scrapy import cmdline cmdline.execute("scrapy crawl qsbk_spider".split())
上一篇: 详解Vue中使用Echarts的两种方式
下一篇: vuex的使用
推荐阅读
-
21天打造分布式爬虫-Scrapy框架(七)
-
Scrapy分布式爬虫打造搜索引擎 - (一)基础知识
-
Scrapy分布式爬虫打造搜索引擎-(八)elasticsearch结合django搭建搜索引擎
-
python分布式爬虫搜索引擎实战-4-scrapy框架初体验
-
Scrapy分布式爬虫打造搜索引擎-(六)scrapy进阶开发
-
学习python-day003---转自Python分布式爬虫打造搜索引擎Scrapy精讲
-
Scrapy分布式爬虫打造搜索引擎-(七)scrapy-redis 分布式爬虫
-
Scrapy分布式爬虫打造搜索引擎 - (三)知乎网问题和答案爬取
-
Python分布式爬虫Scrapy打造搜索引擎2020
-
21天打造分布式爬虫-Scrapy框架(七)