scrapy中item和pipeline和yield和callback用法
程序员文章站
2023-12-30 15:16:40
...
配置
要将sitting中ITEM_PIPELINES = { 'mingyan.pipelines.MingyanPipeline': 300, }
注释取消。最好将ROBOTSTXT_OBEY = False
设置为False
代码
spider中代码使用了item和pipline
class itemSpider(scrapy.Spider):
name = 'itemSpider'
item = MingyanItem()
start_urls = ['http://mil.news.sina.com.cn/roll/index.d.html?cid=57918']
def parse(self, response):
ul = response.css('.sub_page li a::attr(href)').extract()
for i in ul:
if i.startswith('http://mil.news.sina.com.cn/roll/'):
li=response.css('.fixList .linkNews li')
for l in li:
link = l.css('a::attr(href)').extract_first() # 提取首页所有url
title=l.css('a::text').extract_first()#提取title
self.item['link'] = link
self.item['title'] = title
yield self.item
next_page = response.css('.pagebox_next a::attr(href)').extract_first()
if next_page is not None:
print(next_page)
next_page = response.urljoin(next_page)
yield self.item,scrapy.Request(next_page, callback=self.parse)
item
import scrapy
class MingyanItem(scrapy.Item):
title=scrapy.Field() #以字典的形式存放其中title和link为字典中的key
link=scrapy.Field()
pass
piplines
在yield item时调用用于存储信息
import os
import json
class MingyanPipeline(object):
def process_item(self, item, spider):
file_name = os.path.join('./junshi.json')
print(os.path.abspath(file_name))
with open(file_name, "a+",encoding='utf-8') as f: # “a+”以追加的形式
f.write(json.dumps(dict(item),ensure_ascii=False,sort_keys=False)+'\n')
# print(item['text'])
return item
yield相当于return
callback回调函数
知识点:cls清空终端命令行