scrapy爬虫中pipeline保存数据
程序员文章站
2022-05-08 16:18:11
...
保存到MongoDB
class MongoPipeline(object):
def __init__(self, mongo_url, mongo_db):
self.mongo_url =mongo_url
self.mongo_db = mongo_db
self.db = None
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_url = crawler.settings.get('MONGO_URL'),
mongo_db = crawler.settings.get('MONGO_DB')
)
def process_item(self, item, spider):
logger.warning('%s保存到mongoDB成功',item['title'])
# 创建数据表,插入数据
if spider.name == 'yg':
name = item.__class__.__name__
self.db[name].update({'_id': item.get('_id')}, {'$set':dict(item)}, True)
return item
else:
raise DropItem
def open_spider(self, spider):
# 建立连接,创建数据库
if spider.name == 'yg':
self.client = pymongo.MongoClient(self.mongo_url)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
if spider.name == 'yg':
self.client.close()
保存到文件
import json
class JsonWriterPipeline(object):
def open_spider(self, spider):
self.file = open(spider.settings.get("SAVE_FILE", './temp.json'), 'w')
def close_spider(self,spider):
self.file.close()
def peocess_item(self, item, spider):
line = json.dumps(dict(item)) + '\n' # 将字典转为json
self.file.write(item)
retuen item
下一篇: 属性构造