Scrapy爬虫框架-Pipeline保存数据
程序员文章站
2022-03-02 22:45:56
...
Pipeline:
class ToscrapPipeline(object):
def process_item(self, item, spider):'''item:返回的item,spider:爬虫名字'''
data = item['text']
with open ('quotes.txt', 'a', encoding='utf-8') as f:
f.write(data + '\n')
return item '''必须要有return item'''
class ToscrapPipeline2(object):
def process_item(self, item, spider):
data = item['text']
with open ('quotes2.txt', 'a', encoding='utf-8') as f:
f.write(data + '\n')
return item
'''保存到MongoDB'''
'''setting.py添加两个变量:'''
''' MONGO_URI = 'localhost' '''
''' MONGO_DB = 'iamges360' '''
import pymongo
class MongoPipeline(object):
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DB')
)
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def process_item(self, item, spider):
self.db[item.collection].insert(dict(item))
return item
def close_spider(self, spider):
self.client.close()
'''保存到MySQL'''
'''setting.py添加五个变量:'''
''' MYSQL_HOST = 'localhost' '''
''' MYSQL_DATABASE = 'images360 ''''
''' MYSQL_PORT = 3306 '''
''' MYSQL_USER = 'root' '''
''' MYSQL_PASSWORD = '123456' '''
import pymysql
class MysqlPipeline():
def __init__(self, host, database, user, password, port):
self.host = host
self.database = database
self.user = user
self.password = password
self.port = port
@classmethod
def from_crawler(cls, crawler):
return cls(
host=crawler.settings.get('MYSQL_HOST'),
database=crawler.settings.get('MYSQL_DATABASE'),
user=crawler.settings.get('MYSQL_USER'),
password=crawler.settings.get('MYSQL_PASSWORD'),
port=crawler.settings.get('MYSQL_PORT'),
)
def open_spider(self, spider):
self.db = pymysql.connect(self.host, self.user, self.password, self.database,
charset='utf8',port=self.port)
self.cursor = self.db.cursor()
def close_spider(self, spider):
self.db.close()
def process_item(self, item, spider):
data = dict(item)
keys = ','.join(data.keys())
values = ','.join(['%s'] * len(data))
sql = 'insert into %s (%s) values(%s)' % (item.table, keys, values)
self.cursor.execute(sql, tuple(data.values()))
self.db.commit()
return item
'''保存到Redis'''
import redis
class ToscrapPipelineRedis(object):
def __init__(self):
self.redisCli = redis.StrictRedis(
host='127.0.0.1',
port=6379,
db=1
)
def process_item(self, item, spider):
self.redisCli.lpush('scrapyw', item['text'])
return item
**管道文件:
'''settings.py文件放开ITEM_PIPELINES= {}'''
ITEM_PIPELINES = {
'toscrap.pipelines.ToscrapPipeline': 200,'''项目名.管道名.管道类名:权重(越小越先执行)'''
'toscrap.pipelines.ToscrapPipeline2': 300,
'toscrap.pipelines.MysqlPipeline': 400,
'toscrap.pipelines.MongoPipeline': 500,
'toscrap.pipelines.ToscrapPipelineRedis': 600,
}
有问题请联系博主:
微信:hrvrap
qq:2580419087
推荐阅读
-
Laravel框架使用monolog_mysql实现将系统日志信息保存到mysql数据库的方法
-
使用Python的Scrapy框架编写web爬虫的简单示例
-
爬虫(十四):Scrapy框架(一) 初识Scrapy、第一个案例
-
爬虫之scrapy框架
-
Python爬虫入门教程 31-100 36氪(36kr)数据抓取 scrapy
-
scrapy自定义pipeline类实现将采集数据保存到mongodb的方法
-
Python爬虫框架Scrapy基本用法入门教程
-
爬虫(十六):Scrapy框架(三) Spider Middleware、Item Pipeline、对接Selenium
-
爬虫框架-Scrapy
-
python爬虫框架scrapy实现模拟登录操作示例