欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

Scrapy爬虫框架-Pipeline保存数据

程序员文章站 2022-03-02 22:45:56
...

Pipeline:

class ToscrapPipeline(object):
    def process_item(self, item, spider):'''item:返回的item,spider:爬虫名字'''
        data = item['text']
        with open ('quotes.txt', 'a', encoding='utf-8') as f:
            f.write(data + '\n')
        return item '''必须要有return item'''

class ToscrapPipeline2(object):
    def process_item(self, item, spider):
        data = item['text']
        with open ('quotes2.txt', 'a', encoding='utf-8') as f:
            f.write(data + '\n')
        return item

'''保存到MongoDB'''
'''setting.py添加两个变量:'''
''' MONGO_URI = 'localhost' '''
''' MONGO_DB = 'iamges360' '''
import pymongo

class MongoPipeline(object):
    def __init__(self, mongo_uri, mongo_db):
        self.mongo_uri = mongo_uri
        self.mongo_db = mongo_db

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            mongo_uri=crawler.settings.get('MONGO_URI'),
            mongo_db=crawler.settings.get('MONGO_DB')
        )

    def open_spider(self, spider):
        self.client = pymongo.MongoClient(self.mongo_uri)
        self.db = self.client[self.mongo_db]

    def process_item(self, item, spider):
        self.db[item.collection].insert(dict(item))
        return item

    def close_spider(self, spider):
        self.client.close()

'''保存到MySQL'''
'''setting.py添加五个变量:'''
''' MYSQL_HOST = 'localhost' '''
''' MYSQL_DATABASE = 'images360 ''''
''' MYSQL_PORT = 3306 '''
''' MYSQL_USER = 'root' '''
''' MYSQL_PASSWORD = '123456' '''
import pymysql

class MysqlPipeline():
    def __init__(self, host, database, user, password, port):
        self.host = host
        self.database = database
        self.user = user
        self.password = password
        self.port = port

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            host=crawler.settings.get('MYSQL_HOST'),
            database=crawler.settings.get('MYSQL_DATABASE'),
            user=crawler.settings.get('MYSQL_USER'),
            password=crawler.settings.get('MYSQL_PASSWORD'),
            port=crawler.settings.get('MYSQL_PORT'),
        )

    def open_spider(self, spider):
        self.db = pymysql.connect(self.host, self.user, self.password, self.database, 
                                    charset='utf8',port=self.port)
        self.cursor = self.db.cursor()

    def close_spider(self, spider):
        self.db.close()

    def process_item(self, item, spider):
        data = dict(item)
        keys = ','.join(data.keys())
        values = ','.join(['%s'] * len(data))
        sql = 'insert into %s (%s) values(%s)' % (item.table, keys, values)
        self.cursor.execute(sql, tuple(data.values()))
        self.db.commit()
        return item

'''保存到Redis'''
import  redis

class ToscrapPipelineRedis(object):
    def __init__(self):
        self.redisCli = redis.StrictRedis(
            host='127.0.0.1',
            port=6379,
            db=1
        )

    def process_item(self, item, spider):
        self.redisCli.lpush('scrapyw', item['text'])
        return item                                

**管道文件:

'''settings.py文件放开ITEM_PIPELINES= {}'''
ITEM_PIPELINES = {
    'toscrap.pipelines.ToscrapPipeline': 200,'''项目名.管道名.管道类名:权重(越小越先执行)'''
    'toscrap.pipelines.ToscrapPipeline2': 300,
    'toscrap.pipelines.MysqlPipeline': 400,
    'toscrap.pipelines.MongoPipeline': 500,
    'toscrap.pipelines.ToscrapPipelineRedis': 600,
}

有问题请联系博主:
微信:hrvrap
qq:2580419087