欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

Scrapy 常用pipeline

程序员文章站 2022-05-05 15:15:50
...

保存到json文件

import codecs

class JsonEncodingPipeline(object):
    def __init__(self):
        self.file = codecs.open("job_info.json", 'w', encoding='utf-8')
        self.file.write('[')

    def process_item(self, item, spider):
        """
        ensure_ascii=False 防止中文等编码错误
        """
        lines = json.dumps(dict(item), ensure_ascii=False) + ',\n'
        self.file.write(lines)
        return item

    def spider_closed(self, spider):
        """
        spider_close(signal) 关闭文件
        """
        self.file.write(']')
        self.file.close()

Scrapy自带的
Scrapy 常用pipeline

class JsonExporterPipeline(object):
    def __init__(self):
        self.file = open('jon_info1.json', 'wb')
        self.exporter = JsonItemExporter(self.file, encoding='utf-8', ensure_ascii=False)
        self.exporter.start_exporting()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

存储到MySQL

基于Twisted异步框架异步插入

class MysqlTwistedPipeline(object):

    def __init__(self, dbpool):
        self.dbpool = dbpool

    @classmethod
    def from_settings(cls, settings):
        dbparms = dict(
            host=settings["MYSQL_HOST"],
            db=settings["MYSQL_DBNAME"],
            user=settings["MYSQL_USER"],
            passwd=settings["MYSQL_PASSWORD"],
            charset='utf8',
            cursorclass=MySQLdb.cursors.DictCursor,
            use_unicode=True,
        )
        # twisted异步容器 使用MySQLdb模块连接
        dbpool = adbapi.ConnectionPool("MySQLdb", **dbparms)

        return cls(dbpool)

    def process_item(self, item, spider):
        # 使用Twisted将mysql插入变成异步执行
        query = self.dbpool.runInteraction(self.do_insert, item)
        # 处理异常
        query.addErrback(self.handle_error, item, spider)

    def handle_error(self, failure, item, spider):
        print(failure)

    def do_insert(self, cursor, item):

        insert_sql = """
        insert into job_info(
                company_name,
                company_addr,
                company_info,
                job_name,
                job_addr,
                job_info,
                salary_min,
                salary_max,
                url,
                time) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
        """

        cursor.execute(insert_sql, (
            item['company_name'],
            item['company_addr'],
            item['company_info'],
            item['job_name'],
            item['job_addr'],
            item['job_info'],
            item['salary_min'],
            item['salary_max'],
            item['url'],
            item['time'],
        ))

seetings中配置

MYSQL_HOST = 'localhost'
MYSQL_DBNAME = 'crawler'
MYSQL_USER = 'root'
MYSQL_PASSWORD = 'xxxxx'
相关标签: scrapy