欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

scrapy爬取——mysql存储

程序员文章站 2022-04-30 13:37:35
...

我们这里爬取我爱我家http://fang.5i5j.com/bj/loupan/t17n用来做例子

MySQL建立一个数据库和需要的表格

DROP DATABASE IF EXISTS `cc`; /*``是键盘第二行最左边的键,不是单引号‘*/
CREATE DATABASE `cc`
    default character set=utf8;
use `cc`;
DROP table IF EXISTS Home;
create table Home
(Id int(4) PRIMARY KEY auto_increment,  #id为自增
Name1 Varchar(50),
location Varchar(100), 
averagePrice Varchar(20), 
broker Varchar(20)
)auto_increment=1;  #从1开始自增

爬取文件

import scrapy
import logging  # 日志模块
from doubanFilm.items import DoubanfilmItem

logger = logging.getLogger(__name__)

##我爱我家爬取-mysql存储
class LovehomeSpider(scrapy.Spider):
    name = 'lovehome'
    allowed_domains = ['fang.5i5j.com']
    start_urls = ['http://fang.5i5j.com/bj/loupan/t17n' + str(i) for i in range(13)] #翻页

    def parse(self, response): #深层爬取
        for row in response.xpath('/html/body/div[6]/div[1]/ul[1]/li/div[2]/div[1]/a/@href').extract():  # 获取所有li
            url = 'https://fang.5i5j.com' + row
            yield scrapy.Request(url, callback=self.parseHome)
    def parseHome(self, response):
        item = DoubanfilmItem()  # 实例化
        item["Name1"] = response.xpath('/html/body/div[5]/div[1]/div[1]/h1/text()').get()  # 名称
        item["location"] = response.xpath('/html/body/div[5]/div[2]/ul/li[5]/span[1]/text()').get()  # 地点
        item["averagePrice"] = response.xpath('string(/html/body/div[5]/div[2]/ul/li[1]/span[1])').get()  # 均价
        item["broker"] = response.xpath('/html/body/div[5]/div[2]/ul/li[7]/span[1]/text()').get()  # 经纪人
        logger.warning(item)
        yield item

items.py文件

class DoubanfilmItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()

    ##我爱我家爬取-mysql存储
    Name1 = scrapy.Field()
    location = scrapy.Field()
    averagePrice = scrapy.Field()
    broker = scrapy.Field()
    pass

pipelines.py文件

import pymysql
class DoubanfilmPipeline():
    def process_item(self, item, spider):
        return item
##我爱我家爬取-mysql存储
class MySQLPipeline: #开始爬取数据之前被调用 #读取配置文件,初始化连接以及游标
    pass
    def open_spider(self, spider):
        host = spider.settings.get("MYSQL_DB_HOST", "127.0.0.1") #服务器 IP
        port = spider.settings.get("MYSQL_DB_PORT", 3306) #端口
        dbname = spider.settings.get("MYSQL_DB_NAME", "cc")#数据库名称
        user = spider.settings.get("MYSQL_DB_USER", "root")#用户名
        pwd = spider.settings.get("MYSQL_DB_PASSWORD", "密码") #密码
        self.db_conn = pymysql.connect(host=host, port=port,
                                        db=dbname, user=user, password=pwd)
        self.db_cur = self.db_conn.cursor()
    #每解析完一个 item 调用
    #插入数据
    def process_item(self, item, spider):
        values=(item["Name1"],
                item["location"],
                item["averagePrice"],
                item["broker"],)
        sql="insert into Home(Name1,location,averagePrice,broker) values(%s,%s,%s,%s)"
        self.db_cur.execute(sql, values)
        return item
    #爬取完全部数据后被调用
    #提交数据,释放连接
    def close_spider(self, spider):
        self.db_conn.commit()
        self.db_cur.close()
        self.db_conn.close()

settings.py文件

DOWNLOAD_DELAY = 2   # 爬取时间间隔
RANDOMIZE_DOWNLOAD_DELAY = True
COOKIES_ENABLED = True

USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Safari/537.36' # 请事头

ROBOTSTXT_OBEY = False #关闭 robots 协议,否则很多页面都无法爬取
LOG_LEVEL="WARNING"#日志为警告以上才显示

MYSQL_DB_HOST="127.0.0.1" #服务器 IP
MYSQL_DB_PORT=3306 #端口
MYSQL_DB_NAME="cc" #数据库名称
MYSQL_DB_USER="root"  #用户名
MYSQL_DB_PASSWORD="密码" #密码

ITEM_PIPELINES = {
   'doubanFilm.pipelines.MySQLPipeline': 1
}

最后

在terminal中输入 (lovehome是项目名)

scrapy crawl lovehome 

打开MySQL可以看到已经爬取出来了
scrapy爬取——mysql存储

相关标签: 数据采集与处理