scrapy爬取——mysql存储
程序员文章站
2022-04-30 13:37:35
...
我们这里爬取我爱我家http://fang.5i5j.com/bj/loupan/t17n
用来做例子
MySQL建立一个数据库和需要的表格
DROP DATABASE IF EXISTS `cc`; /*``是键盘第二行最左边的键,不是单引号‘*/
CREATE DATABASE `cc`
default character set=utf8;
use `cc`;
DROP table IF EXISTS Home;
create table Home
(Id int(4) PRIMARY KEY auto_increment, #id为自增
Name1 Varchar(50),
location Varchar(100),
averagePrice Varchar(20),
broker Varchar(20)
)auto_increment=1; #从1开始自增
爬取文件
import scrapy
import logging # 日志模块
from doubanFilm.items import DoubanfilmItem
logger = logging.getLogger(__name__)
##我爱我家爬取-mysql存储
class LovehomeSpider(scrapy.Spider):
name = 'lovehome'
allowed_domains = ['fang.5i5j.com']
start_urls = ['http://fang.5i5j.com/bj/loupan/t17n' + str(i) for i in range(13)] #翻页
def parse(self, response): #深层爬取
for row in response.xpath('/html/body/div[6]/div[1]/ul[1]/li/div[2]/div[1]/a/@href').extract(): # 获取所有li
url = 'https://fang.5i5j.com' + row
yield scrapy.Request(url, callback=self.parseHome)
def parseHome(self, response):
item = DoubanfilmItem() # 实例化
item["Name1"] = response.xpath('/html/body/div[5]/div[1]/div[1]/h1/text()').get() # 名称
item["location"] = response.xpath('/html/body/div[5]/div[2]/ul/li[5]/span[1]/text()').get() # 地点
item["averagePrice"] = response.xpath('string(/html/body/div[5]/div[2]/ul/li[1]/span[1])').get() # 均价
item["broker"] = response.xpath('/html/body/div[5]/div[2]/ul/li[7]/span[1]/text()').get() # 经纪人
logger.warning(item)
yield item
items.py文件
class DoubanfilmItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
##我爱我家爬取-mysql存储
Name1 = scrapy.Field()
location = scrapy.Field()
averagePrice = scrapy.Field()
broker = scrapy.Field()
pass
pipelines.py文件
import pymysql
class DoubanfilmPipeline():
def process_item(self, item, spider):
return item
##我爱我家爬取-mysql存储
class MySQLPipeline: #开始爬取数据之前被调用 #读取配置文件,初始化连接以及游标
pass
def open_spider(self, spider):
host = spider.settings.get("MYSQL_DB_HOST", "127.0.0.1") #服务器 IP
port = spider.settings.get("MYSQL_DB_PORT", 3306) #端口
dbname = spider.settings.get("MYSQL_DB_NAME", "cc")#数据库名称
user = spider.settings.get("MYSQL_DB_USER", "root")#用户名
pwd = spider.settings.get("MYSQL_DB_PASSWORD", "密码") #密码
self.db_conn = pymysql.connect(host=host, port=port,
db=dbname, user=user, password=pwd)
self.db_cur = self.db_conn.cursor()
#每解析完一个 item 调用
#插入数据
def process_item(self, item, spider):
values=(item["Name1"],
item["location"],
item["averagePrice"],
item["broker"],)
sql="insert into Home(Name1,location,averagePrice,broker) values(%s,%s,%s,%s)"
self.db_cur.execute(sql, values)
return item
#爬取完全部数据后被调用
#提交数据,释放连接
def close_spider(self, spider):
self.db_conn.commit()
self.db_cur.close()
self.db_conn.close()
settings.py文件
DOWNLOAD_DELAY = 2 # 爬取时间间隔
RANDOMIZE_DOWNLOAD_DELAY = True
COOKIES_ENABLED = True
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Safari/537.36' # 请事头
ROBOTSTXT_OBEY = False #关闭 robots 协议,否则很多页面都无法爬取
LOG_LEVEL="WARNING"#日志为警告以上才显示
MYSQL_DB_HOST="127.0.0.1" #服务器 IP
MYSQL_DB_PORT=3306 #端口
MYSQL_DB_NAME="cc" #数据库名称
MYSQL_DB_USER="root" #用户名
MYSQL_DB_PASSWORD="密码" #密码
ITEM_PIPELINES = {
'doubanFilm.pipelines.MySQLPipeline': 1
}
最后
在terminal中输入 (lovehome是项目名)
scrapy crawl lovehome
打开MySQL可以看到已经爬取出来了