Python爬虫项目--爬取链家热门城市新房
程序员文章站
2024-01-20 17:04:40
本次实战是利用爬虫爬取链家的新房(声明: 内容仅用于学习交流, 请勿用作商业用途) 环境 win8, python 3.7, pycharm 正文 1. 目标网站分析 通过分析, 找出相关url, 确定请求方式, 是否存在js加密等. 2. 新建scrapy项目 1. 在cmd命令行窗口中输入以下命 ......
本次实战是利用爬虫爬取链家的新房(声明: 内容仅用于学习交流, 请勿用作商业用途)
环境
win8, python 3.7, pycharm
正文
1. 目标网站分析
通过分析, 找出相关url, 确定请求方式, 是否存在js加密等.
2. 新建scrapy项目
1. 在cmd命令行窗口中输入以下命令, 创建lianjia项目
scrapy startproject lianjia
2. 在cmd中进入lianjia文件中, 创建spider文件
cd lianjia scrapy genspider -t crawl xinfang lianjia.com
这次创建的是crawlspider类, 该类适用于批量爬取网页
3. 新建main.py文件, 用于执行scrapy项目文件
到现在, 项目就创建完成了, 下面开始编写项目
3 定义字段
在items.py文件中定义需要的爬取的字段信息
import scrapy from scrapy.item import item, field class lianjiaitem(scrapy.item): # define the fields for your item here like: # name = scrapy.field() city = field() #城市名 name = field() #楼盘名 type = field() #物业类型 status = field() #状态 region = field() #所属区域 street = field() #街道 address = field() #具体地址 area = field() #面积 average_price = field() #平均价格 total_price = field() #总价 tags = field() #标签
4 爬虫主程序
在xinfang.py文件中编写我们的爬虫主程序
from scrapy.linkextractors import linkextractor from scrapy.spiders import crawlspider, rule from lianjia.items import lianjiaitem class xinfangspider(crawlspider): name = 'xinfang' allowed_domains = ['lianjia.com'] start_urls = ['https://bj.fang.lianjia.com/'] #定义爬取的规则, linkextractor是用来提取链接(其中,allow指允许的链接格式, restrict_xpaths指链接处于网页结构中的位置), follow为true表示跟进提取出的链接, callback则是调用函数 rules = ( rule(linkextractor(allow=r'\.fang.*com/$', restrict_xpaths='//div[@class="footer"]//div[@class="link-list"]/div[2]/dd'), follow=true), rule(linkextractor(allow=r'.*loupan/$', restrict_xpaths='//div[@class="xinfang-all"]/div/a'),callback= 'parse_item', follow=true) ) def parse_item(self, response): '''请求每页的url'''' counts = response.xpath('//div[@class="page-box"]/@data-total-count').extract_first() pages = int(counts) // 10 + 2 #由于页数最多为100, 加条件判断 if pages > 100: pages = 101 for page in range(1, pages): url = response.url + "pg" + str(page) yield scrapy.request(url, callback=self.parse_detail, dont_filter=false) def parse_detail(self, response): '''解析网页内容''' item = lianjiaitem() item["title"] = response.xpath('//div[@class="resblock-have-find"]/span[3]/text()').extract_first()[1:] infos = response.xpath('//ul[@class="resblock-list-wrapper"]/li') for info in infos: item["city"] = info.xpath('div/div[1]/a/text()').extract_first() item["type"] = info.xpath('div/div[1]/span[1]/text()').extract_first() item["status"] = info.xpath('div/div[1]/span[2]/text()').extract_first() item["region"] = info.xpath('div/div[2]/span[1]/text()').extract_first() item["street"] = info.xpath('div/div[2]/span[2]/text()').extract_first() item["address"] = info.xpath('div/div[2]/a/text()').extract_first().replace(",", "") item["area"] = info.xpath('div/div[@class="resblock-area"]/span/text()').extract_first() item["average_price"] = "".join(info.xpath('div//div[@class="main-price"]//text()').extract()).replace(" ", "") item["total_price"] = info.xpath('div//div[@class="second"]/text()').extract_first() item["tags"] = ";".join(info.xpath('div//div[@class="resblock-tag"]//text()').extract()).replace(" ","").replace("\n", "") yield item
5 保存到mysql数据库
在pipelines.py文件中编辑如下代码
import pymysql class lianjiapipeline(object): def __init__(self): #创建数据库连接对象 self.db = pymysql.connect( host = "localhost", user = "root", password = "1234", port = 3306, db = "lianjia", charset = "utf8" ) self.cursor = self.db.cursor() def process_item(self, item, spider): #存储到数据库中 sql = "insert into xinfang(city, name, type, status, region, street, address, area, average_price, total_price, tags) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" data = (item["city"], item["name"], item["type"], item["status"], item["region"], item["street"], item["address"], item["area"], item["average_price"], item["total_price"], item["tags"]) try: self.cursor.execute(sql, data) self.db.commit() except: self.db.rollback() finally: return item
6 反反爬措施
由于是批量性爬取, 有必要采取些反反爬措施, 我这里采用的是免费的ip代理. 在middlewares.py中编辑如下代码:
from scrapy import signals import logging import requests class proxymiddleware(object): def __init__(self, proxy): self.logger = logging.getlogger(__name__) self.proxy = proxy @classmethod def from_crawler(cls, crawler): '''获取随机代理的api接口''' settings = crawler.settings return cls( proxy=settings.get('random_proxy') ) def get_random_proxy(self): '''获取随机代理''' try: response = requests.get(self.proxy) if response.status_code == 200: proxy = response.text return proxy except: return false def process_request(self, request, spider): '''使用随机生成的代理请求''' proxy = self.get_random_proxy() if proxy: url = 'http://' + str(proxy) self.logger.debug('本次使用代理'+ proxy) request.meta['proxy'] = url
7 配置settings文件
import random random_proxy = "http://localhost:6686/random" bot_name = 'lianjia' spider_modules = ['lianjia.spiders'] newspider_module = 'lianjia.spiders' robotstxt_obey = false download_delay = random.random()*2 cookies_enabled = false default_request_headers = { 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'accept-language': 'en', } downloader_middlewares = { 'lianjia.middlewares.proxymiddleware': 543 } item_pipelines = { 'lianjia.pipelines.lianjiapipeline': 300, }
8 执行项目文件
在mian.py中执行如下命令
from scrapy import cmdline cmdline.execute('scrapy crawl xinfang'.split())
scrapy项目即可开始执行, 最后爬取到1万4千多条数据.
上一篇: 高速上手C++11 14 笔记2