scrapy爬虫代码获取58房产小区的信息
程序员文章站
2022-06-09 09:46:36
...
废话不多说了,直接上代码
1、 为了反扒首先我们设置请求头设置
DEFAULT_REQUEST_HEADERS = {
':authority': 'sh.58.com',
':method': 'GET',
':scheme': 'https',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
'cache-Control': 'max-age=0',
'if-modified-since': 'Fri, 17 May 2019 00:25:44 GMT',
'referer': 'https://sh.58.com/xiaoqu/',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 6.3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36',
}
2、 爬虫的代码
import re
import datetime
import scrapy
import time
#引入容器
from scrapy.loader import ItemLoader
from sinanewsroll.items import SinanewsrollItem
class SinanewsrollSpider(scrapy.Spider):
name = 'sinanewsroll'
allowed_domains = ["sh.58.com"]
start_urls = [
'https://sh.58.com/xiaoqu/11352/pn_1/'
]
def parse(self, response):
num = len(response.xpath('//*[@class="xq-list-wrap"]/li').extract())
print('=-=-------' + str(num) + '---=-=-=')
if num >0:
newurl= str(response.url).split('/')[0] + "//" + str(response.url).split('/')[2] + "/" + str(response.url).split('/')[3] + "/" + str(response.url).split('/')[4]
newurlnum = int(str(response.url).split('/')[5].split('_')[1]) + 1
newurl = newurl + "/pn_" + str(newurlnum) + "/"
print('=-=-next url--' + str(newurl) + '---=-=-=')
yield scrapy.Request(url=newurl, dont_filter=False, callback=self.parse)
for i in range(1, num):
item = SinanewsrollItem()
item['main_url'] = response.url
item['article_url'] = response.xpath('//*[@class="xq-list-wrap"]/li['+str(i)+']//*[@class="pic"]/a/img/@data-src').extract()
item['article_imageurl'] = response.xpath('//*[@class="xq-list-wrap"]/li['+str(i)+']//*[@class="list-info"]/h2/a/text()').extract()
item['article_title'] = response.xpath('//*[@class="xq-list-wrap"]/li['+str(i)+']//*[@class="list-info"]/h2/a/@href').extract()
item['article_excerpt'] = response.xpath('//*[@class="xq-list-wrap"]/li['+str(i)+']//*[@class="list-info"]//*[@class="baseinfo"][1]//span/text()').extract()
item['article_author'] = response.xpath('//*[@class="xq-list-wrap"]/li['+str(i)+']//*[@class="list-info"]//*[@class="baseinfo"][2]//span/text()').extract()
item['article_date'] = response.xpath('//*[@class="xq-list-wrap"]/li['+str(i)+']//*[@class="price"]//*[@class="unit"]/text()').extract()
item['article_content'] = " "
yield item
# 解析详情页面内容
def parse_detail(self, response):
print('--URL---' + str(response.url) + '------')
item = response.meta['item']
item['article_content'] = response.xpath('//*[@class="detail-text-cont simditor-body"]').extract()
return item
import scrapy
import re
from urllib import parse, request
import json
import time
def list2str(value):
new = ''.join(value).strip()
return new
class SinanewsrollPipeline(object):
def process_item(self, item, spider):
main_url : str = item['main_url']
article_title : str = item['article_title']
article_date : str = item['article_date']
article_content : str = item['article_content']
article_url : str = item['article_url']
article_excerpt : str = item['article_excerpt']
article_imageurl : str = item['article_imageurl']
article_author : str = item['article_author']
item['main_url'] = list2str(main_url)
item['article_title'] = list2str(article_title)
item['article_date'] = list2str(article_date)
item['article_content'] = list2str(article_content)
item['article_url'] = list2str(article_url)
item['article_excerpt'] = list2str(article_excerpt)
item['article_imageurl'] = list2str(article_imageurl)
item['article_author'] = list2str(article_author)
return item
4、item的代码
import scrapy
class SinanewsrollItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
main_url = scrapy.Field()
article_title = scrapy.Field()
article_date = scrapy.Field()
article_content = scrapy.Field()
article_url = scrapy.Field()
article_excerpt = scrapy.Field()
article_imageurl = scrapy.Field()
article_author = scrapy.Field()
article_categories = scrapy.Field()
pass
更多内容访问
freescrapy *爬虫 http://www.freescrapy.com
区块链科技 http://www.chainpark.com
区块链导航 http://www.chain123.org
区块链课程 http://www.chaincourse.com
上一篇: php使用mysql_query查询超大结果集超内存的解决方法
下一篇: 2020秋招笔试题目