欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

scrapy爬虫代码获取58房产小区的信息

程序员文章站 2022-06-09 09:46:36
...

废话不多说了,直接上代码

1、 为了反扒首先我们设置请求头设置

DEFAULT_REQUEST_HEADERS = {
    ':authority': 'sh.58.com',
    ':method': 'GET',
  ':scheme': 'https',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
    'cache-Control': 'max-age=0',
    'if-modified-since': 'Fri, 17 May 2019 00:25:44 GMT',
    'referer': 'https://sh.58.com/xiaoqu/',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Windows NT 6.3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36',
}

2、 爬虫的代码

import re
import datetime
import scrapy
import time

#引入容器
from scrapy.loader import ItemLoader
from sinanewsroll.items import SinanewsrollItem

class SinanewsrollSpider(scrapy.Spider):
    name = 'sinanewsroll'
    allowed_domains = ["sh.58.com"]
    start_urls = [
                'https://sh.58.com/xiaoqu/11352/pn_1/'
                ]
    def parse(self, response):

        num = len(response.xpath('//*[@class="xq-list-wrap"]/li').extract())
        print('=-=-------' + str(num) + '---=-=-=')
        if num >0:
            newurl= str(response.url).split('/')[0] + "//" + str(response.url).split('/')[2] + "/" + str(response.url).split('/')[3] + "/" + str(response.url).split('/')[4]
            newurlnum = int(str(response.url).split('/')[5].split('_')[1]) + 1
            newurl = newurl + "/pn_" + str(newurlnum) + "/"
            print('=-=-next url--' + str(newurl) + '---=-=-=')
            yield scrapy.Request(url=newurl, dont_filter=False, callback=self.parse)

        for i in range(1, num):
            item = SinanewsrollItem()

            item['main_url'] = response.url
            item['article_url'] = response.xpath('//*[@class="xq-list-wrap"]/li['+str(i)+']//*[@class="pic"]/a/img/@data-src').extract()
            item['article_imageurl'] = response.xpath('//*[@class="xq-list-wrap"]/li['+str(i)+']//*[@class="list-info"]/h2/a/text()').extract()
            item['article_title'] = response.xpath('//*[@class="xq-list-wrap"]/li['+str(i)+']//*[@class="list-info"]/h2/a/@href').extract()
            item['article_excerpt'] = response.xpath('//*[@class="xq-list-wrap"]/li['+str(i)+']//*[@class="list-info"]//*[@class="baseinfo"][1]//span/text()').extract()
            item['article_author'] = response.xpath('//*[@class="xq-list-wrap"]/li['+str(i)+']//*[@class="list-info"]//*[@class="baseinfo"][2]//span/text()').extract()
            item['article_date'] = response.xpath('//*[@class="xq-list-wrap"]/li['+str(i)+']//*[@class="price"]//*[@class="unit"]/text()').extract()
            item['article_content'] = "  "

            yield item

    # 解析详情页面内容
    def parse_detail(self, response):
        print('--URL---' + str(response.url) + '------')
        item = response.meta['item']

        item['article_content'] = response.xpath('//*[@class="detail-text-cont simditor-body"]').extract()

        return item

3、爬虫pipelines.py

import scrapy
import re
from urllib import parse, request
import json
import time

def list2str(value):
    new = ''.join(value).strip()
    return new
    
class SinanewsrollPipeline(object):
    def process_item(self, item, spider):
        main_url : str = item['main_url'] 
        article_title : str = item['article_title']
        article_date : str = item['article_date']
        article_content : str = item['article_content']
        article_url : str = item['article_url']
        article_excerpt : str = item['article_excerpt']
        article_imageurl : str = item['article_imageurl']
        article_author : str = item['article_author']

        item['main_url'] = list2str(main_url)
        item['article_title'] = list2str(article_title)
        item['article_date'] = list2str(article_date)
        item['article_content'] = list2str(article_content)
        item['article_url'] = list2str(article_url)
        item['article_excerpt'] = list2str(article_excerpt)
        item['article_imageurl'] = list2str(article_imageurl)
        item['article_author'] = list2str(article_author)

        return item

4、item的代码

import scrapy

class SinanewsrollItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    main_url = scrapy.Field()
    article_title = scrapy.Field()
    article_date = scrapy.Field()
    article_content = scrapy.Field()
    article_url = scrapy.Field()
    article_excerpt = scrapy.Field()
    article_imageurl = scrapy.Field()
    article_author = scrapy.Field()
    article_categories = scrapy.Field()
    pass

更多内容访问
freescrapy *爬虫 http://www.freescrapy.com
区块链科技 http://www.chainpark.com
区块链导航 http://www.chain123.org
区块链课程 http://www.chaincourse.com