爬取大众点评

程序员文章站 2022-03-02 19:38:37

...

使用命令行创建scrapy项目
1、####ping.py

# -*- coding: utf-8 -*-
import os
import random
import re

import requests
import scrapy
# from fake_useragent import UserAgent
# ua = UserAgent()
from lxml import etree

from dianping.items import shop
from dianping.settings import user_agent
from fontTools.ttLib import TTFont

class PingSpider(scrapy.Spider):
    name = 'ping'
    allowed_domains = ['https://www.dianping.com/']
    start_urls = ['https://www.dianping.com/beijing/ch10']
    custom_settings = {
        'ITEM_PIPELINES': {'dianping.pipelines.shopPipelines': 301},
    }
    headers = {
        'lgtoken': '0897479c0-088d-4343-a98f-82bff8f7bf23',
        'Accept': '*/*',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
        'Connection': 'keep-alive',
        'Cookie': '_lxsdk_cuid=170b78cc76fc8-0a5b2ec62d67f4-4313f6a-144000-170b78cc76fc8; _lxsdk=170b78cc76fc8-0a5b2ec62d67f4-4313f6a-144000-170b78cc76fc8; _hc.v=32cef2f6-b6e1-4f3f-52cc-f7f971670e42.1583627422; s_ViewType=10; ua=dpuser_8153713552; ctu=307ee90b2640a3fdfb7d5160e160f22434ac9fe3475d11d27394f1c12f34fba0; cy=3592; cye=zhongningxian; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; _lxsdk_s=170c46808f8-d46-74c-84b%7C%7C65',
        'Host': 'www.dianping.com',
        'Referer': 'http://www.dianping.com/zhongningxian/ch10',
        'User-Agent': random.choice(user_agent),
        'X-Requested-With': 'XMLHttpRequest'
    }
    cookies = {
        "s_ViewType": "10",
        " _lxsdk_cuid": "16e48a565e1c8-04659e09b895fe-e343166-144000-16e48a565e373",
        " _lxsdk": "170c3969300-358-fb-b16%7C%7C140",
        " _hc.v": "43db2bd6-f5d5-fee9-59df-981a1d70742c.1573176830"
    }
    fontDir={}
    def get_font(self,filePath):
        # 处理数字
        font = TTFont(filePath)
        font_names = font.getGlyphOrder()
        # 这些文字就是在FontEditor软件打开字体文件后看到的文字名字
        texts = ['', '', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0']
        font_name = {}
        # 将字体名字和它们所对应的乱码构成一个字典
        for index, value in enumerate(texts):
            a = font_names[index].replace('uni', '&#x').lower() + ";"
            font_name[a] = value
        # print("wenzi",font_name)
        self.fontDir=font_name
    def getCss(self,css_url):
        # 获取css链接
        try:
            r = requests.get(css_url)
            # print(r.text)
            r.encoding = 'utf-8'
            # 获取评价的数字
            numwoff = re.findall('@font-face{font-family: "PingFangSC-Regular-shopNum(.*?).woff', r.text, re.S)
            print(numwoff)
            # 保存文件
            self.savewoff(numwoff,"num.woff")
        except Exception as e:
            print("error",e)
    # 将获取的woff文件保存到本地
    def savewoff(self,woff, filename):
        headers = {'User-Agent': random.choice(user_agent),
                   'Connection': "keep-alive",
                   'Host': "s3plus.meituan.net",
                   'referer': 'http://www.dianping.com/',
                   'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
                   }
        result = ''
        for tmp in woff:
            result = result + tmp
        woff2 = re.findall('url\(\"//(.*?);', result, re.S)
        resultb = ''
        for a in woff2:
            resultb = resultb + str(a).replace('eot', 'woff').replace(')', '').replace('"', '')
        # print(resultb)
        url = "https://" + resultb
        response_woff = requests.get(url, headers=headers).content
        path="static/woff/"
        if not os.path.exists(path):
            os.mkdir(path)
        else:
            with open(path+filename, 'wb') as f:
                f.flush()
                f.write(response_woff)
        self.get_font(path+filename)


    def start_requests(self):
        '''
        重写start_requests
        :return:
        '''
        start_urls = 'http://www.dianping.com/beijing/ch10'
        yield scrapy.Request(url=start_urls, headers=self.headers, cookies=self.cookies, callback=self.parse)
    def parse(self, response):
        svgtextcss = re.search(r'href="([^"]+svgtextcss[^"]+)"', response.text, re.M)
        print(svgtextcss)
        if svgtextcss:
            css_url = "http:"+svgtextcss.group(1)
            print("获取到的css链接", css_url)
        else:
            print("是None")
            css_url = "http://s3plus.meituan.net/v1/mss_0a06a471f9514fc79c981b5466f56b91/svgtextcss/eb22f63af49ed95fc4b2b41cd112b91f.css"

        self.getCss(css_url)
        print("得到字体", self.fontDir)
        wholetext = response.text
        print(wholetext)
        data = response.text
        for key in self.fontDir:
            if key in data:
                # 说明获取到的原网页的编码匹配到了新字典里面的键
                data = data.replace(key, str(self.fontDir[key]))
        htmldata=etree.HTML(data)
        li = htmldata.xpath("/html/body/div[@class='section Fix J-shop-search']/div[@class='content-wrap']/div[@class='shop-wrap']/div[@class='content']/div[@id='shop-all-list']/ul/li")
        print(li)
        for item in li:
            shopList=shop()
            # 商家名称xpath('.//div[@class="tit"]/a/@title')[0]
            shopList["shopName"] = item.xpath(".//a//h4/text()")[0]
            print(shopList["shopName"])
            #  商家店铺照片
            shopList["shopThumb"] = item.xpath(".//div[@class='pic']//a//img/@data-src")[0]
            print(shopList["shopThumb"])
            # 商家链接
            shopList["shopUrl"] = item.xpath(".//a/@href")[0]
            print("商家链接", shopList["shopUrl"])
            # 商家评分
            shopList["shopComment"] = "".join(
                item.xpath(".//div[@class='txt']//div[@class='comment']//a[@class='review-num']/b//text()"))
            print("商家评分", shopList["shopComment"])
            shopList["shopAverage"] = "".join(item.xpath(".//div[@class='txt']//div[@class='comment']/a[last()]/b//text()"))
            print("人均消费", shopList["shopAverage"])
            shopList["shopTaste"] = "".join(item.xpath(".//div[@class='txt']//span[@class='comment-list']//span[1]//b//text()"))
            print("口味", shopList["shopTaste"])
            shopList["shopView"] = "".join(
                item.xpath(".//div[@class='txt']//span[@class='comment-list']//span[2]//b//text()"))
            print("环境", shopList["shopView"])
            shopList["shopService"] = "".join(
                item.xpath(".//div[@class='txt']//span[@class='comment-list']//span[3]//b//text()"))
            print("服务", shopList["shopService"])
            # yield shopList
        # 判断是否有下一页
        next = htmldata.xpath("//div[@class='page']//a[@class='next']")
        print("next===========", next[0])
        if next:
            url = next[0].xpath(".//@href")[0]
            print("有下一页", url)
            yield scrapy.Request(url, self.parse, headers=self.headers, cookies=self.cookies,dont_filter=True)

2、items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class DianpingItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    pass
class shop(scrapy.Item):
    table="shop"
    # 商家名称
    shopName = scrapy.Field()
    #  商家店铺照片
    shopThumb = scrapy.Field()
    # 商家链接
    shopUrl = scrapy.Field()
    # 商家评分
    shopComment = scrapy.Field()
    # 人均消费
    shopAverage = scrapy.Field()
    # 口味
    shopTaste = scrapy.Field()
    # 环境
    shopView = scrapy.Field()
    # 服务
    shopService = scrapy.Field()

3、pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql


class DianpingPipeline(object):
    def process_item(self, item, spider):
        return item


class shopPipelines(object):
    def open_spider(self, spider):
        self.conn = pymysql.Connect(
            host='39.106.8.123',  ##mysql服务器地址
            port=3306,  ##mysql服务器端口号
            user='root',  ##用户名
            passwd='[email protected]',  ##密码
            db='dzdp',  ##数据库名
            charset='utf8'  ##连接编码
        )
        self.cursor = self.conn.cursor()

    def process_item(self, item, spider):
        # print(item.__class__.__name__)
        # print(item.table)
        # print(item.values())
        list = item
        tablename = list.table
        print(tablename)
        key = [x for x in list.keys()]
        keystr = ",".join(key)
        value = [str(x) for x in list.values()]
        tag = (str("%s,") * len(key))[0:-1]
        print(tag)
        sql = "INSERT INTO %s(%s) VALUES(%s) " % (tablename, keystr, tag)
        print(sql)
        self.cursor.execute(sql, value)
        self.conn.commit()
        return item

    def close_spider(self, spider):
        self.conn.close()
        self.cursor.close()

爬取大众点评

python爬取网页内容转换为PDF文件

亲手撸码，爬取手机号码归属地最新数据（201911）

爬取编程常用词汇，保存为Excel

Python使用Selenium爬取淘宝异步加载的数据方法

网易云歌单信息爬取及数据分析（python爬虫）

python爬取个性签名的方法

scrapy 爬取纵横网实战

python如何爬取个性签名

Python实现爬取马云的微博功能示例

Python爬取Coursera课程资源的详细过程

爬取大众点评

python爬取网页内容转换为PDF文件

亲手撸码，爬取 手机号码归属地最新数据（201911）

爬取编程常用词汇，保存为Excel

Python使用Selenium爬取淘宝异步加载的数据方法

网易云歌单信息爬取及数据分析（python爬虫）

python爬取个性签名的方法

scrapy 爬取纵横网实战

python如何爬取个性签名

Python实现爬取马云的微博功能示例

Python爬取Coursera课程资源的详细过程

亲手撸码，爬取手机号码归属地最新数据（201911）