欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

爬取大众点评

程序员文章站 2022-03-02 19:38:37
...

使用命令行创建scrapy项目
1、####ping.py

# -*- coding: utf-8 -*-
import os
import random
import re

import requests
import scrapy
# from fake_useragent import UserAgent
# ua = UserAgent()
from lxml import etree

from dianping.items import shop
from dianping.settings import user_agent
from fontTools.ttLib import TTFont

class PingSpider(scrapy.Spider):
    name = 'ping'
    allowed_domains = ['https://www.dianping.com/']
    start_urls = ['https://www.dianping.com/beijing/ch10']
    custom_settings = {
        'ITEM_PIPELINES': {'dianping.pipelines.shopPipelines': 301},
    }
    headers = {
        'lgtoken': '0897479c0-088d-4343-a98f-82bff8f7bf23',
        'Accept': '*/*',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
        'Connection': 'keep-alive',
        'Cookie': '_lxsdk_cuid=170b78cc76fc8-0a5b2ec62d67f4-4313f6a-144000-170b78cc76fc8; _lxsdk=170b78cc76fc8-0a5b2ec62d67f4-4313f6a-144000-170b78cc76fc8; _hc.v=32cef2f6-b6e1-4f3f-52cc-f7f971670e42.1583627422; s_ViewType=10; ua=dpuser_8153713552; ctu=307ee90b2640a3fdfb7d5160e160f22434ac9fe3475d11d27394f1c12f34fba0; cy=3592; cye=zhongningxian; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; _lxsdk_s=170c46808f8-d46-74c-84b%7C%7C65',
        'Host': 'www.dianping.com',
        'Referer': 'http://www.dianping.com/zhongningxian/ch10',
        'User-Agent': random.choice(user_agent),
        'X-Requested-With': 'XMLHttpRequest'
    }
    cookies = {
        "s_ViewType": "10",
        " _lxsdk_cuid": "16e48a565e1c8-04659e09b895fe-e343166-144000-16e48a565e373",
        " _lxsdk": "170c3969300-358-fb-b16%7C%7C140",
        " _hc.v": "43db2bd6-f5d5-fee9-59df-981a1d70742c.1573176830"
    }
    fontDir={}
    def get_font(self,filePath):
        # 处理数字
        font = TTFont(filePath)
        font_names = font.getGlyphOrder()
        # 这些文字就是在FontEditor软件打开字体文件后看到的文字名字
        texts = ['', '', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0']
        font_name = {}
        # 将字体名字和它们所对应的乱码构成一个字典
        for index, value in enumerate(texts):
            a = font_names[index].replace('uni', '&#x').lower() + ";"
            font_name[a] = value
        # print("wenzi",font_name)
        self.fontDir=font_name
    def getCss(self,css_url):
        # 获取css链接
        try:
            r = requests.get(css_url)
            # print(r.text)
            r.encoding = 'utf-8'
            # 获取评价的数字
            numwoff = re.findall('@font-face{font-family: "PingFangSC-Regular-shopNum(.*?).woff', r.text, re.S)
            print(numwoff)
            # 保存文件
            self.savewoff(numwoff,"num.woff")
        except Exception as e:
            print("error",e)
    # 将获取的woff文件保存到本地
    def savewoff(self,woff, filename):
        headers = {'User-Agent': random.choice(user_agent),
                   'Connection': "keep-alive",
                   'Host': "s3plus.meituan.net",
                   'referer': 'http://www.dianping.com/',
                   'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
                   }
        result = ''
        for tmp in woff:
            result = result + tmp
        woff2 = re.findall('url\(\"//(.*?);', result, re.S)
        resultb = ''
        for a in woff2:
            resultb = resultb + str(a).replace('eot', 'woff').replace(')', '').replace('"', '')
        # print(resultb)
        url = "https://" + resultb
        response_woff = requests.get(url, headers=headers).content
        path="static/woff/"
        if not os.path.exists(path):
            os.mkdir(path)
        else:
            with open(path+filename, 'wb') as f:
                f.flush()
                f.write(response_woff)
        self.get_font(path+filename)


    def start_requests(self):
        '''
        重写start_requests
        :return:
        '''
        start_urls = 'http://www.dianping.com/beijing/ch10'
        yield scrapy.Request(url=start_urls, headers=self.headers, cookies=self.cookies, callback=self.parse)
    def parse(self, response):
        svgtextcss = re.search(r'href="([^"]+svgtextcss[^"]+)"', response.text, re.M)
        print(svgtextcss)
        if svgtextcss:
            css_url = "http:"+svgtextcss.group(1)
            print("获取到的css链接", css_url)
        else:
            print("是None")
            css_url = "http://s3plus.meituan.net/v1/mss_0a06a471f9514fc79c981b5466f56b91/svgtextcss/eb22f63af49ed95fc4b2b41cd112b91f.css"

        self.getCss(css_url)
        print("得到字体", self.fontDir)
        wholetext = response.text
        print(wholetext)
        data = response.text
        for key in self.fontDir:
            if key in data:
                # 说明获取到的原网页的编码匹配到了新字典里面的键
                data = data.replace(key, str(self.fontDir[key]))
        htmldata=etree.HTML(data)
        li = htmldata.xpath("/html/body/div[@class='section Fix J-shop-search']/div[@class='content-wrap']/div[@class='shop-wrap']/div[@class='content']/div[@id='shop-all-list']/ul/li")
        print(li)
        for item in li:
            shopList=shop()
            # 商家名称xpath('.//div[@class="tit"]/a/@title')[0]
            shopList["shopName"] = item.xpath(".//a//h4/text()")[0]
            print(shopList["shopName"])
            #  商家店铺照片
            shopList["shopThumb"] = item.xpath(".//div[@class='pic']//a//img/@data-src")[0]
            print(shopList["shopThumb"])
            # 商家链接
            shopList["shopUrl"] = item.xpath(".//a/@href")[0]
            print("商家链接", shopList["shopUrl"])
            # 商家评分
            shopList["shopComment"] = "".join(
                item.xpath(".//div[@class='txt']//div[@class='comment']//a[@class='review-num']/b//text()"))
            print("商家评分", shopList["shopComment"])
            shopList["shopAverage"] = "".join(item.xpath(".//div[@class='txt']//div[@class='comment']/a[last()]/b//text()"))
            print("人均消费", shopList["shopAverage"])
            shopList["shopTaste"] = "".join(item.xpath(".//div[@class='txt']//span[@class='comment-list']//span[1]//b//text()"))
            print("口味", shopList["shopTaste"])
            shopList["shopView"] = "".join(
                item.xpath(".//div[@class='txt']//span[@class='comment-list']//span[2]//b//text()"))
            print("环境", shopList["shopView"])
            shopList["shopService"] = "".join(
                item.xpath(".//div[@class='txt']//span[@class='comment-list']//span[3]//b//text()"))
            print("服务", shopList["shopService"])
            # yield shopList
        # 判断是否有下一页
        next = htmldata.xpath("//div[@class='page']//a[@class='next']")
        print("next===========", next[0])
        if next:
            url = next[0].xpath(".//@href")[0]
            print("有下一页", url)
            yield scrapy.Request(url, self.parse, headers=self.headers, cookies=self.cookies,dont_filter=True)


2、items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class DianpingItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    pass
class shop(scrapy.Item):
    table="shop"
    # 商家名称
    shopName = scrapy.Field()
    #  商家店铺照片
    shopThumb = scrapy.Field()
    # 商家链接
    shopUrl = scrapy.Field()
    # 商家评分
    shopComment = scrapy.Field()
    # 人均消费
    shopAverage = scrapy.Field()
    # 口味
    shopTaste = scrapy.Field()
    # 环境
    shopView = scrapy.Field()
    # 服务
    shopService = scrapy.Field()

3、pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql


class DianpingPipeline(object):
    def process_item(self, item, spider):
        return item


class shopPipelines(object):
    def open_spider(self, spider):
        self.conn = pymysql.Connect(
            host='39.106.8.123',  ##mysql服务器地址
            port=3306,  ##mysql服务器端口号
            user='root',  ##用户名
            passwd='[email protected]',  ##密码
            db='dzdp',  ##数据库名
            charset='utf8'  ##连接编码
        )
        self.cursor = self.conn.cursor()

    def process_item(self, item, spider):
        # print(item.__class__.__name__)
        # print(item.table)
        # print(item.values())
        list = item
        tablename = list.table
        print(tablename)
        key = [x for x in list.keys()]
        keystr = ",".join(key)
        value = [str(x) for x in list.values()]
        tag = (str("%s,") * len(key))[0:-1]
        print(tag)
        sql = "INSERT INTO %s(%s) VALUES(%s) " % (tablename, keystr, tag)
        print(sql)
        self.cursor.execute(sql, value)
        self.conn.commit()
        return item

    def close_spider(self, spider):
        self.conn.close()
        self.cursor.close()
相关标签: python