欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

Python 爬取大众点评店铺评论

程序员文章站 2022-05-02 22:14:41
...

这是我自己写的代码,有可以优化的地方请不要藏着,要告诉我哦!

import parsel
import pymysql
from lxml import etree
import re
import requests
def download_data(url,cookie):
    '''
    获取加密网页源码
    获取加密文件
    :return:
    '''
    headers = {
        "Cookie": cookie,
        "Referer": "http://www.dianping.com/",
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
    }
    '''
    获取原始网页
    '''
    ret = requests.get(url=url, headers=headers).text
    with open('01 原始网页_加密.html', 'w', encoding='utf-8') as f:
        f.write(ret)

    '''
    获取css文件
    '''
    css_url = re.findall('<link rel="stylesheet" type="text/css" href="(//s3plus.meituan.*?)">', ret)
    css_url = 'https:' + css_url[0]
    css_response = requests.get(css_url).text
    with open('02 css样式.css', 'w', encoding='utf-8') as f:
        f.write(css_response)

    '''
    获取svg对照表
    '''
    svg_urls = re.findall(r'.*?\[class\^="(.*?)"\]\{.*?background-image: url\((.*?)\);', css_response)
    for svg_url in svg_urls:
        name, url = svg_url
        svg_url = 'https:' + url
        svg_response = requests.get(svg_url).text
        with open(F'03 svg对照表{name}.svg', 'w', encoding='utf-8') as f:
            f.write(svg_response)
def *****_data():
    '''
    解密数据,**svg对应关系
    :return:
    '''
    with open('03 svg对照表zpd.svg', 'r', encoding='utf-8') as f:#文件名称根据获取到的svg文件更换
        svg_html = f.read()
    sel = parsel.Selector(svg_html)
    texts = sel.css('textPath')
    paths = sel.css('path')
    path_dict = {}
    for path in paths:
        path_dict[path.css('path::attr(id)').get()] = path.css('path::attr(d)').get().split(' ')[1]
        # print(path.css('path::attr(id)').get())
        # print(path.css('path::attr(d)').get().split(' ')[1])
    count = 1
    zpd_svg_dict = {}  # y坐标和字符串的联系
    for text in texts:
        zpd_svg_dict[path_dict[str(count)]] = text.css('textPath::text').get()
        count += 1
    print(zpd_svg_dict)

    with open('02 css样式.css', 'r', encoding='utf-8') as f:
        css_html = f.read()

    css_paths = re.findall(r'''
    \.(zpd.*?) {
        background: -(\d+)\.0px -(\d+)\.0px;
    \}
    ''', css_html) # 正则表达式条件根据css文件类标签更换
    print(css_paths)
    last_map = {}
    for css_path in css_paths:
        css_name, x, y = css_path
        index = int(int(x) / 14)
        for i in zpd_svg_dict:
            if int(y) > int(i):
                pass
            else:
                last_map[css_name] = zpd_svg_dict[i][index]
                break
    return last_map

def decryption(last_map):
    '''
    返回**后的html
    :param last_map:
    :return:
    '''

    with open('01 原始网页_加密.html', 'r', encoding='utf-8') as f:
        ret = f.read()
    svg_list = re.findall('<svgmtsi class="(.*?)"></svgmtsi>', ret)
    for svg in svg_list:
        print(svg, last_map[svg])
        ret = ret.replace(f'<svgmtsi class="{svg}"></svgmtsi>', last_map[svg])
    return ret
def write_data(ret):
    '''
    获取评论数据并写入数据库

    :param ret:
    :return:
    '''
    # 用不到的div标签去掉  并不是全部都有这个标签 影响代码编写
    ret = ret.replace(' <div class="richtitle">消费后评价</div>', '')
    # ret = ret.replace(div,'')
    # print(ret)
    etre = etree.HTML(ret)
    li_list = etre.xpath('//*[@id="review-list"]/div[2]/div[3]/div[3]/div[3]/ul/li')

    # 初始化数据库
    db = pymysql.connect(host='localhost', user='root', password='123456', port=3306, database='review',
                         charset='utf8mb4')
    cursor = db.cursor()
    count = 0
    for li in li_list:
        name = li.xpath('./div[@class="main-review"]/div[1]/a/text()')[0].strip()
        score = re.findall('sml-rank-stars sml-str(.*?) star', li.xpath('./div[1]/div[2]/span[1]/@class')[0])[0].strip()
        time = li.xpath('//div[@class="misc-info clearfix"]/span[1]/text()')[count].strip()
        shop_name = li.xpath('//div[@class="misc-info clearfix"]/span[2]/text()')[count].strip()
        comment = ','.join([i.replace('\n', '').strip() for i in li.xpath('./div/div[4]/text()')])
        count += 1
        print(name, score, time, shop_name, comment)
        # 写入数据库
        sql = 'insert into dianping(name,score,time,shop_name,comment)values(%s,%s,%s,%s,%s)'
        cursor.execute(sql, (name, score, time, shop_name, comment))
        db.commit()
    # 关闭连接
    db.close()


if __name__ == '__main__':
    #cookie 不定时更换
    cookie = "s_ViewType=10; _lxsdk_cuid=175e331ad79c8-0996df2d570671-46460e2a-1fa400-175e331ad79c8; _lxsdk=175e331ad79c8-0996df2d570671-46460e2a-1fa400-175e331ad79c8; _hc.v=c4dfac1c-01af-6a87-d803-2cd6b8db107a.1605834485; fspop=test; ctu=ef0b64e4cabf67f148563284ea8c8d0555a008f7ca0dee097831c90b52822812; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; Hm_lvt_602b80cf8079ae6591966cc70a3940e7=1605834487,1605835298,1606093773; cy=2; cye=beijing; Hm_lpvt_602b80cf8079ae6591966cc70a3940e7=1606098153; expand=yes; _lxsdk_s=175f2cc7d23-6-9d5-75e%7C%7C532"
    url = 'http://www.dianping.com/shop/130096343/review_all' #这是一个商家的评论  可以更换
    try:
        download_data(url,cookie)
    except Exception:
        print('出现验证码验证')#访问过多会出现验证码 目前没有**
    map_dict = {}
    try:
        map_dict = *****_data()
    except Exception:
        print('css类属性发生变化')
    ret = decryption(map_dict)
    write_data(ret)
相关标签: python svg xpath