大众点评美食评论 @@@@ ‘乞讨’ 型爬虫
大家都知道的,大众点评用了css反爬,脑壳疼。评论文字使用SVG替换。然后还需要登录才能查看全部评论,也就是要带cookie了,此外时不时跳验证码,验证码还有几种,这帮人是真的过分了,搬砖的何苦为难搬砖的呢。
'''function:从数据库取店铺id取解析店铺下的评论'''
# -*- coding:utf-8 -*-
# Author : peng
from getter import DP2_get #改成你自己的get方法,怎么处理验证码之类的建议放在这里面
from redis import StrictRedis,ConnectionPool
import threading, time
import re
from pyquery import PyQuery
import os
import json
import random
pool = ConnectionPool(host='localhost', port=***, db=***, password=***)
def word_repair(s,index_dict,svg_dict):
'''
s :待替换的字码
index_dict : 字码坐标字典
svg_dict: 字库
'''
# print(index_dict)
# print(svg_dict)
s = re.sub(r'@','',s) #之前添加的用于分割的字符,处理可能存在的残留字符
try:coordinate = index_dict[s] #字码坐标
except:
print('error_______________',s)
return '' #错误或者空字符返回空字符
x = float(coordinate[0])
y = float(coordinate[1])
'''根据该字码在svg图上的位置解析出该字在svg_dict对应的位置'''
hight_list = svg_dict.keys()
hight_list = [int(_) for _ in svg_dict.keys()]
if y<hight_list[0]:
y_key = str(hight_list[0])
elif y>hight_list[-1]:
y_key = str(hight_list[-1])
else:
for i,n in enumerate(hight_list):
if n<y<hight_list[i+1]:
y_key = str(hight_list[i+1])
break
x_pos = int(x/14)
# print(svg_dict)
word = svg_dict[y_key][x_pos]
return word
def get_index_dict(html):
'''
html:店铺首页的response.text
'''
raw = re.findall(r'href="(//s3plus.meituan.net/v1.*?\.css)',html)[0]
url = 'http:'+raw
print(url) #css文件链接
res = DP2_get(url)
if res == None:
return None,None
'''解析css文件,解析出字码对应坐标'''
index_info = re.findall(r'\.(\w+?){background:-(.*?)px\s-(.*?)px;}',res.text)
index_dict = {}
for each in index_info:
index_dict[each[0]]=[each[1],each[2]]
'''找到主要的svg文件,舍弃次要的svg文件'''
svgs = re.findall(r'\[class\^="(\S*?)"]{(.*?)(//.*?\.svg)',res.text)
for each in svgs:
if 'width: 14px' in each[1] and 'height: 24px' in each[1]:
svg = each
svg_url = 'http:'+svg[2]
svg_name = svg[0]
file_path = 'C:/Users/pengyong/Desktop/DZDP/svg_files/'+svg_name
'''已经存在则无需更新,不存在则访问下载该svg文件来添加新的svg字典'''
if not os.path.exists(file_path):
res2 = DP2_get(svg_url)
print(res2.url)
if res2 == None:
return None,None
'''svg文件记录,目前发现有两种格式,后一种临时添加,懒得美化代码了'''
hight_list = re.findall(r'<path id="\d*" d="\w* (\d*?) \w*"/>', res2.text)
text_list = re.findall(r'<textPath xlink:href="\S*" textLength="\d*">(\S*)</textPath>', res2.text)
svg_dict = {}
for i in range(len(hight_list)):
svg_dict[str(hight_list[i])] = text_list[i]
if len(svg_dict) == 0:
#则为第二种格式
hight_and_text_list = re.findall(r'<text x="\d*" y="(\d*)">(\S*)</text>',res2.text)
print(hight_and_text_list)
for i in range(len(hight_and_text_list)):
svg_dict[str(hight_and_text_list[i][0])] = hight_and_text_list[i][1]
with open(file_path,'w',encoding='utf-8')as f:
f.write(str(svg_dict))
else:
with open(file_path,'r',encoding='utf-8')as f:
svg_dict=eval(f.read())
return index_dict,svg_dict
def save(s):
'''存储评论'''
with open('DP_comment.txt','a',encoding='utf-8')as f:
f.write(s+'\n')
def comment_parse(html,index_dict, svg_dict):
'''
html: 评论页面
index_dict: 坐标字典
svg_dic: 字库
'''
divs = re.findall(r'<div class="review-words Hide">(.*?)<div', html, re.S)
def rp(temp):
'''处理svgmtsi中字码,用#,@做分割标记之类的工作,方便后续处理'''
svg = temp.group()
return '#@' + re.findall(r'"(.*?)"', svg)[0] + '#'
for div in divs:
div = re.sub(r'<svgmtsi class=".*?"></svgmtsi>', rp, div)
div = re.sub(r'\r', '\n', div)
div = re.sub(r'<[^<]+?>|\t|\s', '', div)
words = div.split('#')
for i, word in enumerate(words):
if word != None and '@' in word: #判断是否是svgmtsi标签中的字码
words[i] = word_repair(word, index_dict, svg_dict) #替换
comment = ''.join(words) #合并
'''处理评论内容'''
comment = re.sub(u'&x0A|&x20|&x2F|&x0D', '\n', comment)
comment = re.sub(u';', '', comment)
comment = re.sub(r'^(\s*)\n','',comment)
print(comment)
save(comment)
def work():
redis = StrictRedis(connection_pool=pool)
while True:
id = redis.spop('DPids') # 返回一个元素并删除
if isinstance(id, bytes):
id = id.decode()
# id = '510660'
url = 'http://www.dianping.com/shop/' + id + '/review_all/p1'
res = DP2_get(url)
if res:
# print(res.text)
index_dict, svg_dict = get_index_dict(res.text) #获取坐标字典,svg字库
if index_dict==None or svg_dict == None:
#获取失败
break
comment_parse(res.text, index_dict, svg_dict)
pages = re.findall(r'<a href="/shop/.*?" data-pg="(\d*?)" class="PageLink"', res.text)
pages = [int(_) for _ in pages] #类型转换
try:
max_page = max(pages) #最大评论页数
except:pass
else:
if max_page >= 2:
for i in range(2, max_page + 1):
href = 'http://www.dianping.com/shop/' + id + '/review_all/p' + str(i)
print(href)
res2 = DP2_get(href)
if res2:
comment_parse(res2.text, index_dict, svg_dict)
n = 15+20*random.random()#平均睡眠25秒
time.sleep(n)
else:
#记录错误
with open('error_record.txt', 'a')as f:
f.write(id)
if __name__ == '__main__':
#留着开多线程的,但改成多线程不现实,单线程都休眠25秒了,还得登录,还有封号危险。
work()
'''function:获取美食店铺id'''
# -*- coding:utf-8 -*-
# Author : peng
from pyquery import PyQuery
from getter import DP_get
import time
import re
from redis import StrictRedis,ConnectionPool
pool = ConnectionPool(host='localhost', port=***, db=***, password=***)
redis = StrictRedis(connection_pool=pool)
areas = ['beijing','shanghai','guangzhou','shenzhen','tianjin','hangzhou','nanjing','suzhou','chengdu','wuhan','chongqing','xian','tokyo','seoul','bangkok','paris']
id_comp = re.compile(r'"http://www.dianping.com/shop/(\d{4,})" data-click-name')
for area in areas:
url = 'http://www.dianping.com/{}/food'.format(area)
res = DP_get(url)
if res:
doc = PyQuery(res.text)
label_a = doc('#J_nc_cooking > div > ul > li > a')
for a in label_a.items():
half_url =a.attr('href')
# print(a.attr('href')) /beijing/ch10/g2714
for i in range(1,51):
href2 = 'http://www.dianping.com'+half_url+'p'+str(i)
GetIdPageRes = DP_get(href2) #分类详情页响应
if GetIdPageRes:
shop_ids = re.findall(id_comp,GetIdPageRes.text)
for each in shop_ids:
redis.sadd('DPids',each)
print(each)
此外,关于大众点评的字体替换,更烦,还好我目前不用搞。
思路是 fontTools解析woff文件,找到编码对应的 ‘笔画’(这个东西吧,看下面)。编码是会变的,就是所谓字体库换了,但这个笔画是不会变的,所以,根据笔画替换编码。至于第一次怎么找到笔画对应什么汉字,我也很脑壳疼。。。。
<TTGlyph name="unieb30" xMin="0" yMin="-101" xMax="935" yMax="827">
<contour>
<pt x="447" y="158" on="1"/>
<pt x="447" y="93" on="1"/>
<pt x="803" y="93" on="1"/>
<pt x="803" y="158" on="1"/>
</contour>
<contour>
<pt x="655" y="625" on="1"/>
<pt x="608" y="596" on="1"/>
<pt x="669" y="539" on="0"/>
<pt x="705" y="489" on="1"/>
<pt x="754" y="524" on="1"/>
<pt x="720" y="568" on="0"/>
........................
</contour>
<instructions/>
</TTGlyph>
上一篇: Python 爬取大众点评店铺评论
下一篇: 属性动画简单的使用