爬取大众点评
程序员文章站
2022-03-02 19:38:37
...
使用命令行创建scrapy项目
1、####ping.py
# -*- coding: utf-8 -*-
import os
import random
import re
import requests
import scrapy
# from fake_useragent import UserAgent
# ua = UserAgent()
from lxml import etree
from dianping.items import shop
from dianping.settings import user_agent
from fontTools.ttLib import TTFont
class PingSpider(scrapy.Spider):
name = 'ping'
allowed_domains = ['https://www.dianping.com/']
start_urls = ['https://www.dianping.com/beijing/ch10']
custom_settings = {
'ITEM_PIPELINES': {'dianping.pipelines.shopPipelines': 301},
}
headers = {
'lgtoken': '0897479c0-088d-4343-a98f-82bff8f7bf23',
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Connection': 'keep-alive',
'Cookie': '_lxsdk_cuid=170b78cc76fc8-0a5b2ec62d67f4-4313f6a-144000-170b78cc76fc8; _lxsdk=170b78cc76fc8-0a5b2ec62d67f4-4313f6a-144000-170b78cc76fc8; _hc.v=32cef2f6-b6e1-4f3f-52cc-f7f971670e42.1583627422; s_ViewType=10; ua=dpuser_8153713552; ctu=307ee90b2640a3fdfb7d5160e160f22434ac9fe3475d11d27394f1c12f34fba0; cy=3592; cye=zhongningxian; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; _lxsdk_s=170c46808f8-d46-74c-84b%7C%7C65',
'Host': 'www.dianping.com',
'Referer': 'http://www.dianping.com/zhongningxian/ch10',
'User-Agent': random.choice(user_agent),
'X-Requested-With': 'XMLHttpRequest'
}
cookies = {
"s_ViewType": "10",
" _lxsdk_cuid": "16e48a565e1c8-04659e09b895fe-e343166-144000-16e48a565e373",
" _lxsdk": "170c3969300-358-fb-b16%7C%7C140",
" _hc.v": "43db2bd6-f5d5-fee9-59df-981a1d70742c.1573176830"
}
fontDir={}
def get_font(self,filePath):
# 处理数字
font = TTFont(filePath)
font_names = font.getGlyphOrder()
# 这些文字就是在FontEditor软件打开字体文件后看到的文字名字
texts = ['', '', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0']
font_name = {}
# 将字体名字和它们所对应的乱码构成一个字典
for index, value in enumerate(texts):
a = font_names[index].replace('uni', '&#x').lower() + ";"
font_name[a] = value
# print("wenzi",font_name)
self.fontDir=font_name
def getCss(self,css_url):
# 获取css链接
try:
r = requests.get(css_url)
# print(r.text)
r.encoding = 'utf-8'
# 获取评价的数字
numwoff = re.findall('@font-face{font-family: "PingFangSC-Regular-shopNum(.*?).woff', r.text, re.S)
print(numwoff)
# 保存文件
self.savewoff(numwoff,"num.woff")
except Exception as e:
print("error",e)
# 将获取的woff文件保存到本地
def savewoff(self,woff, filename):
headers = {'User-Agent': random.choice(user_agent),
'Connection': "keep-alive",
'Host': "s3plus.meituan.net",
'referer': 'http://www.dianping.com/',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
}
result = ''
for tmp in woff:
result = result + tmp
woff2 = re.findall('url\(\"//(.*?);', result, re.S)
resultb = ''
for a in woff2:
resultb = resultb + str(a).replace('eot', 'woff').replace(')', '').replace('"', '')
# print(resultb)
url = "https://" + resultb
response_woff = requests.get(url, headers=headers).content
path="static/woff/"
if not os.path.exists(path):
os.mkdir(path)
else:
with open(path+filename, 'wb') as f:
f.flush()
f.write(response_woff)
self.get_font(path+filename)
def start_requests(self):
'''
重写start_requests
:return:
'''
start_urls = 'http://www.dianping.com/beijing/ch10'
yield scrapy.Request(url=start_urls, headers=self.headers, cookies=self.cookies, callback=self.parse)
def parse(self, response):
svgtextcss = re.search(r'href="([^"]+svgtextcss[^"]+)"', response.text, re.M)
print(svgtextcss)
if svgtextcss:
css_url = "http:"+svgtextcss.group(1)
print("获取到的css链接", css_url)
else:
print("是None")
css_url = "http://s3plus.meituan.net/v1/mss_0a06a471f9514fc79c981b5466f56b91/svgtextcss/eb22f63af49ed95fc4b2b41cd112b91f.css"
self.getCss(css_url)
print("得到字体", self.fontDir)
wholetext = response.text
print(wholetext)
data = response.text
for key in self.fontDir:
if key in data:
# 说明获取到的原网页的编码匹配到了新字典里面的键
data = data.replace(key, str(self.fontDir[key]))
htmldata=etree.HTML(data)
li = htmldata.xpath("/html/body/div[@class='section Fix J-shop-search']/div[@class='content-wrap']/div[@class='shop-wrap']/div[@class='content']/div[@id='shop-all-list']/ul/li")
print(li)
for item in li:
shopList=shop()
# 商家名称xpath('.//div[@class="tit"]/a/@title')[0]
shopList["shopName"] = item.xpath(".//a//h4/text()")[0]
print(shopList["shopName"])
# 商家店铺照片
shopList["shopThumb"] = item.xpath(".//div[@class='pic']//a//img/@data-src")[0]
print(shopList["shopThumb"])
# 商家链接
shopList["shopUrl"] = item.xpath(".//a/@href")[0]
print("商家链接", shopList["shopUrl"])
# 商家评分
shopList["shopComment"] = "".join(
item.xpath(".//div[@class='txt']//div[@class='comment']//a[@class='review-num']/b//text()"))
print("商家评分", shopList["shopComment"])
shopList["shopAverage"] = "".join(item.xpath(".//div[@class='txt']//div[@class='comment']/a[last()]/b//text()"))
print("人均消费", shopList["shopAverage"])
shopList["shopTaste"] = "".join(item.xpath(".//div[@class='txt']//span[@class='comment-list']//span[1]//b//text()"))
print("口味", shopList["shopTaste"])
shopList["shopView"] = "".join(
item.xpath(".//div[@class='txt']//span[@class='comment-list']//span[2]//b//text()"))
print("环境", shopList["shopView"])
shopList["shopService"] = "".join(
item.xpath(".//div[@class='txt']//span[@class='comment-list']//span[3]//b//text()"))
print("服务", shopList["shopService"])
# yield shopList
# 判断是否有下一页
next = htmldata.xpath("//div[@class='page']//a[@class='next']")
print("next===========", next[0])
if next:
url = next[0].xpath(".//@href")[0]
print("有下一页", url)
yield scrapy.Request(url, self.parse, headers=self.headers, cookies=self.cookies,dont_filter=True)
2、items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class DianpingItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
class shop(scrapy.Item):
table="shop"
# 商家名称
shopName = scrapy.Field()
# 商家店铺照片
shopThumb = scrapy.Field()
# 商家链接
shopUrl = scrapy.Field()
# 商家评分
shopComment = scrapy.Field()
# 人均消费
shopAverage = scrapy.Field()
# 口味
shopTaste = scrapy.Field()
# 环境
shopView = scrapy.Field()
# 服务
shopService = scrapy.Field()
3、pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
class DianpingPipeline(object):
def process_item(self, item, spider):
return item
class shopPipelines(object):
def open_spider(self, spider):
self.conn = pymysql.Connect(
host='39.106.8.123', ##mysql服务器地址
port=3306, ##mysql服务器端口号
user='root', ##用户名
passwd='[email protected]', ##密码
db='dzdp', ##数据库名
charset='utf8' ##连接编码
)
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
# print(item.__class__.__name__)
# print(item.table)
# print(item.values())
list = item
tablename = list.table
print(tablename)
key = [x for x in list.keys()]
keystr = ",".join(key)
value = [str(x) for x in list.values()]
tag = (str("%s,") * len(key))[0:-1]
print(tag)
sql = "INSERT INTO %s(%s) VALUES(%s) " % (tablename, keystr, tag)
print(sql)
self.cursor.execute(sql, value)
self.conn.commit()
return item
def close_spider(self, spider):
self.conn.close()
self.cursor.close()