python 爬虫 爬取大众点评11月之星
程序员文章站
2022-05-02 22:02:53
...
import requests
from bs4 import BeautifulSoup
import re
def getHTMLText(url):
try:
r = requests.get(url)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
def getStockList(lst, stockURL,city_lst):
html = getHTMLText(stockURL)
soup = BeautifulSoup(html, 'html.parser')
a = soup.find_all(href=re.compile("/memberlist/star/1711/\d*?"))
for i in a:
try:
href = i.attrs['href']
city = i.text
lst.append(href)
city_lst.append(city)
except:
continue
def getStockInfo(lst, fpath,cst):
count=0
for stock in lst:
url = "http://www.dianping.com" + stock
html = getHTMLText(url)
try:
if html=="":
continue
infoDict = {}
soup = BeautifulSoup(html, 'html.parser')
stockInfo = soup.find_all('h4')
for n in stockInfo:
name = n.find('a').text
number = int(eval((n.find('span').text)))
infoDict[name]=number
with open(fpath, 'a', encoding='utf-8') as f:
f.write(str(cst[count])+":"+str(infoDict.items()) + '\n' )
count = count + 1
print("\r当前速度:{:.2f}%".format(count*100/len(lst)),end="")
except:
count = count + 1
print("\r当前速度:{:.2f}%".format(count*100/len(lst)),end="")
continue
print len(lst)
def main():
stock_list_url = 'http://www.dianping.com/memberlist/star/1711/2'
output_file = 'E:/Dazhongdianping.txt'
slist=[]
clist=[]
getStockList(slist, stock_list_url,clist)
getStockInfo(slist, output_file,clist)
main()