欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

python 爬虫 大众点评美食排名

程序员文章站 2022-05-02 22:15:05
...
import requests
from bs4 import BeautifulSoup
import re

def getHTMLText(url):
    try:
        r = requests.get(url)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return ""

def getStockList(lst, stockURL,city_lst,infodict):
    html = getHTMLText(stockURL)
    soup = BeautifulSoup(html, 'html.parser') 
    a = soup.find_all("h4")
    b = soup.find_all("a", href = re.compile("http://www.dianping.com/search/category/33/0/r\d{4}"))
    print (a,b)
    for i in a[2:len(a)-2]:

        name = i.text

        lst.append(name)
        print (name)


    count=0
    for j in b:

        try:
            address = j.text.split()[0]+j.text.split()[1]
            city_lst.append(address)
            infodict[lst[count]]=address
            print (address)

            count+=1
        except:
            count+=1
            continue



def main():
    stock_list_url = 'http://www.dianping.com/search/category/33/10/r3300'

    output_file = 'E:/dzdpmspm.txt'
    slist=[]
    clist=[]
    infoDict={}
    getStockList(slist, stock_list_url,clist,infoDict)
    for n in range(2,51):
        stock_list_url="http://www.dianping.com/search/category/33/10/r3300p"+str(n)+"?aid=91959818%2C93071129"
        getStockList(slist, stock_list_url,clist,infoDict)
        with open(output_file, 'a', encoding='utf-8') as f:
                f.write(str(infoDict.items()) + '\n' )

                print("\r当前速度:{:.2f}%".format(n*100/50),end="")
main()