欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

python爬虫 爬取大众点评中所有行政区内的商户 将获取信息存于excle中

程序员文章站 2022-05-02 22:03:05
...
import xlwt
'''
爬取网页时直接出现403,意思是没有访问权限
'''
import requests
from bs4 import BeautifulSoup

#入口网页
start_url = 'https://www.dianping.com/search/category/344/10'

def get_content(url,headers = None):
    response = requests.get(url,headers=headers)#发起了一次请求
    html = response.content
    return html

'''
    获取所有行政区的url
'''
def region_url(html):
    soup = BeautifulSoup(html,'lxml')#lxml解析器
    #<div id="region-nav" class="nc-items ">
    #   <a href="/search/category/344/10/r299"><span>芙蓉区</span></a>
    #列表推导式
    base_url = 'https://www.dianping.com'
    region_url_list = [base_url+i['href'] for i in soup.find('div',id="region-nav").find_all('a')]
    return region_url_list

#获取商户的详情页的url地址
#find:取第一个(返回一个具体的元素,没有为null)       find_all:匹配所有(返回列表,没有返回[])
def get_shop_url(html):
    base_url = 'https://www.dianping.com'
    soup = BeautifulSoup(html,'lxml')#lxml解析器
    shop_url_list = [base_url+i.find('a')['href'] for i in soup.find_all('div',class_='tit')]
    return shop_url_list

#获取所得信息(店名,价格,评分)。。。解析页面
def get_detail(html):
    soup = BeautifulSoup(html,'lxml')#lxml解析器
    #<h1 class="shop-name">1911牛肉烤串</h1>
    title = soup.find('div',class_='breadcrumb').find('span').text
    #<span id="avgPriceTitle" class="item">人均:-</span>
    price = soup.find('span',id="avgPriceTitle").text
    #<span id="comment_score"><span class="item">口味:7.6</span><span class="item">环境:7.4</span><span class="item">服务:7.5</span></span>
    evaluation = soup.find('span',id="comment_score").find_all('span',class_="item")#评分的list
    #<span id="reviewCount" class="item">3条评论</span>
    comments = soup.find('span',id="reviewCount").text#评论的数量
#     <div class="expand-info address" itemprop="street-address">
#         <span class="item" itemprop="street-address" title="麓松路南丰港安置小区12栋">
#                      麓松路南丰港安置小区12栋
#         </span>
#     </div>
    address = soup.find('span',class_="item",itemprop="street-address").text.strip()
    
#     print u'店名'+title
#     for ev in evaluation:
#         print ev.text
#     print u'价格'+price
#     print u'评论数量'+comments
#     print u'地址'+address
    return (title,evaluation[0].text,evaluation[1].text,evaluation[2].text,price,comments,address)
  



#文件作为脚本直接执行,而import到其他脚本中是不会被执行的。
if __name__ =='__main__':
    items = []
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36',
        'Cookie':'_hc.v=dd67ff67-20d0-6e83-7f61-ce93e4d46539.1503387665; _lx_utm=utm_source%3Dbaidu%26utm_medium%3Dorganic; _lxsdk_cuid=15e08e4c108c8-01758fac19fbe5-3f63440c-100200-15e08e4c108c8; _lxsdk=15e08e4c108c8-01758fac19fbe5-3f63440c-100200-15e08e4c108c8; __utma=205923334.211352043.1503391484.1503391484.1503391484.1; __utmz=205923334.1503391484.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); looyu_id=29bc50ef1530ab64cbaa69b29cad64f39a_51868%3A1; s_ViewType=10; JSESSIONID=A49EED22A236962EA3506BA888799402; aburl=1; cy=344; cye=changsha; PHOENIX_ID=0a010918-15e0a223263-d4c1a92; __mta=146625163.1503391361571.1503401588676.1503408592089.10; _lxsdk_s=15e0a219034-38-9d5-acb%7C%7C37'
        }
    html = get_content(start_url)
    region_url_list = region_url(html)
    #遍历所有行政区的所有商户
    for url in region_url_list:#遍历所有的行政区
        #简单的出错处理,有错则略过
        try:
            for n in range(1,51):#遍历所有的50页
                html = get_content(url+'p'+str(n))
                #所有商户的详情页
                shop_url_list = get_shop_url(html)
                for shop_url in  shop_url_list:
    #                 print shop_url
                    #提取数据,获取
                    detail_html = get_content(shop_url,headers)
                    '''
                    #403 Forbidden(没有访问权限):
                                            (1)直接出现:
                                            (2)爬取一会儿出现403:可以通过代理ip解决
                    referer   防盗链
                    Host域名
                    Cookie
                    '''
                    items.append(get_detail(detail_html))
        except:
            continue
    new_table = r'F:\reptile_Python\daZhongDianPin_spiders\dzdp.xls'
    wb = xlwt.Workbook(encoding='utf-8')
    ws =wb.add_sheet('test1')  
    headData = ['商户名字','口味评分','环境评分','服务评分','人均价格','评论数量','地址']
    for colnum in  range(0,7):
        ws.write(0,colnum,headData[colnum],xlwt.easyxf('font:bold on'))        
    index = 1
    lens = len(items)
    for j in range(0,lens):
        for i in range(0,7):
            ws.write(index,i,items[j][i])
        index=index+1
    
    wb.save(new_table)