python爬虫——按城市及店铺面爬取大众点评分类
程序员文章站
2022-05-02 22:02:47
...
题外话:因为最近遇到很多标签要对其进行分类,其中很多是店铺名,所以就想爬取大众点评的分类信息。因为不是专门做爬虫的,所以下面这段代码仅仅是可以实现要求,如何能避免网站的反爬机制这一点就无能无力了。另外大众点评根据店铺名返回的分类结果也不一定完全争取,这里没有处理这种情况。
# -*- coding:utf-8 -*-
import urllib.request
import urllib.parse
import re
import bs4
from bs4 import BeautifulSoup
from selenium import webdriver
from time import sleep
import selenium
def getCityURL(city):
baseURL='http://www.dianping.com/citylist'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
request=urllib.request.Request(baseURL,headers=headers)
response=urllib.request.urlopen(request)
content=response.read().decode('utf-8')
soup=BeautifulSoup(content,'lxml')
for item in soup.find_all('a',{'class':'link onecity'}):
if city in item.string:
return 'http://'+item['href'][2:]
def searchShop(cityURL,shop_name):
browser = webdriver.Firefox()
browser.get(cityURL)
normal_window=browser.current_window_handle
search_input=browser.find_element_by_id('J-search-input')
search_input.send_keys(shop_name)
search_click=browser.find_element_by_id('J-all-btn')
search_click.click()
res=''
for pay_window in browser.window_handles:
if pay_window!=normal_window:
browser.switch_to.window(pay_window)
sleep(20)
if browser.find_elements_by_class_name('not-found-right')!=[]:
res="not found"
else:
tmp_name=browser.find_element_by_id('shop-all-list').find_element_by_xpath('.//h4').text
if shop_name not in tmp_name:
res="not found"
else:
tmp=browser.find_element_by_class_name('J_filter_channel')
tmp_cls=tmp.find_element_by_class_name('nc-items')
res='_'.join([item.text for item in tmp_cls.find_elements_by_xpath('.//span')])
browser.quit()
return res
if __name__ == '__main__':
cityURL=getCityURL('上海')
cls=searchShop(cityURL,'天真蓝')
print(cls)