爬取百度口碑企业标签分类
程序员文章站
2022-05-02 17:37:22
...
爬虫测试代码,主要通过selenium,bs4等模块完成爬虫。
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import time
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import datetime
alist=[]
browser = webdriver.Chrome()
browser.get("https://koubei.baidu.com/rank?tid=1702")
browser.maximize_window()
time.sleep(2)
for i in range(1,11):
a=browser.find_element_by_xpath('//*[@id="app"]/div/div[2]/div[3]/div['+str(i)+']/a').get_attribute('href')
alist.append(a)
start_html = requests.get(a).content
soup = BeautifulSoup(start_html, "lxml")
companytag = ""
for comname in soup.find_all("span", class_=re.compile("compname-txt")):
print(comname.text)
for hy in soup.find_all("p", class_=re.compile("right trade")):
print(hy.text)
for address in soup.find_all("p", class_=re.compile("right businessaddr")):
print(address.text)
for i in range(1, 470):
time.sleep(2)
page=9
if(i>3):
page=10
if(i>4):
page=11
browser.find_element_by_xpath('//*[@id="app"]/div/div[2]/div[3]/ul/li['+str(page)+']').click()
for i in range(1, 11):
try:
a = browser.find_element_by_xpath('//*[@id="app"]/div/div[2]/div[3]/div[' + str(i) + ']/a').get_attribute(
'href')
alist.append(a)
start_html = requests.get(a).content
soup = BeautifulSoup(start_html, "lxml")
companytag = ""
for comname in soup.find_all("span", class_=re.compile("compname-txt")):
print(comname.text)
for hy in soup.find_all("p", class_=re.compile("right trade")):
print(hy.text)
for address in soup.find_all("p", class_=re.compile("right businessaddr")):
print(address.text)
except:
continue
推荐阅读