爬取百度口碑企业标签分类

程序员文章站 2022-05-02 17:37:22
...
爬虫测试代码，主要通过selenium，bs4等模块完成爬虫。
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import time
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import datetime


alist=[]

browser = webdriver.Chrome()
browser.get("https://koubei.baidu.com/rank?tid=1702")
browser.maximize_window()
time.sleep(2)
for i in range(1,11):
    a=browser.find_element_by_xpath('//*[@id="app"]/div/div[2]/div[3]/div['+str(i)+']/a').get_attribute('href')
    alist.append(a)
    start_html = requests.get(a).content
    soup = BeautifulSoup(start_html, "lxml")
    companytag = ""
    for comname in soup.find_all("span", class_=re.compile("compname-txt")):
        print(comname.text)
    for hy in soup.find_all("p", class_=re.compile("right trade")):
        print(hy.text)
    for address in soup.find_all("p", class_=re.compile("right businessaddr")):
        print(address.text)
for i in range(1, 470):
    time.sleep(2)
    page=9
    if(i>3):
        page=10
    if(i>4):
        page=11
    browser.find_element_by_xpath('//*[@id="app"]/div/div[2]/div[3]/ul/li['+str(page)+']').click()
    for i in range(1, 11):
        try:
            a = browser.find_element_by_xpath('//*[@id="app"]/div/div[2]/div[3]/div[' + str(i) + ']/a').get_attribute(
                'href')
            alist.append(a)
            start_html = requests.get(a).content
            soup = BeautifulSoup(start_html, "lxml")
            companytag = ""
            for comname in soup.find_all("span", class_=re.compile("compname-txt")):
                print(comname.text)
            for hy in soup.find_all("p", class_=re.compile("right trade")):
                print(hy.text)
            for address in soup.find_all("p", class_=re.compile("right businessaddr")):
                print(address.text)
        except:
            continue