欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

爬取百度口碑企业标签分类

程序员文章站 2022-05-02 17:37:22
...

爬虫测试代码,主要通过selenium,bs4等模块完成爬虫。

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import time
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import datetime


alist=[]

browser = webdriver.Chrome()
browser.get("https://koubei.baidu.com/rank?tid=1702")
browser.maximize_window()
time.sleep(2)
for i in range(1,11):
    a=browser.find_element_by_xpath('//*[@id="app"]/div/div[2]/div[3]/div['+str(i)+']/a').get_attribute('href')
    alist.append(a)
    start_html = requests.get(a).content
    soup = BeautifulSoup(start_html, "lxml")
    companytag = ""
    for comname in soup.find_all("span", class_=re.compile("compname-txt")):
        print(comname.text)
    for hy in soup.find_all("p", class_=re.compile("right trade")):
        print(hy.text)
    for address in soup.find_all("p", class_=re.compile("right businessaddr")):
        print(address.text)
for i in range(1, 470):
    time.sleep(2)
    page=9
    if(i>3):
        page=10
    if(i>4):
        page=11
    browser.find_element_by_xpath('//*[@id="app"]/div/div[2]/div[3]/ul/li['+str(page)+']').click()
    for i in range(1, 11):
        try:
            a = browser.find_element_by_xpath('//*[@id="app"]/div/div[2]/div[3]/div[' + str(i) + ']/a').get_attribute(
                'href')
            alist.append(a)
            start_html = requests.get(a).content
            soup = BeautifulSoup(start_html, "lxml")
            companytag = ""
            for comname in soup.find_all("span", class_=re.compile("compname-txt")):
                print(comname.text)
            for hy in soup.find_all("p", class_=re.compile("right trade")):
                print(hy.text)
            for address in soup.find_all("p", class_=re.compile("right businessaddr")):
                print(address.text)
        except:
            continue

 

相关标签: selenium bs4