python selenium爬取百度文库
程序员文章站
2022-05-04 12:08:00
...
-
参考博客
- selenium官方教程
-
注意事项
-
电脑版的网页源码中没有文库的内容,该方法适用于手机版的百度文库
- 文档的内容有多页时需要点击继续阅读和加载更多按钮
在对这两个标签进行定位时,开始参考了这位博主写的方法,由于文库的网页源码更改故失败了。
后来受这位博主启发链接完成了‘继续阅读’的点击,但是这种方式用到‘点击加载更多’时又出现了标签覆盖的情况。具体见代码。
# -*- coding:UTF-8 -*-
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
# 打开网页
def connect():
options = webdriver.ChromeOptions()
options.add_argument(
'user-agent="Mozilla/5.0 (Linux; Android 4.0.4; Galaxy Nexus Build/IMM76B) AppleWebKit/535.19 (KHTML, '
'like Gecko) Chrome/18.0.1025.133 Mobile Safari/535.19"')
driver = webdriver.Chrome(executable_path='D:\chromedriver_win32\chromedriver.exe', chrome_options=options)
driver.get('https://wenku.baidu.com/view/aa31a84bcf84b9d528ea7a2c.html')
return driver
# 定位并点击"继续阅读"和"点击加载更多"
def locate(driver):
wait = WebDriverWait(driver, 10)
target = wait.until(EC.presence_of_element_located((
By.XPATH, '//*[@class="foldpagewg-root"]'
))) # 继续阅读按钮所在区域
driver.execute_script("arguments[0].scrollIntoView();", target) # 滑动滚动条到按钮所在位置
Butt = wait.until(EC.presence_of_element_located((
By.XPATH, '//*[@class="foldpagewg-root"]/div/div/div'
)))
Butt.click() # 点击继续阅读
time.sleep(1)
htmlplus = driver.page_source # 得到网页源码
return htmlplus
# 判断之后是否还有内容,内容关键字为"点击加载更多"
def juge(driver):
print(driver)
'''
#尝试使用locate(driver)相同的方式失败(失败原因标签被覆盖)
wait = WebDriverWait(driver, 10)
target = wait.until(EC.presence_of_element_located((
By.XPATH, '//*[@class="pagerwg-root"]'
))) # 继续阅读按钮所在区域
driver.execute_script("arguments[0].scrollIntoView();", target) # 滑动滚动条到按钮所在位置
Butt = wait.until(EC.presence_of_element_located((
By.XPATH, '//*[@class="pagerwg-root"]'
)))
Butt.click() # 点击加载更多
time.sleep(1)
'''
element = driver.find_element_by_xpath("//div[@class='pagerwg-button']")
# element = driver.find_element_by_css('div[class*="pagerwg-button"]')
driver.execute_script("arguments[0].click();", element)
htmlplus2 = driver.page_source # 得到网页源码
return htmlplus2
# 解析最终获得的html文档
def analysis(html):
#encoding="utf-8"防止遇到\xa0无法写入
f = open('baidu.txt', 'w',encoding="utf-8")
bf1 = BeautifulSoup(html, 'lxml')
result = bf1.find_all(class_='rtcspage')
for each_result in result:
bf2 = BeautifulSoup(str(each_result), 'lxml')
texts = bf2.find_all('p')
for each_text in texts:
main_body = BeautifulSoup(str(each_text), 'lxml')
for each in main_body.find_all(True):
if each.name == 'br' or each.string == None:
print("")
else:
f.write(each.string)
print(each.string)
print('\n')
f.close()
if __name__ == '__main__':
driver = connect()
html = locate(driver)
htmlplus = juge(driver)
analysis(htmlplus)
代码还有许多待完善的地方,例如对异常的处理,下载后文档格式的调整等,以后还会完善。有什么不妥的地方欢迎大家批评指正!!
上一篇: Unity 判断当前动画是否播放完成