selenium爬取知乎文章内容和图片
程序员文章站
2022-05-02 21:38:40
...
代码主要实现功能使用selenium+chrome自动登录知乎,查找问题,并把所有的文章(作者,赞同数,内容,图片)都保存在数据库中(mongodb)
from selenium import webdriver
import time,re
import requests
from bs4 import BeautifulSoup
import pymongo
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from gridfs import *
#登录
def selenium_test(contents):
global b
print("dssdsa")
try:
chrome_crawler="C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe"
b=webdriver.Chrome(executable_path=chrome_crawler)
b.get("https://www.zhihu.com/signin?next=%2F")
time.sleep(1)
b.find_element_by_xpath('//*[@id="root"]/div/main/div/div/div/div[3]/span[2]/button[2]').click()
time.sleep(3)
windows=b.window_handles
b.switch_to.window(windows[1])
time.sleep(4)
b.find_element_by_id('ptlogin_iframe').click()
time.sleep(2)
b.switch_to.frame('ptlogin_iframe')
id0='###'
password="###"
time.sleep(3)
b.find_element_by_id("switcher_plogin").click()
time.sleep(3)
b.find_element_by_xpath('//*[@id="u"]').send_keys(id0)
b.find_element_by_xpath('//*[@id="p"]').send_keys(password)
time.sleep(1)
b.find_element_by_xpath('//*[@id="login_button"]').click()
time.sleep(5)
b.switch_to.window(b.window_handles[0])
time.sleep(6)
# wo=b.current_window_handle
b.switch_to.window(b.window_handles[0])
time.sleep(3)
b.find_element_by_id("Popover1-toggle").send_keys(contents)#Popover1-toggle#Popover2-toggle
time.sleep(1)
b.find_element_by_xpath('//*[@id="root"]/div/div[2]/header/div[1]/div[1]/div/form/div/div/label/button').click()
#//*[@id="root"]/div/div[2]/header/div[1]/div[1]/div/form/div/div/label/button
time.sleep(6)
#b.switch_to.window(b.window_handles[1])这里补血药切换
#print(b.page_source)
time.sleep(5)
get_contens(b)
except NoSuchElementException as e:
print(e)
def consulent():
b=input("查询的内容")#这里是话题的名称如: 如何系统地自学 Python?
selenium_test(b)
# b.close()
def get_contens(bowser):
print("sss")
next_page=browser.find_element_by_xpath('//*[@id="SearchMain"]/div/div/div/div/div[1]/div/div/h2/div/a').get_attribute('href')#//*[@id="SearchMain"]/div/div/div/div/div[1]/div/div/h2/div/a
browser.get(next_page)
time.sleep(10)
browser.find_element_by_xpath('//*[@id="root"]/div/main/div/div[2]/div[1]/div/div[4]/a').click()
#print(bowser.current_window_handle)
try:
t = True
while t:#把页面加载到底部
check_height = browser.execute_script("return document.body.scrollHeight;")
time.sleep(2)
browser.execute_script("window.scrollBy(0,document.body.scrollHeight)")
check_height1 = bowser.execute_script("return document.body.scrollHeight;")
#print(str(check_height) + '**************' + str(check_height1))
if check_height == check_height1:
t = False
except Exception as e:
print(e)
time.sleep(3)
html = browser.page_source
# print(html)
bs = BeautifulSoup(html, "html.parser") # 创建BeautifulSoup对象
body = bs.body
title = get_title(body)
contents = body.find_all('div', {'class': 'List-item'})#RichContent RichContent--unescapable #RichContent-inner RichContent-inner--collapsed
# print(contents)
client = pymongo.MongoClient(host='127.0.0.1', port=27017)
db = client.zhihu
for body in contents:
try:
author = get_author(body)
agree = get_answer_agree(body)
answer = body.find('span', {'class': 'RichText ztext CopyrightRichText-richText'})
print(answer)
#得到图片url并保存在mongodb中
links = body.find_all('img', src=re.compile(r'.jpg$'))
for link in links:
href = link.attrs['src']
a = requests.get(href).content
imgput = GridFS(db)
insertimg = imgput.put(a, content_type="jpg", filename="%s"%author)
#得到文章内容
if answer.string is None:
out = ''
for datastring in answer.strings:
datastring = datastring.encode('utf-8')
out = out + '\n' + str(datastring, encoding='utf-8')
else:
print(answer.string.encode('utf-8'))
out1 ='\n' + out
dict={"title":title,
"author":author,
"agree":agree,
"conntens":out1}
#print(dict)
#保存
save_contents(db,dict,title)
except Exception as e:
print(e)
bowser.quit()
#得到问题名称
def get_title(html_text):
data1 = html_text.find('h1', {'class':'QuestionHeader-title'})
data=data1.string.encode('utf-8')
data=str(data, encoding='utf-8')
return data
#得到赞同数
def get_answer_agree(body):
agree = body.find('button',{'class': 'Button VoteButton VoteButton--up'})
print(agree)
pattern = r'</span>(.*?)</button>'
pattern = re.compile(pattern, flags=0)
a = pattern.findall(str(agree))
return a[0]
#得到作者
def get_author(body):
agree = body.find_all('a',{'class':'UserLink-link'})
print(body)
pattern =r'target="_blank">(.*?)</a>'
pattern =re.compile(pattern,flags=0)
print(agree)
a =agree[1]
a=pattern.findall(str(a))
return a[0]
#mongodb方法
def save_contents(db,result,title):
try:
p =db[title]
p.insert(result)
print('存储到MongoDB成功')
except Exception:
p = db["pythonlearning"]
p.insert(result)
print('存储到MongoDB成功')
if __name__=="__main__":
#test()
consulent()#查询内容
欢迎各位指出不足,程序会慢慢完善
上一篇: Shell运算符