selenium爬取知乎文章内容和图片

程序员文章站 2022-05-02 21:38:40

...

代码主要实现功能使用selenium+chrome自动登录知乎，查找问题，并把所有的文章（作者，赞同数，内容，图片）都保存在数据库中（mongodb）

from selenium import webdriver
import time,re
import requests
from bs4 import BeautifulSoup
import pymongo
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from gridfs import *

#登录
def selenium_test(contents):
    global b
    print("dssdsa")
    try:
        chrome_crawler="C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe"
        b=webdriver.Chrome(executable_path=chrome_crawler)
        b.get("https://www.zhihu.com/signin?next=%2F")
        time.sleep(1)
        b.find_element_by_xpath('//*[@id="root"]/div/main/div/div/div/div[3]/span[2]/button[2]').click()
        time.sleep(3)
        windows=b.window_handles
        b.switch_to.window(windows[1])
        time.sleep(4)
        b.find_element_by_id('ptlogin_iframe').click()
        time.sleep(2)
        b.switch_to.frame('ptlogin_iframe')
        id0='###'
        password="###"
        time.sleep(3)
        b.find_element_by_id("switcher_plogin").click()
        time.sleep(3)
        b.find_element_by_xpath('//*[@id="u"]').send_keys(id0)
        b.find_element_by_xpath('//*[@id="p"]').send_keys(password)
        time.sleep(1)
        b.find_element_by_xpath('//*[@id="login_button"]').click()
        time.sleep(5)
        b.switch_to.window(b.window_handles[0])
        time.sleep(6)
       # wo=b.current_window_handle

        b.switch_to.window(b.window_handles[0])
        time.sleep(3)
        b.find_element_by_id("Popover1-toggle").send_keys(contents)#Popover1-toggle#Popover2-toggle
        time.sleep(1)
        b.find_element_by_xpath('//*[@id="root"]/div/div[2]/header/div[1]/div[1]/div/form/div/div/label/button').click()
      #//*[@id="root"]/div/div[2]/header/div[1]/div[1]/div/form/div/div/label/button
        time.sleep(6)
        #b.switch_to.window(b.window_handles[1])这里补血药切换
        #print(b.page_source)
        time.sleep(5)
        get_contens(b)
    except NoSuchElementException as e:
        print(e)



def consulent():
    b=input("查询的内容")#这里是话题的名称如： 如何系统地自学 Python？
    selenium_test(b)
    # b.close()

def get_contens(bowser):
    print("sss")
    next_page=browser.find_element_by_xpath('//*[@id="SearchMain"]/div/div/div/div/div[1]/div/div/h2/div/a').get_attribute('href')#//*[@id="SearchMain"]/div/div/div/div/div[1]/div/div/h2/div/a
    browser.get(next_page)
    time.sleep(10)
    browser.find_element_by_xpath('//*[@id="root"]/div/main/div/div[2]/div[1]/div/div[4]/a').click()
    #print(bowser.current_window_handle)
    try:
        t = True
        while t:#把页面加载到底部
            check_height = browser.execute_script("return document.body.scrollHeight;")
            time.sleep(2)
            browser.execute_script("window.scrollBy(0,document.body.scrollHeight)")
            check_height1 = bowser.execute_script("return document.body.scrollHeight;")
            #print(str(check_height) + '**************' + str(check_height1))
            if check_height == check_height1:
                t = False
    except Exception as e:
        print(e)
    time.sleep(3)
    html = browser.page_source
   # print(html)
    bs = BeautifulSoup(html, "html.parser")  # 创建BeautifulSoup对象
    body = bs.body
    title = get_title(body)
    contents = body.find_all('div', {'class': 'List-item'})#RichContent RichContent--unescapable  #RichContent-inner RichContent-inner--collapsed
   # print(contents)
    client = pymongo.MongoClient(host='127.0.0.1', port=27017)
    db = client.zhihu
    for body in contents:
        try:
            author = get_author(body)
            agree = get_answer_agree(body)
            answer = body.find('span', {'class': 'RichText ztext CopyrightRichText-richText'})
            print(answer)
            #得到图片url并保存在mongodb中
            links = body.find_all('img', src=re.compile(r'.jpg$'))
            for link in links:
                href = link.attrs['src']
                a = requests.get(href).content
                imgput = GridFS(db)
                insertimg = imgput.put(a, content_type="jpg", filename="%s"%author)
            #得到文章内容
            if answer.string is None:
                out = ''
                for datastring in answer.strings:
                    datastring = datastring.encode('utf-8')
                    out = out + '\n' + str(datastring, encoding='utf-8')
            else:
                print(answer.string.encode('utf-8'))
            out1 ='\n' + out
            dict={"title":title,
                "author":author,
                  "agree":agree,
                  "conntens":out1}
            #print(dict)
            #保存
            save_contents(db,dict,title)
        except Exception as e:
            print(e)
    bowser.quit()

#得到问题名称
def get_title(html_text):
    data1 = html_text.find('h1', {'class':'QuestionHeader-title'})
    data=data1.string.encode('utf-8')
    data=str(data, encoding='utf-8')
    return data
#得到赞同数
def get_answer_agree(body):
    agree = body.find('button',{'class': 'Button VoteButton VoteButton--up'})
    print(agree)
    pattern = r'</span>(.*?)</button>'
    pattern = re.compile(pattern, flags=0)
    a = pattern.findall(str(agree))
    return a[0]
#得到作者
def get_author(body):
    agree = body.find_all('a',{'class':'UserLink-link'})
    print(body)
    pattern =r'target="_blank">(.*?)</a>'
    pattern =re.compile(pattern,flags=0)
    print(agree)
    a =agree[1]
    a=pattern.findall(str(a))
    return a[0]

#mongodb方法
def save_contents(db,result,title):

    try:
        p =db[title]
        p.insert(result)
        print('存储到MongoDB成功')
    except Exception:
        p = db["pythonlearning"]
        p.insert(result)
        print('存储到MongoDB成功')


if __name__=="__main__":
    #test()
    consulent()#查询内容

欢迎各位指出不足，程序会慢慢完善

上一篇： Shell运算符

下一篇： Bit Manipulation总结(二)

selenium爬取知乎文章内容和图片

代码主要实现功能使用selenium+chrome自动登录知乎，查找问题，并把所有的文章（作者，赞同数，内容，图片）都保存在数据库中（mongodb）

欢迎各位指出不足，程序会慢慢完善

【Python3.6爬虫学习记录】（七）使用Selenium+ChromeDriver爬取知乎某问题的回答

python爬虫---爬取知乎热榜内容并进行图片爬取

Python爬虫入门教程 26-100 知乎文章图片爬取器之二

Python爬虫入门教程 25-100 知乎文章图片爬取器之一

php实现爬取和分析知乎用户数据

Scrapy分布式爬虫打造搜索引擎 - （三）知乎网问题和答案爬取

php 爬虫：知乎用户数据爬取和分析

selenium爬取知乎文章内容和图片

php实现爬取和分析知乎用户数据，php

浅谈利用php爬取和分析知乎用户数据的方法