欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

selenium爬取知乎文章内容和图片

程序员文章站 2022-05-02 21:38:40
...

代码主要实现功能使用selenium+chrome自动登录知乎,查找问题,并把所有的文章(作者,赞同数,内容,图片)都保存在数据库中(mongodb)

from selenium import webdriver
import time,re
import requests
from bs4 import BeautifulSoup
import pymongo
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from gridfs import *

#登录
def selenium_test(contents):
    global b
    print("dssdsa")
    try:
        chrome_crawler="C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe"
        b=webdriver.Chrome(executable_path=chrome_crawler)
        b.get("https://www.zhihu.com/signin?next=%2F")
        time.sleep(1)
        b.find_element_by_xpath('//*[@id="root"]/div/main/div/div/div/div[3]/span[2]/button[2]').click()
        time.sleep(3)
        windows=b.window_handles
        b.switch_to.window(windows[1])
        time.sleep(4)
        b.find_element_by_id('ptlogin_iframe').click()
        time.sleep(2)
        b.switch_to.frame('ptlogin_iframe')
        id0='###'
        password="###"
        time.sleep(3)
        b.find_element_by_id("switcher_plogin").click()
        time.sleep(3)
        b.find_element_by_xpath('//*[@id="u"]').send_keys(id0)
        b.find_element_by_xpath('//*[@id="p"]').send_keys(password)
        time.sleep(1)
        b.find_element_by_xpath('//*[@id="login_button"]').click()
        time.sleep(5)
        b.switch_to.window(b.window_handles[0])
        time.sleep(6)
       # wo=b.current_window_handle

        b.switch_to.window(b.window_handles[0])
        time.sleep(3)
        b.find_element_by_id("Popover1-toggle").send_keys(contents)#Popover1-toggle#Popover2-toggle
        time.sleep(1)
        b.find_element_by_xpath('//*[@id="root"]/div/div[2]/header/div[1]/div[1]/div/form/div/div/label/button').click()
      #//*[@id="root"]/div/div[2]/header/div[1]/div[1]/div/form/div/div/label/button
        time.sleep(6)
        #b.switch_to.window(b.window_handles[1])这里补血药切换
        #print(b.page_source)
        time.sleep(5)
        get_contens(b)
    except NoSuchElementException as e:
        print(e)



def consulent():
    b=input("查询的内容")#这里是话题的名称如: 如何系统地自学 Python?
    selenium_test(b)
    # b.close()

def get_contens(bowser):
    print("sss")
    next_page=browser.find_element_by_xpath('//*[@id="SearchMain"]/div/div/div/div/div[1]/div/div/h2/div/a').get_attribute('href')#//*[@id="SearchMain"]/div/div/div/div/div[1]/div/div/h2/div/a
    browser.get(next_page)
    time.sleep(10)
    browser.find_element_by_xpath('//*[@id="root"]/div/main/div/div[2]/div[1]/div/div[4]/a').click()
    #print(bowser.current_window_handle)
    try:
        t = True
        while t:#把页面加载到底部
            check_height = browser.execute_script("return document.body.scrollHeight;")
            time.sleep(2)
            browser.execute_script("window.scrollBy(0,document.body.scrollHeight)")
            check_height1 = bowser.execute_script("return document.body.scrollHeight;")
            #print(str(check_height) + '**************' + str(check_height1))
            if check_height == check_height1:
                t = False
    except Exception as e:
        print(e)
    time.sleep(3)
    html = browser.page_source
   # print(html)
    bs = BeautifulSoup(html, "html.parser")  # 创建BeautifulSoup对象
    body = bs.body
    title = get_title(body)
    contents = body.find_all('div', {'class': 'List-item'})#RichContent RichContent--unescapable  #RichContent-inner RichContent-inner--collapsed
   # print(contents)
    client = pymongo.MongoClient(host='127.0.0.1', port=27017)
    db = client.zhihu
    for body in contents:
        try:
            author = get_author(body)
            agree = get_answer_agree(body)
            answer = body.find('span', {'class': 'RichText ztext CopyrightRichText-richText'})
            print(answer)
            #得到图片url并保存在mongodb中
            links = body.find_all('img', src=re.compile(r'.jpg$'))
            for link in links:
                href = link.attrs['src']
                a = requests.get(href).content
                imgput = GridFS(db)
                insertimg = imgput.put(a, content_type="jpg", filename="%s"%author)
            #得到文章内容
            if answer.string is None:
                out = ''
                for datastring in answer.strings:
                    datastring = datastring.encode('utf-8')
                    out = out + '\n' + str(datastring, encoding='utf-8')
            else:
                print(answer.string.encode('utf-8'))
            out1 ='\n' + out
            dict={"title":title,
                "author":author,
                  "agree":agree,
                  "conntens":out1}
            #print(dict)
            #保存
            save_contents(db,dict,title)
        except Exception as e:
            print(e)
    bowser.quit()

#得到问题名称
def get_title(html_text):
    data1 = html_text.find('h1', {'class':'QuestionHeader-title'})
    data=data1.string.encode('utf-8')
    data=str(data, encoding='utf-8')
    return data
#得到赞同数
def get_answer_agree(body):
    agree = body.find('button',{'class': 'Button VoteButton VoteButton--up'})
    print(agree)
    pattern = r'</span>(.*?)</button>'
    pattern = re.compile(pattern, flags=0)
    a = pattern.findall(str(agree))
    return a[0]
#得到作者
def get_author(body):
    agree = body.find_all('a',{'class':'UserLink-link'})
    print(body)
    pattern =r'target="_blank">(.*?)</a>'
    pattern =re.compile(pattern,flags=0)
    print(agree)
    a =agree[1]
    a=pattern.findall(str(a))
    return a[0]

#mongodb方法
def save_contents(db,result,title):

    try:
        p =db[title]
        p.insert(result)
        print('存储到MongoDB成功')
    except Exception:
        p = db["pythonlearning"]
        p.insert(result)
        print('存储到MongoDB成功')


if __name__=="__main__":
    #test()
    consulent()#查询内容

欢迎各位指出不足,程序会慢慢完善