欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

微博评论爬取优化

程序员文章站 2022-05-02 20:48:45
...
#抓取MiuMiu微博所有评论
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
import time
import pandas as pd



def switch_to_window():
    nowhandle=driver.current_window_handle

    allhandles=driver.window_handles

    for handle in allhandles:
        if handle != nowhandle:
            driver.switch_to_window(handle)

def use_mouse_add_hide_content():
    for i in range(5):
        driver.execute_script('window.scrollTo(0,1000000)')
        time.sleep(5)

def get_comment_time():
    for i in range(50):
        try:
            element = driver.find_element_by_class_name("more_txt")
            driver.find_element_by_class_name("more_txt").click()
            time.sleep(8)
        except NoSuchElementException as e:
            print(e)
            break

def write_to_txt(commentdata_1,timedata):
    for i in range(len(commentdata_1)):
        with open('D:\\comment_time.txt',mode='a+',encoding='utf-8') as f:#数据保存地址
            f.write(commentdata_1.iloc[i,0])
            f.write('\t')
            f.write(timedata[i+1].text)
            f.write('\n') 

def get_one_page()
    for num in range(2,47):
        print("The {} comment is start!".format(num))
        #use_mouse_add_hide_content()
        if driver.find_element_by_xpath('//*[@id="Pl_Official_MyProfileFeed__23"]/div/div['+str(num)+']/div[2]/div/ul/li[3]/a/span/span/span/em[2]').text == '评论':
            continue
        driver.find_element_by_xpath('//*[@id="Pl_Official_MyProfileFeed__23"]/div/div['+str(num)+']/div[1]/div[3]/div[2]/a[1]').click()
        time.sleep(5)
        switch_to_window()

        use_mouse_add_hide_content()

        get_comment_time()

        commentdata = driver.find_elements_by_class_name("WB_text")
        timedata = driver.find_elements_by_css_selector("[class='WB_from S_txt2']")
        commentdata_1 = pd.DataFrame([commentdata[1].text],columns=["text"])
        for i in range(2,len(commentdata)):
            commentdata_2 = pd.DataFrame([commentdata[i].text],columns=["text"])
            if ':' in commentdata_2["text"][0]:
                commentdata_1 = pd.concat((commentdata_1,commentdata_2))

        write_to_txt(commentdata_1,timedata)

        driver.close()
        allhandles=driver.window_handles
        for handle in allhandles:
            driver.switch_to_window(handle)
        time.sleep(5)
    

if __name__ == '__main__':

    driver = webdriver.Chrome()

    driver.get('https://weibo.com')

    driver.find_element_by_id('loginname').send_keys('******')

    driver.find_element_by_name('password').send_keys('******')

    driver.find_element_by_xpath('//*[@id="pl_login_form"]/div/div[3]/div[6]/a').click()
    time.sleep(10)

    driver.get('https://weibo.com/miumiuofficial?profile_ftype=1&is_all=1#_0')
	
    for j in range(29):
        get_one_page()
        driver.find_element_by_link_text('下一页').click()
        time.sleep(10)