微博抓取尝试

程序员文章站 2022-07-02 13:45:38

微信是比较封闭的，微博的好友信息比较开放，都可以抓到； 1）找人，通过关注列表；2）提取出微博的数据，放到数据库；微博昵称，头像；关注，粉丝及微博数量；根据一些基本的原则来决定是否将该用户的微博入待爬的队列；指标：关注人数；粉丝人数；但是有可能会很多人，而且有很多僵尸粉；（不好:第一，低效；第二 ......

微信是比较封闭的，微博的好友信息比较开放，都可以抓到；

1）找人，通过关注列表；
2）提取出微博的数据，放到数据库；

微博昵称，头像；
关注，粉丝及微博数量；
根据一些基本的原则来决定是否将该用户的微博入待爬的队列；

指标：关注人数；
粉丝人数；但是有可能会很多人，而且有很多僵尸粉；（不好:第一，低效；
第二，平台也不会让你无限制的往下翻页，肯定会有限制）
微博数，粉丝数的数量是个重要的参考点；

怎么判断抓取的人不值得关注？可以先做一个定向的分析，分析你所抓的领域的人的微博大致情况；
    1）如果发布的微博数量特别少，可以认为是僵尸用户，不用爬；微博数小于某个下限；
    2）如果发布的微博数量特别多，比如每天发100多条，可能是小广告商或者机器人；
    3）对于转发的微博其实和僵尸的微博差不多，你可能爬了大量的微博发现都是重复的信息；

# 下面列出部分代码如下：

# -*- coding: utf-8 -*-
"""
Created on Sun Apr 1 10:18:42 2018

@author: Joe3223
"""
# -*- coding:utf-8 -*-
#!/usr/bin/env python3
import time
import os
import re
from bs4 import BeautifulSoup
from urllib.request import urlopen
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
#import pymongo
#from pymongo import MongoClient
import hashlib
from collections import deque
from lxml import etree
import threading

# 数据库的准备，这里用的是mongodb；
#client = MongoClient('localhost',27017)
#db = client.test
#followers = db.followers

# 注意：这里如果不设置user-agent，可能是无法跳转的
user_agent = (
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) " +
    "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.57 Safari/537.36"
)
##dcap = dict(DesiredCapabilities.PHANTOMJS)
##dcap["phantomjs.page.settings.userAgent"] = user_agent
dcap = dict(DesiredCapabilities.FIREFOX)
dcap["firefox.page.settings.userAgent"] = user_agent

#browserPath = '/opt/phantomjs-2.1.1-linux-x86_64/bin/phantomjs'
#browserPath = '/usr/bin/phantomjs'
# 基本参数的一些准备工作
parser = 'html5lib'
domain = "weibo.com"
url_home = "http://" + domain
download_bf = deque()                # 双向队列，用于保证多线程爬取是安全的
cur_queue = deque()
min_mblogs_allowed = 10              # 爬取的阈值设置
max_follow_fans_ratio_allowed = 3

# 这里有两个爬虫，一个爬取微博数据，一个爬取用户数据
weibo_driver = webdriver.Firefox() # 微博爬虫
weibo_driver.set_window_size(1920, 1200) # optional

# url入队列，当然，入队列前要先做查重
def enqueueUrl(url):
    try:
        md5v = hashlib.md5(url).hexdigest()
        if md5v not in download_bf: # 去重
            print(url + ' is added to queue')
            cur_queue.append(url)
            download_bf.append(md5v)
        # else:
            # print 'Skip %s' % (url)
    except ValueError:
        pass

# 队列左端弹出一个值
def dequeuUrl():
    return cur_queue.popleft()

# 到下一页取抓取
def go_next_page(cur_driver):
    try:
        next_page = cur_driver.find_element_by_xpath('//a[contains(@class, "page next")]').get_attribute('href')
        print('next page is ' + next_page)
        cur_driver.get(next_page)
        time.sleep(3)
        return True
    except Exception:
        print('next page is not found')
        return False

# 通过xpath尝试获取元素，最多尝试6次
def get_element_by_xpath(cur_driver, path):
    tried = 0
    while tried < 6:
        html = cur_driver.page_source
        tr = etree.HTML(html)
        elements = tr.xpath(path)
        if len(elements) == 0:
            time.sleep(1)
            continue
        return elements

# 滚屏，保证能抓到数据
def scroll_to_bottom():
    # 最多尝试 50 次滚屏
    print('scroll down')
    for i in range(0,50):
        # print 'scrolling for the %d time' % (i)
        weibo_driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
        html = weibo_driver.page_source
        tr = etree.HTML(html)
        next_page_url = tr.xpath('//a[contains(@class,"page next")]')
        if len(next_page_url) > 0:
            return next_page_url[0].get('href')
        if len(re.findall('点击重新载入', html)) > 0:
            print('scrolling failed, reload it')
            weibo_driver.find_element_by_link_text('点击重新载入').click()
        time.sleep(1)

# 提取微博数据
def extract_feed(feeds):
    for i in range(0,20):
   # 只有在抓取微博数据时需要滚屏
        scroll_to_bottom()
        for element in weibo_driver.find_elements_by_class_name('WB_detail'):
            tried = 0
            while tried < 3:
                try:
                    feed = {}
                    feed['time'] = element.find_element_by_xpath('.//div[@class="WB_from S_txt2"]').text
                    feed['content'] = element.find_element_by_class_name('WB_text').text
                    feed['image_names'] = []
                    for image in element.find_elements_by_xpath('.//li[contains(@class,"WB_pic")]/img'):
                        feed['image_names'].append(re.findall('/([^/]+)$', image.get_attribute('src')))
                    feeds.append(feed)
                    print('--------------------')
                    print(feed['time'])
                    print(feed['content'])
                    break
                except Exception:
                    tried += 1
                    time.sleep(1)
       # 微博信息的下一页
        if go_next_page(weibo_driver) is False:
            return feeds

def getFollows(pageInfo):
    pattern3 = re.compile('class="S_txt1" title="(.*?)".*?usercard')
    follows = re.findall(pattern3, pageInfo)
    print(follows)
    for i in follows:
        print(i)
        #follower = {"name":i,"type":"follower"}
        #rs = followers.insert_one(follower)
        #print('one insert:{0}'.format(rs.inserted_id))

    ##urlsToScrawl = []
    ##urlsScrawled = []
    patterUrls = re.compile('<a bpfilter="page" class="page S_txt1"[\s\S]*?href="([\s\S]*?pids=Pl_Official_RelationMyfollow__92&cfs=&Pl_Official_RelationMyfollow__92_page=[\s\S]*?)"')
    follows = re.findall(patterUrls, pageInfo)
    for i in follows:
        print("http://weibo.com/"+i)
        ##if i not in urlsScrawled and i not in urlsToScrawl:
        ##urlsToScrapy.append("http://weibo.com/"+i)

def login(current_driver,username, password):
    #driver = webdriver.PhantomJS(executable_path=browserPath) #浏览器的地址
    #driver = webdriver.PhantomJS(desired_capabilities=dcap)
    #driver = webdriver.Firefox()
    #driver.set_window_size(1920, 1200)

    current_driver.get(url_home) #访问目标网页地址
    #bsObj = BeautifulSoup(user_driver.page_source, parser) #解析目标网页的 Html 源码
    time.sleep(10)
    #user_driver.save_screenshot("weiboLogin0.png")

    # 登录
    current_driver.find_element_by_id('loginname').send_keys(username)
    #user_driver.find_element_by_id('password').send_keys(password)
    #user_driver.find_element_by_xpath('//div[contains(@class,"input_wrap ")][0]/input').send_keys(password)
    current_driver.find_element_by_xpath('/html/body/div[1]/div[1]/div/div[2]/div[1]/div[2]/div/div[2]/div[1]/div[2]/div[1]/div/div/div/div[3]/div[2]/div/input').send_keys(password)
    # 执行 click()
    current_driver.find_element_by_xpath('//div[contains(@class,"login_btn")][1]/a').click()
    time.sleep(8)
    current_driver.save_screenshot("weiboLogin.png")

    ##verifyCode = input("Please input verify code:")
    ##user_driver.find_element_by_xpath('/html/body/div[1]/div[1]/div/div[2]/div[1]/div[2]/div/div[2]/div[1]/div[2]/div[1]/div/div/div/div[3]/div[3]/div/input').send_keys(verifyCode)
    ##user_driver.find_element_by_xpath('//div[contains(@class,"login_btn")][1]/a').click()
    ##time.sleep(8)
    ##user_driver.save_screenshot("weiboLogin2.png")


def main(username, password):
    # 登录
    #login(user_driver,username, password)
    login(weibo_driver,username, password)

    # 等会操作
    time.sleep(30)
    #user_driver.save_screenshot("weibo.png")

    ## 从大V的入口进去爬取,真正的URL入口
    user_link = "https://weibo.com/u/3738542230?topnav=1&wvr=6&topsug=1&is_hot=1"
    print('downloading ' + user_link)
    weibo_driver.get(user_link)
    time.sleep(5)

    # 提取用户姓名
    account_name = get_element_by_xpath(weibo_driver, '//h1')[0].text
    photo = get_element_by_xpath(weibo_driver, '//p[@class="photo_wrap"]/img')[0].get('src')
    account_photo = re.findall('/([^/]+)$', photo)
    # 提取他的关注主页
    follows_link = get_element_by_xpath(weibo_driver, '//a[@class="t_link S_txt1"]')[0].get('href')
    print('account: ' + account_name)
    print('account_photo: '+account_photo[0])
    print('follows link is ' + follows_link)

    #user_driver.get("http"+follows_link)
    feeds = []
    #users = []
   # 起一个线程取获取微博数据
    t_feeds = threading.Thread(target=extract_feed, name=None, args=(feeds,))
    t_feeds.start()
    t_feeds.join()


if __name__ == '__main__':
    main("你的用户","你的密码")
    #login(user_driver,"570876459@qq.com", "xiaowuwu!!!")
    #login(weibo_driver,username, pass

上一篇： linux下重启oracle服务：监听器和实例

下一篇： MyBatis -- generator 逆向工程

微博抓取尝试

更换手机号码新浪微博的手机绑定如何解除

微博运营实战干货分享 3年微博运营经验总结

微博账号登陆时提示帐号异常需要发送短信验证？

盘点网络上盛传的"微博界八大杀器" 你造吗？

2015最新腾讯微博会员开通方法分享复活卡永久无需手机绿钻

2015腾讯微博会员怎么卡永久方法带成长值和微博红名等功能亲测

PHP配合fiddler抓包抓取微信指数小程序数据的实现方法分析

360浏览器微博提醒如何设置 360浏览器微博提醒设置教程

新浪微博网页加载卡死或者错误的解决方法

新浪微博扫一扫在哪？新浪微博扫描二维码使用方法

微博抓取尝试

更换手机号码新浪微博的手机绑定如何解除

微博运营实战干货分享 3年微博运营经验总结

微博账号登陆时提示帐号异常需要发送短信验证？

盘点网络上盛传的"微博界八大杀器" 你造吗？

2015最新腾讯微博会员开通方法分享 复活卡永久 无需手机绿钻

2015腾讯微博会员怎么卡永久方法 带成长值和微博红名等功能 亲测

PHP配合fiddler抓包抓取微信指数小程序数据的实现方法分析

360浏览器微博提醒如何设置 360浏览器微博提醒设置教程

新浪微博网页加载卡死或者错误的解决方法

新浪微博扫一扫在哪？新浪微博扫描二维码使用方法

2015最新腾讯微博会员开通方法分享复活卡永久无需手机绿钻

2015腾讯微博会员怎么卡永久方法带成长值和微博红名等功能亲测