欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

微博用户评论爬取

程序员文章站 2022-05-02 20:42:54
...
# *第一部分首先要爬取MiuMiu的每一条微博的mid,mid就是每一条微博的唯一标识符,便于后期直接爬取;
#  此次爬取下来的数据是:mid和评论数两个维度,后去需要将没有评论的mid删除,节省爬虫时间。*
#-------------------------------------------------------------------------------------#

#使用selenium模块进行模拟浏览器爬取,在python下直接pip install selenium安装即可;
#另外需要安装chrome或者Firefox浏览器,推荐使用安装chrome浏览器,并且要根据chrome浏览器的相应版本安装chromedriver,
#chromedriver的版本必须要与安装的chrome版本相对应,要不然selenium调浏览器会出错。

from selenium import webdriver
import time

driver = webdriver.Chrome()

driver.get('https://weibo.com')
time.sleep(5)

driver.find_element_by_id('loginName').send_keys('******')

driver.find_element_by_id('loginPassword').send_keys('******')

driver.find_element_by_id('loginAction').click()
time.sleep(5)

driver.get('https://weibo.com/miumiuofficial?profile_ftype=1&is_all=1#_0')#此网址为你需要爬取的微博的主页,手动复制添加即可

for j in range(29):
    print(j)
    for i in range(3):
        driver.execute_script('window.scrollTo(0,1000000)')
        time.sleep(10)
    for i in range(2,47):
        test = driver.find_element_by_xpath('//*[@id="Pl_Official_MyProfileFeed__23"]/div/div['+str(i)+']/div[1]/div[3]/div[2]/a[1]')
        comment = driver.find_element_by_xpath('//*[@id="Pl_Official_MyProfileFeed__23"]/div/div['+str(i)+']/div[2]/div/ul/li[3]/a/span/span/span/em[2]').text
        url_mid = test.get_attribute('name')
        print(i)
        with open('D:\\newmid_MiuMiu.txt',mode='a+') as f:#数据保存地址
            f.write(url_mid)
            f.write('\t')
            f.write(str(comment))
            f.write('\n')
    driver.find_element_by_link_text('下一页').click()
    time.sleep(10)


#第二部分进行评论数据爬取,爬取的格式为:评论 时间两个维度进行保存。#
#-------------------------------------------------------------#
# -*- coding: utf-8 -*-
import time
import base64
import rsa
import binascii
import requests
import re
from PIL import Image
import random
from urllib.parse import quote_plus
import http.cookiejar as cookielib
import json
import pandas as pd
import pandas

agent = 'mozilla/5.0 (windowS NT 10.0; win64; x64) appLewEbkit/537.36 (KHTML, likE gecko) chrome/71.0.3578.98 safari/537.36'
headers = {'User-Agent': agent}

class WeiboLogin(object):
	"""
	通过登录 weibo.com 然后跳转到 m.weibo.cn
	"""
	#初始化数据
	def __init__(self, user, password, cookie_path):
		super(WeiboLogin, self).__init__()
		self.user = user
		self.password = password
		self.session = requests.Session()
		self.cookie_path = cookie_path
		# LWPCookieJar是python中管理cookie的工具,可以将cookie保存到文件,或者在文件中读取cookie数据到程序
		self.session.cookies = cookielib.LWPCookieJar(filename=self.cookie_path)
		self.index_url = "http://weibo.com/login.php"
		self.session.get(self.index_url, headers=headers, timeout=2)
		self.postdata = dict()

	def get_su(self):
		"""
		对 email 地址和手机号码 先 javascript 中 encodeURIComponent
		对应 Python 3 中的是 urllib.parse.quote_plus
		然后在 base64 加密后decode
		"""
		username_quote = quote_plus(self.user)
		username_base64 = base64.b64encode(username_quote.encode("utf-8"))
		return username_base64.decode("utf-8")

    # 预登陆获得 servertime, nonce, pubkey, rsakv
	def get_server_data(self, su):
		"""与原来的相比,微博的登录从 v1.4.18 升级到了 v1.4.19
		这里使用了 URL 拼接的方式,也可以用 Params 参数传递的方式
		"""
		pre_url = "http://login.sina.com.cn/sso/prelogin.php?entry=weibo&callback=sinaSSOController.preloginCallBack&su="
		pre_url = pre_url + su + "&rsakt=mod&checkpin=1&client=ssologin.js(v1.4.19)&_="
		pre_url = pre_url + str(int(time.time() * 1000))
		pre_data_res = self.session.get(pre_url, headers=headers)
		sever_data = eval(pre_data_res.content.decode("utf-8").replace("sinaSSOController.preloginCallBack", ''))

		return sever_data

	def get_password(self, servertime, nonce, pubkey):
		"""对密码进行 RSA 的加密"""
		rsaPublickey = int(pubkey, 16)
		key = rsa.PublicKey(rsaPublickey, 65537)  # 创建公钥
		message = str(servertime) + '\t' + str(nonce) + '\n' + str(self.password)  # 拼接明文js加密文件中得到
		message = message.encode("utf-8")
		passwd = rsa.encrypt(message, key)  # 加密
		passwd = binascii.b2a_hex(passwd)  # 将加密信息转换为16进制。
		return passwd

	def get_cha(self, pcid):
		"""获取验证码,并且用PIL打开,
		1. 如果本机安装了图片查看软件,也可以用 os.subprocess 的打开验证码
		2. 可以改写此函数接入打码平台。
		"""
		cha_url = "https://login.sina.com.cn/cgi/pin.php?r="
		cha_url = cha_url + str(int(random.random() * 100000000)) + "&s=0&p="
		cha_url = cha_url + pcid
		cha_page = self.session.get(cha_url, headers=headers)
		with open("cha.jpg", 'wb') as f:
			f.write(cha_page.content)
			f.close()
		try:
			im = Image.open("cha.jpg")
			im.show()
			im.close()
		except Exception as e:
		    print(u"请到当前目录下,找到验证码后输入")

	def pre_login(self):
		# su 是加密后的用户名
		su = self.get_su()
		sever_data = self.get_server_data(su)
		servertime = sever_data["servertime"]
		nonce = sever_data['nonce']
		rsakv = sever_data["rsakv"]
		pubkey = sever_data["pubkey"]
		showpin = sever_data["showpin"]  # 这个参数的意义待探索
		password_secret = self.get_password(servertime, nonce, pubkey)

		self.postdata = {
		    'entry': 'weibo',
		    'gateway': '1',
		    'from': '',
		    'savestate': '7',
		    'useticket': '1',
		    'pagerefer': "https://passport.weibo.com",
		    'vsnf': '1',
		    'su': su,
		    'service': 'miniblog',
		    'servertime': servertime,
		    'nonce': nonce,
		    'pwencode': 'rsa2',
		    'rsakv': rsakv,
		    'sp': password_secret,
		    'sr': '1366*768',
		    'encoding': 'UTF-8',
		    'prelt': '115',
		    "cdult": "38",
		    'url': 'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack',
		    'returntype': 'TEXT'  # 这里是 TEXT 和 META 选择,具体含义待探索
		}
		return sever_data

	def login(self):
		# 先不输入验证码登录测试
		try:
			sever_data = self.pre_login()
			login_url = 'https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.19)&_'
			login_url = login_url + str(time.time() * 1000)
			login_page = self.session.post(login_url, data=self.postdata, headers=headers)
			ticket_js = login_page.json()
			ticket = ticket_js["ticket"]
		except Exception as e:
			sever_data = self.pre_login()
			login_url = 'https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.19)&_'
			login_url = login_url + str(time.time() * 1000)
			pcid = sever_data["pcid"]
			self.get_cha(pcid)
			self.postdata['door'] = input(u"请输入验证码")
			login_page = self.session.post(login_url, data=self.postdata, headers=headers)
			ticket_js = login_page.json()
			ticket = ticket_js["ticket"]
			# 以下内容是 处理登录跳转链接
		save_pa = r'==-(\d+)-'
		ssosavestate = int(re.findall(save_pa, ticket)[0]) + 3600 * 7
		jump_ticket_params = {
		    "callback": "sinaSSOController.callbackLoginStatus",
		    "ticket": ticket,
		    "ssosavestate": str(ssosavestate),
		    "client": "ssologin.js(v1.4.19)",
		    "_": str(time.time() * 1000),
		}
		jump_url = "https://passport.weibo.com/wbsso/login"
		jump_headers = {
		    "Host": "passport.weibo.com",
		    "Referer": "https://weibo.com/",
		    "User-Agent": headers["User-Agent"]
		}
		jump_login = self.session.get(jump_url, params=jump_ticket_params, headers=jump_headers)
		uuid = jump_login.text

		uuid_pa = r'"uniqueid":"(.*?)"'
		uuid_res = re.findall(uuid_pa, uuid, re.S)[0]
		web_weibo_url = "http://weibo.com/%s/profile?topnav=1&wvr=6&is_all=1" % uuid_res
		weibo_page = self.session.get(web_weibo_url, headers=headers)

		Mheaders = {
		    "Host": "login.sina.com.cn",
		    "User-Agent": agent
		}

		# m.weibo.cn 登录的 url 拼接
		_rand = str(time.time())
		mParams = {
		    "url": "https://m.weibo.cn/",
		    "_rand": _rand,
		    "gateway": "1",
		    "service": "sinawap",
		    "entry": "sinawap",
		    "useticket": "1",
		    "returntype": "META",
		    "sudaref": "",
		    "_client_version": "0.6.26",
		}
		murl = "https://login.sina.com.cn/sso/login.php"
		mhtml = self.session.get(murl, params=mParams, headers=Mheaders)
		mhtml.encoding = mhtml.apparent_encoding
		mpa = r'replace\((.*?)\);'
		mres = re.findall(mpa, mhtml.text)

		# 关键的跳转步骤,这里不出问题,基本就成功了。
		Mheaders["Host"] = "passport.weibo.cn"
		self.session.get(eval(mres[0]), headers=Mheaders)
		mlogin = self.session.get(eval(mres[0]), headers=Mheaders)
		# print(mlogin.status_code)
		# 进过几次 页面跳转后,m.weibo.cn 登录成功,下次测试是否登录成功
		Mheaders["Host"] = "m.weibo.cn"
		Set_url = "https://m.weibo.cn"
		pro = self.session.get(Set_url, headers=Mheaders)
		pa_login = r'isLogin":true,'
		login_res = re.findall(pa_login, pro.text)

		# 可以通过 session.cookies 对 cookies 进行下一步相关操作
		self.session.cookies.save()



def weibo_comment(id_mid):
	max_id = ""
	headers = {"user-agent": "mozilla/5.0 (windowS NT 10.0; win64; x64) appLewEbkit/537.36 (KHTML, likE gecko) chrome/71.0.3578.98 safari/537.36"}
	#加载cookie
	cookies = cookielib.LWPCookieJar("Cookie.txt")
	cookies.load(ignore_discard=True, ignore_expires=True)
	# 将cookie转换成字典
	cookie_dict = requests.utils.dict_from_cookiejar(cookies)
    
	max_id_list = []

	while True:
        
		if max_id == "":
			url = "https://m.weibo.cn/comments/hotflow?id="+str(id_mid)+"&mid="+str(id_mid)+"&max_id_type=0"
		else:
			url = "https://m.weibo.cn/comments/hotflow?id="+str(id_mid)+"&mid="+str(id_mid)+"&max_id="+str(max_id)+"&max_id_type=0"
		response = requests.get(url, headers=headers, cookies=cookie_dict)
		comment = response.json()

		if comment['ok'] == 0:
			break
		max_id = comment["data"]["max_id"]
		if max_id in max_id_list:
			break
		max_id_list.append(max_id)

		line = []
		for comment_data in comment["data"]["data"]:
			data = comment_data["text"]
			p = re.compile(r'(<span.*>.*</span>)*(<a.*>.*</ a>)?')
			data = p.sub(r'', data)
			data1 = [comment_data["text"]]
			data2 = pandas.DataFrame(data1,columns=["text"])
			data2["date"] = comment_data["created_at"]
			data2.to_csv('D:\\MiuMiu.csv', header=False,index=False,mode='a+',encoding="utf_8_sig") #写入csv文件,'a+'是追加模式
			if len(data) != 0:
				line.append(data)
		time.sleep(5)

if __name__ == '__main__':
	username = "******"  # 用户名
	password = "******"  # 密码
	cookie_path = "Cookie.txt"  # 保存cookie 的文件名称
	weibo = WeiboLogin(username, password, cookie_path)
	weibo.login()
	data_MiuMiu = pd.read_table("D:\\MiuMiu.txt",header=None,sep=',')
	data_MiuMiu_1 = data_MiuMiu[0]
	num_loop = len(data_MiuMiu_1)
	for i in range(88,num_loop):
		print(i)
		weibo_comment(str(data_MiuMiu_1[i]))