暂存2
程序员文章站
2022-07-09 23:28:46
import timeimport requestsimport jsonimport reimport xlwtfrom bs4 import BeautifulSoupdef main(): list = [] num = 1 while len(list) <= 1000 and num <=50: url = 'https://m.weibo.cn/api/comments/show?id=yadWY3t7v&page=' +...
import time
import requests
import json
import re
import xlwt
from bs4 import BeautifulSoup
def main():
list = []
num = 1
while len(list) <= 1000 and num <=50:
url = 'https://m.weibo.cn/api/comments/show?id=yadWY3t7v&page=' + str(num)
html = getHTMLTest(url)
time.sleep(3)
getList(list,html)
print(url)
print(len(list))
num += 1
getExcel(list)
def getHTMLTest(url): #获取页面源代码
#cookie
headers = {
'User-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:55.0) Gecko/20100101 Firefox/55.0',
'Cookie': "" #这里用f12获取自己微博登陆的cookie,这里cookie就不共享啦
}
try:
r = requests.get(url, headers=headers)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
def getList(list,html):
try:
resjson = json.loads(html)
Data = resjson.get('data')
data = Data.get('data')
for i in range(len(data)):
try:
ttime = data[i].get('created_at')
user = data[i].get('user')
id = user.get('id')
infourl = "https://weibo.cn/" + str(id) + "/info"
infohtml = getHTMLTest(infourl)
soup = BeautifulSoup(infohtml,"xml")
c = soup.find_all("div",attrs={"class":"c"})[2]
sex = re.findall(r"[性别].*?[<]",str(c))[0].replace("性别:","").replace("<","")
addr = re.findall(r"[地区].*?[<]",str(c))[0].replace("地区:","").replace("<","").split(" ")[0]
user_name = user.get('screen_name')
text = data[i].get('text')
text = text.replace('<span class="url-icon">','').replace("</span>",'').replace("<img alt=","")
if len(re.findall(r"].*/>", text)):
text = text.replace(re.findall(r"].*/>", text)[0],"]")
text = re.sub("<.*>","",text).replace("回复:",'')
biaoqing = re.findall(r"[[](.*)[]]",text)
print(text)
if text:
list.append([id,user_name,sex,addr,text,ttime,biaoqing])
except:
continue
except Exception as e:
print(e)
def getExcel(list):
excel = xlwt.Workbook(encoding="utf-8")
sheet = excel.add_sheet("sheet1")
sheet.write(0,0,"id")
sheet.write(0, 1, "用户名")
sheet.write(0, 2, "性别")
sheet.write(0, 3, "地区")
sheet.write(0, 4, "评论")
sheet.write(0, 5, "时间")
sheet.write(0,6,"表情")
for i in range(len(list)):
t = list[i]
sheet.write(i+1, 0, t[0])
sheet.write(i+1, 1, t[1])
sheet.write(i+1, 2, t[2])
sheet.write(i+1, 3, t[3])
sheet.write(i+1, 4, t[4])
sheet.write(i + 1, 5, t[5])
m = t[6]
num = 6
for j in range(len(m)):
sheet.write(i+1,num,m[j])
num += 1
excel.save('.xls') #这里改为你想要的excel表的名称
本文地址:https://blog.csdn.net/weixin_51705193/article/details/109958406
上一篇: UGNX8.5怎么用动画尺寸命令做简单的运动仿真?
下一篇: 网站吸引蜘蛛抓取的方法