欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  IT编程

暂存2

程序员文章站 2022-07-09 23:28:46
import timeimport requestsimport jsonimport reimport xlwtfrom bs4 import BeautifulSoupdef main(): list = [] num = 1 while len(list) <= 1000 and num <=50: url = 'https://m.weibo.cn/api/comments/show?id=yadWY3t7v&page=' +...
import time
import requests
import json
import re
import xlwt
from bs4 import BeautifulSoup
def main():
    list = []
    num = 1
    while len(list) <= 1000 and num <=50:
        url = 'https://m.weibo.cn/api/comments/show?id=yadWY3t7v&page=' + str(num)
        html = getHTMLTest(url)
        time.sleep(3)
        getList(list,html)
        print(url)
        print(len(list))
        num += 1
    getExcel(list)
def getHTMLTest(url):        #获取页面源代码
    #cookie
    headers = {
        'User-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:55.0) Gecko/20100101 Firefox/55.0',
        'Cookie': "" #这里用f12获取自己微博登陆的cookie,这里cookie就不共享啦

        }
    try:
        r = requests.get(url, headers=headers)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return ""
def getList(list,html):
    try:
        resjson = json.loads(html)
        Data = resjson.get('data')
        data = Data.get('data')
        for i in range(len(data)):
            try:
                ttime = data[i].get('created_at')
                user = data[i].get('user')
                id = user.get('id')
                infourl = "https://weibo.cn/" + str(id) + "/info"
                infohtml = getHTMLTest(infourl)
                soup = BeautifulSoup(infohtml,"xml")
                c = soup.find_all("div",attrs={"class":"c"})[2]
                sex = re.findall(r"[性别].*?[<]",str(c))[0].replace("性别:","").replace("<","")
                addr = re.findall(r"[地区].*?[<]",str(c))[0].replace("地区:","").replace("<","").split(" ")[0]
                user_name = user.get('screen_name')
                text = data[i].get('text')
                text = text.replace('<span class="url-icon">','').replace("</span>",'').replace("<img alt=","")
                if len(re.findall(r"].*/>", text)):
                    text = text.replace(re.findall(r"].*/>", text)[0],"]")
                text = re.sub("<.*>","",text).replace("回复:",'')
                biaoqing = re.findall(r"[[](.*)[]]",text)
                print(text)
                if text:
                    list.append([id,user_name,sex,addr,text,ttime,biaoqing])
            except:
                continue
    except Exception as e:
        print(e)

def getExcel(list):
    excel = xlwt.Workbook(encoding="utf-8")
    sheet = excel.add_sheet("sheet1")
    sheet.write(0,0,"id")
    sheet.write(0, 1, "用户名")
    sheet.write(0, 2, "性别")
    sheet.write(0, 3, "地区")
    sheet.write(0, 4, "评论")
    sheet.write(0, 5, "时间")
    sheet.write(0,6,"表情")
    for i in range(len(list)):
        t = list[i]
        sheet.write(i+1, 0, t[0])
        sheet.write(i+1, 1, t[1])
        sheet.write(i+1, 2, t[2])
        sheet.write(i+1, 3, t[3])
        sheet.write(i+1, 4, t[4])
        sheet.write(i + 1, 5, t[5])
        m = t[6]
        num = 6
        for j in range(len(m)):
            sheet.write(i+1,num,m[j])
            num += 1
    excel.save('.xls') #这里改为你想要的excel表的名称

本文地址:https://blog.csdn.net/weixin_51705193/article/details/109958406

推荐阅读