暂存2

程序员文章站 2022-04-07 14:25:38

import timeimport requestsimport jsonimport reimport xlwtfrom bs4 import BeautifulSoupdef main(): list = [] num = 1 while len(list) <= 1000 and num <=50: url = 'https://m.weibo.cn/api/comments/show?id=yadWY3t7v&page=' +...

import time
import requests
import json
import re
import xlwt
from bs4 import BeautifulSoup
def main():
    list = []
    num = 1
    while len(list) <= 1000 and num <=50:
        url = 'https://m.weibo.cn/api/comments/show?id=yadWY3t7v&page=' + str(num)
        html = getHTMLTest(url)
        time.sleep(3)
        getList(list,html)
        print(url)
        print(len(list))
        num += 1
    getExcel(list)

def getHTMLTest(url):        #获取页面源代码
    #cookie
    headers = {
        'User-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:55.0) Gecko/20100101 Firefox/55.0',
        'Cookie': "" #这里用f12获取自己微博登陆的cookie，这里cookie就不共享啦

        }
    try:
        r = requests.get(url, headers=headers)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return ""

def getList(list,html):
    try:
        resjson = json.loads(html)
        Data = resjson.get('data')
        data = Data.get('data')
        for i in range(len(data)):
            try:
                ttime = data[i].get('created_at')
                user = data[i].get('user')
                id = user.get('id')
                infourl = "https://weibo.cn/" + str(id) + "/info"
                infohtml = getHTMLTest(infourl)
                soup = BeautifulSoup(infohtml,"xml")
                c = soup.find_all("div",attrs={"class":"c"})[2]
                sex = re.findall(r"[性别].*?[<]",str(c))[0].replace("性别:","").replace("<","")
                addr = re.findall(r"[地区].*?[<]",str(c))[0].replace("地区:","").replace("<","").split(" ")[0]
                user_name = user.get('screen_name')
                text = data[i].get('text')
                text = text.replace('<span class="url-icon">','').replace("</span>",'').replace("<img alt=","")
                if len(re.findall(r"].*/>", text)):
                    text = text.replace(re.findall(r"].*/>", text)[0],"]")
                text = re.sub("<.*>","",text).replace("回复:",'')
                biaoqing = re.findall(r"[[](.*)[]]",text)
                print(text)
                if text:
                    list.append([id,user_name,sex,addr,text,ttime,biaoqing])
            except:
                continue
    except Exception as e:
        print(e)

def getExcel(list):
    excel = xlwt.Workbook(encoding="utf-8")
    sheet = excel.add_sheet("sheet1")
    sheet.write(0,0,"id")
    sheet.write(0, 1, "用户名")
    sheet.write(0, 2, "性别")
    sheet.write(0, 3, "地区")
    sheet.write(0, 4, "评论")
    sheet.write(0, 5, "时间")
    sheet.write(0,6,"表情")
    for i in range(len(list)):
        t = list[i]
        sheet.write(i+1, 0, t[0])
        sheet.write(i+1, 1, t[1])
        sheet.write(i+1, 2, t[2])
        sheet.write(i+1, 3, t[3])
        sheet.write(i+1, 4, t[4])
        sheet.write(i + 1, 5, t[5])
        m = t[6]
        num = 6
        for j in range(len(m)):
            sheet.write(i+1,num,m[j])
            num += 1
    excel.save('.xls') #这里改为你想要的excel表的名称

本文地址：https://blog.csdn.net/weixin_51705193/article/details/109958406

上一篇：通站长服务器选购指南:做什么类型的网站，选择什么类型的服务器

下一篇：当心robots.txt向黑客泄露了网站的后台和隐私