欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

python爬取微博转发以及转发后的点赞数、转发人信息

程序员文章站 2022-05-02 20:42:24
...

延续上一篇博客,这一篇是为了爬取微博转发人的一些相关数据,数据的分析没什么太大的难度,找到翻页规律就行,不多说,直接贴代码

# -*- coding:utf-8 -*-
__author__ = 'TengYu'
import requests
import xlwt
import  re
import json
import time

headers = {'User-agert':'Your-agent',
           'Cookie':'Your-Cookie'}


#工具类,用来去除爬取的正文中一些不需要的链接、标签等
class Tool:
    deleteImg = re.compile('<img.*?>')
    newLine =re.compile('<tr>|<div>|</tr>|</div>')
    deleteAite = re.compile('//.*?:')
    deleteAddr = re.compile('<a.*?>.*?</a>|<a href='+'\'https:')
    deleteTag = re.compile('<.*?>')
    deleteWord = re.compile('回复@|回覆@|回覆|回复')

    @classmethod
    def replace(cls,x):
        x = re.sub(cls.deleteWord,'',x)
        x = re.sub(cls.deleteImg,'',x)
        x = re.sub(cls.deleteAite,'',x)
        x = re.sub(cls.deleteAddr, '', x)
        x = re.sub(cls.newLine,'',x)
        x = re.sub(cls.deleteTag,'',x)
        return x.strip()



class zhuanfa(object):
    def get_zhuanfa(self):
        File = open('filename.txt', "w")
        excel = xlwt.Workbook(encoding='utf-8')
        sheet = excel.add_sheet('sheet1')
        sheet.write(0, 0, 'id')
        sheet.write(0, 1, 'name')
        sheet.write(0, 2, 'time')
        sheet.write(0, 3, 'text')
        sheet.write(0, 4, 'likes')
        count = 0
        i = 0
        while True and count < 1000:
            url = 'https://m.weibo.cn/api/statuses/repostTimeline?id=4303334066243259&page='
            i = i + 1
            url = url + str(i)
            print url
            try:
                response = requests.get(url, headers=headers)
                resjson = json.loads(response.text)
                dataset = resjson.get('data')
                data = dataset.get('data')
                for j in range(0, len(data)):
                    temp = data[j]
                    user = temp.get('user')
                    text = temp.get('text')
                    text = Tool.replace(text)
                    userid = user.get('id')
                    screen_name = user.get('screen_name')
                    created_at = temp.get('created_at')
                    attitudes_count = temp.get('attitudes_count')
                    count += 1
                    File.write(text.encode('utf-8') + '\n')
                    sheet.write(count,0,userid)
                    sheet.write(count,1,screen_name.encode('utf-8'))
                    sheet.write(count,2,created_at.encode('utf-8'))
                    sheet.write(count,3,text.encode('utf-8'))
                    sheet.write(count,4,attitudes_count)
                print ("已经获取" + str(count) + "条数据")
                time.sleep(3)
            except Exception,e:
                print e
        File.close()
        excel.save('filename.xls')

if __name__ == '__main__':
    Zhuanfa = zhuanfa()
    Zhuanfa.get_zhuanfa()

尊重原作,转载请注明,转载自:https://blog.csdn.net/kr2563

相关标签: python 爬虫