多协程爬取中大微博内容(以及转发数,点赞数,评论数)
程序员文章站
2022-05-02 20:50:21
...
这个是在之前的微博爬取(Python)–中大微博前100条微博内容以及评论转发点赞数目爬取
的并发版本
代码
import requests
from gevent import monkey
import gevent
monkey.patch_all(select=False)
from pyquery import PyQuery as pq
headers = {
'Host': 'm.weibo.cn',
'Referer': 'https://m.weibo.cn/u/1892723783?uid=1892723783&luicode=10000011&lfid=1076031892723783&featurecode=20000320',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
'X-DevTools-Emulate-Network-Conditions-Client-Id': 'A20EA5B172E6DC82709D213A40AD0E8F'
}
def get_page(page):
url = 'https://m.weibo.cn/api/container/getIndex?uid=1892723783&luicode=10000011&lfid=1076031892723783&featurecode=20000320&type=uid&value=1892723783&containerid=1076031892723783&page=%d' % page
try:
res = requests.get(url, headers=headers)
if res.status_code == 200:
return res.json()
except requests.ConnectionError as e:
print("Error", e.args)
def parse_page(json):
if json:
items = json.get('data').get('cards')
for item in items:
item = item.get('mblog')
weibo = {}
weibo['text'] = pq(item.get('text')).text()
weibo['attitudes'] = item.get('attitudes_count')
weibo['comments'] = item.get('comments_count')
weibo['reposts'] = item.get('reposts_count')
yield weibo
def oper(page):
global data
json = get_page(page)
results = parse_page(json)
count = 0
for res in results:
data[page * 20 + count] = '\n'.join(
[res['text'], '【评论数: ' + str(res['comments']) + ' 转发数: ' + str(res['reposts']) + ' 点赞数: ' + str(
res['attitudes']) + '】\n\n'])
count += 1
if __name__ == '__main__':
data = {}
gevent.joinall([gevent.spawn(oper, page) for page in range(1, 11)])
with open('weibo.txt', 'w', encoding='utf-8') as f:
f.write(''.join(data.values()))
上一篇: 2019新零售,能给你带来什么?