python采集微信公众号文章
程序员文章站
2022-05-27 22:35:20
本文实例为大家分享了python采集微信公众号文章的具体代码,供大家参考,具体内容如下
在python一个子目录里存2个文件,分别是:采集公众号文章.py和config....
本文实例为大家分享了python采集微信公众号文章的具体代码,供大家参考,具体内容如下
在python一个子目录里存2个文件,分别是:采集公众号文章.py和config.py。 代码如下:
1.采集公众号文章.py
from urllib.parse import urlencode import pymongo import requests from lxml.etree import xmlsyntaxerror from requests.exceptions import connectionerror from pyquery import pyquery as pq from config import * #配置mongodb client = pymongo.mongoclient(mongo_uri) db = client[mongo_db] base_url = 'http://weixin.sogou.com/weixin?' #添加头文件 headers = { 'cookie': 'usid=s-pkm6vw_ac4ktr1; suv=00a75e9078efd9f75a6573ecad0ec883; wuid=aagcxershqaaaaqrgn4soagaaaa=; iploc=cn4414; suid=767beab73220910a000000005aa9e2aa; pgv_pvi=159197184; pgv_si=s8252565504; abtest=0|1521083055|v1; weixinindexvisited=1; sct=1; jsessionid=aaalxqkrp6jjs8ac4hwhw; ppinf=5|1521083238|1522292838|dhj1c3q6mtoxfgnsawvudglkojq6mjaxn3x1bmlxbmftzto2oiuzqsuyoxxjcnq6mta6mtuymta4mzizohxyzwzuawnrojy6jtnbjti5fhvzzxjpzdo0ndpvoxqybhvoaexncs1vlw1zbjmxmmnmskp4ogpzqhdlaxhpbi5zb2h1lmnvbxw; pprdig=tbvf7qlzddmjpcn4jtf3dg8c8nerx-ygdi8kucezn0rtewuhkgu4xmnaxzbakvquswboigl_rd-34abu6vy9jkv7me3bypigydniv2ljuchgco7gk58m9qhrm3aa7nhlhjfvyoaqkqgbsykpatxmnpe3tm57zdlzdpg_8mbmbnq; sgid=23-30671195-avqp42zctqiacybbdvvfwno4; phpsessid=4jjk2a9rv6kq7m50f42r92u3r3; suir=d2df4e12a5a1c3ce1a8ad7f2a5fe18fe; ppmdig=1521087492000000855f9824f94abe82b25d2839135ad3a8; snuid=fef36d3f8882efec4fcf61e68801da49; seccoderight=success; successcount=1|thu, 15 mar 2018 04:23:23 gmt', 'host': 'weixin.sogou.com', 'referer': 'http://weixin.sogou.com/antispider/?from=%2fweixin%3fquery%3d%e9%a3%8e%e6%99%af%26type%3d2%26page%3d95%26ie%3dutf8', 'upgrade-insecure-requests': '1', 'user-agent': 'mozilla/5.0 (macintosh; intel mac os x 10_11_6) applewebkit/537.36 (khtml, like gecko) chrome/65.0.3325.146 safari/537.36' } #初始化代理为本地ip proxy = none #定义获取代理函数 def get_proxy(): try: response = requests.get(proxy_pool_url) if response.status_code == 200: return response.text return none except connectionerror: return none #添加代理获取网页内容 def get_html(url, count=1): print('crawling', url) print('trying count', count) global proxy if count >= max_count: print('tried too many counts') return none try: if proxy: proxies = { 'http': 'http://' + proxy } response = requests.get(url, allow_redirects=false, headers=headers, proxies=proxies) else: response = requests.get(url, allow_redirects=false, headers=headers) if response.status_code == 200: return response.text if response.status_code == 302: # need proxy print('302') proxy = get_proxy() if proxy: print('using proxy', proxy) return get_html(url) else: print('get proxy failed') return none except connectionerror as e: print('error occurred', e.args) proxy = get_proxy() count += 1 return get_html(url, count) #获取索引页内容 def get_index(keyword, page): data = { 'query': keyword, 'type': 2, 'page': page } queries = urlencode(data) url = base_url + queries html = get_html(url) return html #解析索引页,提取详情页网址 def parse_index(html): doc = pq(html) items = doc('.news-box .news-list li .txt-box h3 a').items() for item in items: yield item.attr('href') #获取详情页 def get_detail(url): try: response = requests.get(url) if response.status_code == 200: return response.text return none except connectionerror: return none #解析索引页,返回微信文章标题、内容、日期、公众号名称等 def parse_detail(html): try: doc = pq(html) title = doc('.rich_media_title').text() content = doc('.rich_media_content').text() date = doc('#post-date').text() nickname = doc('#js_profile_qrcode > div > strong').text() wechat = doc('#js_profile_qrcode > div > p:nth-child(3) > span').text() return { 'title': title, 'content': content, 'date': date, 'nickname': nickname, 'wechat': wechat } except xmlsyntaxerror: return none #存储到mongodb,去重操作 def save_to_mongo(data): if db['articles'].update({'title': data['title']}, {'$set': data}, true): print('saved to mongo', data['title']) else: print('saved to mongo failed', data['title']) #主函数 def main(): for page in range(1, 101): html = get_index(keyword, page) if html: article_urls = parse_index(html) for article_url in article_urls: article_html = get_detail(article_url) if article_html: article_data = parse_detail(article_html) print(article_data) if __name__ == '__main__': main()
2.config.py代码:
#爬取公众号文章 proxy_pool_url = 'http://127.0.0.1:5000/get' keyword ='计算机等级二级' # 输入关键词 mongo_uri = 'localhost' mongo_db = 'data' max_count = 5
其中,config.py中keyword为查找关键词,可以根据需要更改。经实测,运行"采集公众号文章.py"成功!若因受限不成功,可多运行几次。
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持。