python抓取搜狗微信公众号文章
程序员文章站
2022-06-14 11:36:30
初学python,抓取搜狗微信公众号文章存入mysql
mysql表:
代码:
import requests
import json
impor...
初学python,抓取搜狗微信公众号文章存入mysql
mysql表:
代码:
import requests import json import re import pymysql # 创建连接 conn = pymysql.connect(host='你的数据库地址', port=端口, user='用户名', passwd='密码', db='数据库名称', charset='utf8') # 创建游标 cursor = conn.cursor() cursor.execute("select * from hd_gzh") effect_row = cursor.fetchall() from bs4 import beautifulsoup socket.setdefaulttimeout(60) count = 1 headers = {'user-agent': 'mozilla/5.0 (windows nt 10.0; win64; x64; rv:65.0) gecko/20100101 firefox/65.0'} #阿布云ip代理暂时不用 # proxyhost = "http-cla.abuyun.com" # proxyport = "9030" # # 代理隧道验证信息 # proxyuser = "h56761606429t7uc" # proxypass = "9168eb00c4167176" # proxymeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % { # "host" : proxyhost, # "port" : proxyport, # "user" : proxyuser, # "pass" : proxypass, # } # proxies = { # "http" : proxymeta, # "https" : proxymeta, # } #查看是否已存在数据 def checkdata(name): sql = "select * from gzh_article where title = '%s'" data = (name,) count = cursor.execute(sql % data) conn.commit() if(count!=0): return false else: return true #插入数据 def insertdata(title,picture,author,content): sql = "insert into gzh_article (title,picture,author,content) values ('%s', '%s','%s', '%s')" data = (title,picture,author,content) cursor.execute(sql % data) conn.commit() print("插入一条数据") return for row in effect_row: newsurl = 'https://weixin.sogou.com/weixin?type=1&s_from=input&query=' + row[1] + '&ie=utf8&_sug_=n&_sug_type_=' res = requests.get(newsurl,headers=headers) res.encoding = 'utf-8' soup = beautifulsoup(res.text,'html.parser') url = 'https://weixin.sogou.com' + soup.select('.tit a')[0]['href'] res2 = requests.get(url,headers=headers) res2.encoding = 'utf-8' soup2 = beautifulsoup(res2.text,'html.parser') pattern = re.compile(r"url \+= '(.*?)';", re.multiline | re.dotall) script = soup2.find("script") url2 = pattern.search(script.text).group(1) res3 = requests.get(url2,headers=headers) res3.encoding = 'utf-8' soup3 = beautifulsoup(res3.text,'html.parser') print() pattern2 = re.compile(r"var msglist = (.*?);$", re.multiline | re.dotall) script2 = soup3.find("script", text=pattern2) s2 = json.loads(pattern2.search(script2.text).group(1)) #等待10s time.sleep(10) for news in s2["list"]: articleurl = "https://mp.weixin.qq.com"+news["app_msg_ext_info"]["content_url"] articleurl = articleurl.replace('&','&') res4 = requests.get(articleurl,headers=headers) res4.encoding = 'utf-8' soup4 = beautifulsoup(res4.text,'html.parser') if(checkdata(news["app_msg_ext_info"]["title"])): insertdata(news["app_msg_ext_info"]["title"],news["app_msg_ext_info"]["cover"],news["app_msg_ext_info"]["author"],pymysql.escape_string(str(soup4))) count += 1 #等待5s time.sleep(10) for news2 in news["app_msg_ext_info"]["multi_app_msg_item_list"]: articleurl2 = "https://mp.weixin.qq.com"+news2["content_url"] articleurl2 = articleurl2.replace('&','&') res5 = requests.get(articleurl2,headers=headers) res5.encoding = 'utf-8' soup5 = beautifulsoup(res5.text,'html.parser') if(checkdata(news2["title"])): insertdata(news2["title"],news2["cover"],news2["author"],pymysql.escape_string(str(soup5))) count += 1 #等待10s time.sleep(10) cursor.close() conn.close() print("操作完成")
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持。
上一篇: vue 组件高级用法实例详解
下一篇: ASP.NET中常用的优化性能的方法