python知乎首页文章数据爬取
程序员文章站
2022-05-25 09:05:37
...
一、知识点总结和操作步骤以及现存问题
二、源码展示
import urllib.request
import gzip
import io
import random
import threading
import time
import pandas
import json
import sqlite3
"""设置代理user_agent"""
user_agent_set = ['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.81 Safari/537.36']
"""使用IP代理池,此处手动拼接成的ip代理,付费可以购买隧道"""
"""设置header"""
header = {
'Host': 'www.zhihu.com',
'Connection': 'keep-alive',
'x-ab-param': 'qap_question_author=0;ls_video_commercial=0;li_sp_mqbk=0;li_vip_verti_search=0;li_panswer_topic=0;qap_question_visitor= 0;tp_contents=1;zr_expslotpaid=1;zr_intervene=0;li_edu_page=old;pf_profile2_tab=0;li_paid_answer_exp=0;tp_zrec=1;pf_adjust=1;se_ffzx_jushen1=0;top_test_4_liguangyi=1;zr_slotpaidexp=2;tp_dingyue_video=0;tp_topic_style=0;pf_noti_entry_num=2;li_video_section=1',
'x-ab-pb': 'Ck49DPMLJgwPC+QKWAvXC1IM4AsnDEsLrAsgDEwLuQvPC0sMtAo+DJYL7Ao3DAAMmwvhC5oLhgsHDAELUgu1CyIMIQxgCzQM9AtWDA8M3AsSJwAAAAAAAAAAAAEBAQAAAQsAAAAAAQEAAgEAAQEBAQMAAQABAAAAAA==',
'x-api-version': '3.0.53',
'User-Agent':random.choice(user_agent_set),
'x-zse-86': '1.0_a_x0Hh9y6TxpNg28G0YBeAr8r_YpS8YyzBYq67U8cLSp',
'x-requested-with': 'fetch',
'x-zse-83': '3_2.0',
'Accept': '*/*',
'Referer': 'https://www.zhihu.com/',
'Accept-Encoding':' gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cookie': '**************************************************'
}
"""全局变量设置"""
result = [] #结果存储变量
thread_set = [] #线程存储变量
threadLock = threading.Lock() #设置线程锁
url_set = [] #请求参数集合
# df_result = pandas.DataFrame() #结果存储表格
result_information = [] #解析结果存储文件
class Tread_zhihu(threading.Thread):
"""创建知乎线程对象"""
def __init__(self,threadID,name,start_index,end_index):
threading.Thread.__init__(self)
self.threadID = threadID
self.name = name
self.start_index = start_index
self.end_index = end_index
def run(self):
"""获取data"""
print(str(self.threadID)+str(self.name)+time.ctime())
threadLock.acquire()
get_data(self.start_index,self.end_index)
threadLock.release()
# time.sleep(0.2)
def get_data(start_index,end_index):
for i in range(start_index,end_index):
data_temp = request_data(url_set[i])
data_temp1 = json.loads(data_temp) #将json数据转换成python格式
print(data_temp1)
result.append(data_temp1)
return result
def get_url():
"""获得url链接集合"""
for i in range(0,16):
page_num_value = i+2
after_id_value = 5 + (i*6)
url_temp_url = 'https://www.zhihu.com/api/v3/feed/topstory/recommend?session_token=c1b01d7c7522284a68ecc9cbbbf3748e&desktop=true&page_number='+ str(page_num_value) +'&limit=6&action=down&after_id='+ str(after_id_value) +'&ad_interval=-1'
url_set.append(url_temp_url)
return url_set
def request_data(url):
"""请求获取数据"""
req = urllib.request.Request(url, headers=header)
response = urllib.request.urlopen(req)
data = response.read()
buff = io.BytesIO(data)
f = gzip.GzipFile(fileobj=buff)
res = f.read().decode('utf-8')
return res
def data_parsing(result):
"""解析提取json数据"""
for datax in result: #对于result中数据的循环
for i in range(len(datax)): #对于list中data个数的循环
try:
data_title = datax['data'][i]['target']['title'] #提取文章题目
except BaseException:
data_title = '无题'
try:
data_author = datax['data'][i]['target']['author']['name'] #提取文章作者名字
except BaseException:
data_author = 'default' # 提取文章作者名字
try:
data_voteuoCount = datax['data'][i]['target']['voteup_count'] #提取文章的点赞数
except BaseException:
data_author = 'default' #提取文章的点赞数
try:
data_contentIntroduce = datax['data'][i]['target']['excerpt_new'] #提取每条信息的内容简介
except BaseException:
data_author = 'default' #提取每条信息的内容简介
information = {'标题':data_title,'作者':data_author,'点赞数':data_voteuoCount,'内容简介':data_contentIntroduce} #信息字典
result_information.append(information)
return result_information
def data_save(result_information):
"""存储数据至文档"""
title_temp = []
author_temp = []
countVote_temp = []
contentIntroduce_temp = []
for info in result_information:
title_temp.append(info['标题'])
author_temp.append(info['作者'])
countVote_temp.append(info['点赞数'])
contentIntroduce_temp.append(info['内容简介'])
da = {"标题":title_temp,"作者":author_temp,"点赞数":countVote_temp,"内容简介":contentIntroduce_temp}
df = pandas.DataFrame(da)
df.to_excel('./out.xls',index=False)
def creat_sqlite():
"""创建数据库sqlite"""
con = sqlite3.connect('Zhihudata.db')
cur = con.cursor()
sql = 'CREATE TABLE table_one(Serial_number INTEGER PRIMARY KEY AUTOINCREMENT,Title varchar(30) NOT NULL,Author varchar(30) NOT NULL,' \
'Voteup_count varchar(30) NOT NULL,Content_introduce varchar(30) NOT NULL)'
try:
cur.execute(sql)
except Exception as e:
print(e)
print('创表失败')
finally:
cur.close()
con.close()
def insert_sqlite(result_information):
"""插入数据至数据库"""
con = sqlite3.connect('*************')
cur = con.cursor()
for info in result_information:
sql = 'insert into table_one(Title,Author,Voteup_count,Content_introduce)' \
'values(:标题,:作者,:点赞数,:内容简介)'
cur.execute(sql,info)
con.commit()
cur.close()
if __name__ == '__main__':
get_url()
concurrent_num = 4 #设置并发线程数
thread_circle_num = int(16/concurrent_num) #设置线程步长
for i in range(1,concurrent_num+1):
if i == 1:
thread = Tread_zhihu(i,"Thread-"+str(i),0,thread_circle_num+1)
elif i == 2:
thread = Tread_zhihu(i,"Thread-"+str(i),thread_circle_num+1,i*thread_circle_num+1)
else:
thread = Tread_zhihu(i,"Thread-"+str(i),(i-1)*thread_circle_num+1,i*thread_circle_num)
thread_set.append(thread)
# print(thread_set)
"""执行线程和等待线程结束"""
for i in range(len(thread_set)):
thread_set[i].start()
for i in range(len(thread_set)):
thread_set[i].join()
print('data request is over')
data_parsing(result) #解析数据
data_save(result_information) #存储数据
insert_sqlite(result_information) #插入数据至sqlite3数据库
三、成果展示
四、现存问题
1.简介内容特别字符没有处理。
2.sqlite一个表格插入1000数据不可再插入,爬取下来的数据不能完全存入数据库。
3.欢迎大佬批评指正。