python知乎首页文章数据爬取

程序员文章站 2022-05-25 09:05:37

...

一、知识点总结和操作步骤以及现存问题

python知乎首页文章数据爬取

二、源码展示

import urllib.request
import gzip
import io
import random
import threading
import time
import pandas
import json
import sqlite3

"""设置代理user_agent"""
user_agent_set = ['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
                  'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
                  'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
                  'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50',
                  'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201',
                  'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.81 Safari/537.36']

"""使用IP代理池,此处手动拼接成的ip代理,付费可以购买隧道"""

"""设置header"""
header = {
'Host': 'www.zhihu.com',
'Connection': 'keep-alive',
'x-ab-param': 'qap_question_author=0;ls_video_commercial=0;li_sp_mqbk=0;li_vip_verti_search=0;li_panswer_topic=0;qap_question_visitor= 0;tp_contents=1;zr_expslotpaid=1;zr_intervene=0;li_edu_page=old;pf_profile2_tab=0;li_paid_answer_exp=0;tp_zrec=1;pf_adjust=1;se_ffzx_jushen1=0;top_test_4_liguangyi=1;zr_slotpaidexp=2;tp_dingyue_video=0;tp_topic_style=0;pf_noti_entry_num=2;li_video_section=1',
'x-ab-pb': 'Ck49DPMLJgwPC+QKWAvXC1IM4AsnDEsLrAsgDEwLuQvPC0sMtAo+DJYL7Ao3DAAMmwvhC5oLhgsHDAELUgu1CyIMIQxgCzQM9AtWDA8M3AsSJwAAAAAAAAAAAAEBAQAAAQsAAAAAAQEAAgEAAQEBAQMAAQABAAAAAA==',
'x-api-version': '3.0.53',
'User-Agent':random.choice(user_agent_set),
'x-zse-86': '1.0_a_x0Hh9y6TxpNg28G0YBeAr8r_YpS8YyzBYq67U8cLSp',
'x-requested-with': 'fetch',
'x-zse-83': '3_2.0',
'Accept': '*/*',
'Referer': 'https://www.zhihu.com/',
'Accept-Encoding':' gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cookie': '**************************************************'
}

"""全局变量设置"""
result = [] #结果存储变量
thread_set = []  #线程存储变量
threadLock = threading.Lock()  #设置线程锁
url_set = []  #请求参数集合
# df_result = pandas.DataFrame()  #结果存储表格
result_information = []  #解析结果存储文件

class Tread_zhihu(threading.Thread):
    """创建知乎线程对象"""
    def __init__(self,threadID,name,start_index,end_index):
        threading.Thread.__init__(self)
        self.threadID = threadID
        self.name = name
        self.start_index = start_index
        self.end_index = end_index
    def run(self):
        """获取data"""
        print(str(self.threadID)+str(self.name)+time.ctime())
        threadLock.acquire()
        get_data(self.start_index,self.end_index)
        threadLock.release()
        # time.sleep(0.2)

def get_data(start_index,end_index):
    for i in range(start_index,end_index):
        data_temp = request_data(url_set[i])
        data_temp1 = json.loads(data_temp)  #将json数据转换成python格式
        print(data_temp1)
        result.append(data_temp1)
    return result

def get_url():
   """获得url链接集合"""
   for i in range(0,16):
      page_num_value = i+2
      after_id_value = 5 + (i*6)
      url_temp_url = 'https://www.zhihu.com/api/v3/feed/topstory/recommend?session_token=c1b01d7c7522284a68ecc9cbbbf3748e&desktop=true&page_number='+ str(page_num_value) +'&limit=6&action=down&after_id='+ str(after_id_value) +'&ad_interval=-1'
      url_set.append(url_temp_url)
   return url_set

def request_data(url):
    """请求获取数据"""
    req = urllib.request.Request(url, headers=header)
    response = urllib.request.urlopen(req)
    data = response.read()
    buff = io.BytesIO(data)
    f = gzip.GzipFile(fileobj=buff)
    res = f.read().decode('utf-8')
    return res

def data_parsing(result):
    """解析提取json数据"""
    for datax in result:  #对于result中数据的循环
        for i in range(len(datax)):  #对于list中data个数的循环
            try:
                data_title = datax['data'][i]['target']['title']  #提取文章题目
            except BaseException:
                data_title = '无题'
            try:
                data_author = datax['data'][i]['target']['author']['name']  #提取文章作者名字
            except BaseException:
                data_author = 'default'  # 提取文章作者名字
            try:
                data_voteuoCount = datax['data'][i]['target']['voteup_count']  #提取文章的点赞数
            except BaseException:
                data_author = 'default'  #提取文章的点赞数
            try:
                data_contentIntroduce = datax['data'][i]['target']['excerpt_new']  #提取每条信息的内容简介
            except BaseException:
                data_author = 'default'  #提取每条信息的内容简介
            information = {'标题':data_title,'作者':data_author,'点赞数':data_voteuoCount,'内容简介':data_contentIntroduce}  #信息字典
            result_information.append(information)
    return result_information

def data_save(result_information):
    """存储数据至文档"""
    title_temp = []
    author_temp = []
    countVote_temp = []
    contentIntroduce_temp = []
    for info in result_information:
        title_temp.append(info['标题'])
        author_temp.append(info['作者'])
        countVote_temp.append(info['点赞数'])
        contentIntroduce_temp.append(info['内容简介'])
    da = {"标题":title_temp,"作者":author_temp,"点赞数":countVote_temp,"内容简介":contentIntroduce_temp}
    df = pandas.DataFrame(da)
    df.to_excel('./out.xls',index=False)

def creat_sqlite():
    """创建数据库sqlite"""
    con = sqlite3.connect('Zhihudata.db')
    cur = con.cursor()
    sql = 'CREATE TABLE table_one(Serial_number INTEGER PRIMARY KEY AUTOINCREMENT,Title varchar(30) NOT NULL,Author varchar(30) NOT NULL,' \
          'Voteup_count varchar(30) NOT NULL,Content_introduce varchar(30) NOT NULL)'
    try:
        cur.execute(sql)
    except Exception as e:
        print(e)
        print('创表失败')
    finally:
        cur.close()
        con.close()

def insert_sqlite(result_information):
    """插入数据至数据库"""
    con = sqlite3.connect('*************')
    cur = con.cursor()
    for info in result_information:
        sql = 'insert into table_one(Title,Author,Voteup_count,Content_introduce)' \
              'values(:标题,:作者,:点赞数,:内容简介)'
        cur.execute(sql,info)
        con.commit()
    cur.close()

if __name__ == '__main__':
    get_url()
    concurrent_num = 4  #设置并发线程数
    thread_circle_num = int(16/concurrent_num)  #设置线程步长
    for i in range(1,concurrent_num+1):
        if i == 1:
            thread = Tread_zhihu(i,"Thread-"+str(i),0,thread_circle_num+1)
        elif i == 2:
            thread = Tread_zhihu(i,"Thread-"+str(i),thread_circle_num+1,i*thread_circle_num+1)
        else:
            thread = Tread_zhihu(i,"Thread-"+str(i),(i-1)*thread_circle_num+1,i*thread_circle_num)
        thread_set.append(thread)
    # print(thread_set)
    """执行线程和等待线程结束"""
    for i in range(len(thread_set)):
        thread_set[i].start()
    for i in range(len(thread_set)):
        thread_set[i].join()
    print('data request is over')
    data_parsing(result)  #解析数据
    data_save(result_information) #存储数据
    insert_sqlite(result_information)  #插入数据至sqlite3数据库

三、成果展示

python知乎首页文章数据爬取