欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

python知乎首页文章数据爬取

程序员文章站 2022-05-25 09:05:37
...

一、知识点总结和操作步骤以及现存问题

python知乎首页文章数据爬取

二、源码展示

import urllib.request
import gzip
import io
import random
import threading
import time
import pandas
import json
import sqlite3

"""设置代理user_agent"""
user_agent_set = ['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
                  'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
                  'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
                  'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50',
                  'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201',
                  'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.81 Safari/537.36']

"""使用IP代理池,此处手动拼接成的ip代理,付费可以购买隧道"""

"""设置header"""
header = {
'Host': 'www.zhihu.com',
'Connection': 'keep-alive',
'x-ab-param': 'qap_question_author=0;ls_video_commercial=0;li_sp_mqbk=0;li_vip_verti_search=0;li_panswer_topic=0;qap_question_visitor= 0;tp_contents=1;zr_expslotpaid=1;zr_intervene=0;li_edu_page=old;pf_profile2_tab=0;li_paid_answer_exp=0;tp_zrec=1;pf_adjust=1;se_ffzx_jushen1=0;top_test_4_liguangyi=1;zr_slotpaidexp=2;tp_dingyue_video=0;tp_topic_style=0;pf_noti_entry_num=2;li_video_section=1',
'x-ab-pb': 'Ck49DPMLJgwPC+QKWAvXC1IM4AsnDEsLrAsgDEwLuQvPC0sMtAo+DJYL7Ao3DAAMmwvhC5oLhgsHDAELUgu1CyIMIQxgCzQM9AtWDA8M3AsSJwAAAAAAAAAAAAEBAQAAAQsAAAAAAQEAAgEAAQEBAQMAAQABAAAAAA==',
'x-api-version': '3.0.53',
'User-Agent':random.choice(user_agent_set),
'x-zse-86': '1.0_a_x0Hh9y6TxpNg28G0YBeAr8r_YpS8YyzBYq67U8cLSp',
'x-requested-with': 'fetch',
'x-zse-83': '3_2.0',
'Accept': '*/*',
'Referer': 'https://www.zhihu.com/',
'Accept-Encoding':' gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cookie': '**************************************************'
}

"""全局变量设置"""
result = [] #结果存储变量
thread_set = []  #线程存储变量
threadLock = threading.Lock()  #设置线程锁
url_set = []  #请求参数集合
# df_result = pandas.DataFrame()  #结果存储表格
result_information = []  #解析结果存储文件

class Tread_zhihu(threading.Thread):
    """创建知乎线程对象"""
    def __init__(self,threadID,name,start_index,end_index):
        threading.Thread.__init__(self)
        self.threadID = threadID
        self.name = name
        self.start_index = start_index
        self.end_index = end_index
    def run(self):
        """获取data"""
        print(str(self.threadID)+str(self.name)+time.ctime())
        threadLock.acquire()
        get_data(self.start_index,self.end_index)
        threadLock.release()
        # time.sleep(0.2)

def get_data(start_index,end_index):
    for i in range(start_index,end_index):
        data_temp = request_data(url_set[i])
        data_temp1 = json.loads(data_temp)  #将json数据转换成python格式
        print(data_temp1)
        result.append(data_temp1)
    return result

def get_url():
   """获得url链接集合"""
   for i in range(0,16):
      page_num_value = i+2
      after_id_value = 5 + (i*6)
      url_temp_url = 'https://www.zhihu.com/api/v3/feed/topstory/recommend?session_token=c1b01d7c7522284a68ecc9cbbbf3748e&desktop=true&page_number='+ str(page_num_value) +'&limit=6&action=down&after_id='+ str(after_id_value) +'&ad_interval=-1'
      url_set.append(url_temp_url)
   return url_set

def request_data(url):
    """请求获取数据"""
    req = urllib.request.Request(url, headers=header)
    response = urllib.request.urlopen(req)
    data = response.read()
    buff = io.BytesIO(data)
    f = gzip.GzipFile(fileobj=buff)
    res = f.read().decode('utf-8')
    return res

def data_parsing(result):
    """解析提取json数据"""
    for datax in result:  #对于result中数据的循环
        for i in range(len(datax)):  #对于list中data个数的循环
            try:
                data_title = datax['data'][i]['target']['title']  #提取文章题目
            except BaseException:
                data_title = '无题'
            try:
                data_author = datax['data'][i]['target']['author']['name']  #提取文章作者名字
            except BaseException:
                data_author = 'default'  # 提取文章作者名字
            try:
                data_voteuoCount = datax['data'][i]['target']['voteup_count']  #提取文章的点赞数
            except BaseException:
                data_author = 'default'  #提取文章的点赞数
            try:
                data_contentIntroduce = datax['data'][i]['target']['excerpt_new']  #提取每条信息的内容简介
            except BaseException:
                data_author = 'default'  #提取每条信息的内容简介
            information = {'标题':data_title,'作者':data_author,'点赞数':data_voteuoCount,'内容简介':data_contentIntroduce}  #信息字典
            result_information.append(information)
    return result_information

def data_save(result_information):
    """存储数据至文档"""
    title_temp = []
    author_temp = []
    countVote_temp = []
    contentIntroduce_temp = []
    for info in result_information:
        title_temp.append(info['标题'])
        author_temp.append(info['作者'])
        countVote_temp.append(info['点赞数'])
        contentIntroduce_temp.append(info['内容简介'])
    da = {"标题":title_temp,"作者":author_temp,"点赞数":countVote_temp,"内容简介":contentIntroduce_temp}
    df = pandas.DataFrame(da)
    df.to_excel('./out.xls',index=False)

def creat_sqlite():
    """创建数据库sqlite"""
    con = sqlite3.connect('Zhihudata.db')
    cur = con.cursor()
    sql = 'CREATE TABLE table_one(Serial_number INTEGER PRIMARY KEY AUTOINCREMENT,Title varchar(30) NOT NULL,Author varchar(30) NOT NULL,' \
          'Voteup_count varchar(30) NOT NULL,Content_introduce varchar(30) NOT NULL)'
    try:
        cur.execute(sql)
    except Exception as e:
        print(e)
        print('创表失败')
    finally:
        cur.close()
        con.close()

def insert_sqlite(result_information):
    """插入数据至数据库"""
    con = sqlite3.connect('*************')
    cur = con.cursor()
    for info in result_information:
        sql = 'insert into table_one(Title,Author,Voteup_count,Content_introduce)' \
              'values(:标题,:作者,:点赞数,:内容简介)'
        cur.execute(sql,info)
        con.commit()
    cur.close()

if __name__ == '__main__':
    get_url()
    concurrent_num = 4  #设置并发线程数
    thread_circle_num = int(16/concurrent_num)  #设置线程步长
    for i in range(1,concurrent_num+1):
        if i == 1:
            thread = Tread_zhihu(i,"Thread-"+str(i),0,thread_circle_num+1)
        elif i == 2:
            thread = Tread_zhihu(i,"Thread-"+str(i),thread_circle_num+1,i*thread_circle_num+1)
        else:
            thread = Tread_zhihu(i,"Thread-"+str(i),(i-1)*thread_circle_num+1,i*thread_circle_num)
        thread_set.append(thread)
    # print(thread_set)
    """执行线程和等待线程结束"""
    for i in range(len(thread_set)):
        thread_set[i].start()
    for i in range(len(thread_set)):
        thread_set[i].join()
    print('data request is over')
    data_parsing(result)  #解析数据
    data_save(result_information) #存储数据
    insert_sqlite(result_information)  #插入数据至sqlite3数据库

三、成果展示

python知乎首页文章数据爬取

四、现存问题

1.简介内容特别字符没有处理。

2.sqlite一个表格插入1000数据不可再插入,爬取下来的数据不能完全存入数据库。

3.欢迎大佬批评指正。