欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

爬取梨视频

程序员文章站 2022-04-11 17:13:12
...

import re
import redis
import requests,time

from setting import PAGE,CATEGORY_ID,START,MAIN_URL,DETAIL_URL

from concurrent.futures import ThreadPoolExecutor

from myredis import POOL
class CrawlVideo():
    pools = ThreadPoolExecutor(100)

    def __init__(self, page=PAGE):
        self.page = page
        self.video_info_dic_list = []
        self.conn = redis.Redis(connection_pool=POOL)

    def async_download(self,video_dic):

        video_link = video_dic["video_link"]
        if self.conn.get(video_link):
            return
        video_name = video_dic["title"][:3]
        response = requests.get(video_link)
        if response.status_code == 200:
            with open("%s.mp4" % video_name, "wb")as f:
                f.write(response.content)
            self.conn.set(video_link,video_link)


    def download_video(self, category_id=CATEGORY_ID, start=START, num=PAGE):

        crawl_ids_list= self.crawl_videolist(category_id, start, num)
        print(len(crawl_ids_list))
        self.get_video_info(crawl_ids_list)
        i = 0
        while i < len(crawl_ids_list):
            try:
                video_dic = self.video_info_dic_list.pop()
                self.pools.submit(self.async_download,video_dic)
                i += 1
            except Exception as e:
                time.sleep(0.2)



    def get_video_ids(self, category_id, start):
        main_url = MAIN_URL.format(category_id, start)
        try:
            response = requests.get(main_url)
            video_id_list = re.findall('<a href="(video_\d+)"', response.text)
            return video_id_list
        except Exception as e:
            pass

    # 爬取单个视频的id的列表,可以通过此列表发请求
    def crawl_videolist(self, category_id, start, num):
        crawl_ids_list = []
        page_num = self.get_page_num(num)
        for i in range(page_num):
            video_id_list = self.get_video_ids(category_id, start)
            crawl_ids_list.extend(video_id_list)
            start += self.page
        while len(crawl_ids_list) > num:
            crawl_ids_list.pop()
        return crawl_ids_list


    def get_detail(self, obj):
        response = obj.result()
        dic = {}
        title = re.search('<title>(.*?)</title>', response.text).group(1)
        video_link = re.search('srcUrl="(.*?)"', response.text).group(1)
        dic["title"] = title
        dic["video_link"] = video_link
        self.video_info_dic_list.append(dic)



    def async_request(self,url,video_addr):
        response = requests.get(url.format(video_addr))
        return response

    def get_video_info(self, video_id_list):
        url = DETAIL_URL
        try:
            for video_addr in video_id_list:
                obj = self.pools.submit(self.async_request,url,video_addr)
                obj.add_done_callback(self.get_detail)
        except Exception as e:
            print(e)

    def get_page_num(self, num):
        if num % self.page == 0:
            page_num = num / self.page
        elif num <= self.page:
            page_num = 1
        else:
            page_num = num // self.page + 1
        return int(page_num)



crawl = CrawlVideo()
crawl.download_video(start=1,num=2)