欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

抖音小视频爬取(配合mitmproxy和夜神模拟器)

程序员文章站 2022-04-11 18:53:01
...
# #_*_encoding:utf-8_*_
import json
import ssl
from urllib.request import Request
import urllib
import sys
import time
import hashlib
import os
from threading import Thread
import logging
from queue import Queue
ssl._create_default_https_context = ssl._create_unverified_context

#启动命令: mitmdump -p 8081 -s douyin1.py

keyword = 'shshs'
BASE_PATH = 'E:\douyin'

STORE_PATH = '{}\\{}'.format(BASE_PATH,keyword)#视频保存路径
if not os.path.exists(STORE_PATH):
    os.mkdir(STORE_PATH)
TXT_FILENAME = STORE_PATH + '.txt'

header = {
    "User-Agent": "com.ss.android.ugc.aweme/251 (Linux; U; Android 4.4.2; zh_CN; MI 6 ; Build/NMF26X; Cronet/58.0.2991.0)",
}

logging.basicConfig(filename='{}\douyin.log'.format(BASE_PATH),level=logging.INFO,format="%(asctime)s-%(name)s-%(levelno)s-%(lineno)d-%(message)s")
logger = logging.getLogger(__name__)
q = Queue(maxsize=2000)
def response(flow):
# 如果经过中间人的请求中有以url或者url1开头请求,我就解析它的响应
    url = 'https://aweme.snssdk.com/aweme/v1/search/'
    url1 = 'https://api.amemv.com/aweme/v1/search/'
    url2=" https://aweme.snssdk.com/aweme/v1/general/search/"
    url3 = 'https://aweme-hl.snssdk.com/aweme/v1/search/'
    if flow.request.url.startswith(url) or flow.request.url.startswith(url1) or flow.request.url.startswith(url3) or flow.request.url.startswith(url2):
        logger.info(flow.request.url)
        resp = flow.response.text
        data = json.loads(resp)
        # 解析url地址和视频名称
        url_list = data['aweme_list']
        for url in url_list:
            video_url = url['video']['play_addr']['url_list'][0]
            real_url = video_url.replace("play", "playwm")#视频链接
            video_name = url['desc']#视频名称
            name = hashlib.md5(video_name.encode()).hexdigest()#将视频名hash,作为去重依据
            q.put(real_url + "+" + name)
        print("The len of queue is **********************************************:",q.qsize())

def download(q):
    while True:
        try:
            a =q.get()
            with open(TXT_FILENAME, 'a') as f:#将抓取的视频链接写入文件
                f.write(a)
                f.write('\n')
            url = a.split("+")[0]
            video_name = a.split("+")[1]
        except:
            continue
        print("---------------queue has %s------------"%q.qsize())
        try:
            filename = '{}\\{}.mp4'.format(STORE_PATH, hashlib.md5(video_name.encode()).hexdigest())
            if not os.path.exists(filename):
                r = Request(url, headers=header)
                data = urllib.request.urlopen(r)
                with open(filename, 'ab') as f:
                    f.write(data.read())
                data.close()
                print("download ok:", video_name)
                q.task_done()
                time.sleep(0.1)
        except Exception as e:
            q.put(a)
            print("Error:", e)
            time.sleep(0.2)

for i in range(15):
    t = Thread(target=download, args=(q,))
    print("threading start")
    t.start()
    q.join()
    sys.exit(1)