抖音小视频爬取(配合mitmproxy和夜神模拟器)
程序员文章站
2022-04-11 18:53:01
...
# #_*_encoding:utf-8_*_
import json
import ssl
from urllib.request import Request
import urllib
import sys
import time
import hashlib
import os
from threading import Thread
import logging
from queue import Queue
ssl._create_default_https_context = ssl._create_unverified_context
#启动命令: mitmdump -p 8081 -s douyin1.py
keyword = 'shshs'
BASE_PATH = 'E:\douyin'
STORE_PATH = '{}\\{}'.format(BASE_PATH,keyword)#视频保存路径
if not os.path.exists(STORE_PATH):
os.mkdir(STORE_PATH)
TXT_FILENAME = STORE_PATH + '.txt'
header = {
"User-Agent": "com.ss.android.ugc.aweme/251 (Linux; U; Android 4.4.2; zh_CN; MI 6 ; Build/NMF26X; Cronet/58.0.2991.0)",
}
logging.basicConfig(filename='{}\douyin.log'.format(BASE_PATH),level=logging.INFO,format="%(asctime)s-%(name)s-%(levelno)s-%(lineno)d-%(message)s")
logger = logging.getLogger(__name__)
q = Queue(maxsize=2000)
def response(flow):
# 如果经过中间人的请求中有以url或者url1开头请求,我就解析它的响应
url = 'https://aweme.snssdk.com/aweme/v1/search/'
url1 = 'https://api.amemv.com/aweme/v1/search/'
url2=" https://aweme.snssdk.com/aweme/v1/general/search/"
url3 = 'https://aweme-hl.snssdk.com/aweme/v1/search/'
if flow.request.url.startswith(url) or flow.request.url.startswith(url1) or flow.request.url.startswith(url3) or flow.request.url.startswith(url2):
logger.info(flow.request.url)
resp = flow.response.text
data = json.loads(resp)
# 解析url地址和视频名称
url_list = data['aweme_list']
for url in url_list:
video_url = url['video']['play_addr']['url_list'][0]
real_url = video_url.replace("play", "playwm")#视频链接
video_name = url['desc']#视频名称
name = hashlib.md5(video_name.encode()).hexdigest()#将视频名hash,作为去重依据
q.put(real_url + "+" + name)
print("The len of queue is **********************************************:",q.qsize())
def download(q):
while True:
try:
a =q.get()
with open(TXT_FILENAME, 'a') as f:#将抓取的视频链接写入文件
f.write(a)
f.write('\n')
url = a.split("+")[0]
video_name = a.split("+")[1]
except:
continue
print("---------------queue has %s------------"%q.qsize())
try:
filename = '{}\\{}.mp4'.format(STORE_PATH, hashlib.md5(video_name.encode()).hexdigest())
if not os.path.exists(filename):
r = Request(url, headers=header)
data = urllib.request.urlopen(r)
with open(filename, 'ab') as f:
f.write(data.read())
data.close()
print("download ok:", video_name)
q.task_done()
time.sleep(0.1)
except Exception as e:
q.put(a)
print("Error:", e)
time.sleep(0.2)
for i in range(15):
t = Thread(target=download, args=(q,))
print("threading start")
t.start()
q.join()
sys.exit(1)
上一篇: 什么是匿名对象