【Python爬虫案例学习2】python多线程爬取youtube视频
程序员文章站
2022-12-22 13:09:52
转载:https://www.cnblogs.com/binglansky/p/8534544.html 开发环境: python2.7 + win10 开始先说一下,访问youtube需要那啥的,请自行解决,最好是全局代理。 实现代码: coding:utf 8 author : Corleone ......
转载:
开发环境:
- python2.7 + win10
开始先说一下,访问youtube需要那啥的,请自行解决,最好是全局代理。
实现代码:
# -*-coding:utf-8-*- # author : corleone from bs4 import beautifulsoup import lxml import queue import requests import re,os,sys,random import threading import logging import json,hashlib,urllib from requests.exceptions import connecttimeout,connectionerror,readtimeout,sslerror,missingschema,chunkedencodingerror import random ''' 遇到不懂的问题?python学习交流群:821460695满足你的需求,资料都已经上传群文件,可以自行下载! ''' reload(sys) sys.setdefaultencoding('gbk') # 日志模块 logger = logging.getlogger("appname") formatter = logging.formatter('%(asctime)s %(levelname)-5s: %(message)s') console_handler = logging.streamhandler(sys.stdout) console_handler.formatter = formatter logger.addhandler(console_handler) logger.setlevel(logging.info) q = queue.queue() # url队列 page_q = queue.queue() # 页面 def downlaod(q,x,path): urlhash = "https://weibomiaopai.com/" try: html = requests.get(urlhash).text except sslerror: logger.info(u"网络不稳定 正在重试") html = requests.get(urlhash).text reg = re.compile(r'var hash="(.*?)"', re.s) result = reg.findall(html) hash_v = result[0] while true: data = q.get() url, name = data[0], data[1].strip().replace("|", "") file = os.path.join(path, '%s' + ".mp4") % name api = "https://steakovercooked.com/api/video/?cached&hash=" + hash_v + "&video=" + url api2 = "https://helloacm.com/api/video/?cached&hash=" + hash_v + "&video=" + url try: res = requests.get(api) result = json.loads(res.text) except (valueerror,sslerror): try: res = requests.get(api2) result = json.loads(res.text) except (valueerror,sslerror): q.task_done() return false vurl = result['url'] logger.info(u"正在下载:%s" %name) try: r = requests.get(vurl) except sslerror: r = requests.get(vurl) except missingschema: q.task_done() continue try: with open(file,'wb') as f: f.write(r.content) except ioerror: name = u'好开心么么哒 %s' % random.randint(1,9999) file = os.path.join(path, '%s' + ".mp4") % name with open(file,'wb') as f: f.write(r.content) logger.info(u"下载完成:%s" %name) q.task_done() def get_page(keyword,page_q): while true: headers = { 'user-agent': 'mozilla/5.0 (windows nt 10.0; win64; x64; rv:57.0) gecko/20100101 firefox/57.0' } page = page_q.get() url = "https://www.youtube.com/results?sp=egiiag%253d%253d&search_query=" + keyword + "&page=" + str(page) try: html = requests.get(url, headers=headers).text except (connecttimeout,connectionerror): print u"不能访问youtube 检查是否已fq" os._exit(0) reg = re.compile(r'"url":"/watch\?v=(.*?)","webpagetype"', re.s) result = reg.findall(html) logger.info(u"第 %s 页" % page) for x in result: vurl = "https://www.youtube.com/watch?v=" + x try: res = requests.get(vurl).text except (connectionerror,chunkedencodingerror): logger.info(u"网络不稳定 正在重试") try: res = requests.get(vurl).text except sslerror: continue reg2 = re.compile(r"<title>(.*?)youtube",re.s) name = reg2.findall(res)[0].replace("-","") if u'\u4e00' <= keyword <= u'\u9fff': q.put([vurl, name]) else: # 调用金山词霸 logger.info(u"正在翻译") url_js = "http://www.iciba.com/" + name html2 = requests.get(url_js).text soup = beautifulsoup(html2, "lxml") try: res2 = soup.select('.clearfix')[0].get_text() title = res2.split("\n")[2] except indexerror: title = u'好开心么么哒 %s' % random.randint(1, 9999) q.put([vurl, title]) page_q.task_done() def main(): # 使用帮助 keyword = raw_input(u"请输入关键字:").decode("gbk") threads = int(raw_input(u"请输入线程数量(建议1-10): ")) # 判断目录 path = 'd:\youtube\%s' % keyword if os.path.exists(path) == false: os.makedirs(path) # 解析网页 logger.info(u"开始解析网页") for page in range(1,26): page_q.put(page) for y in range(threads): t = threading.thread(target=get_page,args=(keyword,page_q)) t.setdaemon(true) t.start() page_q.join() logger.info(u"共 %s 视频" % q.qsize()) # 多线程下载 logger.info(u"开始下载视频") for x in range(threads): t = threading.thread(target=downlaod,args=(q,x,path)) t.setdaemon(true) t.start() q.join() logger.info(u"全部视频下载完成!") main()
上一篇: Maven常用构建命令
下一篇: oracle之 RA-00054: resource busy and acquire with NOWAIT specified or timeout expired
推荐阅读
-
【Python爬虫案例学习】Python爬取淘宝店铺和评论
-
python爬虫爬取快手视频多线程下载功能
-
【Python爬虫案例学习2】python多线程爬取youtube视频
-
Python爬虫学习教程 bilibili网站视频爬取!【附源码】
-
Python爬虫学习教程 猫眼电影网站视频爬取!【附源码】
-
python爬虫爬取快手视频多线程下载功能
-
【Python爬虫案例学习20】Python爬虫爬取智联招聘职位信息
-
python学习笔记(二十二)爬虫基础(2):模拟浏览器,ajax动态爬取,爬取数据写入文件、图片爬虫
-
【Python爬虫案例学习】python爬取淘宝里的手机报价并以价格排序
-
【Python爬虫案例学习2】python多线程爬取youtube视频