python实现爬虫下载漫画示例

程序员文章站 2022-04-11 21:47:12

...

代码如下:

#!/usr/bin/python3.2
import os,socket
import urllib
import urllib.request,threading,time
import re,sys
global manhuaweb,weburl,floder,chapterbegin,currentthreadnum,threadcount,mutex,mutex2

weburl=''
floder=''
chapterbegin=0
currentthreadnum=0
threadcount=6

if len(sys.argv)>=3:
weburl=sys.argv[1]
floder=sys.argv[2]
else:
print("usag: downloadmanhua weburl floder chapterbegin=0 threadnnum=6")
sys.exit(0)
if len(sys.argv)>=4:
chapterbegin=int(sys.argv[3])
if len(sys.argv)>=5:
threadcount=(int)(sys.argv[4])

def jin(i,jinzhi):
finalans=""
answer=i%jinzhi
i=int(i/jinzhi)
if answer>9:
finalans=finalans+chr(ord('a')+(answer-10))
else:
finalans=finalans+str(answer)
if i!=0:
finalans=jin(i,jinzhi)+finalans
return finalans
def urlparse(p,a,c,k):
d={}
e=lambda c: jin(c,36)
if 1:
while c:
c=c-1
if not k[c]:
d[jin(c,36)]=jin(c,36)
else:
d[jin(c,36)]=k[c]
k=[lambda e:d[e]]
e=lambda c:'\\w+'
c=1
newstr=""
while c:
c=c-1
if k[c]:
for i in range(0,len(p)):
tempi=p[i]
tempi=ord(tempi)
if tempi>=ord('a') and tempi newstr+=d[chr(tempi)]
elif tempi>=ord('0') and tempi newstr+=d[chr(tempi)]
else:
newstr+=chr(tempi)
return newstr
def meispower(s):
p=re.compile(r"(?=\}\().*",re.IGNORECASE)
s=p.findall(s)
s=s[0]
s=s[0:(len(s)-19)]
par=s.split(',')
par[3]=par[3][1:len(par[3])]
answer=par[3].split('|')
chapterpath=urlparse(par[0],int(par[1]),int(par[2]),answer)
allurl=re.findall('imgpath=[^;]*',chapterpath)[0]
allurl=allurl[10:(len(allurl)-2)]
return allurl
def pictofile(weburl,filename,loop=100):
if loop print('can\'t download the picture %s'%weburl)
return
loop=loop-1
if os.path.exists(filename):
return
try:
url=urllib.request.urlopen(weburl)
data=url.read()
if len(data) url.close()
pictofile(weburl,filename,loop)
else:
print('download from %s name is %s\n'%(weburl,filename))
myfile=open('%s'%filename,'wb')
myfile.write(data)
myfile.close()
url.close();
except socket.timeout:
print('timeout')
pictofile(weburl,filename,loop)
except Exception as e:
print('error',e)
pictofile(weburl,filename,loop)
finally:
pass
def downloadpic(url,loadpicdir,num):
#download the all url picture to loadpicdir
global currentthreadnum,mutex,mutex2
mymode=re.compile(r'[0-9a-z.]*\Z')
try:
mutex2.acquire()
os.chdir(loadpicdir)
mutex2.release()
except:
print("can't open the floder %s will be create"%loadpicdir)
try:
if(mutex2.locked()):
os.mkdir(loadpicdir)
os.chdir(loadpicdir)
mutex2.release()
print('create floder succeed')
except:
print("can't create floder %s"%loadpicdir)
if(mutex.acquire()):
mutex.release()
quit(0)
name=mymode.findall(url)
filename='manhua'+name[0]
pictofile(url,loadpicdir+'//'+str(num)+'-'+filename)
mutex.acquire()
currentthreadnum=currentthreadnum-1
mutex.release()
def downloadchapter(url,loadpicdir,num,begin=0):
global manhuaweb,threadcount,currentthreadnum,mutex
print(manhuaweb+url)
webdata=urllib.request.urlopen(manhuaweb+url).read()
webdata=webdata.decode('UTF-8')
chaptername=re.findall(r'

[^_]*',webdata)[0] chaptername=chaptername[7:len(chaptername)] webscrip=re.findall(r'eval.*[^]',webdata) chapterurl=meispower(webscrip[0]); chapterurl='http://mhimg.ali213.net'+chapterurl for i in range(begin,num): try: while(currentthreadnum>=threadcount): time.sleep(0.5) mutex.acquire() currentthreadnum=currentthreadnum+1 mutex.release() threading.Thread(target=downloadpic,args=(r'%s%d.jpg'%(chapterurl,i),loadpicdir+chaptername,num)).start() except socket.error: mutex.acquire() i=i-1 currentthreadnum=currentthreadnum-1 mutex.release() except Exception as error: print(error,'break') print('download chapter %d of picture make a error'%i) break if __name__=='__main__': manhuaweb=r'http://manhua.ali213.net' socket.setdefaulttimeout(60.0) mutex=threading.Lock() mutex2=threading.Lock() webfile=urllib.request.urlopen(weburl) webdata=webfile.read(); webdata=webdata.decode('UTF-8') meshmode=re.compile(r' <div class="detail_body_right_sec_con">.*</div>') meshdata=meshmode.findall(webdata)[0] indexmode=re.compile(r'([0-9]*页)') indexdata=indexmode.findall(meshdata) picurlmode=re.compile(r'/comic/[0-9/]*.html') picurldata=picurlmode.findall(meshdata) chapterlength=len(picurldata) nummode=re.compile(r'[\d]+') i=chapterbegin while i<chapterlength:> manhuachapter=picurldata[chapterlength-i-1] downloadchapter(manhuachapter,floder,int(nummode.findall(indexdata[chapterlength-i-1])[0])) i=i+1</chapterlength:>

python实现爬虫下载漫画示例

基python实现多线程网页爬虫

Python实现类似jQuery使用中的链式调用的示例

Android基于自带的DownloadManager实现下载功能示例

Python实现ping指定IP的示例

python3 property装饰器实现原理与用法示例

python实现绘制树枝简单示例

Python实现带下标索引的遍历操作示例

Python实现的插入排序，冒泡排序，快速排序，选择排序算法示例

Python实现html转换为pdf报告(生成pdf报告)功能示例

Python实现数据结构线性链表（单链表）算法示例