欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  后端开发

python实现爬虫下载漫画示例

程序员文章站 2022-04-11 21:47:12
...
代码如下:


#!/usr/bin/python3.2
import os,socket
import urllib
import urllib.request,threading,time
import re,sys
global manhuaweb,weburl,floder,chapterbegin,currentthreadnum,threadcount,mutex,mutex2

weburl=''
floder=''
chapterbegin=0
currentthreadnum=0
threadcount=6


if len(sys.argv)>=3:
weburl=sys.argv[1]
floder=sys.argv[2]
else:
print("usag: downloadmanhua weburl floder chapterbegin=0 threadnnum=6")
sys.exit(0)
if len(sys.argv)>=4:
chapterbegin=int(sys.argv[3])
if len(sys.argv)>=5:
threadcount=(int)(sys.argv[4])

def jin(i,jinzhi):
finalans=""
answer=i%jinzhi
i=int(i/jinzhi)
if answer>9:
finalans=finalans+chr(ord('a')+(answer-10))
else:
finalans=finalans+str(answer)
if i!=0:
finalans=jin(i,jinzhi)+finalans
return finalans
def urlparse(p,a,c,k):
d={}
e=lambda c: jin(c,36)
if 1:
while c:
c=c-1
if not k[c]:
d[jin(c,36)]=jin(c,36)
else:
d[jin(c,36)]=k[c]
k=[lambda e:d[e]]
e=lambda c:'\\w+'
c=1
newstr=""
while c:
c=c-1
if k[c]:
for i in range(0,len(p)):
tempi=p[i]
tempi=ord(tempi)
if tempi>=ord('a') and tempi newstr+=d[chr(tempi)]
elif tempi>=ord('0') and tempi newstr+=d[chr(tempi)]
else:
newstr+=chr(tempi)
return newstr
def meispower(s):
p=re.compile(r"(?=\}\().*",re.IGNORECASE)
s=p.findall(s)
s=s[0]
s=s[0:(len(s)-19)]
par=s.split(',')
par[3]=par[3][1:len(par[3])]
answer=par[3].split('|')
chapterpath=urlparse(par[0],int(par[1]),int(par[2]),answer)
allurl=re.findall('imgpath=[^;]*',chapterpath)[0]
allurl=allurl[10:(len(allurl)-2)]
return allurl
def pictofile(weburl,filename,loop=100):
if loop print('can\'t download the picture %s'%weburl)
return
loop=loop-1
if os.path.exists(filename):
return
try:
url=urllib.request.urlopen(weburl)
data=url.read()
if len(data) url.close()
pictofile(weburl,filename,loop)
else:
print('download from %s name is %s\n'%(weburl,filename))
myfile=open('%s'%filename,'wb')
myfile.write(data)
myfile.close()
url.close();
except socket.timeout:
print('timeout')
pictofile(weburl,filename,loop)
except Exception as e:
print('error',e)
pictofile(weburl,filename,loop)
finally:
pass
def downloadpic(url,loadpicdir,num):
#download the all url picture to loadpicdir
global currentthreadnum,mutex,mutex2
mymode=re.compile(r'[0-9a-z.]*\Z')
try:
mutex2.acquire()
os.chdir(loadpicdir)
mutex2.release()
except:
print("can't open the floder %s will be create"%loadpicdir)
try:
if(mutex2.locked()):
os.mkdir(loadpicdir)
os.chdir(loadpicdir)
mutex2.release()
print('create floder succeed')
except:
print("can't create floder %s"%loadpicdir)
if(mutex.acquire()):
mutex.release()
quit(0)
name=mymode.findall(url)
filename='manhua'+name[0]
pictofile(url,loadpicdir+'//'+str(num)+'-'+filename)
mutex.acquire()
currentthreadnum=currentthreadnum-1
mutex.release()
def downloadchapter(url,loadpicdir,num,begin=0):
global manhuaweb,threadcount,currentthreadnum,mutex
print(manhuaweb+url)
webdata=urllib.request.urlopen(manhuaweb+url).read()
webdata=webdata.decode('UTF-8')
chaptername=re.findall(r'

[^_]*',webdata)[0]<br> chaptername=chaptername[7:len(chaptername)]<br> webscrip=re.findall(r'eval.*[^]',webdata)<br> chapterurl=meispower(webscrip[0]);<br> chapterurl='http://mhimg.ali213.net'+chapterurl<br> for i in range(begin,num):<br> try:<br> while(currentthreadnum>=threadcount):<br> time.sleep(0.5)<br> mutex.acquire()<br> currentthreadnum=currentthreadnum+1<br> mutex.release()<br> threading.Thread(target=downloadpic,args=(r'%s%d.jpg'%(chapterurl,i),loadpicdir+chaptername,num)).start()<br> except socket.error:<br> mutex.acquire()<br> i=i-1<br> currentthreadnum=currentthreadnum-1<br> mutex.release()<br> except Exception as error:<br> print(error,'break')<br> print('download chapter %d of picture make a error'%i)<br> break<br>if __name__=='__main__':<br> manhuaweb=r'http://manhua.ali213.net'<br> socket.setdefaulttimeout(60.0)<br> mutex=threading.Lock()<br> mutex2=threading.Lock() <p> <br> webfile=urllib.request.urlopen(weburl)<br> webdata=webfile.read();<br> webdata=webdata.decode('UTF-8')<br> meshmode=re.compile(r'</p> <div class="detail_body_right_sec_con">.*</div>')<br> meshdata=meshmode.findall(webdata)[0]<br> indexmode=re.compile(r'([0-9]*页)')<br> indexdata=indexmode.findall(meshdata) <p> picurlmode=re.compile(r'/comic/[0-9/]*.html')<br> picurldata=picurlmode.findall(meshdata)</p> <p><br> chapterlength=len(picurldata)<br> nummode=re.compile(r'[\d]+')</p> <p> i=chapterbegin<br> while i<chapterlength:> manhuachapter=picurldata[chapterlength-i-1]<br> downloadchapter(manhuachapter,floder,int(nummode.findall(indexdata[chapterlength-i-1])[0]))<br> i=i+1</chapterlength:></p>
相关标签: 爬虫 漫画