python爬虫项目-一见倾心壁纸
程序员文章站
2024-01-29 13:38:46
方法1 1 import re 2 import urllib 3 import urllib.request 4 5 def getHtml(url): 6 page = urllib.request.urlopen(url) 7 html = page.read() 8 return html ......
方法1
1 import re 2 import urllib 3 import urllib.request 4 5 def gethtml(url): 6 page = urllib.request.urlopen(url) 7 html = page.read() 8 return html 9 10 def getimage(html,x): 11 #https://mmbiz.qpic.cn/mmbiz_jpg/ib55rg6wzuc3b16kiy3uu53nkcttdic8uea4wwbpahj8lpibvankps2fztyjrv7w7dbeenrhfvpuuyrenaxsldgja/640?wx_fmt=jpeg 12 #https://mmbiz.qpic.cn/mmbiz_jpg/ib55rg6wzuc3b16kiy3uu53nkcttdic8uehqoci7r86nehl2neforaqvctiaeaiuwjtwpknxnnxipuuuqnujefkyw/640?wx_fmt=jpeg 13 #此处正则为重点 14 reg = 'data-src="(.*?)"' 15 image = re.compile(reg) 16 imlist = re.findall(reg,html.decode('utf-8')) 17 18 print(imlist) 19 for i in imlist: 20 print(i) 21 print(x) 22 urllib.request.urlretrieve(i,'%s.jpg' % x) 23 x +=1 24 return x 25 x=1 26 url = 'https://mp.weixin.qq.com/s/mvdcn0o3093olihmykqbia' 27 html = gethtml(url) 28 x = getimage(html,x)
29 print('下载完成')
30 #下载结果与此.py文件在同一目录
方法2:beautifulsoup 避免写正则表达式(因为不会)
import requests import urllib.request from bs4 import beautifulsoup url = "https://mp.weixin.qq.com/s/cm3bua0um1jbznr2de7twg" r = requests.get(url) demo = r.text soup = beautifulsoup(demo,"html.parser") piclist=[] for link in soup.find_all('img'): link_list = link.get('data-src') if link_list != none: piclist.append(link_list) #print(piclist) #print(type(link_list)) x = 0 for http in piclist: print(http) #f:\桌面\pa 是存储路径,需要先建立文件夹 filesavepath = r'f:\桌面\pa\%s.jpg' % x urllib.request.urlretrieve(http,filesavepath) x +=1 print('正在保存第{:.0f}张图片'.format(x)) print('下载完成')
上一篇: CSS标准文档流