爬虫实战爬取校花图片
程序员文章站
2022-05-02 16:59:26
...
爬取校花图片
import requests as rq
from bs4 import BeautifulSoup as bf
import urllib.request
import os
paths ="images"
#个人基本信息的页面链接并存入列表infourl
urli= 'http://www.xiaohuar.com/p-1-{}.html'
infourl = []
for i in range(1,50):
infourl.append(urli.format(i))
#获取校花姓名及所在大学
def getname(url):
res = rq.get(url)
res.encoding = 'gb2312'
soup = bf(res.text, 'html.parser')
info = soup.select('title')
name = info[0].text.rstrip("_大学校花")
return name
#个人相册的页面链接并存入列表img_page_url
urlm = "http://www.xiaohuar.com/s-1-{}.html#p1"
img_page_url=[]
for i in range(1,50):
img_page_url.append(urlm.format(i))
print(img_page_url)
#获取校花照片链接
def getimg(pageurl):
res = rq.get(pageurl)
soup = bf(res.text, 'html.parser')
imgclass = soup.select('.image0')
imgurl = []
for i in imgclass:
urlc = "http://www.xiaohuar.com/" + i['src']
imgurl.append(urlc)
return imgurl
# 创建文件夹
p =[]
z = 1
for url in infourl:#遍历个人信息页面
path = os.path.join(paths,str(z)+getname(url))
os.mkdir(path)
p.append(path)
z+=1
# 下载图片
i =0
for j in img_page_url:
n =0
for k in getimg(j):
with open(p[i]+"/"+str(n)+".jpg","wb") as f:
rep = rq.get(k).content
f.write(rep)
n=n+1
i=i+1