欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

爬虫实战爬取校花图片

程序员文章站 2022-05-02 16:59:26
...

爬取校花图片

import requests as rq
from bs4 import BeautifulSoup as bf
import urllib.request
import os

paths ="images"

#个人基本信息的页面链接并存入列表infourl
urli= 'http://www.xiaohuar.com/p-1-{}.html'
infourl = []
for i in range(1,50):
    infourl.append(urli.format(i))


#获取校花姓名及所在大学
def getname(url):
    res = rq.get(url)
    res.encoding = 'gb2312'
    soup = bf(res.text, 'html.parser')
    info = soup.select('title')
    name = info[0].text.rstrip("_大学校花")
    return name

#个人相册的页面链接并存入列表img_page_url
urlm = "http://www.xiaohuar.com/s-1-{}.html#p1"
img_page_url=[]
for i in range(1,50):
    img_page_url.append(urlm.format(i))
print(img_page_url)

#获取校花照片链接
def getimg(pageurl):
    res = rq.get(pageurl)
    soup = bf(res.text, 'html.parser')
    imgclass = soup.select('.image0')
    imgurl = []
    for i in imgclass:
        urlc = "http://www.xiaohuar.com/" + i['src']
        imgurl.append(urlc)
    return imgurl


# 创建文件夹
p =[]
z = 1
for url in infourl:#遍历个人信息页面
    path = os.path.join(paths,str(z)+getname(url))
    os.mkdir(path)
    p.append(path)
    z+=1


# 下载图片
i =0
for j in img_page_url:
    n =0
    for k in getimg(j):
        with open(p[i]+"/"+str(n)+".jpg","wb") as f:
            rep = rq.get(k).content
            f.write(rep)
            n=n+1
    i=i+1

利用Scrapy框架爬取