实战项目 — 爬取 妹子图网,保存图片到本地
程序员文章站
2022-04-14 15:41:17
重点: 1. 用def函数 2. 使用 os.path.dirname("路径保存") , 实现每组图片保存在独立的文件夹中 方法1: 方法2: ......
重点:
1. 用def函数
2. 使用 os.path.dirname("路径保存") , 实现每组图片保存在独立的文件夹中
方法1:
import requests
from lxml import etree
import os
import time
start = time.time()
def mz_spider(base_url, headers_one):
res = requests.get(url=base_url, headers=headers_one) # 请求链接
base_html = etree.html(res.text) # 解析html
img_src = base_html.xpath('//div[@class="postlist"]/ul/li/a/@href')
for img_url in img_src:
# print(img_url)
img_parse(img_url)
def img_parse(img_url):
headers = {
"user-agent": "mozilla/4.0 (compatible; msie 7.0; windows nt 6.1; wow64; trident/5.0; slcc2; .net clr 2.0.50727; .net clr 3.5.30729; .net clr 3.0.30729; media center pc 6.0; infopath.3; .net4.0c; .net4.0e; se 2.x metasr 1.0)",
'referer': "https://www.mzitu.com/"
}
res_sec = requests.get(url=img_url, headers=headers)
html_sec = etree.html(res_sec.text)
try:
# 由于会出现 list index out of range,所以用try进行,获取标题
title = html_sec.xpath('//div[@class="content"]/h2/text()')[0]
# print(title)
# 获取图片总页数
page_num = html_sec.xpath('//div[@class="pagenavi"]/a/span/text()')[-2]
# print("这组图一共有:{} 页".format(page_num))
# 拼接图片详情页地址
for num in range(1, int(page_num) + 1):
# 拼接每个图片url
img_per_url = img_url + "/" + str(num)
download_img(img_per_url, title)
except exception as e:
print(e)
else:
pass
# 下载图片
def download_img(img_per_url, title):
res_per = requests.get(url=img_per_url, headers=headers_one)
html_per = etree.html(res_per.text)
# 提取每个图片的url
img_down_url = html_per.xpath('//div[@class="main-image"]/p/a/img/@src')[0]
# 解析图片url 把 html3 每个图片再解析拿到 content
res_down = requests.get(img_down_url, headers=headers_one)
# 把图片文件装入内容
data = res_down.content
# 下载文件,设置保存文件和路径
# 获取文件所在的路径,注意的是路径是 d:/图片/mz
path = os.path.dirname("d:\图片\mz\\0.py")
img_name = img_down_url.split('/')[-1]
# 设置文件夹名称
folder_name = title.replace(' ', '')
# 保存的地址是 c:\py_code\new_code\mz\"title"
root_dir = path + "\\" + folder_name
# 新建文档的文件夹
if not os.path.exists(root_dir):
os.makedirs(root_dir)
# 设置保存文件的绝对地址
with open(root_dir + "\\" + img_name, "wb") as f:
f.write(data)
# 强行把缓冲区中的内容放到磁盘中
f.flush()
f.close()
print(img_name + "__文件下载成功: " + title)
if __name__ == "__main__":
headers_one = {
"user-agent": 'mozilla/4.0 (compatible; msie 7.0; windows nt 6.0; trident/4.0)',
'referer': "https://www.mzitu.com/"
}
for i in range(1, 10):
base_url = 'https://www.mzitu.com/page/{}/'.format(str(i))
time.sleep(0.5)
mz_spider(base_url, headers_one)
print("全部下载完成,耗时 %d s" % (start - time.time()))
方法2:
import requests
from lxml import etree
import time
import os
start = time.time()
headers_one = {
"user-agent": 'mozilla/5.0 (windows; u; windows nt 5.2) applewebkit/525.13 (khtml, like gecko) chrome/0.2.149.27 safari/525.13',
'referer': "https://www.mzitu.com/"
}
headers_two = {
"user-agent": 'mozilla/5.0 (windows nt 6.1; wow64) applewebkit/537.36 (khtml, like gecko) chrome/39.0.2171.71 safari/537.36',
'referer': "https://www.mzitu.com/"
}
headers_three = {
"user-agent": 'mozilla/5.0 (x11; linux x86_64) applewebkit/537.11 (khtml, like gecko) chrome/23.0.1271.64 safari/537.11',
'referer': "https://www.mzitu.com/"
}
# 构建所有要抓取的页面链接
for i in range(1, 3):
base_url = 'https://www.mzitu.com/page/{}/'.format(str(i))
print(" ———— 现在抓取第{}页 ".format(i) + base_url)
base_response = requests.get(url=base_url, headers=headers_one) # 请求链接
print(base_response)
base_html = etree.html(base_response.text) # 解析html
# 第一层主页面,获取每组图片的链接和详细信息
# 获取每组图片的主链接
img_urls = base_html.xpath('//div[@class="postlist"]/ul/li/a/@href')
for img_url in img_urls:
print("抓取第{}页, 这组图片的 img_url: ".format(i) + img_url)
# 第二层,每组图片的详细页面
res_two = requests.get(url=img_url, headers=headers_two)
html_sec = etree.html(res_two.text)
try:
# 由于会出现 list index out of range,所以用try进行
# 获取标题
title = html_sec.xpath('//div[@class="content"]/h2/text()')[0]
# 获取图片总页数
page_num = html_sec.xpath('//div[@class="pagenavi"]/a/span/text()')[-2]
print("这组图一共有:{} 页".format(page_num))
page = int(page_num) + 1
# 拼接图片详情页地址
for num in range(1, page):
# 拼接每个图片url
img_per_url = img_url + "/" + str(num)
# print("组图中的第{}张图的url ".format(num) + img_per_url)
# 解析每个图片所在的网页,获取每个图片的url
res_three = requests.get(url=img_per_url, headers=headers_three)
html_url = etree.html(res_three.text)
# 提取每个图片的url
img_down_url = html_url.xpath('//div[@class="main-image"]/p/a/img/@src')[0]
# print("图片下载的 img_down_url: " + img_down_url)
# 第三层,解析图片url 把 html3 每个图片再解析拿到 content
res_four = requests.get(img_down_url, headers=headers_three)
# 把图片文件
data = res_four.content
# 下载文件,设置保存文件和路径
# 获取文件所在的路径,注意的是路径是 c:/py_code/new_code/mz
path = os.path.dirname("c:/py_code/new_code/mz/0.py")
# 获取图片名称
img_name = img_down_url.split('/')[-1]
# 设置文件夹名称
folder = title.replace(' ', '')
# 保存的地址是 c:/py_code/new_code/mz/"title"
root_dir = path + "/" + folder
# 新建文档的文件夹
if not os.path.exists(root_dir):
os.makedirs(root_dir)
else:
# 如果存在就不做更改
pass
# 设置保存文件的绝对地址
with open(root_dir + "/" + img_name, "wb") as f:
f.write(data)
# 强行把缓冲区中的内容放到磁盘中
f.flush()
f.close()
print(img_name + "__文件下载成功: " + title)
time.sleep(0.5)
except exception as e:
print(e)
continue
else:
pass
print("完了,程序耗时是:%f s" % (start-time.time()))