图片下载器爬虫 ItemLoader
程序员文章站
2022-03-07 20:00:55
...
思路
http://www.meizitu.com/a/more_1.html
图片下载器爬虫
1.使用到的技术:scrapy ,requests(下载图片),ItemLoader
2.项目的创建:scrapy startproject Meizitu
创建爬虫命令:scrapy genspider meizi www.meizitu.com
3.爬取的目标:
帖子链接:url
帖子的标题:title
妹子详情里面的图片, 及图片链接
帖子里图片的所有链接:image_urls
图片保存的位置:image_paths
4. xpath的编写
帖子的标题:title xpath: //h2/a/text()
帖子里图片的所有链接:image_urls xpath: //div[@id="picture"]/p/img/@src
mian
from scrapy import cmdline
cmdline.execute("scrapy crawl meizi".split())
item
import scrapy
class MeizituItem(scrapy.Item):
# define the fields for your item here like:
# 帖子链接:url
url = scrapy.Field()
# 帖子的标题:title
title = scrapy.Field()
# 帖子里图片的所有链接:image_urls
image_urls = scrapy.Field()
# 图片保存的位置:image_paths
image_paths = scrapy.Field()
meizi.py
import scrapy
from scrapy.contrib.loader import ItemLoader
from Meizitu.items import MeizituItem
class MeiziSpider(scrapy.Spider):
name = 'meizi'
allowed_domains = ['www.meizitu.com']
#页码拼接:
# http://www.meizitu.com/a/more_2.html
# http://www.meizitu.com/a/more_3.html
page = 1
url = "http://www.meizitu.com/a/more_"
start_urls = [url+str(page)+".html"]
#帖子的数据
def tiezi_detail(self,response):
url = response.url
item = ItemLoader(item=MeizituItem(),response=response)
#直接添加值
item.add_value("url",url)
#添加标题
item.add_xpath("title",'//h2/a/text()')
#一个帖子里面多张图片
item.add_xpath("image_urls", '//div[@id="picture"]/p/img/@src')
return item.load_item()
def parse(self, response):
#得到所有帖子的链接
for teizi_url in response.xpath('//li[@class="wp-item"]//div[@class="pic"]//a[1]/@href').extract():
yield scrapy.Request(teizi_url,callback=self.tiezi_detail)
#下一页的代码
if self.page < 72:
self.page +=1
next_url = self.url+str(self.page)+".html"
yield scrapy.Request(next_url,callback=self.parse)
pipelines
import os
import requests
import json
#1.保存文本-json数据
class MeizituPipeline(object):
def open_spider(self,spider):
self.file = open("妹子图.json","w",encoding="utf-8")
def close_spider(self,spider):
self.file.close()
def process_item(self, item, spider):
print("item===",item)
#当前目录
image_store = os.path.dirname(os.path.realpath("__file__"))+"/images"
#判断当前目录下是否有/images,没有就创建
if not os.path.exists(image_store):
os.makedirs(image_store)
#多张图片下载后的地址(磁盘上的位置)
image_paths = []
for image_url in item["image_urls"]:
#http://mm.chinasareview.com/wp-content/uploads/2017a/06/14/03.jpg
#--》mm.chinasareview.com_wp-content_uploads_2017a_06/14_03.jpg
# 图片保存的名称
file_name = image_url.split("/")[2:]
file_name = "_".join(file_name)
print(file_name)
#判断是否存在,如果存在直接不去下载,跳过本次下载
image_path = image_store +"/"+file_name
image_path = image_path.replace("/","\\")
if os.path.exists(image_path):
# 存在的地址添加到image_paths
image_paths.append(image_path)
continue#跳过本次下载
#如果不存在才去下载,下载图片使用requests
response = requests.get(image_url)
if response.status_code == 200:
#文件自己创建
image_file = open(image_path,"wb")
#下载图片
for block in response.iter_content(1024):
if not block:
break
image_file.write(block)
#图片下载完成
image_paths.append(image_path)
item["image_paths"] = image_paths
#保存文本
self.file.write(json.dumps(dict(item),ensure_ascii=False)+"\n")
return item
setting
ITEM_PIPELINES = {
'Meizitu.pipelines.MeizituPipeline': 300,
}
#不遵循爬虫协议
ROBOTSTXT_OBEY = False
下一篇: Why Kotlin?