欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

图片下载器爬虫 ItemLoader

程序员文章站 2022-03-07 20:00:55
...

思路

http://www.meizitu.com/a/more_1.html

图片下载器爬虫  
1.使用到的技术:scrapy ,requests(下载图片),ItemLoader
2.项目的创建:scrapy startproject Meizitu
  创建爬虫命令:scrapy genspider meizi www.meizitu.com


3.爬取的目标:

  帖子链接:url
  帖子的标题:title

妹子详情里面的图片, 及图片链接
  帖子里图片的所有链接:image_urls
  图片保存的位置:image_paths

4. xpath的编写
  帖子的标题:title                  xpath: //h2/a/text()
  帖子里图片的所有链接:image_urls                 xpath: //div[@id="picture"]/p/img/@src

mian

from scrapy import cmdline
cmdline.execute("scrapy crawl meizi".split())

item

import scrapy

class MeizituItem(scrapy.Item):
    # define the fields for your item here like:

    # 帖子链接:url
    url = scrapy.Field()
    # 帖子的标题:title
    title = scrapy.Field()
    # 帖子里图片的所有链接:image_urls
    image_urls = scrapy.Field()
    # 图片保存的位置:image_paths
    image_paths = scrapy.Field()

meizi.py

import scrapy
from scrapy.contrib.loader import ItemLoader
from Meizitu.items import MeizituItem


class MeiziSpider(scrapy.Spider):

    name = 'meizi'
    allowed_domains = ['www.meizitu.com']

    #页码拼接:
    # http://www.meizitu.com/a/more_2.html
    # http://www.meizitu.com/a/more_3.html

    page = 1
    url = "http://www.meizitu.com/a/more_"
    start_urls = [url+str(page)+".html"]

    #帖子的数据
    def tiezi_detail(self,response):

        url = response.url

        item = ItemLoader(item=MeizituItem(),response=response)
        #直接添加值
        item.add_value("url",url)
        #添加标题
        item.add_xpath("title",'//h2/a/text()')
        #一个帖子里面多张图片
        item.add_xpath("image_urls", '//div[@id="picture"]/p/img/@src')

        return item.load_item()



    def parse(self, response):
        #得到所有帖子的链接
        for teizi_url in response.xpath('//li[@class="wp-item"]//div[@class="pic"]//a[1]/@href').extract():
            yield scrapy.Request(teizi_url,callback=self.tiezi_detail)


        #下一页的代码
        if self.page < 72:
            self.page +=1

        next_url = self.url+str(self.page)+".html"
        yield scrapy.Request(next_url,callback=self.parse)

pipelines

import os
import requests
import json

#1.保存文本-json数据
class MeizituPipeline(object):

    def open_spider(self,spider):
        self.file = open("妹子图.json","w",encoding="utf-8")

    def close_spider(self,spider):
        self.file.close()


    def process_item(self, item, spider):
        print("item===",item)
        #当前目录
        image_store = os.path.dirname(os.path.realpath("__file__"))+"/images"

        #判断当前目录下是否有/images,没有就创建
        if not os.path.exists(image_store):
            os.makedirs(image_store)

        #多张图片下载后的地址(磁盘上的位置)
        image_paths = []

        for image_url in item["image_urls"]:

            #http://mm.chinasareview.com/wp-content/uploads/2017a/06/14/03.jpg
            #--》mm.chinasareview.com_wp-content_uploads_2017a_06/14_03.jpg
            # 图片保存的名称
            file_name = image_url.split("/")[2:]
            file_name = "_".join(file_name)
            print(file_name)

            #判断是否存在,如果存在直接不去下载,跳过本次下载
            image_path = image_store +"/"+file_name

            image_path = image_path.replace("/","\\")

            if os.path.exists(image_path):
                # 存在的地址添加到image_paths
                image_paths.append(image_path)
                continue#跳过本次下载

            #如果不存在才去下载,下载图片使用requests
            response = requests.get(image_url)

            if response.status_code == 200:
                #文件自己创建
                image_file = open(image_path,"wb")

                #下载图片
                for block in response.iter_content(1024):
                    if not block:
                        break
                    image_file.write(block)

                #图片下载完成
                image_paths.append(image_path)

        item["image_paths"] = image_paths


        #保存文本
        self.file.write(json.dumps(dict(item),ensure_ascii=False)+"\n")

        return item

setting

ITEM_PIPELINES = {
   'Meizitu.pipelines.MeizituPipeline': 300,
}

#不遵循爬虫协议
ROBOTSTXT_OBEY = False

图片下载器爬虫 ItemLoader

相关标签: 爬虫