欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

360 壁纸

程序员文章站 2024-03-24 11:28:52
...

目标网址

1. 创建爬虫项目
scrapy startproject images
2. 编写items文件(需要的字段信息)
import scrapy

class ImagesItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()

    title = scrapy.Field()
    pic = scrapy.Field()
3. 爬虫文件(解析数据,跟进url)
# -*- coding: utf-8 -*-
import scrapy
import json
from ..items import ImagesItem

class ImagesSpiderSpider(scrapy.Spider):
    name = 'images_spider'
    # allowed_domains = ['aaa']
    start_urls = ['https://image.so.com/zjl?ch=beauty&sn=0&listtype=new&temp=1']
    offset = 0

    def parse(self, response):
        data_list = json.loads(response.body)['list']
        if not data_list:
            return

        for data in data_list:
            item = ImagesItem()
            item['title'] = data['title']
            item['pic'] = data['qhimg_url']
            yield item

        self.offset += 30
        next_url = 'https://image.so.com/zjl?ch=wallpaper&sn={}&pn=30'.format(self.offset)
        yield scrapy.Request(next_url, callback=self.parse)

4. 管道存储(图片下载)
from scrapy.pipelines.images import ImagesPipeline

class ImgPipeline(ImagesPipeline):
    def get_media_requests(self, item, info):
        img_url = item["pic"]
        yield scrapy.Request(img_url)

    def item_completed(self, results, item, info):
        img_path = [x["path"] for ok, x in results if ok]
        try:
            old_path = IMAGES_STORE + img_path[0]
            new_path = IMAGES_STORE + item["title"] + ".jpg"
            os.rename(old_path, new_path)
        except Exception:
            pass
        return item
5. 保存mysql
class MysqlPipeline(object):

    def __init__(self, db, host, port, user, password):
        self.user = user
        self.password = password
        self.host = host
        self.port = port
        self.db = db

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            user = crawler.settings.get('USER'),
            password=crawler.settings.get('PASSWORD'),
            host=crawler.settings.get('HOST'),
            port=crawler.settings.get('PORT'),
            db=crawler.settings.get('DB'),
        )

    def open_spider(self, spider):
        self.client = pymysql.connect(host=self.host, user=self.user, db=self.db, password=self.password, port=self.port)
        self.cursor = self.client.cursor()

    def process_item(self, item, spider):
        item = dict(item)
        sql = 'insert into images360(title, pic) values(%s, %s)'
        try:
            self.cursor.execute(sql, (item["title"], item["pic"]))
            self.client.commit()

        except Exception as e:
            self.client.rollback()

        else:
            return item

    def close_spider(self, spider):
        self.client.close()
6. 保存mongo
class MongoPipeline(object):

    collection_name = '360'

    def __init__(self, mongo_uri, mongo_db):
        self.mongo_uri = mongo_uri
        self.mongo_db = mongo_db


    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            mongo_uri=crawler.settings.get('MONGO_URI'),
            mongo_db=crawler.settings.get('MONGO_DB')
        )

    def open_spider(self, spider):
        self.client = pymongo.MongoClient()
        self.db = self.client[self.mongo_db]
        self.collection = self.db[self.collection_name]

    def process_item(self, item, spider):
        self.collection.insert(dict(item))
        return item

    def close_spider(self, spider):
        self.client.close()
7. 配置文件
# -*- coding: utf-8 -*-

# Scrapy settings for images project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'images'

SPIDER_MODULES = ['images.spiders']
NEWSPIDER_MODULE = 'images.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'images (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'images.middlewares.ImagesSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'images.middlewares.ImagesDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'images.pipelines.ImgPipeline': 300,
   'images.pipelines.MongoPipeline': 400,
   'images.pipelines.MysqlPipeline': 500,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'


IMAGES_STORE = '360images\\'

MONGO_URI = 'localhost'
MONGO_DB = '360'


USER = 'root'
PASSWORD = 'qwe123'
DB = 'spider'
PORT = 3306
HOST = 'localhost'
8. 运行脚本
from scrapy import cmdline

cmdline.execute('scrapy crawl images_spider'.split())
9. 遇到的问题

(1)mysql表名不能以数字开头
(2)mysql 端口为3306, 并非‘3306’

相关标签: spider Crawler case