360 壁纸

程序员文章站 2024-03-24 11:28:52

...

1. 创建爬虫项目

scrapy startproject images

2. 编写items文件（需要的字段信息）

import scrapy

class ImagesItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()

    title = scrapy.Field()
    pic = scrapy.Field()

3. 爬虫文件（解析数据，跟进url）

# -*- coding: utf-8 -*-
import scrapy
import json
from ..items import ImagesItem

class ImagesSpiderSpider(scrapy.Spider):
    name = 'images_spider'
    # allowed_domains = ['aaa']
    start_urls = ['https://image.so.com/zjl?ch=beauty&sn=0&listtype=new&temp=1']
    offset = 0

    def parse(self, response):
        data_list = json.loads(response.body)['list']
        if not data_list:
            return

        for data in data_list:
            item = ImagesItem()
            item['title'] = data['title']
            item['pic'] = data['qhimg_url']
            yield item

        self.offset += 30
        next_url = 'https://image.so.com/zjl?ch=wallpaper&sn={}&pn=30'.format(self.offset)
        yield scrapy.Request(next_url, callback=self.parse)

4. 管道存储（图片下载）

from scrapy.pipelines.images import ImagesPipeline

class ImgPipeline(ImagesPipeline):
    def get_media_requests(self, item, info):
        img_url = item["pic"]
        yield scrapy.Request(img_url)

    def item_completed(self, results, item, info):
        img_path = [x["path"] for ok, x in results if ok]
        try:
            old_path = IMAGES_STORE + img_path[0]
            new_path = IMAGES_STORE + item["title"] + ".jpg"
            os.rename(old_path, new_path)
        except Exception:
            pass
        return item

5. 保存mysql

class MysqlPipeline(object):

    def __init__(self, db, host, port, user, password):
        self.user = user
        self.password = password
        self.host = host
        self.port = port
        self.db = db

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            user = crawler.settings.get('USER'),
            password=crawler.settings.get('PASSWORD'),
            host=crawler.settings.get('HOST'),
            port=crawler.settings.get('PORT'),
            db=crawler.settings.get('DB'),
        )

    def open_spider(self, spider):
        self.client = pymysql.connect(host=self.host, user=self.user, db=self.db, password=self.password, port=self.port)
        self.cursor = self.client.cursor()

    def process_item(self, item, spider):
        item = dict(item)
        sql = 'insert into images360(title, pic) values(%s, %s)'
        try:
            self.cursor.execute(sql, (item["title"], item["pic"]))
            self.client.commit()

        except Exception as e:
            self.client.rollback()

        else:
            return item

    def close_spider(self, spider):
        self.client.close()

6. 保存mongo

class MongoPipeline(object):

    collection_name = '360'

    def __init__(self, mongo_uri, mongo_db):
        self.mongo_uri = mongo_uri
        self.mongo_db = mongo_db


    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            mongo_uri=crawler.settings.get('MONGO_URI'),
            mongo_db=crawler.settings.get('MONGO_DB')
        )

    def open_spider(self, spider):
        self.client = pymongo.MongoClient()
        self.db = self.client[self.mongo_db]
        self.collection = self.db[self.collection_name]

    def process_item(self, item, spider):
        self.collection.insert(dict(item))
        return item

    def close_spider(self, spider):
        self.client.close()

7. 配置文件

# -*- coding: utf-8 -*-

# Scrapy settings for images project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'images'

SPIDER_MODULES = ['images.spiders']
NEWSPIDER_MODULE = 'images.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'images (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'images.middlewares.ImagesSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'images.middlewares.ImagesDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'images.pipelines.ImgPipeline': 300,
   'images.pipelines.MongoPipeline': 400,
   'images.pipelines.MysqlPipeline': 500,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'


IMAGES_STORE = '360images\\'

MONGO_URI = 'localhost'
MONGO_DB = '360'


USER = 'root'
PASSWORD = 'qwe123'
DB = 'spider'
PORT = 3306
HOST = 'localhost'

8. 运行脚本

from scrapy import cmdline

cmdline.execute('scrapy crawl images_spider'.split())

9. 遇到的问题

（1）mysql表名不能以数字开头
（2）mysql 端口为3306，并非‘3306’

360 壁纸

1. 创建爬虫项目

2. 编写items文件（需要的字段信息）

3. 爬虫文件（解析数据，跟进url）

4. 管道存储（图片下载）

5. 保存mysql

6. 保存mongo

7. 配置文件

8. 运行脚本

9. 遇到的问题

360 壁纸

【360】挑选镇长

360公司2018春招编程题题解【附C/C++代码】

360Lib：CPP-PSNR

360 RePlugin引入教程

360插件化RePlugin踩坑

Ubuntu 14.04自动更换壁纸

Python爬虫——爬去必应壁纸（简化版）

ExtJs checkbox radiobox 问题汇总博客分类： ext基础 EXTjsonIEF#360

2017前端实习面经之腾讯、阿里、360、IBM、美团、Daocloud

360 壁纸

1. 创建爬虫项目

2. 编写items文件（需要的字段信息）

3. 爬虫文件（解析数据，跟进url）

4. 管道存储（图片下载）

5. 保存mysql

6. 保存mongo

7. 配置文件

8. 运行脚本

9. 遇到的问题

360 壁纸

【360】挑选镇长

360公司2018春招编程题题解【附C/C++代码】

360Lib：CPP-PSNR

360 RePlugin引入教程

360插件化RePlugin踩坑

Ubuntu 14.04自动更换壁纸

Python爬虫——爬去必应壁纸（简化版）

ExtJs checkbox radiobox 问题 汇总 博客分类： ext基础 EXTjsonIEF#360

2017前端实习面经之腾讯、阿里、360、IBM、美团、Daocloud

ExtJs checkbox radiobox 问题汇总博客分类： ext基础 EXTjsonIEF#360