欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

Scrapy

程序员文章站 2022-01-20 23:14:49
...

创建项目

# scrapy startproject 项目名
scrapy startproject scrapy_demo

生成爬虫

# scrapy genspider 爬虫名 网站域名
cd scrapy_demo
scrapy genspider csdn "csdn.net"

启动爬虫

# scrapy crawl 爬虫名
scrapy crawl baidu

GET请求

import scrapy
import logging
logger = logging.getLogger(__name__)
class BaiduSpider(scrapy.Spider):
    name = 'baidu'
    start_urls = ['https://www.csdn.net']

    def parse(self, response):
        yield scrapy.Request(
            "http://www.baidu.com",
            callback=self.parse_detail,
            meta={"hello":"world"}
        )
    def parse_detail(self,response):
        """详情"""
        hello = response.meta["hello"]
        logger.warning(hello)
        pass

POST

import scrapy
import logging

logger = logging.getLogger(__name__)

class BaiduSpider(scrapy.Spider):
    name = 'baidu'
   # start_urls = ['https://www.csdn.net']

    def start_requests(self):
        post_data = dict(
            login_name = "username",
            password = "123456"
        )

        yield scrapy.FormRequest(
            "http://www.baidu.com",
            callback=self.parse_detail,
            meta={"hello":"world"},
            formdata=post_data
        )

    def parse_detail(self,response):
        """详情"""
        hello = response.meta["hello"]
        logger.warning(hello)
        pass

日志

普通日志

# scrapy_demo/setting.py
# 设置日志等级
LOG_LEVEL = "WARNING"
# 设置输出日志 scrapy_demo/csdn.py
import scrapy
import logging

class BaiduSpider(scrapy.Spider):
    name = 'csdn'
    allowed_domains = ['csdn.net']
    # 第一次访问的地址
    start_urls = ['https://www.csdn.net']

    def parse(self, response):
        print("======response======")
        # 日志输出
        logging.warning("======warning======")

设置日志输出文件

# scrapy_demo/setting.py
# 设置日志文件
LOG_FILE = "./log.log"
# 设置输出日志 scrapy_demo/csdn.py
import scrapy
import logging

logger = logging.getLevelName(__name__)

class BaiduSpider(scrapy.Spider):
    name = 'csdn'
    allowed_domains = ['csdn.net']
    start_urls = ['https://www.csdn.net']

    def parse(self, response):
        print("======response======")
        logging.warning("======warning======")

日志格式设置

# 设置日志打印格式
LOG_FORMAT = "%(asctime)s [%(levelname)s]: %(message)s"
# 其他日志参数设置
# 设置日志是否可用 默认True
# LOG_ENABLED = False
# 设置日志编码 默认 utf-8
# LOG_ENCODING = "gbk"

普通项目日志设置

import logging

logger = logging.getLogger(__name__)
formatter = logging.Formatter("%(asctime)s [%(levelname)s]: %(message)s",datefmt="%Y-%m-%d %H:%M:%S")
fh = logging.FileHandler("./scrapy_demo/log.log",encoding='utf-8')
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
logger.addHandler(fh)

if __name__ == "__main__":
    print("log")
    logger.error("错误")

%(levelno)s: 打印日志级别的数值
%(levelname)s: 打印日志级别名称
%(pathname)s: 打印当前执行程序的路径,其实就是sys.argv[0]
%(filename)s: 打印当前执行程序名
%(funcName)s: 打印日志的当前函数
%(lineno)d: 打印日志的当前行号
%(asctime)s: 打印日志的时间
%(thread)d: 打印线程ID
%(threadName)s: 打印线程名称
%(process)d: 打印进程ID
%(message)s: 打印日志信息

自定义item

# scrapy_demo/item.py
import scrapy

class ScrapyDemoItem(scrapy.Item):
	name = scrapy.Field()

使用

import scrapy
import logging
from scrapy_demo.items import ScrapyDemoItem

logger = logging.getLogger(__name__)

class BaiduSpider(scrapy.Spider):
    name = 'csdn'
    allowed_domains = ['csdn.net']
    start_urls = ['https://www.csdn.net']

    def parse(self, response):
        item = ScrapyDemoItem()
        item["name"] = "hello"
        logger.warning(item.get("name"))

settings.py文件中设置变量

#############  scrapy_demo/settings.py
MONGO_HOST = 'localhost'
############  scrapy_demo/spiders/csdn.py
import scrapy
import logging

logger = logging.getLogger(__name__)
class BaiduSpider(scrapy.Spider):
   name = 'csdn'
   allowed_domains = ['csdn.net']
   start_urls = ['https://www.csdn.net']

   def parse(self, response):
       logger.warning(self.settings.get("MONGO_HOST"))

CrawlSpider

生成爬虫

# scrapy genspider -t crawl 爬虫名 域名
scrapy genspider -t crawl csdn "csdn.net"

Rule

from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
####### scrapy_demo/spiders/csdn.py
class CsdnSpider(CrawlSpider):
   name = 'csdn'
   allowed_domains = ['csdn.net']
   start_urls = ['https://www.csdn.net/']

   rules = (
       # LinkExtractor 定义url提取规则 可以有多个规则
       # callback 回调方法
       # allow 需要匹配的规则
       # follow 当前请求获取的url是否可以经过rule
       Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),
   )

   def parse_item(self, response):
       pass

下载中间件

# 配置
# scrapy_demo/settings.py
DOWNLOADER_MIDDLEWARES = {
   'scrapy_demo.middlewares.ScrapyDemoDownloaderMiddleware': 543,
}
# scrapy_demo/middleware.py
class ScrapyDemoDownloaderMiddleware(object):
	    def process_request(self, request, spider):
		# 下载开始,可以添加请求头
        return None

    def process_response(self, request, response, spider):
		# 下载完成
        return response
相关标签: Scrapy