Scrapy
程序员文章站
2022-01-20 23:14:49
...
创建项目
# scrapy startproject 项目名
scrapy startproject scrapy_demo
生成爬虫
# scrapy genspider 爬虫名 网站域名
cd scrapy_demo
scrapy genspider csdn "csdn.net"
启动爬虫
# scrapy crawl 爬虫名
scrapy crawl baidu
GET请求
import scrapy
import logging
logger = logging.getLogger(__name__)
class BaiduSpider(scrapy.Spider):
name = 'baidu'
start_urls = ['https://www.csdn.net']
def parse(self, response):
yield scrapy.Request(
"http://www.baidu.com",
callback=self.parse_detail,
meta={"hello":"world"}
)
def parse_detail(self,response):
"""详情"""
hello = response.meta["hello"]
logger.warning(hello)
pass
POST
import scrapy
import logging
logger = logging.getLogger(__name__)
class BaiduSpider(scrapy.Spider):
name = 'baidu'
# start_urls = ['https://www.csdn.net']
def start_requests(self):
post_data = dict(
login_name = "username",
password = "123456"
)
yield scrapy.FormRequest(
"http://www.baidu.com",
callback=self.parse_detail,
meta={"hello":"world"},
formdata=post_data
)
def parse_detail(self,response):
"""详情"""
hello = response.meta["hello"]
logger.warning(hello)
pass
日志
普通日志
# scrapy_demo/setting.py
# 设置日志等级
LOG_LEVEL = "WARNING"
# 设置输出日志 scrapy_demo/csdn.py
import scrapy
import logging
class BaiduSpider(scrapy.Spider):
name = 'csdn'
allowed_domains = ['csdn.net']
# 第一次访问的地址
start_urls = ['https://www.csdn.net']
def parse(self, response):
print("======response======")
# 日志输出
logging.warning("======warning======")
设置日志输出文件
# scrapy_demo/setting.py
# 设置日志文件
LOG_FILE = "./log.log"
# 设置输出日志 scrapy_demo/csdn.py
import scrapy
import logging
logger = logging.getLevelName(__name__)
class BaiduSpider(scrapy.Spider):
name = 'csdn'
allowed_domains = ['csdn.net']
start_urls = ['https://www.csdn.net']
def parse(self, response):
print("======response======")
logging.warning("======warning======")
日志格式设置
# 设置日志打印格式
LOG_FORMAT = "%(asctime)s [%(levelname)s]: %(message)s"
# 其他日志参数设置
# 设置日志是否可用 默认True
# LOG_ENABLED = False
# 设置日志编码 默认 utf-8
# LOG_ENCODING = "gbk"
普通项目日志设置
import logging
logger = logging.getLogger(__name__)
formatter = logging.Formatter("%(asctime)s [%(levelname)s]: %(message)s",datefmt="%Y-%m-%d %H:%M:%S")
fh = logging.FileHandler("./scrapy_demo/log.log",encoding='utf-8')
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
logger.addHandler(fh)
if __name__ == "__main__":
print("log")
logger.error("错误")
%(levelno)s: 打印日志级别的数值
%(levelname)s: 打印日志级别名称
%(pathname)s: 打印当前执行程序的路径,其实就是sys.argv[0]
%(filename)s: 打印当前执行程序名
%(funcName)s: 打印日志的当前函数
%(lineno)d: 打印日志的当前行号
%(asctime)s: 打印日志的时间
%(thread)d: 打印线程ID
%(threadName)s: 打印线程名称
%(process)d: 打印进程ID
%(message)s: 打印日志信息
自定义item
# scrapy_demo/item.py
import scrapy
class ScrapyDemoItem(scrapy.Item):
name = scrapy.Field()
使用
import scrapy
import logging
from scrapy_demo.items import ScrapyDemoItem
logger = logging.getLogger(__name__)
class BaiduSpider(scrapy.Spider):
name = 'csdn'
allowed_domains = ['csdn.net']
start_urls = ['https://www.csdn.net']
def parse(self, response):
item = ScrapyDemoItem()
item["name"] = "hello"
logger.warning(item.get("name"))
settings.py文件中设置变量
############# scrapy_demo/settings.py
MONGO_HOST = 'localhost'
############ scrapy_demo/spiders/csdn.py
import scrapy
import logging
logger = logging.getLogger(__name__)
class BaiduSpider(scrapy.Spider):
name = 'csdn'
allowed_domains = ['csdn.net']
start_urls = ['https://www.csdn.net']
def parse(self, response):
logger.warning(self.settings.get("MONGO_HOST"))
CrawlSpider
生成爬虫
# scrapy genspider -t crawl 爬虫名 域名
scrapy genspider -t crawl csdn "csdn.net"
Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
####### scrapy_demo/spiders/csdn.py
class CsdnSpider(CrawlSpider):
name = 'csdn'
allowed_domains = ['csdn.net']
start_urls = ['https://www.csdn.net/']
rules = (
# LinkExtractor 定义url提取规则 可以有多个规则
# callback 回调方法
# allow 需要匹配的规则
# follow 当前请求获取的url是否可以经过rule
Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),
)
def parse_item(self, response):
pass
下载中间件
# 配置
# scrapy_demo/settings.py
DOWNLOADER_MIDDLEWARES = {
'scrapy_demo.middlewares.ScrapyDemoDownloaderMiddleware': 543,
}
# scrapy_demo/middleware.py
class ScrapyDemoDownloaderMiddleware(object):
def process_request(self, request, spider):
# 下载开始,可以添加请求头
return None
def process_response(self, request, response, spider):
# 下载完成
return response