scrapy爬取豆瓣电影
程序员文章站
2022-05-02 17:43:25
...
刚看了scrapy框架,就想写个小项目练练手,刚好最近的一个django项目缺少电影推荐的信息,自然就想到了去爬取豆瓣电影的数据,爬取的url是
https://movie.douban.com/top250
新建项目命令: scrapy startproject doubanMovie
进入项目目录下,新建一个爬虫文件: scrapy genspider movie movie.douban.com
各py文件代码如下:
# settings.py代码
# -*- coding: utf-8 -*-
BOT_NAME = 'doubanMovie'
SPIDER_MODULES = ['doubanMovie.spiders']
NEWSPIDER_MODULE = 'doubanMovie.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
# 设置UA
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
# Obey robots.txt rules
# 自己写的爬虫,暂时就不遵守robots规则啦
ROBOTSTXT_OBEY = False
# Override the default request headers:
# 打开默认请求头
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
# 开启item pipeline
ITEM_PIPELINES = {
'doubanMovie.pipelines.DoubanmoviePipeline': 300,
}
# items.py文件
# -*- coding: utf-8 -*-
import scrapy
class DoubanmovieItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field() # 电影名字
release_time = scrapy.Field() # 上映时间
director = scrapy.Field() # 导演名字
length = scrapy.Field() # 片长
imdb_link = scrapy.Field() # imdb下载链接
mark = scrapy.Field() # 评分
cover_link = scrapy.Field() # 封面图片
summary = scrapy.Field() # 概述
# movie.py 文件,也就是爬虫文件
# -*- coding: utf-8 -*-
import scrapy
from urllib import parse
import time
class MovieSpider(scrapy.Spider):
name = 'movie' # 爬虫名
# allowed_domains = ['movie.douban.com']
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36',
}
def start_requests(self):
url = 'https://movie.douban.com/top250'
# yield scrapy.Request(url=url, headers=self.headers, meta={'proxy':'http:118.117.136.19:9000'})
yield scrapy.Request(url=url, headers=self.headers)
page = 1 # 翻页记录
# 解析页面
def parse(self, response):
# item = DoubanmovieItem()
movie_ol = response.xpath('//ol[@class="grid_view"]/li/div/div[2]')
for div in movie_ol:
# 电影名称和评分
item = {}
name = div.xpath('.//a/span[1]/text()').extract_first()
mark = div.xpath('.//span[@class="rating_num"]/text()').extract_first()
item = {
'name':name,
'mark':mark,
}
# 电影详情信息页面url
detail_url = div.xpath('.//a/@href').extract_first()
# 获取详细信息
yield scrapy.Request(url=detail_url, callback=self.parse_info, meta={'item':item}, dont_filter=True)
# 翻页
if self.page <= 10:
data = {
'start':(self.page - 1) * 25,
}
data = parse.urlencode(data)
# 下一页url
next_url = 'https://movie.douban.com/top250?' + data
self.page += 1
yield scrapy.Request(url=next_url, callback=self.parse, dont_filter=True)
# 解析每一条电影链接对应的详情页信息
def parse_info(self, response):
item = response.meta['item']
director = response.xpath('//div[@class="article"]//div[@id="info"]/span[1]/span[2]/a/text()').extract_first()
release_time = response.xpath('//div[@class="article"]//div[@id="info"]/span[10]/text()').extract_first()
length = response.xpath('//div[@class="article"]//div[@id="info"]/span[13]/text()').extract_first()
imdb_link = response.xpath('//div[@class="article"]//div[@id="info"]/a[1]/@href').extract_first()
cover_link = response.xpath('//div[@class="article"]//div[@id="mainpic"]//img/@src').extract_first()
summary = response.xpath('//div[@class="article"]//div[@id="link-report"]/span[1]/text()').extract_first()
item['director'] = director
item['release_time'] = release_time
item['length'] = length
item['imdb_link'] = imdb_link
item['cover_link'] = cover_link
item['summary'] = summary
time.sleep(2)
yield item
# pipeline.py文件
# -*- coding: utf-8 -*-
import json
class DoubanmoviePipeline(object):
# 首次运行工程就打开文件
def open_spider(self, spider):
self.fp = open('movie.json', 'w', encoding='utf-8')
# 退出就关闭文件
def close_spider(self, spider):
self.fp.close()
# 此处的item参数就是从film.py中的parse方法返回的
# 每返回一个item,这里就调用一次
def process_item(self, item, spider):
# print(item)
string = json.dumps(item, ensure_ascii=False, indent=4)
self.fp.write(string + '\n')
return item
运行命令: scrapy crawl movie
刚开始测试的时候是可以拿到结果的,但在测试了几次以后,豆瓣似乎封掉了我的ip,总是报403错误。可能是请求速度太快了,添加了headers之后也不行,最后更换了代理ip就可以了。如果在程序运行的过程中遇到相似问题,更换ip即可。