Python网络爬虫爬取豆瓣电影的数据
程序员文章站
2022-05-02 16:58:14
...
一、豆瓣电影的数据是动态加载,所以要用到json解析
https://movie.douban.com/typeranktype_name=%E5%89%A7%E6%83%85&type=11&interval_id=100:90&action=为例子
用火狐浏览器打开
完成第四步就获取到请求动态数据的json请求地址,在执行第三步时,你会发现你点击不同的url地址,消息头下的请求地址中的action的数据会发生变化,start表示第几部电影,limit表示一页显示多少部电影,为了更好的获取数据,我们在下面的程序中实在limit=1,start的值通过循环来设置,设置的大小看下图。
json文件里面的数据进行格式化后如下,根据下面数据提前信息
源代码:
#coding:utf-8
import json
from time import sleep
import jsonpath as jsonpath
import requests
import csv
#解析json数据并存储到CSV文件
def pardeContent(url,headers):
# 因为豆瓣是https的,所以我们在此处需要稍微注意一下,将verify置为False表示不需要验证SSL证书
response = requests.get(url, headers=headers, verify=False)
# 读取reponse
html = response.text
# 把json格式字符串转换成python对象
html = json.loads(html)
# 获取节点下的数据并存储到CSV文件
with open('movie_data.csv', 'a+', newline="", encoding='utf-8') as f:
writer = csv.writer(f)
#从json文件提取数据
id = jsonpath.jsonpath(html, '$..id')
title = jsonpath.jsonpath(html, '$..title')
regions = jsonpath.jsonpath(html, '$..regions')
types = jsonpath.jsonpath(html, '$..types')
release_date = jsonpath.jsonpath(html, '$..release_date')
score = jsonpath.jsonpath(html, '$..score')
cover_url = jsonpath.jsonpath(html, '$..cover_url')
url = jsonpath.jsonpath(html, '$..url')
actors = jsonpath.jsonpath(html, '$..actors')
print(id, title, regions, types, release_date, score, cover_url, url, actors)
#写入CSv文件
writer.writerow([id, title, regions, types, release_date, score, cover_url, url, actors])
if __name__ == "__main__":
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/77.0.3865.120 Safari/537.36'
}
with open('movie_data.csv','w+',encoding='utf-8', newline='') as f:
writer = csv.writer(f)
#设置文件目录
writer.writerow(['id', 'regions', 'title', 'types', 'release_date', 'score', 'cover_url', 'url', 'actors'])
for start in range(79, 654):
url = "https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action=&start={}&limit=1".format(
start)
print(url)
pardeContent(url, headers)
-
二、源代码中的不足
1、如果你想要豆瓣上首有电影的信息,每次都要输入新页面的url
三、优化后的代码
#coding:utf-8
import json
import requests
import csv
#解析json数据并存储到CSV文件
def pardeContent(url,headers):
# 因为豆瓣是https的,所以我们在此处需要稍微注意一下,将verify置为False表示不需要验证SSL证书
response = requests.get(url, headers=headers, verify=False)
# 读取reponse
html = response.text
# 把json格式字符串转换成python对象
html = json.loads(html)
# 获取节点下的数据并存储到CSV文件
with open('movie_data1.csv', 'a+', newline="", encoding='utf-8') as f:
writer = csv.writer(f)
#从json文件提取数据
# id = jsonpath.jsonpath(html, '$..id')
# title = jsonpath.jsonpath(html, '$..title')
# regions = jsonpath.jsonpath(html, '$..regions')
# types = jsonpath.jsonpath(html, '$..types')
# release_date = jsonpath.jsonpath(html, '$..release_date')
# score = jsonpath.jsonpath(html, '$..score')
# cover_url = jsonpath.jsonpath(html, '$..cover_url')
# url = jsonpath.jsonpath(html, '$..url')
# actors = jsonpath.jsonpath(html, '$..actors')
for key in html:
id = key["id"]
title = key["title"]
regions = key["regions"]
types = key["types"]
release_date = key["release_date"]
score = key["score"]
cover_url = key["cover_url"]
url = key["url"]
actors = key["actors"]
print(id, title, regions, types, release_date, score, cover_url, url, actors)
#写入CSv文件
writer.writerow([id, title, regions, types, release_date, score, cover_url, url, actors])
if __name__ == "__main__":
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/77.0.3865.120 Safari/537.36'
}
with open('movie_data1.csv','w+',encoding='utf-8', newline='') as f:
writer = csv.writer(f)
#设置文件目录
writer.writerow(['id', 'title', 'regions', 'types', 'release_date', 'score', 'cover_url', 'url', 'actors'])
for start in range(0, 654):
url = "https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action=&start={}&limit=1".format(
start)
print(url)
print(start)
pardeContent(url, headers)