(爬虫基础)爬取猫眼Top100电影
程序员文章站
2022-05-02 16:55:57
...
#!/usr/bin/env python3
# _*_ coding: utf-8 _*_
# Date: 2019/11/11 0011 12:40
# Author: Mijiu
# Version: 1.0
import requests,re,csv
from lxml import etree
# 获取页面源码 (猫眼电影top100) by Rosny 2019-11-11
def Get_page(num=0):
pr = {
"offset":num*10
}
url = 'https://maoyan.com/board/4'
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0"
}
response = requests.get(url,headers=header,params=pr)
return response.text
# 分析数据 (Xpath验证) 顺手清理数据 by Rosny 2019-11-11
def Get_data(data):
html = etree.HTML(data)
data_MV = html.xpath('//body//img[@class="board-img"]')
# 电影名,图片地址
MV_list = list()
Net_list = list()
for i in data_MV:
# print(i.xpath("./@alt"),i.xpath("./@data-src"))
MV_list.append(i.xpath("./@alt"))
Net_list.append(i.xpath("./@data-src"))
# 主演
data_name = html.xpath('//p[@class="star"]')
name = list() # 主演列表
for j in data_name:
name.append(j.xpath("./text()")[0].strip())
# 上映时间
time_MV = list()
for y in data_name:
time_MV.append(y.xpath("./following-sibling::p[1]")[0].xpath("./text()"))
# 评分
grade = list()
data_grade = html.xpath('//i[@class="integer"]/text()',) # 整数部分
data_grade2 = html.xpath('//i[@class="fraction"]/text()') # 小数部分
num = 0
for i in data_grade:
grade.append(i+data_grade2[num])
num+=1
top = list()
for i in range(10):
data__ = f'{MV_list[i][0]}${Net_list[i][0]}${name[i]}${time_MV[i][0]}${grade[i]}'
top.append(data__)
top2_0 = list() # 完整版列表 [[1],[2]]
for i in top:
top2_0.append(i.split("$"))
return top2_0
# CSV格式存储清洗过后的数据
def Csv_data(data):
with open("maoyan_top100/top100.csv","a") as cf:
wf = csv.writer(cf)
wf.writerows(data)
# 获取图片
def Get_img():
data = list()
for i in range(10):
for j in (Get_data(Get_page(i))):
data.append(j)
num = 1
for i in data:
response = requests.get(i[1])
with open(f"maoyan_top100/img/{i[0]}.jpg","wb") as f:
f.write(response.content)
num +=1
# 主函数
def main():
for i in range(10):
Csv_data(Get_data(Get_page(i)))
Get_img()
if __name__ == '__main__':
main()
使用正则
#!/usr/bin/env python3
# _*_ coding: utf-8 _*_
# Date: 2019/11/7 0007 14:18
# Author: Mijiu
# Version: 1.0
import requests,re,csv
# 获取页面源码 (猫眼电影top100) by Rosny 2019-11-7
def Get_page(num=0):
pr = {
"offset":num*10
}
url = 'https://maoyan.com/board/4'
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0"
}
response = requests.get(url,headers=header,params=pr)
return response.text
# 分析数据 (正则验证) by Rosny 2019-11-7
def Get_data(data):
reobj = re.findall('<dd.*?title="(.*?)".*?data-src="(.*?)".*?star">(.*?)<.*?time">(.*?)<.*?ger">(.*?)<.*?tion">(\d)',data,re.S)
return reobj
# 数据清洗 by Rosny 2019-11-7
def Data_cleaning(data):
data_lis = list()
for i in data:
data_lis.append(list(i))
for j in data_lis:
j[2] = j[2].strip()
j[4] = j[4]+j[5]
j.pop()
return data_lis
# CSV格式存储清洗过后的数据
def Csv_data(data):
# for i in data:
with open("file.csv","a+") as cf:
wf = csv.writer(cf)
wf.writerows(data)
# 获取图片
def Get_img():
data = list()
for i in range(10):
for j in (Data_cleaning(Get_data(Get_page(i)))):
data.append(j)
num = 1
for i in data:
response = requests.get(i[1])
with open(f"img/{num}.jpg","wb") as f:
f.write(response.content)
num +=1
# 主函数
def main():
for i in range(10):
Csv_data(Data_cleaning(Get_data(Get_page(i))))
if __name__ == '__main__':
# main()
Get_img()
上一篇: 关于多线程与门票的问题
推荐阅读
-
Python爬取猫眼电影排行
-
爬虫小项目!适合有基础的!爬取葡萄酒评分!哪个阶段和那种酒呢
-
Python爬虫实战用 BeautifulSoup 爬取电影网站信息
-
【Python3爬虫】猫眼电影爬虫(破解字符集反爬)
-
用.NET Core写爬虫爬取电影天堂
-
如何利用python爬虫爬取爱奇艺VIP电影?
-
Python爬虫实战用 BeautifulSoup 爬取电影网站信息
-
python使用BeautifulSoup与正则表达式爬取时光网不同地区top100电影并对比
-
基础爬虫,谁学谁会,用requests、正则表达式爬取豆瓣Top250电影数据!
-
【Python爬虫教程】 爬取某个小岛国家的电影