(爬虫基础)爬取猫眼Top100电影

程序员文章站 2022-05-02 16:55:57

...

#!/usr/bin/env python3
# _*_ coding: utf-8 _*_
# Date: 2019/11/11 0011 12:40
# Author: Mijiu
# Version: 1.0

import requests,re,csv
from lxml import etree

# 获取页面源码 (猫眼电影top100) by Rosny 2019-11-11
def Get_page(num=0):
    pr = {
        "offset":num*10
    }
    url = 'https://maoyan.com/board/4'

    header = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0"
    }
    response = requests.get(url,headers=header,params=pr)
    return response.text

# 分析数据 (Xpath验证) 顺手清理数据 by Rosny 2019-11-11
def Get_data(data):

    html = etree.HTML(data)
    data_MV = html.xpath('//body//img[@class="board-img"]')
    # 电影名,图片地址
    MV_list = list()
    Net_list = list()
    for i in data_MV:
        # print(i.xpath("./@alt"),i.xpath("./@data-src"))
        MV_list.append(i.xpath("./@alt"))
        Net_list.append(i.xpath("./@data-src"))

    # 主演
    data_name = html.xpath('//p[@class="star"]')
    name = list()    # 主演列表
    for j in data_name:
        name.append(j.xpath("./text()")[0].strip())

    # 上映时间
    time_MV = list()
    for y in data_name:
        time_MV.append(y.xpath("./following-sibling::p[1]")[0].xpath("./text()"))
    # 评分
    grade = list()
    data_grade = html.xpath('//i[@class="integer"]/text()',) # 整数部分
    data_grade2 = html.xpath('//i[@class="fraction"]/text()')   # 小数部分
    num = 0
    for i in data_grade:
        grade.append(i+data_grade2[num])
        num+=1

    top = list()
    for i in range(10):
        data__ = f'{MV_list[i][0]}${Net_list[i][0]}${name[i]}${time_MV[i][0]}${grade[i]}'
        top.append(data__)

    top2_0 = list()   # 完整版列表 [[1],[2]]
    for i in top:
        top2_0.append(i.split("$"))
    return top2_0


# CSV格式存储清洗过后的数据
def Csv_data(data):
    with open("maoyan_top100/top100.csv","a") as cf:
        wf = csv.writer(cf)
        wf.writerows(data)

# 获取图片
def Get_img():
    data = list()
    for i in range(10):
        for j in (Get_data(Get_page(i))):
            data.append(j)
    num = 1
    for i in data:
        response = requests.get(i[1])
        with open(f"maoyan_top100/img/{i[0]}.jpg","wb") as f:
            f.write(response.content)
        num +=1

# 主函数
def main():
    for i in range(10):
        Csv_data(Get_data(Get_page(i)))
    Get_img()

if __name__ == '__main__':

    main()

使用正则

#!/usr/bin/env python3
# _*_ coding: utf-8 _*_
# Date: 2019/11/7 0007 14:18
# Author: Mijiu
# Version: 1.0

import requests,re,csv

# 获取页面源码 (猫眼电影top100) by Rosny 2019-11-7
def Get_page(num=0):

    pr = {
        "offset":num*10
    }
    url = 'https://maoyan.com/board/4'

    header = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0"
    }
    response = requests.get(url,headers=header,params=pr)
    return response.text

# 分析数据 (正则验证) by Rosny 2019-11-7
def Get_data(data):
    reobj = re.findall('<dd.*?title="(.*?)".*?data-src="(.*?)".*?star">(.*?)<.*?time">(.*?)<.*?ger">(.*?)<.*?tion">(\d)',data,re.S)
    return reobj

# 数据清洗  by Rosny 2019-11-7
def Data_cleaning(data):
    data_lis = list()
    for i in data:
        data_lis.append(list(i))
    for j in data_lis:
        j[2] = j[2].strip()
        j[4] = j[4]+j[5]
        j.pop()


    return data_lis

# CSV格式存储清洗过后的数据
def Csv_data(data):
    # for i in data:
    with open("file.csv","a+") as cf:
        wf = csv.writer(cf)
        wf.writerows(data)

# 获取图片
def Get_img():
    data = list()
    for i in range(10):
        for j in (Data_cleaning(Get_data(Get_page(i)))):
            data.append(j)
    num = 1
    for i in data:
        response = requests.get(i[1])

        with open(f"img/{num}.jpg","wb") as f:
            f.write(response.content)
        num +=1

# 主函数
def main():
    for i in range(10):

        Csv_data(Data_cleaning(Get_data(Get_page(i))))


if __name__ == '__main__':

    # main()
    Get_img()

(爬虫基础)爬取猫眼Top100电影

Python爬取猫眼电影排行

爬虫小项目！适合有基础的！爬取葡萄酒评分！哪个阶段和那种酒呢

Python爬虫实战用 BeautifulSoup 爬取电影网站信息

【Python3爬虫】猫眼电影爬虫（破解字符集反爬）

用.NET Core写爬虫爬取电影天堂

如何利用python爬虫爬取爱奇艺VIP电影？

Python爬虫实战用 BeautifulSoup 爬取电影网站信息

python使用BeautifulSoup与正则表达式爬取时光网不同地区top100电影并对比

基础爬虫，谁学谁会，用requests、正则表达式爬取豆瓣Top250电影数据！

【Python爬虫教程】爬取某个小岛国家的电影

(爬虫基础)爬取猫眼Top100电影

Python爬取猫眼电影排行

爬虫小项目！适合有基础的！爬取葡萄酒评分！哪个阶段和那种酒呢

Python爬虫实战用 BeautifulSoup 爬取电影网站信息

【Python3爬虫】猫眼电影爬虫（破解字符集反爬）

用.NET Core写爬虫爬取电影天堂

如何利用python爬虫爬取爱奇艺VIP电影？

Python爬虫实战用 BeautifulSoup 爬取电影网站信息

python使用BeautifulSoup与正则表达式爬取时光网不同地区top100电影并对比

基础爬虫，谁学谁会，用requests、正则表达式爬取豆瓣Top250电影数据！

【Python爬虫教程】 爬取某个小岛国家的电影

【Python爬虫教程】爬取某个小岛国家的电影