欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

(爬虫基础)爬取猫眼Top100电影

程序员文章站 2022-05-02 16:55:57
...
#!/usr/bin/env python3
# _*_ coding: utf-8 _*_
# Date: 2019/11/11 0011 12:40
# Author: Mijiu
# Version: 1.0

import requests,re,csv
from lxml import etree

# 获取页面源码 (猫眼电影top100) by Rosny 2019-11-11
def Get_page(num=0):
    pr = {
        "offset":num*10
    }
    url = 'https://maoyan.com/board/4'

    header = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0"
    }
    response = requests.get(url,headers=header,params=pr)
    return response.text

# 分析数据 (Xpath验证) 顺手清理数据 by Rosny 2019-11-11
def Get_data(data):

    html = etree.HTML(data)
    data_MV = html.xpath('//body//img[@class="board-img"]')
    # 电影名,图片地址
    MV_list = list()
    Net_list = list()
    for i in data_MV:
        # print(i.xpath("./@alt"),i.xpath("./@data-src"))
        MV_list.append(i.xpath("./@alt"))
        Net_list.append(i.xpath("./@data-src"))

    # 主演
    data_name = html.xpath('//p[@class="star"]')
    name = list()    # 主演列表
    for j in data_name:
        name.append(j.xpath("./text()")[0].strip())

    # 上映时间
    time_MV = list()
    for y in data_name:
        time_MV.append(y.xpath("./following-sibling::p[1]")[0].xpath("./text()"))
    # 评分
    grade = list()
    data_grade = html.xpath('//i[@class="integer"]/text()',) # 整数部分
    data_grade2 = html.xpath('//i[@class="fraction"]/text()')   # 小数部分
    num = 0
    for i in data_grade:
        grade.append(i+data_grade2[num])
        num+=1

    top = list()
    for i in range(10):
        data__ = f'{MV_list[i][0]}${Net_list[i][0]}${name[i]}${time_MV[i][0]}${grade[i]}'
        top.append(data__)

    top2_0 = list()   # 完整版列表 [[1],[2]]
    for i in top:
        top2_0.append(i.split("$"))
    return top2_0


# CSV格式存储清洗过后的数据
def Csv_data(data):
    with open("maoyan_top100/top100.csv","a") as cf:
        wf = csv.writer(cf)
        wf.writerows(data)

# 获取图片
def Get_img():
    data = list()
    for i in range(10):
        for j in (Get_data(Get_page(i))):
            data.append(j)
    num = 1
    for i in data:
        response = requests.get(i[1])
        with open(f"maoyan_top100/img/{i[0]}.jpg","wb") as f:
            f.write(response.content)
        num +=1

# 主函数
def main():
    for i in range(10):
        Csv_data(Get_data(Get_page(i)))
    Get_img()

if __name__ == '__main__':

    main()

使用正则

#!/usr/bin/env python3
# _*_ coding: utf-8 _*_
# Date: 2019/11/7 0007 14:18
# Author: Mijiu
# Version: 1.0

import requests,re,csv

# 获取页面源码 (猫眼电影top100) by Rosny 2019-11-7
def Get_page(num=0):

    pr = {
        "offset":num*10
    }
    url = 'https://maoyan.com/board/4'

    header = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0"
    }
    response = requests.get(url,headers=header,params=pr)
    return response.text

# 分析数据 (正则验证) by Rosny 2019-11-7
def Get_data(data):
    reobj = re.findall('<dd.*?title="(.*?)".*?data-src="(.*?)".*?star">(.*?)<.*?time">(.*?)<.*?ger">(.*?)<.*?tion">(\d)',data,re.S)
    return reobj

# 数据清洗  by Rosny 2019-11-7
def Data_cleaning(data):
    data_lis = list()
    for i in data:
        data_lis.append(list(i))
    for j in data_lis:
        j[2] = j[2].strip()
        j[4] = j[4]+j[5]
        j.pop()


    return data_lis

# CSV格式存储清洗过后的数据
def Csv_data(data):
    # for i in data:
    with open("file.csv","a+") as cf:
        wf = csv.writer(cf)
        wf.writerows(data)

# 获取图片
def Get_img():
    data = list()
    for i in range(10):
        for j in (Data_cleaning(Get_data(Get_page(i)))):
            data.append(j)
    num = 1
    for i in data:
        response = requests.get(i[1])

        with open(f"img/{num}.jpg","wb") as f:
            f.write(response.content)
        num +=1

# 主函数
def main():
    for i in range(10):

        Csv_data(Data_cleaning(Get_data(Get_page(i))))


if __name__ == '__main__':

    # main()
    Get_img()
相关标签: python