欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

爬取猫眼电影榜单100

程序员文章站 2022-05-02 16:56:21
...

初学根据崔老师的教程写的

import json
import requests
from requests.exceptions import RequestException
import re
#获取页面源代码
def get_url(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'}
    try:
        response = requests.get(url,headers = headers)
        if response.status_code == 200:
            return response.text
        else:
            return None
    except RequestException:
        return False
#用正则表达式分析内容
def get_content(html):
    pattern = re.compile('<dd>.*?board-index.*?>'
                         '(.*?)</i>.*?data-src="(.*?)"'
                         '.*?><a.*?>(.*?)</a>.*?star">'
                         '(.*?)</p>.*?releasetime">'
                         '(.*?)</p>.*?integer">(.*?)'
                         '</i>.*?fraction">(.*?)</i>'
                         '.*?</dd>',re.S)
    items = re.findall(pattern,html)
    for list in items:
        yield {
            '编号' : list[0],
            '图片' : list[1],
            '片名' : list[2],
            '主演' : list[3].strip()[3:],
            '时间' : list[4].strip()[5:],
            '评分' : list[5]+list[6]
        }
#保存文件
def get_txt(content):
    with open('猫眼Top100.txt','a',encoding='utf-8') as f:
        f.write(json.dumps(content,ensure_ascii=False)+'\n')
        f.close()
def main(offset):
    url = 'https://maoyan.com/board/4?offset=0'+str(offset)
    html = get_url(url)
    for i in get_content(html):
        print(i)
        get_txt(i)
if __name__ == '__main__':
    for a in range(10):
        main(a*10)
相关标签: python