欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

Python爬取豆瓣电影TOP250,结果输出为Execl,保存海报图片。

程序员文章站 2022-05-02 17:46:27
...

使用requests获取网页内容,beautifulsoup煲汤,xlsxwriter导出excel表格。完全是新手练习作品,欢迎交流。

import requests as req
import bs4
import xlsxwriter as xs
import os

def get_content(page_url): #获取网页
    url = page_url
    r = []
    headers = {'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Mobile Safari/537.36'}
    for each in url:
        r.append(req.get(each, headers = headers))
    return r

def analysis_content(html):  #分析网页
    global image_url
    movies = []
    image_url = {}  #存储图片名字 图片url
    movies_all = {}
    douban = html
    #此处douban是一个列表,存储pages个网页的内容
    for each in douban:
        soup = bs4.BeautifulSoup(each.text,'html.parser') #注意是each.text
        div_info = soup.find_all('div',class_='info')
        for each in div_info:
            movies_name = each.a.span.text
            star = each.find('span', class_ = 'rating_num') #评分
            movies_star = star.text
            movies.append(movies_star)
            quote = each.find('span', class_ = 'inq')  #评论
            try:  # 防止quote不存在报错
                movies_quote = quote.text
            except:
                movies_quote = ' '
            movies.append(movies_quote)
            movies_all[movies_name] = movies
            movies = []   #列表清空,不然数据会累加
            
        div_items = soup.find_all('div', class_='item') #用于下载海报
        for each in div_items:
            image_url[each.img['alt']] = each.img['src']
        
    return movies_all

def how_many_pages():   # 爬取深度
    pages_url = []
    pages = input('How many pages do you want to scrap?(total:10):')
    for i in range(0,int(pages)):
        pages_url.append('https://movie.douban.com/top250?start='+ str(25*(int(i))) +'&filter=')
    return pages_url

def excel_export(movies_all): # 输出为excel
    workbook = xs.Workbook('Top250Movies.xlsx')
    worksheet = workbook.add_worksheet()
    text_format = workbook.add_format({'center_across':'True','text_wrap':'True'})
    capture_format = workbook.add_format({'bold':'Ture','center_across':'True'})
    worksheet.set_column(1 ,5 ,20)
    
    worksheet.write('A1', '排名', capture_format)
    worksheet.write('B1', '名字', capture_format)
    worksheet.write('C1', '评分', capture_format)
    worksheet.write('D1', '评论', capture_format)
    row = 1      #从第二行第一列开始写
    col = 0
    i = 1       #排名
    
    for name in movies_all:    # movies_all 含有所有电影数据的字典
        worksheet.write(row, col, i, text_format)
        worksheet.write(row, col+1, name, text_format)
        worksheet.write(row, col+2, movies_all[name][0] , text_format)
        worksheet.write(row, col+3, movies_all[name][1], text_format)
        row += 1
        i += 1
    workbook.close()

def image_save():
    folder = '豆瓣top250'
    g = 1
    try:   #防止出现同名文件夹
        os.mkdir(folder)
    except:
        folder = '豆瓣top250(' + str(g) +')'
        g += 1
    os.chdir(folder)
    
    for name in image_url:
        with open(name+'.jpg', 'wb') as f:
            img_html = req.get(image_url[name])
            f.write(img_html.content)

            

def main():
    page = how_many_pages()
    html = get_content(page)
    movies = analysis_content(html)
    excel_export(movies)
    print('表格输出完成!')
    image_save()
    print('图片输出完成!')
    

if __name__ == "__main__":
    main()