Python爬取豆瓣电影TOP250,结果输出为Execl,保存海报图片。
程序员文章站
2022-05-02 17:46:27
...
使用requests获取网页内容,beautifulsoup煲汤,xlsxwriter导出excel表格。完全是新手练习作品,欢迎交流。
import requests as req
import bs4
import xlsxwriter as xs
import os
def get_content(page_url): #获取网页
url = page_url
r = []
headers = {'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Mobile Safari/537.36'}
for each in url:
r.append(req.get(each, headers = headers))
return r
def analysis_content(html): #分析网页
global image_url
movies = []
image_url = {} #存储图片名字 图片url
movies_all = {}
douban = html
#此处douban是一个列表,存储pages个网页的内容
for each in douban:
soup = bs4.BeautifulSoup(each.text,'html.parser') #注意是each.text
div_info = soup.find_all('div',class_='info')
for each in div_info:
movies_name = each.a.span.text
star = each.find('span', class_ = 'rating_num') #评分
movies_star = star.text
movies.append(movies_star)
quote = each.find('span', class_ = 'inq') #评论
try: # 防止quote不存在报错
movies_quote = quote.text
except:
movies_quote = ' '
movies.append(movies_quote)
movies_all[movies_name] = movies
movies = [] #列表清空,不然数据会累加
div_items = soup.find_all('div', class_='item') #用于下载海报
for each in div_items:
image_url[each.img['alt']] = each.img['src']
return movies_all
def how_many_pages(): # 爬取深度
pages_url = []
pages = input('How many pages do you want to scrap?(total:10):')
for i in range(0,int(pages)):
pages_url.append('https://movie.douban.com/top250?start='+ str(25*(int(i))) +'&filter=')
return pages_url
def excel_export(movies_all): # 输出为excel
workbook = xs.Workbook('Top250Movies.xlsx')
worksheet = workbook.add_worksheet()
text_format = workbook.add_format({'center_across':'True','text_wrap':'True'})
capture_format = workbook.add_format({'bold':'Ture','center_across':'True'})
worksheet.set_column(1 ,5 ,20)
worksheet.write('A1', '排名', capture_format)
worksheet.write('B1', '名字', capture_format)
worksheet.write('C1', '评分', capture_format)
worksheet.write('D1', '评论', capture_format)
row = 1 #从第二行第一列开始写
col = 0
i = 1 #排名
for name in movies_all: # movies_all 含有所有电影数据的字典
worksheet.write(row, col, i, text_format)
worksheet.write(row, col+1, name, text_format)
worksheet.write(row, col+2, movies_all[name][0] , text_format)
worksheet.write(row, col+3, movies_all[name][1], text_format)
row += 1
i += 1
workbook.close()
def image_save():
folder = '豆瓣top250'
g = 1
try: #防止出现同名文件夹
os.mkdir(folder)
except:
folder = '豆瓣top250(' + str(g) +')'
g += 1
os.chdir(folder)
for name in image_url:
with open(name+'.jpg', 'wb') as f:
img_html = req.get(image_url[name])
f.write(img_html.content)
def main():
page = how_many_pages()
html = get_content(page)
movies = analysis_content(html)
excel_export(movies)
print('表格输出完成!')
image_save()
print('图片输出完成!')
if __name__ == "__main__":
main()