爬取豆瓣电影排行top250
程序员文章站
2022-06-06 22:02:20
功能描述: 爬取豆瓣电影排行top250 使用的库 1、time 2、json 3、requests 4、BuautifulSoup 5、RequestException 上机实验室: 补充说明: 1、 ......
功能描述:
爬取豆瓣电影排行top250
使用的库
1、time
2、json
3、requests
4、buautifulsoup
5、requestexception
上机实验室:
"""
作者:李舵
日期:2019-4-27
功能:抓取豆瓣电影top250
版本:v1.0
"""
import time
import json
import requests
from bs4 import beautifulsoup
from requests.exceptions import requestexception
def get_one_page(url):
try:
headers = {'user-agent': 'mozilla/5.0 (windows nt 10.0; win64; x64) applewebkit/537.36 (khtml, like gecko) chrome/74.0.3729.108 safari/537.36'}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
return none
except requestexception:
return none
def parse_one_page(html):
soup = beautifulsoup(html, 'lxml')
ol_list = soup.find('ol', {'class': 'grid_view'})
li_list = ol_list.find_all('li')
for i in range(25):
move_value = li_list[i]
yield {
'index': move_value.find('em', {'class': ''}).text.strip(),
'title': move_value.find('span', {'class': 'title'}).text.strip(),
'actor': move_value.find('p', {'class': ''}).text.strip(),
'score': move_value.find('span', {'class': 'rating_num'}).text.strip()
}
def write_to_file(content):
with open('result.txt', 'a', encoding='utf-8') as f:
print(type(json.dumps(content)))
f.write(json.dumps(content, ensure_ascii=false)+'\n')
def main(start):
url = 'https://movie.douban.com/top250?start=' + str(start)
html = get_one_page(url)
for item in parse_one_page(html):
print(item)
write_to_file(item)
if __name__ == '__main__':
for i in range(11):
main(start=i * 25)
time.sleep(1)
补充说明:
1、