爬取猫眼电影榜单100
程序员文章站
2022-05-02 16:56:21
...
初学根据崔老师的教程写的
import json
import requests
from requests.exceptions import RequestException
import re
#获取页面源代码
def get_url(url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'}
try:
response = requests.get(url,headers = headers)
if response.status_code == 200:
return response.text
else:
return None
except RequestException:
return False
#用正则表达式分析内容
def get_content(html):
pattern = re.compile('<dd>.*?board-index.*?>'
'(.*?)</i>.*?data-src="(.*?)"'
'.*?><a.*?>(.*?)</a>.*?star">'
'(.*?)</p>.*?releasetime">'
'(.*?)</p>.*?integer">(.*?)'
'</i>.*?fraction">(.*?)</i>'
'.*?</dd>',re.S)
items = re.findall(pattern,html)
for list in items:
yield {
'编号' : list[0],
'图片' : list[1],
'片名' : list[2],
'主演' : list[3].strip()[3:],
'时间' : list[4].strip()[5:],
'评分' : list[5]+list[6]
}
#保存文件
def get_txt(content):
with open('猫眼Top100.txt','a',encoding='utf-8') as f:
f.write(json.dumps(content,ensure_ascii=False)+'\n')
f.close()
def main(offset):
url = 'https://maoyan.com/board/4?offset=0'+str(offset)
html = get_url(url)
for i in get_content(html):
print(i)
get_txt(i)
if __name__ == '__main__':
for a in range(10):
main(a*10)