python:猫眼电影TOP100的电影爬取
程序员文章站
2022-05-02 18:05:42
...
'''
爬取猫眼电影上TOP100的电影
技术路线:requests bs4 re
'''
import requests
from bs4 import BeautifulSoup
import re
def getHTML(url): # 爬取页面
hd = {'User-Agent': 'Mozilla/5.0'} # 模拟浏览器进行爬取
try:
r = requests.get(url, headers=hd)
r.raise_for_status() # 如果状态不是200,抛出HTTPError异常
r.encoding = r.apparent_encoding
return r.text
except:
print('爬取失败')
def parsePage(ulist, html): # 解析HTML页面
soup = BeautifulSoup(html, "html.parser")
# 电影名
films = soup.find_all('p', attrs={'class': 'name'}) # 找到p标签并且属性为class='name' ,这里想找的就是电影名所在的标签
# 排名
rank = re.findall(r'board-index-(\d*)', html, re.S) # (\d*),加上了括号相当于只返回括号内的内容,如果不加()返回匹配到的整个字符串
# 主角
actors1 = soup.find_all('p', attrs={'class': 'star'}) # 返回的是一个列表
actors2 = [] #
for item in actors1: # 遍历
a = item.string.replace('\n', '') # 因为存在换行和空格,在后面的操作不好进行,所以将其消去
b = a.replace(' ', '')
actors2.append(b) # 重新整理后为actor2
# 上映日期,地点
rate = soup.find_all('p', attrs={'class': "releasetime"})
# 装入ulist中
for i in range(len(films)):
ulist.append([rank[i], films[i].string, actors2[i], rate[i].string])
def PrintList(ulist):
tplt = '{0:<4}\t{1:{4}<15}\t{2:{4}<30}{3:{4}<50}'
print(tplt.format('排名', "电影", '主演', '上映日期', chr(12288)))
for item in ulist:
print(tplt.format(item[0], item[1], item[2], item[3], chr(12288)))
def Savetxt(ulist): # 保存为txt文件
with open('films.txt', 'w', encoding="gbk") as f:
for item in ulist:
items = '{},{},{},{}'.format(item[0], item[1], item[2], item[3])
f.write(items)
f.write('\n')
def SaveExcle(ulist): # 保存为xls文件
with open('fimls.xls', 'w', encoding='gbk') as f:
for i in range(len(ulist)):
for j in range(len(ulist[i])):
f.write(str(ulist[i][j]))
f.write('\t') # 换下一个单元格
f.write('\n')
def main():
starturl = 'http://maoyan.com/board/4?offset=0'
lsoffset = []
ulist = []
#
# html = getHTML(starturl)
# parsePage(ulist, html)
# PrintList(ulist)
for i in range(10):
lsoffset.append(str(i * 10)) # 用来设置翻页的url
for offset in lsoffset: # 这里意思就是找到所有的页面
url = starturl + offset
html = getHTML(url)
parsePage(ulist, html)
PrintList(ulist)
SaveExcle(ulist)
Savetxt(ulist)
main()
BeautifulSoup与正则表达式要会灵活运用
上一篇: PHP常用函数备用