用Python爬虫实现爬取豆瓣电影Top250
程序员文章站
2022-11-28 09:33:30
用Python爬虫实现爬取豆瓣电影Top250
#爬取 豆瓣电影Top250
#250个电影 ,分为10个页显示,1页有25个电影
import urllib.req...
用Python爬虫实现爬取豆瓣电影Top250
#爬取 豆瓣电影Top250 #250个电影 ,分为10个页显示,1页有25个电影 import urllib.request from bs4 import BeautifulSoup url = "https://movie.douban.com/top250" headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'} #headers 要根据自己的网页抓取情况修改 targetPath = "storage path" #填写自己想要存储的地址 def saveText(f,text): f.write(text) #获取网页源码 def getData(url,headers): req = urllib.request.Request(url = url , headers = headers) res = urllib.request.urlopen(req) data = res.read() return data #解析网页 def praseHtml(f,url,headers): currenturl = url i = 1 #序号 #flag = True while currenturl : #解析当前页,获取想要的内容 html = getData(currenturl,headers) soup = BeautifulSoup(html,'lxml') moveList = soup.find('ol',attrs = {'class':'grid_view'}) for moveLi in moveList.find_all('li'): detail = moveLi.find('p',attrs = {'class':'hd'}) moveName = detail.find('span',attrs = {'class':'title'}) saveText(f,str(i)+ moveName.getText()+'\n') i += 1 print(moveName.getText()) #下一页 nextpage = soup.find('span',attrs = {'class':'next'}).find('a') #next = nextpage['href'] #这样写报错:NoneType object is not subscriptable if nextpage: currenturl = url + nextpage['href'] else : currenturl = None f = open(targetPath,"w") praseHtml(f,url,headers)
下一篇: Python编程之yaml文件读写教程