欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  后端开发

Python抓取电影天堂电影信息的代码

程序员文章站 2022-06-09 21:56:01
...
Python2.7Mac OS

抓取的是电影天堂里面最新电影的页面。链接地址: http://www.dytt8.net/html/gndy/dyzz/index.html

获取页面的中电影详情页链接

import urllib2
import os
import re
import string


# 电影URL集合
movieUrls = []


# 获取电影列表
def queryMovieList():

 url = 'http://www.dytt8.net/html/gndy/dyzz/index.html' 
 conent = urllib2.urlopen(url)
 conent = conent.read()
 conent = conent.decode('gb2312','ignore').encode('utf-8','ignore') 
 pattern = re.compile ('

.*?>

'+ '(.*?) ',re.S) items = re.findall(pattern,conent) str = ''.join(items) pattern = re.compile ('(.*?).*?(.*?)',re.S) news = re.findall(pattern, str) for j in news: movieUrls.append('http://www.dytt8.net'+j[0])

抓取详情页中的电影数据

def queryMovieInfo(movieUrls):

 for index, item in enumerate(movieUrls):

 print('电影URL: ' + item)

 conent = urllib2.urlopen(item)
 conent = conent.read()
 conent = conent.decode('gb2312','ignore').encode('utf-8','ignore') 


 movieName = re.findall(r'

(.*?)

', conent, re.S) if (len(movieName) > 0): movieName = movieName[0] + "" # 截取名称 movieName = movieName[movieName.find("《") + 3:movieName.find("》")] else: movieName = "" print("电影名称: " + movieName.strip()) movieContent = re.findall(r'
(.*?)',conent , re.S) pattern = re.compile('
相关标签: Python 电影天堂