Python爬虫: 用urllib2+beautifulsoup写的抓取网页内容的示例
程序员文章站
2022-05-04 11:21:54
...
BeautifulSoup是一个可以解析HTML或XML内容的python库,和java的Dom4j有点类似。当用urllib2抓取到网页的HTML源码之后,调用beautifulSoup的API就可以解析HTML,比用自己写正则表达式简单高效得多。
安装BeautifulSoup: sudo apt-get install python-bs4 (注意是:python-bs4 )
检查是否安装成功: from bs4 import BeautifulSoup
代码:
# coding=utf-8
import urllib2
import re
import os
import sys
from bs4 import BeautifulSoup
#解决中文输出乱码问题,write(中文)时需要将中文unicode。 write(unicode('中文'))
reload(sys)
sys.setdefaultencoding('utf-8')
class Spider:
'''
这是一个简单的用于抓取'晚安心语'里的语句
晚安心语的链接: http://www.vikilife.com/tag/晚安心语
'''
def __init__(self):
self.url='http://www.vikilife.com/tag/%E6%99%9A%E5%AE%89%E5%BF%83%E8%AF%AD/page/'
def load_url(self,pageNum):
'''
Send HTTP request to the special URL and get some useful information
'''
current_url= self.url + str(pageNum)
user_agent='Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0'
headers={'User-Agent':user_agent}
#Send Http Request to URL
req = urllib2.Request(current_url,headers=headers)
response = urllib2.urlopen(req)
#Read HTML response
html = response.read()
#Get some useful information from the HTML
#pattern = re.compile(r'<h2>.*?title="(.*?)"',re.S)
#result_list = pattern.findall(html)
result_list = self.analysis_html(html)
#out put result list
#print result_list
#write result to log file
self.write_result_log(result_list,pageNum)
def analysis_html(self,html):
'''
Use BeautifulSoup to analysis HTML
'''
#create BeautifulSoup Object
soup = BeautifulSoup(html)
#format out put
#print soup.prettify()
print soup.title
result_list = []
for link in soup.find_all("div","indexs"):
title= link.find('a').text
result_list.append(title)
return result_list
def write_result_log(self,result_list,pageNum):
'''
write result list to log file
'''
filePath= os.getcwd() + '/log/wananxinyu_'+str(pageNum)+'.log'
file=open(filePath,'w')
for result in result_list:
file.write(unicode(result))
file.write("\n")
file.close()
if __name__=='__main__':
spider=Spider()
for pageNum in range(1,10):
spider.load_url(pageNum)
上一篇: SpringBoot 配置