python读取html中指定元素生成excle文件示例
python2.7编写的读取html中指定元素,并生成excle文件
#coding=gbk
import string
import codecs
import os,time
import xlwt
import xlrd
from bs4 import beautifulsoup
from xlrd import open_workbook
class logmsg:
def __init__(self,logfile,level=0):
try:
import logging
#self.logger = none
self.logger = logging.getlogger()
self.hdlr = logging.filehandler(logfile)
formatter = logging.formatter("[%(asctime)s]: %(message)s","%y%m%d %h:%m:%s")
self.hdlr.setformatter(formatter)
self.logger.addhandler(self.hdlr)
#logger.setlevel()
if level == 10:
self.logger.setlevel(logging.debug)
elif level == 20:
self.logger.setlevel(logging.info)
elif level == 30:
self.logger.setlevel(logging.warning)
elif level == 40:
self.logger.setlevel(logging.error)
elif level == 50:
self.logger.setlevel(logging.critical)
else:
self.logger.setlevel(logging.notset)
except:
print "log init error!"
exit(1)
def output(self,loginfo):
level = self.logger.geteffectivelevel()
try:
if level == 10:
self.logger.debug(loginfo)
elif level == 20:
self.logger.info(loginfo)
elif level == 30:
self.logger.warning(loginfo)
elif level == 40:
self.logger.error(loginfo)
elif level == 50:
self.logger.critical(loginfo)
else:
self.logger.info(loginfo)
except:
print "log output error!"
exit(1)
def close(self):
try:
#logging.shutdown([self.hdlr])
self.logger.removehandler(self.hdlr)
except:
print "log closed error!"
exit(1)
logtime = time.strftime("%y%m%d%h%m%s",time.localtime())
logfiletime = time.strftime("%y%m%d",time.localtime())
logfile = '/data/pyexample/logs/htmlparser_%s.log' % logfiletime
log = logmsg(logfile,20)
datapath = '/data/pyexample/'
xlsname = 'dangjian_'+logtime+'.xls'
if __name__ == '__main__':
wbk = xlwt.workbook(encoding = 'gbk')
sheet = wbk.add_sheet('基本内容导入模板')
sheet.write(0,0,'内容类型 ')
sheet.write(0,1,'栏目名称')
sheet.write(0,2,'栏目编号')
sheet.write(0,3,'内容名称')
sheet.write(0,4,'时长')
sheet.write(0,5,'关键字')
sheet.write(0,6,'看点')
sheet.write(0,7,'作者')
sheet.write(0,8,'来源')
sheet.write(0,9,'子内容1')
sheet.write(0,10,'子内容2')
xlscontent = []
files = os.listdir(datapath)
k = 0
for f in files:
if os.path.splitext(f)[1] == '.html':
content=[]
log.output('当前文件:'+f)
htmlfile =codecs.open(datapath+f,'r','gbk')
lines = htmlfile.readlines()
if not lines:
log.output ('not line')
for line in lines:
if line.strip()=='\n':
log.output('该处是空行')
else:
line = line.replace(' ','')
soup = beautifulsoup(line)
for tdd in soup.findall('td'):
#print tdd.text.encode("gbk")
content.append(tdd.text.encode("gbk"))
#print line.encode('gbk')
htmlfile.close()
for i in content:
print content.index(i),',',i
log.output(i)
log.output(content.index(i))
print '----------------------------------------'
foldername = content[6]
contentname= content[4]
duration = filter(str.isdigit, content[16])
int_duration = string.atoi(duration)*60
str_duration = "%i"%int_duration
keyword = content[6]
desciption = content[36]
videoname_1 = content[10]
print foldername
print contentname
print str_duration
print keyword
print desciption
print videoname_1
log.output('输出xls数据:'+','+foldername+',,'+contentname+','+str_duration+','+keyword+','+desciption+',管理员,华数编辑,'+videoname_1+',,')
print k
sheet.write(k+1,0,'')
sheet.write(k+1,1,foldername)
sheet.write(k+1,2,'')
sheet.write(k+1,3,contentname)
sheet.write(k+1,4,str_duration)
sheet.write(k+1,5,keyword)
sheet.write(k+1,6,desciption)
sheet.write(k+1,7,'管理员')
sheet.write(k+1,8,'华数编辑')
sheet.write(k+1,9,videoname_1)
sheet.write(k+1,10,'')
k+=1
wbk.save(datapath + xlsname)
print '========================================='