欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  IT编程

python读取html中指定元素生成excle文件示例

程序员文章站 2023-10-17 21:11:29
python2.7编写的读取html中指定元素,并生成excle文件 复制代码 代码如下:#coding=gbkimport stringimport codecsimp...

python2.7编写的读取html中指定元素,并生成excle文件

复制代码 代码如下:

#coding=gbk
import string
import codecs
import os,time
import xlwt
import xlrd
from bs4 import beautifulsoup
from xlrd import open_workbook

class logmsg:
        def __init__(self,logfile,level=0):
                try:
                        import logging
                        #self.logger = none
                        self.logger = logging.getlogger()
                        self.hdlr = logging.filehandler(logfile)
                        formatter = logging.formatter("[%(asctime)s]: %(message)s","%y%m%d %h:%m:%s")
                        self.hdlr.setformatter(formatter)
                        self.logger.addhandler(self.hdlr)
                        #logger.setlevel()
                        if level == 10:
                                self.logger.setlevel(logging.debug)
                        elif level == 20:
                                self.logger.setlevel(logging.info)
                        elif level == 30:
                                self.logger.setlevel(logging.warning)
                        elif level == 40:
                                self.logger.setlevel(logging.error)
                        elif level == 50:
                                self.logger.setlevel(logging.critical)
                        else:
                                self.logger.setlevel(logging.notset)
                except:
                        print "log init error!"
                        exit(1)

        def output(self,loginfo):
                level = self.logger.geteffectivelevel()
                try:
                        if level == 10:
                                self.logger.debug(loginfo)
                        elif level == 20:
                                self.logger.info(loginfo)
                        elif level == 30:
                                self.logger.warning(loginfo)
                        elif level == 40:
                                self.logger.error(loginfo)
                        elif level == 50:
                                self.logger.critical(loginfo)
                        else:
                                self.logger.info(loginfo)
                except:
                        print "log output error!"
                        exit(1)

        def close(self):
                try:
                #logging.shutdown([self.hdlr])
                        self.logger.removehandler(self.hdlr)
                except:
                        print "log closed error!"
                        exit(1)

logtime = time.strftime("%y%m%d%h%m%s",time.localtime())
logfiletime = time.strftime("%y%m%d",time.localtime())
logfile = '/data/pyexample/logs/htmlparser_%s.log' % logfiletime
log = logmsg(logfile,20)


datapath = '/data/pyexample/'
xlsname = 'dangjian_'+logtime+'.xls'


if __name__ == '__main__':
   

    wbk = xlwt.workbook(encoding = 'gbk')
    sheet = wbk.add_sheet('基本内容导入模板')
    sheet.write(0,0,'内容类型 ')
    sheet.write(0,1,'栏目名称')
    sheet.write(0,2,'栏目编号')
    sheet.write(0,3,'内容名称')
    sheet.write(0,4,'时长')
    sheet.write(0,5,'关键字')
    sheet.write(0,6,'看点')
    sheet.write(0,7,'作者')
    sheet.write(0,8,'来源')
    sheet.write(0,9,'子内容1')
    sheet.write(0,10,'子内容2')
    xlscontent = []  
    files = os.listdir(datapath)
    k = 0
    for f in files: 
        if os.path.splitext(f)[1] == '.html':
            content=[]
            log.output('当前文件:'+f)
            htmlfile =codecs.open(datapath+f,'r','gbk')
            lines = htmlfile.readlines()
            if not lines:
                log.output ('not line')
            for line in lines:
                if line.strip()=='\n':
                    log.output('该处是空行')
                else:
                    line = line.replace(' ','')
                    soup  = beautifulsoup(line)
                    for tdd in soup.findall('td'): 
                        #print tdd.text.encode("gbk")
                        content.append(tdd.text.encode("gbk"))      
                #print line.encode('gbk')
            htmlfile.close()   
            for i in content:
                print content.index(i),',',i
                log.output(i)
                log.output(content.index(i))
            print '----------------------------------------'
           

            foldername =  content[6]
            contentname=  content[4]      
            duration =    filter(str.isdigit, content[16])
            int_duration = string.atoi(duration)*60
            str_duration = "%i"%int_duration
            keyword =     content[6]
            desciption =  content[36]
            videoname_1 = content[10]
            print foldername
            print contentname
            print str_duration
            print keyword
            print desciption
            print videoname_1
            log.output('输出xls数据:'+','+foldername+',,'+contentname+','+str_duration+','+keyword+','+desciption+',管理员,华数编辑,'+videoname_1+',,')
            print k           
            sheet.write(k+1,0,'')
            sheet.write(k+1,1,foldername)
            sheet.write(k+1,2,'')
            sheet.write(k+1,3,contentname)
            sheet.write(k+1,4,str_duration)
            sheet.write(k+1,5,keyword)
            sheet.write(k+1,6,desciption)
            sheet.write(k+1,7,'管理员')
            sheet.write(k+1,8,'华数编辑')
            sheet.write(k+1,9,videoname_1)
            sheet.write(k+1,10,'')
            k+=1

    wbk.save(datapath + xlsname)       

    print '========================================='