使用python解析xml成对应的html示例分享
sax将dd.xml解析成html。当然啦,如果得到了xml对应的xsl文件可以直接用libxml2将其转换成html。
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#---------------------------------------
# 程序:xml解析器
# 版本:01.0
# 作者:mupeng
# 日期:2013-12-18
# 语言:python 2.7
# 功能:将xml解析成对应的html
# 注解:该程序用xml.sax模块的parse函数解析xml,并生成事件
# 继承contenthandler并重写其事件处理函数
# dispatcher主要用于相应标签的起始、结束事件的派发
#---------------------------------------
from xml.sax.handler import contenthandler
from xml.sax import parse
class dispatcher:
def dispatch(self, prefix, name, attrs=none):
mname = prefix + name.capitalize()
dname = 'default' + prefix.capitalize()
method = getattr(self, mname, none)
if callable(method): args = ()
else:
method = getattr(self, dname, none)
#args = name
#if prefix == 'start': args += attrs
if callable(method): method()
def startelement(self, name, attrs):
self.dispatch('start', name, attrs)
def endelement(self, name):
self.dispatch('end', name)
class website(dispatcher, contenthandler):
def __init__(self):
self.fout = open('ddt_sax.html', 'w')
self.imagein = false
self.desflag = false
self.item = false
self.title = ''
self.link = ''
self.guid = ''
self.url = ''
self.pubdate = ''
self.description = ''
self.temp = ''
self.prx = ''
def startchannel(self):
self.fout.write('''<html>\n<head>\n<title> rss-''')
def endchannel(self):
self.fout.write('''
<tr><td height="20"></td></tr>
</table>
</center>
<script>
function gettimediff(str)
{
if(str == '')
{
return '';
}
var pubdate = new date(str);
var nowdate = new date();
var diffmilseconds = nowdate.valueof()-pubdate.valueof();
var days = diffmilseconds/86400000;
days = parseint(days);
diffmilseconds = diffmilseconds-(days*86400000);
var hours = diffmilseconds/3600000;
hours = parseint(hours);
diffmilseconds = diffmilseconds-(hours*3600000);
var minutes = diffmilseconds/60000;
minutes = parseint(minutes);
diffmilseconds = diffmilseconds-(minutes*60000);
var seconds = diffmilseconds/1000;
seconds = parseint(seconds);
var returnstr = "±±¾©·¢²¼ê±¼ä£º" + pubdate.tolocalestring();
if(days > 0)
{
returnstr = returnstr + " £¨¾ààëïöôú" + days + "ìì" + hours + "ð¡ê±" + minutes + "·ööó£©";
}
else if (hours > 0)
{
returnstr = returnstr + " £¨¾ààëïöôú" + hours + "ð¡ê±" + minutes + "·ööó£©";
}
else if (minutes > 0)
{
returnstr = returnstr + " £¨¾ààëïöôú" + minutes + "·ööó£©";
}
return returnstr;
}
function getspantext()
{
var pubdate;
var pubdatearray;
var spanarray = document.getelementsbytagname("span");
for(var i = 0; i < spanarray.length; i++)
{
pubdate = spanarray[i].innerhtml;
document.getelementsbytagname("span")[i].innerhtml = gettimediff(pubdate);
}
}
getspantext();
</script>
</body>
</html>
''')
self.fout.close()
def characters(self, chars):
if chars.strip():
#chars = chars.strip()
self.temp += chars
#print self.temp
def starttitle(self):
if self.item:
self.fout.write('''
<tr bgcolor="#eeeeee">\n<td style="padding-top:5px;padding-left:5px;" height="30">\n<b>
''')
def endtitle(self):
if not self.imagein and not self.item:
self.title = self.temp
self.temp = ''
self.fout.write(self.title.encode('gb2312'))
#self.title = self.temp
self.fout.write('''
</title>\n</head>\n<body>\n<center>\n
<script>\n
function copylink()
{
clipboarddata.setdata("text",window.location.href);
alert("rssá´½óòñ¾¸´öæµ½¼ôìù°å");
}
function subscibelink()
{
var str = window.location.pathname;
while(str.match(/^\//))
{
str = str.replace(/^\//,"");
}
window.open("http://rss.sina.com.cn/my_sina_web_rss_news.html?url=" + str,"_self");
}
</script>\n
<table width="750" cellpadding="0" cellspacing="0">\n
<tr>\n
<td align="right" style="padding-right:15px;" valign="bottom">\n
''')
if self.item:
self.title = self.temp
self.temp = ''
self.fout.write(self.title.encode('gb2312'))
self.fout.write('''
</b>
</td>
</tr>
<tr bgcolor="#eeeeee">
<td style="padding-left:5px;">
''')
def startimage(self):
self.imagein = true
def endimage(self):
self.imagein = false
def startlink(self):
if self.imagein:
self.fout.write('''<a href=" ''')
def endlink(self):
self.link = self.temp
self.temp = ''
if self.imagein:
self.fout.write(self.link.encode('gb2312'))
self.fout.write('''" target="_blank">\n ''')
elif self.item:
#self.link = self.temp
pass
else:
self.fout.write(self.link)
self.fout.write(''' " target="
_blank
"> ''')
self.fout.write(self.title.encode('gb2312'))
self.fout.write(''' </a></b></td>
</tr>
<tr><td colspan="2" align="center">
''')
self.fout.write(self.description.encode('gb2312'))
self.fout.write('''
</td></tr>
<tr style="font-size:12px;" bgcolor="#eeeeff"><td colspan="2" style="font-size:14px;padding-top:5px;padding-bottom:5px;"><b><a href="javascript:copylink();">¸´öæ´ëò³á´½ó</a> <a href="javascript:subscibelink();">îòòªç¶èë¸ãðâîåáð±íµ½îòµäò³ã棨¼òµ¥¡¢¿ìëù¡¢êµê±¡¢ãâ·ñ£©</a></b></td></tr>
</table>
<table width="750" cellpadding="0" cellspacing="0">
''')
def starturl(self):
if self.imagein:
self.fout.write('''<img src=" ''')
def endurl(self):
self.url = self.temp
self.temp = ''
if self.imagein:
self.fout.write(self.url.encode('gb2312'))
self.fout.write('''" border="0">\n
</a>
</td>
<td align="left" valign="bottom" style="padding-bottom:8px;"><b><a href="
''')
if self.item:
#self.url = self.temp
pass
def defaultstart(self):
pass
def defaultend(self):
self.temp = ''
def startdescription(self):
pass
def enddescription(self):
self.description = self.temp
self.temp = ''
if self.item:
#self.fout.write('¡¡¡¡')
self.fout.write(self.description.encode('gb2312'))
def endguid(self):
self.guid = self.temp
def endpubdate(self):
if not self.temp.startswith('http'):
self.pubdate = self.temp
self.temp = ''
else:
self.pubdate = ''
def startitem(self):
self.item = true
def enditem(self):
self.item = false
self.fout.write('''
</td>
</tr>
<tr bgcolor="#eeeeee">
<td style="padding-top:5px;padding-left:5px;">
<a href="''')
self.fout.write(self.link)
self.fout.write(''' " target="_blank"> ''')
self.fout.write(self.guid)
self.fout.write('''
</a>
</td>
</tr>
<tr bgcolor="#eeeeee">
<td style="padding-top:5px;padding-left:5px;padding-bottom:5px;"><span>''')
self.fout.write(self.pubdate)
self.fout.write('''</span></td>
</tr>
<tr height="10"><td></td></tr>''')
#程序入口
if __name__ == '__main__':
parse('ddt.xml', website())
上一篇: cad中怎么按比例缩放图纸? cad比例多放的两种方法
下一篇: SQL 实现定期备份数据库