python htmllib.HTMLParser处理A标签
程序员文章站
2022-06-10 23:14:02
...
#!/usr/bin/python
#encoding='utf-8'
import htmllib,urllib,formatter,string
'''''
import chardet,sys
type = sys.getdefaultencoding()
'''
class GetLinks(htmllib.HTMLParser):
def __init__(self):
self.links = {}
f = formatter.NullFormatter()
htmllib.HTMLParser.__init__(self, f)
def anchor_bgn(self, href, name, type):
self.save_bgn()
self.link = href
def anchor_end(self):
text = string.strip(self.save_end())
if self.link and text:
self.links[text] = self.link#self.links.get(text, []) + [self.link]
#print self.links
#exit()
fp = urllib.urlopen("http://www.baidu.com")
data = fp.read()
fp.close()
linkdemo = GetLinks()
linkdemo.feed(data)
linkdemo.close()
for href, link in linkdemo.links.items():
print href, "=>", link
#encoding='utf-8'
import htmllib,urllib,formatter,string
'''''
import chardet,sys
type = sys.getdefaultencoding()
'''
class GetLinks(htmllib.HTMLParser):
def __init__(self):
self.links = {}
f = formatter.NullFormatter()
htmllib.HTMLParser.__init__(self, f)
def anchor_bgn(self, href, name, type):
self.save_bgn()
self.link = href
def anchor_end(self):
text = string.strip(self.save_end())
if self.link and text:
self.links[text] = self.link#self.links.get(text, []) + [self.link]
#print self.links
#exit()
fp = urllib.urlopen("http://www.baidu.com")
data = fp.read()
fp.close()
linkdemo = GetLinks()
linkdemo.feed(data)
linkdemo.close()
for href, link in linkdemo.links.items():
print href, "=>", link
上一篇: python:装饰器