欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

python htmllib.HTMLParser处理A标签

程序员文章站 2022-06-10 23:14:02
...
#!/usr/bin/python 
#encoding='utf-8' 
import htmllib,urllib,formatter,string 
'''''
import chardet,sys
type = sys.getdefaultencoding()
''' 
class GetLinks(htmllib.HTMLParser): 
    def __init__(self):
        self.links = {}
        f = formatter.NullFormatter()
        htmllib.HTMLParser.__init__(self, f) 
 
    def anchor_bgn(self, href, name, type): 
        self.save_bgn() 
        self.link = href 
 
    def anchor_end(self):
        text = string.strip(self.save_end())
        if self.link and text: 
            self.links[text] = self.link#self.links.get(text, []) + [self.link] 
            #print self.links 
            #exit() 
fp = urllib.urlopen("http://www.baidu.com")
data = fp.read() 
fp.close() 
 
linkdemo = GetLinks()
linkdemo.feed(data)
linkdemo.close() 
 
for href, link in linkdemo.links.items():
    print href, "=>", link 
相关标签: python