简单的网络爬虫的python实现

# coding=utf-8

import HTMLParser
import urllib
import sys
import re
import os


# 定义HTML解析器
class parseLinks(HTMLParser.HTMLParser):
    # 该方法用来处理开始标签的，eg:<div id="main">
    def handle_starttag(self, tag, attrs):
        def _attr(attrlist, attrname):
            for each in attrlist:
                if attrname == each[0]:
                    return each[1]
            return None

        if tag == 'a' or tag == "li" or tag == "link":  # 如果为<a>标签
            # name为标签的属性名，如href、name、id、onClick等等
            for name, value in attrs:
                if name == 'href':  # 这时选择href属性
                    #print "name_value: ", value  # href属性的值
                    link_file.write(value)
                    link_file.write("\n")
                    #print "title: ", _attr(attrs, 'title')
                    #print "first tag:", self.get_starttag_text()  # <a>标签的开始tag
                    #print "\n"

def search_info(link, key):
    name = key
    text = urllib.urlopen(link).read()
    file_object = open("text.txt", "w")
    file_object.write(text)
    file_object.close()

    file_read = open("text.txt", "r")
    for line in file_read:
        if re.search(name, line):
            print line
            file_result.write(line)
            file_result.write("\n")
    file_read.close()


def deep_search(link, depth):
    lParser.feed(urllib.urlopen(link).read())

if __name__ == "__main__":
    #处理输入
    website = raw_input("请输入需要搜索的网站（exp:http://www.baidu.com）： ")
    key = raw_input("请输入需要搜索的关键字： ")
    print "需要查找的网站是：", website
    print "我知道了主人，您需要找关键字：", key
    # 创建HTML解析器的实例
    lParser = parseLinks()
    # 深度搜索子链接
    link_file = open("sub_link.txt", "w")
    deep_search("http://www.baidu.com", 10)
    link_file.close()

    # 查找子链接中的信息
    sub_link = open("sub_link.txt", "r")
    file_result = open("result.txt", "w")
    for sublink in sub_link:
        #print sublink
        if re.search("http", sublink):
            search_info(sublink, key)
    file_result.close()
    sub_link.close()

    lParser.close()

简单的网络爬虫的python实现

简单介绍的几个企业网络推广模式方法

操作Windows注册表的简单的Python程序制作教程

使用python实现语音文件的特征提取方法

Python实现在线程里运行scrapy的方法

Python 实现中值滤波、均值滤波的方法

编写简单的Python程序来判断文本的语种

Python实现简易过滤删除数字的方法小结

Python反爬虫技术之防止IP地址被封杀的讲解

Python实现的简单文件传输服务器和客户端

Python实现从脚本里运行scrapy的方法