【Python真的很强大】开发简易在线搜索 博客分类: pythonModel pythonhttp serverhtml parseropen local default browser
程序员文章站
2024-03-13 10:04:45
...
- 需求如下:开发一个简易的搜索引擎(即提供查询关键字的服务)
- 程序思路及模型: python构建一个Http Server;提供用户输入的一个静态页面;用户提交请求后,把请求再转发到其他站点,最后把结果(动态页面)传给用户
- 下面是程序原型模型(使用python 2.7编写,没有使用第三方library/module),没有任何优化,主要演示Http Server构建过程,Html Parser使用,调用本地浏览器进行体验等。
-
#为方便测试,所有代码都编写于一个文件 #没有保存到本地数据库(sqllite) #没有递归下载所有数据 import os,subprocess import sys import SimpleHTTPServer import SocketServer import logging import cgi from HTMLParser import HTMLParser import urllib2,urllib htmlContent='''<!DOCTYPE html> <html> <head> <meta http-equiv="Content-Type" content="text/html;charset=utf-8"> <style type="text/css"> .myform { COLOR: #ffffff; BACKGROUND-COLOR: skyblue; BORDER-RIGHT: rgb(0,0,0) 1px dashed; BORDER-TOP: rgb(0,0,0) 1px dotted; BORDER-LEFT: rgb(0,0,0) 1px dotted; BORDER-BOTTOM: rgb(0,0,0) 1px dotted } li a { display:block; } </style> </head> <body> <div class="myform"> <form action="/" method="post"> Search Keyword: <input type="text" name="keyword" value="UIC"><input type="submit" value="Submit"> </form> </div> <ul> search:result </ul> <p>The above is parts of result for searching.</p> </body> </html> ''' def openUrlInDefaultBrowser(url): if sys.platform=='win32': os.startfile(url) elif sys.platform=='darwin': subprocess.Popen(['open', url]) else: try: subprocess.Popen(['xdg-open', url]) except OSError: print 'Please open a browser on: '+url if len(sys.argv) > 2: PORT = int(sys.argv[2]) elif len(sys.argv) > 1: PORT = int(sys.argv[1]) else: PORT = 8899 class ServerHandler(SimpleHTTPServer.SimpleHTTPRequestHandler): def do_GET(self): logging.warning("======= GET STARTED =======") logging.warning(self.headers) self.wfile.write(htmlContent.replace('search:result','')) def do_POST(self): logging.warning("======= POST STARTED =======") logging.warning(self.headers) form = cgi.FieldStorage( fp=self.rfile, headers=self.headers, environ={'REQUEST_METHOD':'POST', 'CONTENT_TYPE':self.headers['Content-Type'], }) logging.warning("======= POST VALUES =======") for item in form.list: logging.warning(item) logging.warning("\n") q=form.getvalue("keyword") if q: p = MyHTMLParser() f = urllib2.urlopen('http://uic.edu.hk/en/component/search/?searchword=%s&searchphrase=all&Itemid=108' % (q)) html = f.read() p.feed(html) total = len(p.urls) item = 0 urls = [] while item < total: url = "<li><a href=\"%s%s\">%s</a></li>" % ("http://uic.edu.hk",p.urls[item].strip(),p.titles[item].strip()) urls.append(url) item += 1 p.close() self.wfile.write(htmlContent.replace('UIC',q).replace('search:result',''.join(urls))) else: self.wfile.write(htmlContent.replace('search:result','')) class MyHTMLParser(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.recording = 0 self.findHref = 0 self.urls = [] self.titles = [] def handle_starttag(self, tag, attrs): if tag == 'dt': for name, value in attrs: if name == 'class' and value == 'result-title': print name, value self.recording = 1 self.findHref = 0 if tag == 'a' and self.recording == 1: for name, value in attrs: if name == 'href': print name,value self.urls.append(value) self.recording = 1 self.findHref = 1 def handle_endtag(self, tag): if tag == 'dt': self.recording -= 1 if tag == 'a' and self.findHref == 1: self.findHref -= 1 def handle_data(self, data): if self.findHref: self.titles.append(data) Handler = ServerHandler httpd = SocketServer.TCPServer(("", PORT), Handler) print "author:sunflowerbbs@gmail.com from UC Studio, Python http server version 1.0 (for testing purposes only)" openUrlInDefaultBrowser("http://%s:%d" % ("localhost", PORT)) httpd.serve_forever()