tornado之TCPServer性能测试
前言
在下面这个博客里,介绍了tornado的异步TCPServer和TCPClient。
http://yunjianfei.iteye.com/blog/2186646
那么,tornado的性能到底怎么样呢?让我们来做一个性能测试吧。
这里推荐关于C10K问题的一个文章:http://www.kegel.com/c10k.html
准备工作
在开始性能测试前,需要修改一些相关的系统参数。
一.修改MAX open files
首先运行下面命令,查看单个进程运行时,能够打开的最多文件描述符(即FD)
本人的运行结果为: 65535
这个数值有点小,如果一个进程打开10万个文件或者FD,就会出现以下错误:
那么,我们开始修改。
打开 /etc/security/limits.conf 并添加(如果已经存在,则修改数值)
* hard nofile 1000000
修改完毕后,重新开一个shell窗口,或者重新登录shell就会使设置生效。
再次运行ulimit -n 可以看到已经是1000000
二.解决TCP的TIME_WAIT问题
如果在短时间内,TCPServer收到大量的连接(比如2万以上),并且这些连接迅速关闭,就会在linux系统中,发现大量状态为TIME-WAIT的TCP连接,可以通过以下命令来查看服务器中TCP连接的状态。
服务器中如果出现了大量的TIME-WAIT,导致的直接后果就是,TIME-WAIT状态的TCP连接占用的资源(比如端口号)不能得到及时释放,如果端口号被耗尽,则无法建立新的连接。
在开始测试前,我们修改一些系统参数,来缓解TIME_WAIT问题。
执行以下几个命令:
sysctl -w net.ipv4.tcp_tw_recycle=1
sysctl -w net.ipv4.tcp_max_tw_buckets=5000
sysctl -w net.ipv4.tcp_syncookies=1
sysctl -w net.ipv4.ip_local_port_range="1024 65000"
具体的我这里就不解释了,有兴趣的同学可以自行去查找资料。
测试代码
tcp_server.py
#!/usr/bin/env python2.7 # -*- coding: utf-8 -*- from tornado import ioloop, httpclient, gen from tornado.gen import Task from tornado.tcpserver import TCPServer import pdb, time, logging from tornado import stack_context from tornado.escape import native_str #Init logging def init_logging(): logger = logging.getLogger() logger.setLevel(logging.DEBUG) sh = logging.StreamHandler() formatter = logging.Formatter('%(asctime)s -%(module)s:%(filename)s-L%(lineno)d-%(levelname)s: %(message)s') sh.setFormatter(formatter) logger.addHandler(sh) logging.info("Current log level is : %s", logging.getLevelName(logger.getEffectiveLevel())) class MyServer(TCPServer): def __init__(self, io_loop=None, **kwargs): TCPServer.__init__(self, io_loop=io_loop, **kwargs) def handle_stream(self, stream, address): TCPConnection(stream, address, io_loop=self.io_loop) class TCPConnection(object): count = 0 def __init__(self, stream, address, io_loop): self.io_loop = io_loop self.stream = stream self.address = address self.address_family = stream.socket.family self.EOF = b' END' self._clear_request_state() self._message_callback = stack_context.wrap(self._on_message) self.stream.set_close_callback(self._on_connection_close) self.stream.read_until(self.EOF, self._message_callback) def _on_timeout(self): #logging.info("Send message..") self.write("Hello client!" + self.EOF) TCPConnection.count += 1 #logging.info("Has treated : %s", TCPConnection.count) if TCPConnection.count == 0 or TCPConnection.count % 2000 == 0: logging.info("Has treated : %s", TCPConnection.count) def _on_message(self, data): try: timeout = 5 data = native_str(data.decode('latin1')) #logging.info("Received: %s", data) #self.io_loop.add_timeout(self.io_loop.time() + timeout, self._on_timeout) self._on_timeout() except Exception, ex: logging.error("Exception: %s", str(ex)) def _clear_request_state(self): """Clears the per-request state. """ self._write_callback = None self._close_callback = None def set_close_callback(self, callback): """Sets a callback that will be run when the connection is closed. """ self._close_callback = stack_context.wrap(callback) def _on_connection_close(self): if self._close_callback is not None: callback = self._close_callback self._close_callback = None callback() self._clear_request_state() def close(self): self.stream.close() # Remove this reference to self, which would otherwise cause a self._clear_request_state() def write(self, chunk, callback=None): """Writes a chunk of output to the stream.""" if not self.stream.closed(): self._write_callback = stack_context.wrap(callback) self.stream.write(chunk, self._on_write_complete) def _on_write_complete(self): if self._write_callback is not None: callback = self._write_callback self._write_callback = None callback() def main(): init_logging() server = MyServer() server.listen(8001) ioloop.IOLoop.instance().start() if __name__ == "__main__": try: main() except Exception, ex: print "Ocurred Exception: %s" % str(ex) quit()
tcp_client.py
#!/usr/bin/env python2.7 # -*- coding: utf-8 -*- from tornado import ioloop, httpclient, gen from tornado.gen import Task import pdb, time, logging import tornado.ioloop import tornado.iostream import socket from tornado import stack_context #Init logging def init_logging(): logger = logging.getLogger() logger.setLevel(logging.DEBUG) sh = logging.StreamHandler() formatter = logging.Formatter('%(asctime)s -%(module)s:%(filename)s-L%(lineno)d-%(levelname)s: %(message)s') sh.setFormatter(formatter) logger.addHandler(sh) logging.info("Current log level is : %s", logging.getLevelName(logger.getEffectiveLevel())) class TCPClient(object): test_start = False max_connected = 20000 test_num = 20000 test_remain = test_num test_count = 0 total_count = 0 shutdown = False def __init__(self, host, port, io_loop=None): self.host = host self.port = port self.io_loop = io_loop self.shutdown = False self.stream = None self.sock_fd = None self.EOF = b' END' def get_stream(self): self.sock_fd = socket.socket(socket.AF_INET, socket.SOCK_STREAM, 0) self.stream = tornado.iostream.IOStream(self.sock_fd) self.stream.set_close_callback(self.on_close) def connect(self): self.get_stream() self.stream.connect((self.host, self.port), self.send_message) def on_receive(self, data): #logging.info("Received: %s", data) self.stream.close() self.stream = None self.sock_fd = None TCPClient.test_count += 1 try_testing(self.io_loop) def on_close(self): TCPClient.total_count += 1 if TCPClient.total_count % 2000 == 0: logging.info("Has treat: %s", TCPClient.total_count) #logging.info("----count: %s %s %s", TCPClient.total_count, TCPClient.shutdown, TCPClient.test_num) if TCPClient.shutdown and TCPClient.total_count == TCPClient.test_num: logging.info("shutdown") self.io_loop.stop() def send_message(self): #logging.info("Send message....") self.stream.write(b"Hello Server!" + self.EOF) self.stream.read_until(self.EOF, self.on_receive) #logging.info("After send....") def set_shutdown(self): TCPClient.shutdown = True def _handle_exception(typ, value, tb): logging.info("%s %s %s", typ, value, tb) return True def try_testing(io_loop): #logging.info("test_count: %s, test_remain: %s", TCPClient.test_count, TCPClient.test_remain) if TCPClient.test_start and TCPClient.test_count >= TCPClient.max_connected and TCPClient.test_remain > 0: TCPClient.test_start = False if not TCPClient.test_start: TCPClient.test_count = 0 TCPClient.test_start = True gen = None if TCPClient.test_remain >= TCPClient.max_connected: gen = start_test(io_loop, TCPClient.max_connected) elif TCPClient.test_remain > 0: gen = start_test(io_loop, TCPClient.test_remain) if gen: with stack_context.ExceptionStackContext(_handle_exception): c = gen.next() while True: c = gen.send(c) def start_test(io_loop, count): TCPClient.test_remain = TCPClient.test_remain - count logging.info("Will start %s testing! Remain: %s", count, TCPClient.test_remain) for i in range(count): c = yield TCPClient("127.0.0.1", 8001, io_loop) if i == (count-1) and TCPClient.test_remain <= 0: c.set_shutdown() c.connect() def main(): init_logging() io_loop = tornado.ioloop.IOLoop.instance() try_testing(io_loop) logging.info("**********************start ioloop******************") io_loop.start() if __name__ == "__main__": try: main() except Exception, ex: print "Ocurred Exception: %s" % str(ex) quit()
上面的client代码中,可以修改test_num这个数值,来改变测试case的数目,上面设置成了20000
测试结果
tcp_server:
2015-02-27 17:01:30,544 -server:server.py-L55-INFO: Has treated : 2000
2015-02-27 17:01:30,807 -server:server.py-L55-INFO: Has treated : 4000
2015-02-27 17:01:31,242 -server:server.py-L55-INFO: Has treated : 6000
2015-02-27 17:01:31,510 -server:server.py-L55-INFO: Has treated : 8000
2015-02-27 17:01:31,738 -server:server.py-L55-INFO: Has treated : 10000
2015-02-27 17:01:32,220 -server:server.py-L55-INFO: Has treated : 12000
2015-02-27 17:01:32,452 -server:server.py-L55-INFO: Has treated : 14000
2015-02-27 17:01:32,743 -server:server.py-L55-INFO: Has treated : 16000
2015-02-27 17:01:33,146 -server:server.py-L55-INFO: Has treated : 18000
2015-02-27 17:01:38,342 -server:server.py-L55-INFO: Has treated : 20000
可以看到,总体来说,server的性能还是不错的,总共8s时间,处理了2万个请求+响应。
上面的测试结果中有个地方让我非常疑惑,最后2000个请求,中间居然耗费了5s,我在tornado的ioloop.py中加了一些debug语句跟踪后,发现是在33s的时候,卡在了ioloop的下面这一行
也就是说,epoll在等待IO事件到来,有点想不明白为什么在这里等了5s。经过多次测试,这种情况是完全可以复现的,还有待进一步调查。