php与python实现的线程池多线程爬虫功能示例
程序员文章站
2024-03-01 21:20:22
本文实例讲述了php与python实现的线程池多线程爬虫功能。分享给大家供大家参考,具体如下:
多线程爬虫可以用于抓取内容了这个可以提升性能了,这里我们来看php与pyt...
本文实例讲述了php与python实现的线程池多线程爬虫功能。分享给大家供大家参考,具体如下:
多线程爬虫可以用于抓取内容了这个可以提升性能了,这里我们来看php与python 线程池多线程爬虫的例子,代码如下:
php例子
<?php class connect extends worker //worker模式 { public function __construct() { } public function getconnection() { if (!self::$ch) { self::$ch = curl_init(); curl_setopt(self::$ch, curlopt_timeout, 2); curl_setopt(self::$ch, curlopt_returntransfer, 1); curl_setopt(self::$ch, curlopt_header, 0); curl_setopt(self::$ch, curlopt_nosignal, true); curl_setopt(self::$ch, curlopt_useragent, "firefox"); curl_setopt(self::$ch, curlopt_followlocation, 1); } /* do some exception/error stuff here maybe */ return self::$ch; } public function closeconnection() { curl_close(self::$ch); } /** * note that the link is stored statically, which for pthreads, means thread local * */ protected static $ch; } class query extends threaded { public function __construct($url) { $this->url = $url; } public function run() { $ch = $this->worker->getconnection(); curl_setopt($ch, curlopt_url, $this->url); $page = curl_exec($ch); $info = curl_getinfo($ch); $error = curl_error($ch); $this->deal_data($this->url, $page, $info, $error); $this->result = $page; } function deal_data($url, $page, $info, $error) { $parts = explode(".", $url); $id = $parts[1]; if ($info['http_code'] != 200) { $this->show_msg($id, $error); } else { $this->show_msg($id, "ok"); } } function show_msg($id, $msg) { echo $id."\t$msg\n"; } public function getresult() { return $this->result; } protected $url; protected $result; } function check_urls_multi_pthreads() { global $check_urls; //定义抓取的连接 $check_urls = array( 'http://xxx.com' => "xx网",); $pool = new pool(10, "connect", array()); //建立10个线程池 foreach ($check_urls as $url => $name) { $pool->submit(new query($url)); } $pool->shutdown(); } check_urls_multi_pthreads(); python 多线程 def handle(sid)://这个方法内执行爬虫数据处理 pass class mythread(thread): """docstring for classname""" def __init__(self, sid): thread.__init__(self) self.sid = sid def run(): handle(self.sid) threads = [] for i in xrange(1,11): t = mythread(i) threads.append(t) t.start() for t in threads: t.join()
python 线程池爬虫:
from queue import queue from threading import thread, lock import urllib.parse import socket import re import time seen_urls = set(['/']) lock = lock() class fetcher(thread): def __init__(self, tasks): thread.__init__(self) self.tasks = tasks self.daemon = true self.start() def run(self): while true: url = self.tasks.get() print(url) sock = socket.socket() sock.connect(('localhost', 3000)) get = 'get {} http/1.0\r\nhost: localhost\r\n\r\n'.format(url) sock.send(get.encode('ascii')) response = b'' chunk = sock.recv(4096) while chunk: response += chunk chunk = sock.recv(4096) links = self.parse_links(url, response) lock.acquire() for link in links.difference(seen_urls): self.tasks.put(link) seen_urls.update(links) lock.release() self.tasks.task_done() def parse_links(self, fetched_url, response): if not response: print('error: {}'.format(fetched_url)) return set() if not self._is_html(response): return set() urls = set(re.findall(r'''(?i)href=["']?([^\s"'<>]+)''', self.body(response))) links = set() for url in urls: normalized = urllib.parse.urljoin(fetched_url, url) parts = urllib.parse.urlparse(normalized) if parts.scheme not in ('', 'http', 'https'): continue host, port = urllib.parse.splitport(parts.netloc) if host and host.lower() not in ('localhost'): continue defragmented, frag = urllib.parse.urldefrag(parts.path) links.add(defragmented) return links def body(self, response): body = response.split(b'\r\n\r\n', 1)[1] return body.decode('utf-8') def _is_html(self, response): head, body = response.split(b'\r\n\r\n', 1) headers = dict(h.split(': ') for h in head.decode().split('\r\n')[1:]) return headers.get('content-type', '').startswith('text/html') class threadpool: def __init__(self, num_threads): self.tasks = queue() for _ in range(num_threads): fetcher(self.tasks) def add_task(self, url): self.tasks.put(url) def wait_completion(self): self.tasks.join() if __name__ == '__main__': start = time.time() pool = threadpool(4) pool.add_task("/") pool.wait_completion() print('{} urls fetched in {:.1f} seconds'.format(len(seen_urls),time.time() - start))
更多关于php相关内容感兴趣的读者可查看本站专题:《php curl用法总结》、《php数组(array)操作技巧大全》、《php排序算法总结》、《php常用遍历算法与技巧总结》、《php数据结构与算法教程》、《php程序设计算法总结》、《php数学运算技巧总结》、《php正则表达式用法总结》、《php运算与运算符用法总结》、《php字符串(string)用法总结》及《php常见数据库操作技巧汇总》
希望本文所述对大家php程序设计有所帮助。
上一篇: 详谈Spring框架之事务管理