页面抓取程序
程序员文章站
2022-04-16 17:38:55
...
跳至
_init_request($request); $this->_init_curl(); } protected function _init_var() { $this->pageCount = 0; } protected function _init_request(request $request) { $this->request = $request; // $this->request->cache_path = __DIR__ . '/sjm_cache/'; // $this->request->fetch_item_query = '#J_posts_list .subject .title a'; // $this->request->fetch_page_current = '.J_page_wrap .pages strong'; // $this->request->base_url = 'http://bbs.sijiaomao.com/index.php?m=bbs&c=thread&fid=10&page=%d'; } protected function _init_curl() { $this->curl = new CurlMulti(); $this->cacheDir = $this->request->cache_path . 'cache'; if (! is_dir($this->cacheDir)) { mkdir($this->cacheDir, 777, true); } $this->cacheDataDir = $this->request->cache_path . 'data'; if (! is_dir($this->cacheDataDir)) { mkdir($this->cacheDataDir, 777, true); } $this->curl->cache = array( 'dir' => $this->cacheDir, 'on' => true, 'expire' => 3600 * 24 ); $this->curl->maxThread = 10; $this->curl->opt[CURLOPT_CONNECTTIMEOUT] = 10; } public function fetch_list(){ $this->_add_fetch_list_url(); $this->curl->start(); $this->_save_article_list(); } public function fetch_article() { foreach ($this->article_list as $k => $v) { $this->curl->add(array( 'url' => $v['href'] ), array($this, '_success_article')); } $this->curl->start(); } public function display() { printf( "\n共抓取%d个页面\n文章列表%d篇\n相关文章%d篇\n文章目录存放在%s\n", $this->pageCount + $this->articleCount, $this->pageCount, count($this->article_list), $this->cacheDataDir . '/list.php' ); } public function fetch() { return sprintf( "\n共抓取%d个页面\n文章列表%d篇\n相关文章%d篇\n文章目录存放在%s\n", $this->pageCount + $this->articleCount, $this->pageCount, count($this->article_list), $this->cacheDataDir . '/list.php' ); } public function _add_fetch_list_url($page = 1){ $this->curl->add( array( 'url' => sprintf($this->request->base_url, $page), 'args' => array('page' => $page) ), array($this, '_success_list') ); } protected function _save_article_list() { $res = file_put_contents( $this->cacheDataDir . '/list.php', sprintf("<?php\n return\t%s;", var_export($this->article_list, true)) ); // 相关性排序整理 /*uasort($this->article_list, function ($a, $b){ preg_match_all('#([a-zA-Z]+)#is', $a['title'], $match); $a_title = strtoupper(implode("", $match[0])); preg_match_all('#([a-zA-Z]+)#is', $b['title'], $match); $b_title = strtoupper(implode("", $match[0])); return $a_title > $b_title; });*/ $res = file_put_contents( $this->cacheDataDir . '/list.txt', array_map(function($a_list){ $str = sprintf( "标题:%s\t超链接:%s \n", str_replace(" ", "", $a_list['title']), $a_list['href'] ); return $str; }, $this->article_list) ); return $res; } public function _success_article($r, $param){ ++$this->articleCount; } public function _success_list($r, $param){ ++$this->pageCount; $html = phpQuery::newDocumentHTML($r['content']); $list = $html[$this->request->fetch_item_query]; foreach ($list as $v) { $v = pq($v); $item = array( "title" => $v->attr('title') ? $v->attr('title') : $v->text(), "href" => real_url($v->attr('href'), $this->request->base_url) ); $this->article_list[md5($item['href'])] = $item; } $page_current = $html[$this->request->fetch_page_current]; if ($page_current->next()->text()) { $page = ++ $param['page']; $this->_add_fetch_list_url($page); } phpQuery::unloadDocuments(); } } class request{ /*url*/ public $base_url; /*缓存文件路径*/ public $cache_path; /*获取元素的CSS选择器*/ public $fetch_item_query; /*分页当前页面元素的CSS选择器*/ public $fetch_page_current; static $instance; static public function getInstance() { if (empty(self::$instance)) { self::$instance = new self; } return self::$instance; } private function __construct() { $this->_init_base(); } function _init_base() { $this->cache_path = __DIR__ . '/'. trim($_POST['cache_path'], '/') .'/'; $this->fetch_item_query = $_POST['fetch_item_query']; $this->fetch_page_current = $_POST['fetch_page_current']; $this->base_url = $_POST['url']; } function request() { if (strstr($_POST['url'], '?')) { $url = sprintf("%s&auth=%s", $_POST['url'], $auth); } else { $url = sprintf("%s?auth=%s", $_POST['url'], $auth); } $param = array(); if (isset($_POST['param'])) { foreach($_POST['param'] as $k => $item) { if (!empty($item['method']) && !empty($item['name'])) { $param[$item['method']][$item['name']] = $item['value']; } } } if (isset($param['get']) && !empty($param['get'])) { foreach ($param['get'] as $name => $value) { $url = sprintf("%s&%s=%s", $url, $name, $value); } } $post_data = null; if (isset($param['post']) && !empty($param['post'])) { $post_data = $param['post']; } } } ?>fetch_list(); //$myCurl->fetch_article(); myDebug::set_end(); } else { $_POST['url'] = 'http://www.oschina.net/code/tag/php?show=time&lang=&catalog=&p=%d'; $_POST['cache_path'] = 'oschina'; $_POST['fetch_item_query'] = '.code_list ul li .code_title > a'; $_POST['fetch_page_current'] = '.pager li.current'; } ?>页面爬虫