利用curl，正则表达式做的一个php蜘蛛抓取器

global $std;

$std = array(

'url' => '[0-9a-zA-Z\.\:\-\/%_#;&]+',

'img' => '/(?is)/',
);

复制代码

/**

* test.php

*

* @author xzfred
* @copyright 2009 fengone.com

* @created 2010-12-07 .

* @version $Id: php.php 3 2008-10-10 07:49:21Z fred $

* SVNPath $HeadURL: http://192.168.0.16/svn/vim/skeletons/php.php $

*/

/*

include_once "std.php";

include_once "lady_163_com.php";

*/

include_once $GLOBALS['g_dir_core'] . "get.php";

//================================================================================

include_once DIR_HOST_TAG . '/tuku_ent_china_com.php';

$obj = new FcHtmlParse($site);

$c = $obj->parse(file_get_contents("http://tuku.ent.china.com/fun/html/2011-08-23/181703.xml"));

echo "\n\n\n ===================\n";

echo $c['field']['tag1'][0];

echo "\n\n\n ===================\n";

var_dump($c);

exit();

//列表测试

$obj = new FcHtmlParse($site_list);

$c = $obj->parse(file_get_contents("http://tuku.ent.china.com/fun/html/3569_1.html"));

var_dump($c);

exit();

/*

$obj = new FcHtmlGet($site);

$c = $obj->getPage('http://star.pclady.com.cn/entertainment/ss/1106/703240.html');

var_dump($c);

$obj = new FcHtmlGet($site);

$c = $obj->getPage('http://star.pclady.com.cn/entertainment/ss/1106/703240.html');

var_dump($c);

$obj = new FcHtmlParse($site);

$img_obj = new FcHtmlImgUpload($site);

$data = file_get_contents("e:/b.html");

$c = $obj->parse($data);

$ic = $img_obj->upload($c['tag']['tag1'][0]);

var_dump($ic);

$data = file_get_contents("e:/a.html");

$c = $obj->parse($data);

$ic = $img_obj->upload($c['tag']['tag1'][0]);

var_dump($ic);

*/

//var_dump($c['tag']['tag1']);

复制代码

include_once $GLOBALS['g_dir_core'] . 'host/std.php';

$site = array(

'aname' => '中华网娱乐图库',

'domain' => 'tuku.ent.china.com',

'dirname' => '目录名称，用于匹配基于目录不同的正文',

'gettype' => 'default',

//获取主文件

'creg' => '/(?si)(.*?)/',
'code' => 'utf-8',

'sub' => '获取子目录正则',

'content' => 'tag1',

'img_upload'=> array('tag1' => ''),

//下一页

'reg_next' => '/(?is)下一页/',
'key0' => '/(?is)/',

'key0_ap' => array(array(',', '|'), ' '),

'tag0' => '/(?is)title="([^"]*?)"/',

'tag0_arp' => array(

array(

'/(?is)$图$/',

'/(?is)\"/',

'/(?is)独家：/',

'/(?is)独家:/',

'/(?is)(《|》)/',

),

array(

'', '', '', '', '',

)

),

'tag1' => '/(?is)(.*?)/',
'tag1_brp' => array(

array(

'/(?is)\s*\s*/'

),

array(

'

$1

$3
'

)

),

'tag1_arp' => array(

array(

'/(?is)
([^]*?)/',

'/(?is)\<br\/\>/',

),

array(

'
$1

',

'',

)

),

'strip' => array('tag1' => ''),

//网名

'tag3' => '/(?is)([^]*?)/',
'tag4' => '/(?is)(中华网)/'

);

$map = array(

'tag' => 'key0',

'title' => 'tag0',

'content' => 'tag1',

'author' => 'tag3',

'source' => 'tag4'

);

$site_list = array(

'aname' => '中华网娱乐图库',

'domain' => 'tuku.ent.china.com',

'gettype' => 'default',

'creg' => '/(?si)
(.*?)
/',
'code' => 'utf-8',

'reg_next' => '/(?si)

下一页/',
//链接

'tag0' => '/(?is)
.*?[^]*?/',
'tag0_brp' => array(

array(

'/(?is)\.htm/',

),

array(

'.xml'

)

),

//标题

'tag1' => '/(?is)
.*?([^]*?)/',
'tag1_arp' => array(

array(

'/(?is)$图$/',

'/(?is)\"/',

'/(?is)独家：/',

'/(?is)独家:/',

'/(?is)(《|》)/',

),

array(

'', '', '', '', '',

)

),

);

$list_map = array(

'url' => 'tag0',

'title' => 'tag1',

);

$site_list_sub = array();

复制代码

相关标签：利用curl，正则表达式做的一个php蜘蛛抓取器

上一篇：关于Warning: mysql_fetch_array(): supplied argument is not a valid MySQL result问题

下一篇：你如何理解 HTML5 的 section？会在什么场景使用？为什么这些场景使用 section 而不是 div？