欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  php教程

CURL+DOM采集小样

程序员文章站 2022-06-17 16:29:45
...
个人感觉效率非常高的采集方式,但DOMdocument好像无法采集到带标签的内容,采集纯文本非常快.
大部分功能都没来得及做就被否了,只能提供大家参考用.
可以看到CURL代理访问+ZEND_DOM采集用法.
主要看CollectGoodsController.class.php就可以了.//获取店铺栏目
public function getShopCate($shop_url="",$continue=0){
$source=file_get_contents('Offline/shops.htm');
$shops_id=1;
$mall_id=1;
$cate=M('goods_category')->where(array('shops_id'=>$shops_id))->find();
if(!empty($cate)){
return fasle;//采集店铺栏目已存在
//$this->error('采集店铺栏目已存在');
}

import('@.Tao.TaoHttp','','.php');
$Http= new \TaoHttp();
$shop_html=$Http->encoding($source);

$shop_category_rule=D('CollectGoods')->getRule($mall_id,'shop_category');
import('@.Tao.Dom.Query','','.php');
$Dom= new \Zend_Dom_Query($shop_html);
$shop_category=$Dom->query($shop_category_rule);
if(count($shop_category)==0){
return false;//采集不到店铺栏目
//$this->error('采集不到店铺栏目');
}
$result=array();
foreach ($shop_category as $key => $value) {
$result[$key]['url']=$value->getAttribute('href');
$result[$key]['name']=trim($value->nodeValue);
}
unset($result[0]);

$data=array();
$time=time();
$cate_url=array();
foreach ($result as $value) {
$dataTmp=array(
'shops_id'=> $shops_id,
'cate_name'=> $value['name'],
'cate_url'=> $value['url'],
'collect_time'=>$time,
);
$cate_url[]=$value['url'];
$data[]=$dataTmp;
}
M('goods_category')->addAll($data);
return true;//采集店铺栏目成功
}
//获取店铺一个栏目商品
//http://localhost/TaoGoods/index.php?m=Taogoods&c=CollectGoods&a=getShopGoods&cate_id=3
public function getShopGoods($cate_id=0){
if($cate_id==0){return false;}
$goods_time=M('goods')->where(array('cate_id'=>$cate_id))->getField('collect_time');

if($goods_time){
if($goods_time + 86400*$this->day > time()){
$this->error('15天内请勿重复采集',U('index'));
}
$this->error('采集店铺栏目下货品已存在',U('index'));
}

$cate_data=M('goods_category')->find($cate_id);
$shops_id=$cate_data['shops_id'];
$cate_id=$cate_data['id'];
$mall_id=$cate_data['mall_id'];

import('@.Tao.TaoHttp','','.php');
$Http= new \TaoHttp();
$source=$Http->get($cate_data['cate_url']);
$shop_html=$Http->encoding($source);

$cate_rule=D('CollectGoods')->getRule($mall_id);
import('@.Tao.Dom.Query','','.php');
$Dom= new \Zend_Dom_Query($shop_html);
$cate_imgs=$Dom->query($cate_rule['shop_category_goods_img']);
$cate_names=$Dom->query($cate_rule['shop_category_goods_name']);
$cate_sales=$Dom->query($cate_rule['shop_category_goods_sale']);
$cate_cprices=$Dom->query($cate_rule['shop_category_goods_cprice']);
//$cate_sprices=$Dom->query($cate_rule['shop_category_goods_sprice']);

$num=count($cate_names);

$time=time();
$result=array();
for ($i=0; $i $result[$i]['goods_thumb']=$cate_imgs->bykey($i)->getAttribute('src');
$result[$i]['goods_name']=$cate_names->bykey($i)->nodeValue;
$result[$i]['goods_url']=$cate_names->bykey($i)->getAttribute('href');
$result[$i]['goods_cprice']=$cate_cprices->bykey($i)->nodeValue;
$result[$i]['goods_sale']=$cate_sales->bykey($i)->nodeValue;
//$result[$i]['goods_spirce']=$cate_sprices->bykey($i)->nodeValue;
$result[$i]['mall_id']=$mall_id;
$result[$i]['shops_id']=$shops_id;
$result[$i]['cate_id']=$cate_id;
$result[$i]['collect_time']=$time;
}

if(M('goods')->addAll($result)){
$this->success('采集店铺栏目下货品成功',U('index'));
}
/**
* 判断网页数据,转GBK等到UTF-8
*/
public function encoding($source){
$encode = mb_detect_encoding($source, array("GBK","UTF-8","GB2312","BIG5"));
if($encode=='CP936'){
$source=iconv("GBK", "UTF-8//IGNORE", $source);
//$meta用于DOM判断编码
$meta = '';
$source=$meta.$source;
}
return $source;
}
测试:只有这两个按钮能用,其他的都不能用
CURL+DOM采集小样
CURL+DOM采集小样
测试的话 可以将goods表清空 点击采集货品
sql文件在压缩包里

BY:悠悠山雨

CURL+DOM采集小样 Taogoods.zip ( 2.2 MB 下载:72 次 )

AD:真正免费,域名+虚机+企业邮箱=0元