CURL+DOM采集小样
程序员文章站
2022-06-17 16:29:45
...
个人感觉效率非常高的采集方式,但DOMdocument好像无法采集到带标签的内容,采集纯文本非常快.
大部分功能都没来得及做就被否了,只能提供大家参考用.
可以看到CURL代理访问+ZEND_DOM采集用法.
主要看CollectGoodsController.class.php就可以了.
测试的话 可以将goods表清空 点击采集货品
sql文件在压缩包里
BY:悠悠山雨
大部分功能都没来得及做就被否了,只能提供大家参考用.
可以看到CURL代理访问+ZEND_DOM采集用法.
主要看CollectGoodsController.class.php就可以了.
//获取店铺栏目
public function getShopCate($shop_url="",$continue=0){
$source=file_get_contents('Offline/shops.htm');
$shops_id=1;
$mall_id=1;
$cate=M('goods_category')->where(array('shops_id'=>$shops_id))->find();
if(!empty($cate)){
return fasle;//采集店铺栏目已存在
//$this->error('采集店铺栏目已存在');
}
import('@.Tao.TaoHttp','','.php');
$Http= new \TaoHttp();
$shop_html=$Http->encoding($source);
$shop_category_rule=D('CollectGoods')->getRule($mall_id,'shop_category');
import('@.Tao.Dom.Query','','.php');
$Dom= new \Zend_Dom_Query($shop_html);
$shop_category=$Dom->query($shop_category_rule);
if(count($shop_category)==0){
return false;//采集不到店铺栏目
//$this->error('采集不到店铺栏目');
}
$result=array();
foreach ($shop_category as $key => $value) {
$result[$key]['url']=$value->getAttribute('href');
$result[$key]['name']=trim($value->nodeValue);
}
unset($result[0]);
$data=array();
$time=time();
$cate_url=array();
foreach ($result as $value) {
$dataTmp=array(
'shops_id'=> $shops_id,
'cate_name'=> $value['name'],
'cate_url'=> $value['url'],
'collect_time'=>$time,
);
$cate_url[]=$value['url'];
$data[]=$dataTmp;
}
M('goods_category')->addAll($data);
return true;//采集店铺栏目成功
}
//获取店铺一个栏目商品
//http://localhost/TaoGoods/index.php?m=Taogoods&c=CollectGoods&a=getShopGoods&cate_id=3
public function getShopGoods($cate_id=0){
if($cate_id==0){return false;}
$goods_time=M('goods')->where(array('cate_id'=>$cate_id))->getField('collect_time');
if($goods_time){
if($goods_time + 86400*$this->day > time()){
$this->error('15天内请勿重复采集',U('index'));
}
$this->error('采集店铺栏目下货品已存在',U('index'));
}
$cate_data=M('goods_category')->find($cate_id);
$shops_id=$cate_data['shops_id'];
$cate_id=$cate_data['id'];
$mall_id=$cate_data['mall_id'];
import('@.Tao.TaoHttp','','.php');
$Http= new \TaoHttp();
$source=$Http->get($cate_data['cate_url']);
$shop_html=$Http->encoding($source);
$cate_rule=D('CollectGoods')->getRule($mall_id);
import('@.Tao.Dom.Query','','.php');
$Dom= new \Zend_Dom_Query($shop_html);
$cate_imgs=$Dom->query($cate_rule['shop_category_goods_img']);
$cate_names=$Dom->query($cate_rule['shop_category_goods_name']);
$cate_sales=$Dom->query($cate_rule['shop_category_goods_sale']);
$cate_cprices=$Dom->query($cate_rule['shop_category_goods_cprice']);
//$cate_sprices=$Dom->query($cate_rule['shop_category_goods_sprice']);
$num=count($cate_names);
$time=time();
$result=array();
for ($i=0; $i
$result[$i]['goods_thumb']=$cate_imgs->bykey($i)->getAttribute('src');
$result[$i]['goods_name']=$cate_names->bykey($i)->nodeValue;
$result[$i]['goods_url']=$cate_names->bykey($i)->getAttribute('href');
$result[$i]['goods_cprice']=$cate_cprices->bykey($i)->nodeValue;
$result[$i]['goods_sale']=$cate_sales->bykey($i)->nodeValue;
//$result[$i]['goods_spirce']=$cate_sprices->bykey($i)->nodeValue;
$result[$i]['mall_id']=$mall_id;
$result[$i]['shops_id']=$shops_id;
$result[$i]['cate_id']=$cate_id;
$result[$i]['collect_time']=$time;
}
if(M('goods')->addAll($result)){
$this->success('采集店铺栏目下货品成功',U('index'));
}
/**
* 判断网页数据,转GBK等到UTF-8
*/
public function encoding($source){
$encode = mb_detect_encoding($source, array("GBK","UTF-8","GB2312","BIG5"));
if($encode=='CP936'){
$source=iconv("GBK", "UTF-8//IGNORE", $source);
//$meta用于DOM判断编码
$meta = '';
$source=$meta.$source;
}
return $source;
}
测试:只有这两个按钮能用,其他的都不能用测试的话 可以将goods表清空 点击采集货品
sql文件在压缩包里
BY:悠悠山雨
Taogoods.zip ( 2.2 MB 下载:72 次 )
AD:真正免费,域名+虚机+企业邮箱=0元