欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  后端开发

基于Snoopy的PHP近似完美获取网站编码

程序员文章站 2022-04-21 12:35:08
...
基于Snoopy的PHP近似完美获取网站编码
用于php爬虫,获取编码准确率99.9%, 还有部分不能获取,求大牛完善
代码来源: 站云网 www.siteyun.com
先要到网上下载Snoopy.class.php
调用方法: echo $go->getCharset(); ?>

[code]url=$url; } //打开网站 private function open($url) { if($this->request!==null) { if($this->request->status==200) { return true; } else { return false; } } else { $this->request=new Snoopy(); $this->request->fetch($url); if($this->request->status==200) { $this->request->results=strtolower($this->request->results); $charset=$this->getCharset(); if($charset!="utf-8") { if($charset=="windows-1252") { $this->request->results=$this->uni_decode($this->request->results); } else { $this->request->results=mb_convert_encoding($this->request->results,"UTF-8",$charset); } } return true; } else { return false; } } } //获取网站title,keywords,description public function getWebinfo() { $info=array( 'title'=>'', 'keywords'=>'', 'desc'=>'', 'ip'=>'' ); if(!$this->open($this->url)){return $info;exit;} // print_r($this->request->results);exit; preg_match('/([^>]*)/si', $this->request->results, $titlematch ); if (isset($titlematch) && is_array($titlematch) && count($titlematch) > 0) { $info['title'] = strip_tags($titlematch[1]); } preg_match_all('/"]*)"?[\s]*' . 'content="?([^>"]*)"?[\s]*[\/]?[\s]*>/si', $this->request->results, $match); $ft=0; foreach($match[1] as $mt) { if($mt=="keywords" || $mt=="description") { $ft=1; } } if($ft==0) { preg_match_all('/"]*)"?[\s]*name="?' . '([^>"]*)"?[\s]*[\/]?[\s]*>/si', $this->request->results, $match); if (isset($match) && is_array($match) && count($match) == 3) { $originals = $match[0]; $names = $match[2]; $values = $match[1]; if (count($originals) == count($names) && count($names) == count($values)) { $metaTags = array(); for ($i=0, $limiti=count($names); $i htmlentities($originals[$i]), 'value' => $values[$i] ); } } } } else { if (isset($match) && is_array($match) && count($match) == 3) { $originals = $match[0]; $names = $match[1]; $values = $match[2]; if (count($originals) == count($names) && count($names) == count($values)) { $metaTags = array(); for ($i=0, $limiti=count($names); $i htmlentities($originals[$i]), 'value' => $values[$i] ); } } } } $result = array ( 'metaTags' => $metaTags ); if(isset($result['metaTags']['keywords']['value'])) { $info['keywords']=$result['metaTags']['keywords']['value']; } else { $info['keywords']=""; } if(isset($result['metaTags']['description']['value'])) { $info['desc']=$result['metaTags']['description']['value']; } else { $info['desc']=""; } $domain=preg_replace('/http\:\/\//si', '', $this->url); $ip=@gethostbyname($domain); $ip_arr=explode(".", $ip); if(count($ip_arr)==4) { $info['ip']=$ip; } return $info; } public function t($string,$o) { for($i=0;$i<strlen if continue break return strtolower function uni_decode json_decode create_function iconv public getcharset>open($this->url)){return false;exit;} //首先从html获取编码 preg_match("/<meta.>request->results,$temp) ? strtolower($temp[1]):""; if($temp[1]!="") { if(in_array($temp[1], $this->charset_arr)) { if($temp[1]=="gb2312") { $tmp_charset=$this->t($this->request->results,$temp[1]); if($tmp_charset==$temp[1]) { return $temp[1]; } } else { return $temp[1]; } } } if(!empty($this->request->headers)) { //从header中获取编码 $hstr=strtolower(implode("</meta.></strlen>