本帖最后由 qq_25293153 于 2015-01-13 15:42:18 编辑 file_get_contents 采集一个页面的数据,获取的数据是乱码,已经使用了检测编码的方式,
检测的是utf-8,我的页面编码也是utf-8,但是还是显示乱码,不知道为什么
$url="xxx";
$opts = array(
'http'=>array(
'user_agent' => "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)",
)
);
$context = stream_context_create($opts);
$neirong = file_get_contents($url, false, $context);
header("content-Type: text/html; charset=Utf-8");
ob_end_flush();
$encode = mb_detect_encoding($neirong, array("ASCII","UTF-8","GB2312","GBK","BIG5"));
echo $encode."
";
if ($encode!="UTF-8")
{
$neirong=mb_convert_encoding($neirong, "UTF-8", $encode);
}
echo $neirong;
$encode 输出:utf-8
$neirong 输出是乱码
我的页面编码是utf-8
------解决思路----------------------你在輸出的html中加入
它源數據是做了些轉換的,我那個程序已經是轉換過來了。
我把採集的也寫出來,直接運行就可以了。
//http://www.ziyouge.com/conbdhekbefiab
//http://www.ziyouge.com/zy/4/4980/1333249.html
// 獲取程序
$url = 'http://www.ziyouge.com/conbdhekbefiab';
$headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36';
$headerArr = array();
foreach( $headers as $n => $v ) {
$headerArr[] = $n .':' . $v;
}
$ch = curl_init();
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HTTPHEADER , $headerArr ); //构造IP
curl_setopt($ch, CURLOPT_REFERER, 'http://www.ziyouge.com/'); //构造来路
$content = curl_exec($ch);
$content = substr($content,3);
if($error=curl_error($ch)){
die($error);
}
curl_close($ch);
// 分析程序
$result = '';
$str_length = mb_strlen($content);
$i=0;
while ($i{
$temp_str=mb_substr($content,$i,1);
$ascnum=Ord($temp_str);
if ($ascnum>=224){
$result .= change(mb_substr($content,$i,3));
$i=$i+3;
}else{
$result .= mb_substr($content,$i,1);
$i=$i+1;
}
}
echo '';
echo $result;
// 處理
function change($str){
$ignore = array('“','”','!','…',':',',',',');
if(in_array($str, $ignore)){
return $str;
}
$prefix = "%u";
$postfix = "";
$str = iconv('UTF-8', 'UCS-2', $str);
$arrstr = str_split($str, 2);
$unistr = '';
for($i = 0, $len = count($arrstr); $i $tmp = hexdec(bin2hex($arrstr[$i]));
$tmp = str_pad(dechex($tmp),4,'0',STR_PAD_LEFT);
$tmp = decrypt(substr($tmp,2,2).substr($tmp,0,2));
$unistr .= $prefix . $tmp . $postfix;
}
return unescape($unistr);
}
// 解密
function decrypt($d){
$result = str_pad(dechex(hexdec($d)-100),4,'0',STR_PAD_LEFT);
return $result;
}
// 轉中文
function unescape($str) {
$ret = '';
$len = strlen ( $str );
for($i = 0; $i if ($str [$i] == '%' && $str [$i + 1] == 'u') {
$val = hexdec ( substr ( $str, $i + 2, 4 ) );
if ($val $ret .= chr ( $val );
else if ($val $ret .= chr ( 0xc0
------解决思路----------------------
($val >> 6) ) . chr ( 0x80
------解决思路----------------------
($val & 0x3f) );
else
$ret .= chr ( 0xe0
------解决思路----------------------
($val >> 12) ) . chr ( 0x80
------解决思路----------------------
(($val >> 6) & 0x3f) ) . chr ( 0x80
------解决思路----------------------
($val & 0x3f) );
$i += 5;
} else if ($str [$i] == '%') {
$ret .= urldecode ( substr ( $str, $i, 3 ) );
$i += 2;
} else
$ret .= $str [$i];
}
return $ret;
}
?>
声明:本文内容由网友自发贡献,版权归原作者所有,本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容,请联系admin@php.cn核实处理。
相关文章
相关视频
网友评论
文明上网理性发言,请遵守 新闻评论服务协议
我要评论