完善的汉字转拼音php转换类
程序员文章站
2022-04-02 18:40:38
...
办法是利用矩阵,汉字的组成方式是两个ascii字符,一个高位码,一个低位码,界限分别是128-264 64-128
每个汉字拼音最长为8个字符,由此组成二维矩阵进行查询,弊端是无法解决多音字问题
class pinyin{ /* 是否将拼音文件读取到内存内,损耗少许内存,几百kb的样子,速度可以略有提升, */ var $ismemorycache = 1; /* 是否只获取首字母 */ var $isfrist = 1; /* 拼音矩阵文件地址 */ var $path = "py.qdb"; /* 内存拼音矩阵 */ var $memorycache; /* 拼音文件句柄 */ var $handle; /* 转换发生错误盒子 */ var $errormsgbox; /* 转换结果 */ var $result; var $array = array(); var $n_t = array("ā" => "a","á" => "a","ǎ" => "a","à" => "a","ɑ" => "a", "ō" => "o","ó" => "o","ǒ" => "o","ò" => "o", "ē" => "e","é" => "e","ě" => "e","è" => "e","ê" => "e", "ī" => "i","í" => "i","ǐ" => "i","ì" => "i", "ū" => "u","ú" => "u","ǔ" => "u","ù" => "u", "ǖ" => "v","ǘ" => "v","ǚ" => "v","ǜ" => "v","ü" => "v" ); /* 转换入口 @params $str 所需转换字符,$istonemark 是否保留音标 $suffix 尾缀,默认为空格 */ function chinesetopinyin($str,$istonemark = 0,$suffix = ""){ $this->py($str,$istonemark,$suffix); return $this -> result; } function get(){ return $this -> result; } function py($str,$n = 0,$s = ""){ $strlength = strlen($str); if($strlength == 0){ return ""; } $this->result = ""; if(is_array($str)){ foreach($str as $key => $val){ $str[$key] = $this->py($val,$n,$s); } return; } if(empty($this->handle)){ if(!file_exists($this->path)){ $this->addoneerrormsg(1,"拼音文件路径不存在"); return false; } if(is_array($str)){ foreach($str as $key => $val){ $str[$key] = $this->py($val,$n,$s); } } if($this -> ismemorycache){ if(!$this->memorycache){ $this->memorycache = file_get_contents($this->path); for($i = 0 ; $i 128){ $ord2 = ord(substr($str, ++$i, 1)); if(!isset($this->array[$ord1][$ord2])){ $leng = ($ord1 - 129) * ((254 - 63) * 8 + 2) + ($ord2 - 64) * 8; $this->array[$ord1][$ord2] = trim(substr($this->memorycache,$leng,8)); } $strtrlen = $this->isfrist ? 1 : 8; $this->result .= substr($this ->array[$ord1][$ord2],0,$strtrlen).$s; }else{ $this->result .= substr($str,$i,1); } } } }else{ $this->handle = fopen($this->path,"r"); for($i = 0 ; $i 128){ $ord2 = ord(substr($str, ++$i, 1)); if(!isset($this->array[$ord1][$ord2])){ $leng = ($ord1 - 129) * ((254 - 63) * 8 + 2) + ($ord2 - 64) * 8; fseek($this -> handle,$leng); $this->array[$ord1][$ord2] = trim(fgets($this->handle,8)); } $strtrlen = $this->isfrist ? 1 : 8; $this->result .= substr($this ->array[$ord1][$ord2],0,$strtrlen).$s; }else{ $this->result .= substr($str,$i,1); } } } if(!$n){ $this -> result = strtr($this -> result,$this -> n_t);} } } function addoneerrormsg($no,$reason){ $this->errormsgbox[] = "error:" . $no . "," . $reason; } function showerrormsg(){ foreach($this->errormsgbox as $val){ echo $val."rnrn"; } } function __destruct(){ if(is_array($this->errormsgbox)){ $this->showerrormsg(); } } }
之前遇见过这个难题,发现流传的代码都不怎么完善,汉字库总共有20k+的汉字,大多数的是拿几百个常用汉字打算糊弄过去,在火星文流传的今天,是不行的。
还有种读取词典然后转换的,每行一个汉字|拼音,这种弊端非常大,速度慢,耗费巨大内存,仅仅explode一下读入数组,再循环一次,就能耗费上百m的内存,如果一个单页面耗费上百m,负载稍微大点只能泪奔了。
永久地址:
转载随意~请带上教程地址吧^^
上一篇: linux中deb安装包如何安装
下一篇: 显卡在主机的什么位置