欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  后端开发

权重计算,稍加修改亦可用于分词,词频统计,全文和spam检测等

程序员文章站 2022-06-12 14:50:48
...
效率非常客观,你要是改成其他用处那效率我就不保证了
  1. /* vim: set expandtab tabstop=4 shiftwidth=4: */
  2. // +------------------------------------------------------------------------
  3. // Name : 权重计算
  4. // Description: 稍加修改,亦可用于分词,词频统计,全文检索和垃圾检测
  5. // Date : 2013/12/16 08:51
  6. // Authors : latel
  7. // +------------------------------------------------------------------------
  8. //
  9. /*外部调用示例*/
  10. /*
  11. $aItems = array(
  12. 'chinaisbig',
  13. 'whichisnot',
  14. 'totalyrightforme',
  15. );
  16. $aTable = array(
  17. 'china,is|small',
  18. 'china,big|me',
  19. 'china,is|big,which|not,me',
  20. 'totaly|right,for,me',
  21. );
  22. $oWeight = new ttrie;
  23. $oWeight->newItems($aItems);
  24. $aResult = $oWeight->newTable($aTable);
  25. */
  26. class weight {
  27. protected $aDict = array(array());
  28. protected $aItems = array();
  29. protected $sLastRule;
  30. protected $aMatchs = array();
  31. protected $aShow = array();
  32. private function init() {
  33. //清空记录的匹配表和输出结果
  34. unset($this->aShow);
  35. }
  36. public function newItems($mItems) {
  37. //导入新的项目
  38. $this->aItems = (is_array($mItems))? $mItems: array($mItems);
  39. $this->init();
  40. }
  41. public function newTable(array $aTable) {
  42. //导入新的对照表,并生成字典
  43. foreach($aTable as $iTableKey=>$sTableLine) {
  44. $aTableLine = explode(',', str_replace('|', ',', $sTableLine));
  45. $setter = function($v, $k, $paraMeter) {
  46. $k1 = $paraMeter[0]; $oWeight = $paraMeter[1];
  47. $oWeight->genDict($v, $k1);
  48. };
  49. array_walk($aTableLine, $setter, array($iTableKey, $this));
  50. }
  51. $this->init();
  52. }
  53. public function getShow($sRule = 'max') {
  54. //获取最终的显示结果
  55. if(empty($this->aItems) || empty($this->aDict))
  56. return array();
  57. if (empty($this->aShow) || $sRule != $this->sLastRule)
  58. return $this->genShow($sRule);
  59. return $this->aShow;
  60. }
  61. public function genShow($sRule) {
  62. $aShow = array();
  63. $aMatchs = array();
  64. $getter = function($v, $k, $oWeight) use(&$aShow, &$aMatchs, $sRule) {
  65. $t = array_count_values($oWeight->matchWord($v));
  66. $aMatchs[] = $t;
  67. switch ($sRule) {
  68. case 'max':
  69. $aShow[$k] = array_keys($t, max($t));
  70. break;
  71. }
  72. };
  73. array_walk($this->aItems, $getter, $this);
  74. $this->aShow = $aShow;
  75. $this->aMatchs = $aMatchs;
  76. return $aShow;
  77. }
  78. private function genDict($mWord, $iKey = '') {
  79. $iInsertPonit = count($this->aDict);
  80. $iCur = 0; //当前节点号
  81. foreach (str_split($mWord) as $iChar) {
  82. if (isset($this->aDict[$iCur][$iChar])) {
  83. $iCur = $this->aDict[$iCur][$iChar];
  84. continue;
  85. }
  86. $this->aDict[$iInsertPonit] = array();
  87. $this->aDict[$iCur][$iChar] = $iInsertPonit;
  88. $iCur = $iInsertPonit;
  89. $iInsertPonit++;
  90. }
  91. $this->aDict[$iCur]['acc'][] = $iKey;
  92. }
  93. function matchWord($sLine) {
  94. $iCur = $iOffset = $iPosition = 0;
  95. $sLine .= "\0";
  96. $iLen = strlen($sLine);
  97. $aReturn = array();
  98. while($iOffset $sChar = $sLine{$iOffset};
  99. if(isset($this->aDict[$iCur][$sChar])) {
  100. $iCur = $this->aDict[$iCur][$sChar];
  101. if(isset($this->aDict[$iCur]['acc'])) {
  102. $aReturn = array_merge($aReturn, $this->aDict[$iCur]['acc']);
  103. $iPosition = $iOffset + 1;
  104. $iCur = 0;
  105. }
  106. } else {
  107. $iCur = 0;
  108. $iOffset = $iPosition;
  109. $iPosition = $iOffset + 1;
  110. }
  111. ++$iOffset;
  112. }
  113. return $aReturn;
  114. }
  115. }
  116. ?>
复制代码