java使用Nagao算法实现新词发现、热门词的挖掘

程序员文章站 2024-03-05 09:04:12

采用nagao算法统计各个子字符串的频次，然后基于这些频次统计每个字符串的词频、左右邻个数、左右熵、交互信息（内部凝聚度）。名词解释： nagao算法：...

采用nagao算法统计各个子字符串的频次，然后基于这些频次统计每个字符串的词频、左右邻个数、左右熵、交互信息（内部凝聚度）。

名词解释：

nagao算法：一种快速的统计文本里所有子字符串频次的算法。详细算法可见
词频：该字符串在文档中出现的次数。出现次数越多越重要。
左右邻个数：文档中该字符串的左边和右边出现的不同的字的个数。左右邻越多，说明字符串成词概率越高。
左右熵：文档中该字符串的左边和右边出现的不同的字的数量分布的熵。类似上面的指标，有一定区别。
交互信息：每次将某字符串分成两部分，左半部分字符串和右半部分字符串，计算其同时出现的概率除于其各自独立出现的概率，最后取所有的划分里面概率最小值。这个值越大，说明字符串内部凝聚度越高，越可能成词。

算法具体流程：

1. 将输入文件逐行读入，按照非汉字（[^\u4e00-\u9fa5]+）以及停词“的很了么呢是嘛个都也比还这于不与才上用就好在和对挺去后没说”，
分成一个个字符串，代码如下：
string[] phrases = line.split("[^\u4e00-\u9fa5]+|["+stopwords+"]");
停用词可以修改。
2. 获取所有切分后的字符串的左子串和右子串，分别加入左、右ptable
3. 对ptable排序，并计算ltable。ltable记录的是，排序后的ptable中，下一个子串同上一个子串具有相同字符的数量
4. 遍历ptable和ltable，即可得到所有子字符串的词频、左右邻
5. 根据所有子字符串的词频、左右邻结果，输出字符串的词频、左右邻个数、左右熵、交互信息

1. nagaoalgorithm.java

package com.algo.word;
 
import java.io.bufferedreader;
import java.io.bufferedwriter;
import java.io.filenotfoundexception;
import java.io.filereader;
import java.io.filewriter;
import java.io.ioexception;
import java.util.arraylist;
import java.util.arrays;
import java.util.collections;
import java.util.hashmap;
import java.util.hashset;
import java.util.list;
import java.util.map;
import java.util.set;
 
public class nagaoalgorithm {
   
  private int n;
   
  private list<string> leftptable;
  private int[] leftltable;
  private list<string> rightptable;
  private int[] rightltable;
  private double wordnumber;
   
  private map<string, tfneighbor> wordtfneighbor;
   
  private final static string stopwords = "的很了么呢是嘛个都也比还这于不与才上用就好在和对挺去后没说";
   
  private nagaoalgorithm(){
    //default n = 5
    n = 5;
    leftptable = new arraylist<string>();
    rightptable = new arraylist<string>();
    wordtfneighbor = new hashmap<string, tfneighbor>();
  }
  //reverse phrase
  private string reverse(string phrase) {
    stringbuilder reversephrase = new stringbuilder();
    for (int i = phrase.length() - 1; i >= 0; i--)
      reversephrase.append(phrase.charat(i));
    return reversephrase.tostring();
  }
  //co-prefix length of s1 and s2
  private int coprefixlength(string s1, string s2){
    int coprefixlength = 0;
    for(int i = 0; i < math.min(s1.length(), s2.length()); i++){
      if(s1.charat(i) == s2.charat(i))  coprefixlength++;
      else break;
    }
    return coprefixlength;
  }
  //add substring of line to ptable
  private void addtoptable(string line){
    //split line according to consecutive none chinese character
    string[] phrases = line.split("[^\u4e00-\u9fa5]+|["+stopwords+"]");
    for(string phrase : phrases){
      for(int i = 0; i < phrase.length(); i++)
        rightptable.add(phrase.substring(i));
      string reversephrase = reverse(phrase);
      for(int i = 0; i < reversephrase.length(); i++)
        leftptable.add(reversephrase.substring(i));
      wordnumber += phrase.length();
    }
  }
   
  //count ltable
  private void countltable(){
    collections.sort(rightptable);
    rightltable = new int[rightptable.size()];
    for(int i = 1; i < rightptable.size(); i++)
      rightltable[i] = coprefixlength(rightptable.get(i-1), rightptable.get(i));
     
    collections.sort(leftptable);
    leftltable = new int[leftptable.size()];
    for(int i = 1; i < leftptable.size(); i++)
      leftltable[i] = coprefixlength(leftptable.get(i-1), leftptable.get(i));
     
    system.out.println("info: [nagao algorithm step 2]: having sorted ptable and counted left and right ltable");
  }
  //according to ptable and ltable, count statistical result: tf, neighbor distribution
  private void counttfneighbor(){
    //get tf and right neighbor
    for(int pindex = 0; pindex < rightptable.size(); pindex++){
      string phrase = rightptable.get(pindex);
      for(int length = 1 + rightltable[pindex]; length <= n && length <= phrase.length(); length++){
        string word = phrase.substring(0, length);
        tfneighbor tfneighbor = new tfneighbor();
        tfneighbor.incrementtf();
        if(phrase.length() > length)
          tfneighbor.addtorightneighbor(phrase.charat(length));
        for(int lindex = pindex+1; lindex < rightltable.length; lindex++){
          if(rightltable[lindex] >= length){
            tfneighbor.incrementtf();
            string cophrase = rightptable.get(lindex);
            if(cophrase.length() > length)
              tfneighbor.addtorightneighbor(cophrase.charat(length));
          }
          else break;
        }
        wordtfneighbor.put(word, tfneighbor);
      }
    }
    //get left neighbor
    for(int pindex = 0; pindex < leftptable.size(); pindex++){
      string phrase = leftptable.get(pindex);
      for(int length = 1 + leftltable[pindex]; length <= n && length <= phrase.length(); length++){
        string word = reverse(phrase.substring(0, length));
        tfneighbor tfneighbor = wordtfneighbor.get(word);
        if(phrase.length() > length)
          tfneighbor.addtoleftneighbor(phrase.charat(length));
        for(int lindex = pindex + 1; lindex < leftltable.length; lindex++){
          if(leftltable[lindex] >= length){
            string cophrase = leftptable.get(lindex);
            if(cophrase.length() > length)
              tfneighbor.addtoleftneighbor(cophrase.charat(length));
          }
          else break;
        }
      }
    }
    system.out.println("info: [nagao algorithm step 3]: having counted tf and neighbor");
  }
  //according to wordtfneighbor, count mi of word
  private double countmi(string word){
    if(word.length() <= 1)  return 0;
    double coprobability = wordtfneighbor.get(word).gettf()/wordnumber;
    list<double> mi = new arraylist<double>(word.length());
    for(int pos = 1; pos < word.length(); pos++){
      string leftpart = word.substring(0, pos);
      string rightpart = word.substring(pos);
      double leftprobability = wordtfneighbor.get(leftpart).gettf()/wordnumber;
      double rightprobability = wordtfneighbor.get(rightpart).gettf()/wordnumber;
      mi.add(coprobability/(leftprobability*rightprobability));
    }
    return collections.min(mi);
  }
  //save tf, (left and right) neighbor number, neighbor entropy, mutual information
  private void savetfneighborinfomi(string out, string stoplist, string[] threshold){
    try {
      //read stop words file
      set<string> stopwords = new hashset<string>();
      bufferedreader br = new bufferedreader(new filereader(stoplist));
      string line;
      while((line = br.readline()) != null){
        if(line.length() > 1)
          stopwords.add(line);
      }
      br.close();
      //output words tf, neighbor info, mi
      bufferedwriter bw = new bufferedwriter(new filewriter(out));
      for(map.entry<string, tfneighbor> entry : wordtfneighbor.entryset()){
        if( entry.getkey().length() <= 1 || stopwords.contains(entry.getkey()) ) continue;
        tfneighbor tfneighbor = entry.getvalue();
         
         
        int tf, leftneighbornumber, rightneighbornumber;
        double mi;
        tf = tfneighbor.gettf();
        leftneighbornumber = tfneighbor.getleftneighbornumber();
        rightneighbornumber = tfneighbor.getrightneighbornumber();
        mi = countmi(entry.getkey());
        if(tf > integer.parseint(threshold[0]) && leftneighbornumber > integer.parseint(threshold[1]) && 
            rightneighbornumber > integer.parseint(threshold[2]) && mi > integer.parseint(threshold[3]) ){
          stringbuilder sb = new stringbuilder();
          sb.append(entry.getkey());
          sb.append(",").append(tf);
          sb.append(",").append(leftneighbornumber);
          sb.append(",").append(rightneighbornumber);
          sb.append(",").append(tfneighbor.getleftneighborentropy());
          sb.append(",").append(tfneighbor.getrightneighborentropy());
          sb.append(",").append(mi).append("\n");
          bw.write(sb.tostring());
        }
      }
      bw.close();
    } catch (ioexception e) {
      throw new runtimeexception(e);
    }
    system.out.println("info: [nagao algorithm step 4]: having saved to file");
  }
  //apply nagao algorithm to input file
  public static void applynagao(string[] inputs, string out, string stoplist){
    nagaoalgorithm nagao = new nagaoalgorithm();
    //step 1: add phrases to ptable
    string line;
    for(string in : inputs){
      try {
        bufferedreader br = new bufferedreader(new filereader(in));
        while((line = br.readline()) != null){
          nagao.addtoptable(line);
        }
        br.close();
      } catch (ioexception e) {
        throw new runtimeexception();
      }
    }
    system.out.println("info: [nagao algorithm step 1]: having added all left and right substrings to ptable");
    //step 2: sort ptable and count ltable
    nagao.countltable();
    //step3: count tf and neighbor
    nagao.counttfneighbor();
    //step4: save tf neighborinfo and mi
    nagao.savetfneighborinfomi(out, stoplist, "20,3,3,5".split(","));
  }
  public static void applynagao(string[] inputs, string out, string stoplist, int n, string filter){
    nagaoalgorithm nagao = new nagaoalgorithm();
    nagao.setn(n);
    string[] threshold = filter.split(",");
    if(threshold.length != 4){
      system.out.println("error: filter must have 4 numbers, seperated with ',' ");
      return;
    }
    //step 1: add phrases to ptable
    string line;
    for(string in : inputs){
      try {
        bufferedreader br = new bufferedreader(new filereader(in));
        while((line = br.readline()) != null){
          nagao.addtoptable(line);
        }
        br.close();
      } catch (ioexception e) {
        throw new runtimeexception();
      }
    }
    system.out.println("info: [nagao algorithm step 1]: having added all left and right substrings to ptable");
    //step 2: sort ptable and count ltable
    nagao.countltable();
    //step3: count tf and neighbor
    nagao.counttfneighbor();
    //step4: save tf neighborinfo and mi
    nagao.savetfneighborinfomi(out, stoplist, threshold);
  }
  private void setn(int n){
    n = n;
  }
   
  public static void main(string[] args) {
    string[] ins = {"e://test//ganfen.txt"};
    applynagao(ins, "e://test//out.txt", "e://test//stoplist.txt");
  }
 
}

2. tfneighbor.java

package com.algo.word;
 
import java.util.hashmap;
import java.util.map;
 
public class tfneighbor {
 
  private int tf;
  private map<character, integer> leftneighbor;
  private map<character, integer> rightneighbor;
   
  tfneighbor(){
    leftneighbor = new hashmap<character, integer>();
    rightneighbor = new hashmap<character, integer>();
  }
  //add word to leftneighbor
  public void addtoleftneighbor(char word){
    //leftneighbor.put(word, 1 + leftneighbor.getordefault(word, 0));
    integer number = leftneighbor.get(word);
    leftneighbor.put(word, number == null? 1: 1+number);
  }
  //add word to rightneighbor
  public void addtorightneighbor(char word){
    //rightneighbor.put(word, 1 + rightneighbor.getordefault(word, 0));
    integer number = rightneighbor.get(word);
    rightneighbor.put(word, number == null? 1: 1+number);
  }
  //increment tf
  public void incrementtf(){
    tf++;
  }
  public int getleftneighbornumber(){
    return leftneighbor.size();
  }
  public int getrightneighbornumber(){
    return rightneighbor.size();
  }
  public double getleftneighborentropy(){
    double entropy = 0;
    int sum = 0;
    for(int number : leftneighbor.values()){
      entropy += number*math.log(number);
      sum += number;
    }
    if(sum == 0)  return 0;
    return math.log(sum) - entropy/sum;
  }
  public double getrightneighborentropy(){
    double entropy = 0;
    int sum = 0;
    for(int number : rightneighbor.values()){
      entropy += number*math.log(number);
      sum += number;
    }
    if(sum == 0)  return 0;
    return math.log(sum) - entropy/sum;
  }
  public int gettf(){
    return tf;
  }
}

3. main.java

package com.algo.word;
 
public class main {
 
  public static void main(string[] args) {
     
    //if 3 arguments, first argument is input files splitting with ','
    //second argument is output file
    //output 7 columns split with ',' , like below:
    //word, term frequency, left neighbor number, right neighbor number, left neighbor entropy, right neighbor entropy, mutual information
    //third argument is stop words list
    if(args.length == 3)
      nagaoalgorithm.applynagao(args[0].split(","), args[1], args[2]);
     
    //if 4 arguments, forth argument is the ngram parameter n
    //5th argument is threshold of output words, default is "20,3,3,5"
    //output tf > 20 && (left | right) neighbor number > 3 && mi > 5
    else if(args.length == 5)
      nagaoalgorithm.applynagao(args[0].split(","), args[1], args[2], integer.parseint(args[3]), args[4]);
     
     
  }
 
}

以上所述就是本文的全部内容了，希望大家能够喜欢。

上一篇： java实现短地址服务的方法（附代码）

下一篇： thinkPHP订单数字提醒功能的实现方法