java使用Nagao算法实现新词发现、热门词的挖掘
采用nagao算法统计各个子字符串的频次,然后基于这些频次统计每个字符串的词频、左右邻个数、左右熵、交互信息(内部凝聚度)。
名词解释:
nagao算法:一种快速的统计文本里所有子字符串频次的算法。详细算法可见
词频:该字符串在文档中出现的次数。出现次数越多越重要。
左右邻个数:文档中该字符串的左边和右边出现的不同的字的个数。左右邻越多,说明字符串成词概率越高。
左右熵:文档中该字符串的左边和右边出现的不同的字的数量分布的熵。类似上面的指标,有一定区别。
交互信息:每次将某字符串分成两部分,左半部分字符串和右半部分字符串,计算其同时出现的概率除于其各自独立出现的概率,最后取所有的划分里面概率最小值。这个值越大,说明字符串内部凝聚度越高,越可能成词。
算法具体流程:
1. 将输入文件逐行读入,按照非汉字([^\u4e00-\u9fa5]+)以及停词“的很了么呢是嘛个都也比还这于不与才上用就好在和对挺去后没说”,
分成一个个字符串,代码如下:
string[] phrases = line.split("[^\u4e00-\u9fa5]+|["+stopwords+"]");
停用词可以修改。
2. 获取所有切分后的字符串的左子串和右子串,分别加入左、右ptable
3. 对ptable排序,并计算ltable。ltable记录的是,排序后的ptable中,下一个子串同上一个子串具有相同字符的数量
4. 遍历ptable和ltable,即可得到所有子字符串的词频、左右邻
5. 根据所有子字符串的词频、左右邻结果,输出字符串的词频、左右邻个数、左右熵、交互信息
1. nagaoalgorithm.java
package com.algo.word; import java.io.bufferedreader; import java.io.bufferedwriter; import java.io.filenotfoundexception; import java.io.filereader; import java.io.filewriter; import java.io.ioexception; import java.util.arraylist; import java.util.arrays; import java.util.collections; import java.util.hashmap; import java.util.hashset; import java.util.list; import java.util.map; import java.util.set; public class nagaoalgorithm { private int n; private list<string> leftptable; private int[] leftltable; private list<string> rightptable; private int[] rightltable; private double wordnumber; private map<string, tfneighbor> wordtfneighbor; private final static string stopwords = "的很了么呢是嘛个都也比还这于不与才上用就好在和对挺去后没说"; private nagaoalgorithm(){ //default n = 5 n = 5; leftptable = new arraylist<string>(); rightptable = new arraylist<string>(); wordtfneighbor = new hashmap<string, tfneighbor>(); } //reverse phrase private string reverse(string phrase) { stringbuilder reversephrase = new stringbuilder(); for (int i = phrase.length() - 1; i >= 0; i--) reversephrase.append(phrase.charat(i)); return reversephrase.tostring(); } //co-prefix length of s1 and s2 private int coprefixlength(string s1, string s2){ int coprefixlength = 0; for(int i = 0; i < math.min(s1.length(), s2.length()); i++){ if(s1.charat(i) == s2.charat(i)) coprefixlength++; else break; } return coprefixlength; } //add substring of line to ptable private void addtoptable(string line){ //split line according to consecutive none chinese character string[] phrases = line.split("[^\u4e00-\u9fa5]+|["+stopwords+"]"); for(string phrase : phrases){ for(int i = 0; i < phrase.length(); i++) rightptable.add(phrase.substring(i)); string reversephrase = reverse(phrase); for(int i = 0; i < reversephrase.length(); i++) leftptable.add(reversephrase.substring(i)); wordnumber += phrase.length(); } } //count ltable private void countltable(){ collections.sort(rightptable); rightltable = new int[rightptable.size()]; for(int i = 1; i < rightptable.size(); i++) rightltable[i] = coprefixlength(rightptable.get(i-1), rightptable.get(i)); collections.sort(leftptable); leftltable = new int[leftptable.size()]; for(int i = 1; i < leftptable.size(); i++) leftltable[i] = coprefixlength(leftptable.get(i-1), leftptable.get(i)); system.out.println("info: [nagao algorithm step 2]: having sorted ptable and counted left and right ltable"); } //according to ptable and ltable, count statistical result: tf, neighbor distribution private void counttfneighbor(){ //get tf and right neighbor for(int pindex = 0; pindex < rightptable.size(); pindex++){ string phrase = rightptable.get(pindex); for(int length = 1 + rightltable[pindex]; length <= n && length <= phrase.length(); length++){ string word = phrase.substring(0, length); tfneighbor tfneighbor = new tfneighbor(); tfneighbor.incrementtf(); if(phrase.length() > length) tfneighbor.addtorightneighbor(phrase.charat(length)); for(int lindex = pindex+1; lindex < rightltable.length; lindex++){ if(rightltable[lindex] >= length){ tfneighbor.incrementtf(); string cophrase = rightptable.get(lindex); if(cophrase.length() > length) tfneighbor.addtorightneighbor(cophrase.charat(length)); } else break; } wordtfneighbor.put(word, tfneighbor); } } //get left neighbor for(int pindex = 0; pindex < leftptable.size(); pindex++){ string phrase = leftptable.get(pindex); for(int length = 1 + leftltable[pindex]; length <= n && length <= phrase.length(); length++){ string word = reverse(phrase.substring(0, length)); tfneighbor tfneighbor = wordtfneighbor.get(word); if(phrase.length() > length) tfneighbor.addtoleftneighbor(phrase.charat(length)); for(int lindex = pindex + 1; lindex < leftltable.length; lindex++){ if(leftltable[lindex] >= length){ string cophrase = leftptable.get(lindex); if(cophrase.length() > length) tfneighbor.addtoleftneighbor(cophrase.charat(length)); } else break; } } } system.out.println("info: [nagao algorithm step 3]: having counted tf and neighbor"); } //according to wordtfneighbor, count mi of word private double countmi(string word){ if(word.length() <= 1) return 0; double coprobability = wordtfneighbor.get(word).gettf()/wordnumber; list<double> mi = new arraylist<double>(word.length()); for(int pos = 1; pos < word.length(); pos++){ string leftpart = word.substring(0, pos); string rightpart = word.substring(pos); double leftprobability = wordtfneighbor.get(leftpart).gettf()/wordnumber; double rightprobability = wordtfneighbor.get(rightpart).gettf()/wordnumber; mi.add(coprobability/(leftprobability*rightprobability)); } return collections.min(mi); } //save tf, (left and right) neighbor number, neighbor entropy, mutual information private void savetfneighborinfomi(string out, string stoplist, string[] threshold){ try { //read stop words file set<string> stopwords = new hashset<string>(); bufferedreader br = new bufferedreader(new filereader(stoplist)); string line; while((line = br.readline()) != null){ if(line.length() > 1) stopwords.add(line); } br.close(); //output words tf, neighbor info, mi bufferedwriter bw = new bufferedwriter(new filewriter(out)); for(map.entry<string, tfneighbor> entry : wordtfneighbor.entryset()){ if( entry.getkey().length() <= 1 || stopwords.contains(entry.getkey()) ) continue; tfneighbor tfneighbor = entry.getvalue(); int tf, leftneighbornumber, rightneighbornumber; double mi; tf = tfneighbor.gettf(); leftneighbornumber = tfneighbor.getleftneighbornumber(); rightneighbornumber = tfneighbor.getrightneighbornumber(); mi = countmi(entry.getkey()); if(tf > integer.parseint(threshold[0]) && leftneighbornumber > integer.parseint(threshold[1]) && rightneighbornumber > integer.parseint(threshold[2]) && mi > integer.parseint(threshold[3]) ){ stringbuilder sb = new stringbuilder(); sb.append(entry.getkey()); sb.append(",").append(tf); sb.append(",").append(leftneighbornumber); sb.append(",").append(rightneighbornumber); sb.append(",").append(tfneighbor.getleftneighborentropy()); sb.append(",").append(tfneighbor.getrightneighborentropy()); sb.append(",").append(mi).append("\n"); bw.write(sb.tostring()); } } bw.close(); } catch (ioexception e) { throw new runtimeexception(e); } system.out.println("info: [nagao algorithm step 4]: having saved to file"); } //apply nagao algorithm to input file public static void applynagao(string[] inputs, string out, string stoplist){ nagaoalgorithm nagao = new nagaoalgorithm(); //step 1: add phrases to ptable string line; for(string in : inputs){ try { bufferedreader br = new bufferedreader(new filereader(in)); while((line = br.readline()) != null){ nagao.addtoptable(line); } br.close(); } catch (ioexception e) { throw new runtimeexception(); } } system.out.println("info: [nagao algorithm step 1]: having added all left and right substrings to ptable"); //step 2: sort ptable and count ltable nagao.countltable(); //step3: count tf and neighbor nagao.counttfneighbor(); //step4: save tf neighborinfo and mi nagao.savetfneighborinfomi(out, stoplist, "20,3,3,5".split(",")); } public static void applynagao(string[] inputs, string out, string stoplist, int n, string filter){ nagaoalgorithm nagao = new nagaoalgorithm(); nagao.setn(n); string[] threshold = filter.split(","); if(threshold.length != 4){ system.out.println("error: filter must have 4 numbers, seperated with ',' "); return; } //step 1: add phrases to ptable string line; for(string in : inputs){ try { bufferedreader br = new bufferedreader(new filereader(in)); while((line = br.readline()) != null){ nagao.addtoptable(line); } br.close(); } catch (ioexception e) { throw new runtimeexception(); } } system.out.println("info: [nagao algorithm step 1]: having added all left and right substrings to ptable"); //step 2: sort ptable and count ltable nagao.countltable(); //step3: count tf and neighbor nagao.counttfneighbor(); //step4: save tf neighborinfo and mi nagao.savetfneighborinfomi(out, stoplist, threshold); } private void setn(int n){ n = n; } public static void main(string[] args) { string[] ins = {"e://test//ganfen.txt"}; applynagao(ins, "e://test//out.txt", "e://test//stoplist.txt"); } }
2. tfneighbor.java
package com.algo.word; import java.util.hashmap; import java.util.map; public class tfneighbor { private int tf; private map<character, integer> leftneighbor; private map<character, integer> rightneighbor; tfneighbor(){ leftneighbor = new hashmap<character, integer>(); rightneighbor = new hashmap<character, integer>(); } //add word to leftneighbor public void addtoleftneighbor(char word){ //leftneighbor.put(word, 1 + leftneighbor.getordefault(word, 0)); integer number = leftneighbor.get(word); leftneighbor.put(word, number == null? 1: 1+number); } //add word to rightneighbor public void addtorightneighbor(char word){ //rightneighbor.put(word, 1 + rightneighbor.getordefault(word, 0)); integer number = rightneighbor.get(word); rightneighbor.put(word, number == null? 1: 1+number); } //increment tf public void incrementtf(){ tf++; } public int getleftneighbornumber(){ return leftneighbor.size(); } public int getrightneighbornumber(){ return rightneighbor.size(); } public double getleftneighborentropy(){ double entropy = 0; int sum = 0; for(int number : leftneighbor.values()){ entropy += number*math.log(number); sum += number; } if(sum == 0) return 0; return math.log(sum) - entropy/sum; } public double getrightneighborentropy(){ double entropy = 0; int sum = 0; for(int number : rightneighbor.values()){ entropy += number*math.log(number); sum += number; } if(sum == 0) return 0; return math.log(sum) - entropy/sum; } public int gettf(){ return tf; } }
3. main.java
package com.algo.word; public class main { public static void main(string[] args) { //if 3 arguments, first argument is input files splitting with ',' //second argument is output file //output 7 columns split with ',' , like below: //word, term frequency, left neighbor number, right neighbor number, left neighbor entropy, right neighbor entropy, mutual information //third argument is stop words list if(args.length == 3) nagaoalgorithm.applynagao(args[0].split(","), args[1], args[2]); //if 4 arguments, forth argument is the ngram parameter n //5th argument is threshold of output words, default is "20,3,3,5" //output tf > 20 && (left | right) neighbor number > 3 && mi > 5 else if(args.length == 5) nagaoalgorithm.applynagao(args[0].split(","), args[1], args[2], integer.parseint(args[3]), args[4]); } }
以上所述就是本文的全部内容了,希望大家能够喜欢。