搜索引擎之分词器学习
程序员文章站
2024-02-27 19:08:39
...
分词器实现代码:
package com.zd.tokenizer;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.*;
public class Tokenizer {
private Map<Character, Object> dictionary;
public Tokenizer(String dictionaryFilePath) throws IOException {
//红黑树的实现
dictionary = new TreeMap<>();
//从文件加载字典到TreeMap
this.loadDictionary(dictionaryFilePath);
}
private void loadDictionary(String dictionaryFilePath) throws IOException {
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(dictionaryFilePath)));
String line = null;
while ((line = reader.readLine()) != null) {
line = line.trim();
if (line.length() == 0) {
continue;
}
char c;
Map<Character, Object> child = this.dictionary;
//组成以这个字符开头的词的树
for (int i = 0; i < line.length(); i++) {
c = line.charAt(i);
Map<Character, Object> ccMap = (Map<Character, Object>) child.get(c);
if (ccMap == null) {
ccMap = new HashMap<Character, Object>();
child.put(c, ccMap);
}
child = ccMap;
}
child.put(' ', null);
}
}
public List<String> participie(String text) {
if (text == null) {
return null;
}
text = text.trim();
if (text.length() == 0) {
return null;
}
List<String> tokens = new ArrayList<>();
char c;
for (int i = 0; i < text.length(); ) {
StringBuilder token = new StringBuilder();
Map<Character, Object> child = this.dictionary;
boolean matchToken = false;
for (int j = i; j < text.length(); j++) {
c = text.charAt(j);
Map<Character, Object> ccMap = (Map<Character, Object>) child.get(c);
if (ccMap == null) {
if (child.containsKey(' ')) {
matchToken = true;
i = j;
}
break;
} else {
token.append(c);
child = ccMap;
}
}
//匹配到词
if (matchToken) {
tokens.add(token.toString());
} else {
if (child.containsKey(' ')) {//短的也是词,如张三丰,张三
tokens.add(token.toString());
break;
} else {//没有匹配到词,则该字符作为一个词
tokens.add("" + text.charAt(i));
i++;
}
}
}
return tokens;
}
public static void main(String[] args) throws IOException {
Tokenizer tk = new Tokenizer(Tokenizer.class.getResource("/dictionary.txt").getPath());
List<String> tokens = tk.participie("想过离开,以这种方式存在,是因为那些旁白那些姿态那些伤害");
for (String s : tokens){
System.out.println(s);
}
}
}