欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

搜索引擎之分词器学习

程序员文章站 2024-02-27 19:08:39
...

分词器实现代码:
package com.zd.tokenizer;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.*;

public class Tokenizer {
private Map<Character, Object> dictionary;

public Tokenizer(String dictionaryFilePath) throws IOException {
    //红黑树的实现
    dictionary = new TreeMap<>();
    //从文件加载字典到TreeMap
    this.loadDictionary(dictionaryFilePath);
}

private void loadDictionary(String dictionaryFilePath) throws IOException {
    BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(dictionaryFilePath)));
    String line = null;
    while ((line = reader.readLine()) != null) {
        line = line.trim();
        if (line.length() == 0) {
            continue;
        }
        char c;
        Map<Character, Object> child = this.dictionary;

        //组成以这个字符开头的词的树
        for (int i = 0; i < line.length(); i++) {
            c = line.charAt(i);
            Map<Character, Object> ccMap = (Map<Character, Object>) child.get(c);
            if (ccMap == null) {
                ccMap = new HashMap<Character, Object>();
                child.put(c, ccMap);
            }
           child = ccMap;
        }
        child.put(' ', null);
    }
}

public List<String> participie(String text) {
    if (text == null) {
        return null;
    }
    text = text.trim();
    if (text.length() == 0) {
        return null;
    }
    List<String> tokens = new ArrayList<>();
    char c;
    for (int i = 0; i < text.length(); ) {
        StringBuilder token = new StringBuilder();
        Map<Character, Object> child = this.dictionary;
        boolean matchToken = false;

        for (int j = i; j < text.length(); j++) {
            c = text.charAt(j);
            Map<Character, Object> ccMap = (Map<Character, Object>) child.get(c);
            if (ccMap == null) {
                if (child.containsKey(' ')) {
                    matchToken = true;
                    i = j;
                }
                break;
            } else {
                token.append(c);
                child = ccMap;
            }
        }

        //匹配到词
        if (matchToken) {
            tokens.add(token.toString());
        } else {
            if (child.containsKey(' ')) {//短的也是词,如张三丰,张三
                tokens.add(token.toString());
                break;
            } else {//没有匹配到词,则该字符作为一个词
                tokens.add("" + text.charAt(i));
                i++;

            }
        }
    }
    return tokens;
}


public static void main(String[] args) throws IOException {
    Tokenizer tk = new Tokenizer(Tokenizer.class.getResource("/dictionary.txt").getPath());
    List<String> tokens = tk.participie("想过离开,以这种方式存在,是因为那些旁白那些姿态那些伤害");
    for (String s : tokens){
        System.out.println(s);
    }
}

}
搜索引擎之分词器学习