一段没有空格的中英文分词的n-gram算法实现 博客分类: 商业智能和数据挖掘 算法J#数据挖掘CC++
程序员文章站
2024-03-23 11:14:10
...
我刚写过个C#的实现。用的N-Gram算法很简单的。也能解决楼上的朋友的问题就是第一个单词和往后数8个单词的排列组合的取最大概率值得时候,把第一位的单词作为分词的结果,然后分词窗口后移,继续下一步。用堆栈作的 等下我给你找找,算法部分直接就可以在java下面Ctrl+C了。。 我开发项目用java,作数据挖掘和商业算法研究用C#的
c# 代码
- using System;
- using System.Collections.Generic;
- using System.Text;
- using System.Collections;
- using System.IO;
- namespace HNOZ
- {
- class Program
- {
- static double UNKNOWN = 0.05F;
- static int pt = 0;
- static int PRE_LENGTH =8;
- static int FL_LENGTH = 8;
- static int fl = FL_LENGTH;
- static int pre = PRE_LENGTH;
- static string sentence = "goodmorningbetterhello";
- static Hashtable dict = new Hashtable();
- static void Init()
- {
- }
- static void Main(string[] args)
- {
- Hashtable ht = new Hashtable();
- // sentence = "欧美的政治上的保守党派确实经济上的*主义鼓吹者欧美欧美的政治上的保守党派确实经济上欧美的政治上的保守党派确实经济上的*主义鼓吹者欧美欧美的政治上的保守党派确实经济上";
- //string sentence = "欧美的政治上的保守党派确实经";
- string sentence = "goodmorningbetterhello";
- FileStream fs = new FileStream("11.csv", FileMode.Open);
- StreamReader sr = new StreamReader(fs);
- string line = "";
- string hz = "";
- string gl = "";
- dict.Add("", 0.00);
- while ((line = sr.ReadLine()) != null)
- {
- int i = line.IndexOf(',');
- hz = line.Substring(0, i);
- gl = line.Substring(i + 1, line.Length - i - 1);
- if (!hz.Equals("?"))
- dict.Add(hz, double.Parse(gl));
- }
- int start = 0;
- //dict = ht;
- //string aa = Console.ReadLine();
- Console.WriteLine(analyse(sentence, start));
- string e = Console.ReadLine();
- start = 0;
- Console.WriteLine(analyse(e, start));
- Console.WriteLine(analyse(sentence, start));
- }
- static string analyse(string sentence, int start)
- {
- string results = "";
- string nowstr = "";
- int pos = 0;
- int len = 0;
- double max = 0;
- double now = 0;
- while (start < sentence.Length)
- {
- nowstr = Split(sentence.Substring(start, sentence.Length - start));
- start = start + nowstr.Length;
- results += nowstr + "/";
- }
- return results;
- }
- static string Split(string sentence)
- {
- int m = 0;
- int result = 0;
- double now = 0;
- double p = 0;
- int pos = 0;
- double max = 0;
- int j = 1;
- string curWord = "";
- int i = 1;
- int len = 0;
- int[] oj = new int[PRE_LENGTH];
- if (sentence.Length < PRE_LENGTH)
- {
- fl = sentence.Length + 1;
- pre = sentence.Length + 1;
- }
- else
- {
- fl = FL_LENGTH;
- pre = PRE_LENGTH;
- }
- while (i < pre)
- {
- while (j < fl)
- {
- oj[i] = j;
- pos = 0;
- for (int k = 1; k < i; k++)
- {
- pos = pos + oj[k];
- }
- if (pos + j > sentence.Length)
- {
- curWord = "";
- }
- else
- {
- len = j;
- curWord = sentence.Substring(pos, len);
- }
- if (dict[curWord] != null)
- {
- p = (double)dict[curWord];
- }
- else
- {
- j++;
- continue;
- }
- if (i == pre - 1)
- {
- if (p + now > max)
- {
- result = oj[1];
- max = p + now;
- }
- else
- {
- }
- j++;
- }
- else
- {
- now = now + p;
- i++;
- j = 1;
- }
- // j++;
- }
- j = oj[i - 1];
- oj[i] = 0;
- j++;
- i--;
- if (j == 1 && i == 0)
- break;
- }
- return sentence.Substring(0, result);
- }
- }
- }