欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

一些分词中用到的公式-参考ictclas

程序员文章站 2022-06-05 16:19:51
...
//计算平滑系数公式

//平滑参数
dSmoothingPara = 0.1
//设置当前节点的频度,如果是已知词性,直接使用频度
dCurFreqency
//一个参数
static int MAX_FREQUENCE = 2079997;
//Two linked Words frequency
dTemp = (double) 1 / MAX_FREQUENCE;
//两词之间的词频?关联度?
nTwoWordsFreq = DictBinary.GetFrequency(sTwoWords, 3);


//这个词的平度
			if (pCur.p.nPOS >= 0) {
				// It's not an unknown words
				dCurFreqency = pCur.p.value;
			} else {
				// Unknown words
				//如果是未知词性,从核心词典中检索词组汉字对应2的频度
				dCurFreqency = DictCore.GetFrequency(pCur.p.sWord, 2);
			}
			
			
			/**
		 * 得到具体词和词性的频度数据
		 * 
		 * @param sWord
		 *            单词
		 * @param nHandle
		 *            词性
		 * @return 频度
		 */
		public int GetFrequency(char[] sWord, int nHandle) {
			char sWordFind[] = new char[WORD_MAXLENGTH - 2];
			int nPos, nIndex;
			PWORD_CHAIN pFound;
			Pint pnPos = new Pint();
			if (!PreProcessing(sWord, pnPos, sWordFind))
				return 0;
			nPos = pnPos.value;
	
			Pint pnIndex = new Pint();
			if (FindInOriginalTable(nPos, sWordFind, nHandle, pnIndex)) {
				nIndex = pnIndex.value;
				return m_IndexTable[nPos].pWordItemHead[nIndex].p.nFrequency;
			}
			nIndex = pnIndex.value;
	
			PPWORD_CHAIN ppFound = new PPWORD_CHAIN(new PWORD_CHAIN(
					new WORD_CHAIN()));
			if (FindInModifyTable(nPos, sWordFind, nHandle, ppFound)) {
				return ppFound.p.p.data.nFrequency;
			}
			return 0;
		}

dValue = -Math
						.log(dSmoothingPara * (1 + dCurFreqency) / (MAX_FREQUENCE + 80000)+ (1 - dSmoothingPara)* ((1 - dTemp) * nTwoWordsFreq/ (1 + dCurFreqency) + dTemp));