TF-IDF理解及其Java实现代码实例

程序员文章站 2024-04-01 21:02:04

tf-idf 前言前段时间，又具体看了自己以前整理的tf-idf，这里把它发布在博客上，知识就是需要不断的重复的，否则就感觉生疏了。 tf-idf理解 tf-id...

tf-idf

前言

前段时间，又具体看了自己以前整理的tf-idf，这里把它发布在博客上，知识就是需要不断的重复的，否则就感觉生疏了。

tf-idf理解

tf-idf（term frequency–inverse document frequency）是一种用于资讯检索与资讯探勘的常用加权技术, tfidf的主要思想是：如果某个词或短语在一篇文章中出现的频率tf高，并且在其他文章中很少出现，则认为此词或者短语具有很好的类别区分能力，适合用来分类。tfidf实际上是：tf * idf，tf词频(term frequency)，idf反文档频率(inverse document frequency)。tf表示词条在文档d中出现的频率。idf的主要思想是：如果包含词条t的文档越少，也就是n越小，idf越大，则说明词条t具有很好的类别区分能力。如果某一类文档c中包含词条t的文档数为m，而其它类包含t的文档总数为k，显然所有包含t的文档数n=m + k，当m大的时候，n也大，按照idf公式得到的idf的值会小，就说明该词条t类别区分能力不强。但是实际上，如果一个词条在一个类的文档中频繁出现，则说明该词条能够很好代表这个类的文本的特征，这样的词条应该给它们赋予较高的权重，并选来作为该类文本的特征词以区别与其它类文档。这就是idf的不足之处.

tf公式：

TF-IDF理解及其Java实现代码实例

以上式子中 TF-IDF理解及其Java实现代码实例是该词在文件中的出现次数，而分母则是在文件中所有字词的出现次数之和。

idf公式：

TF-IDF理解及其Java实现代码实例

|d|：语料库中的文件总数

TF-IDF理解及其Java实现代码实例：包含词语 t_i 的文件数目（即 n_i,j不等于0的文件数目）如果该词语不在语料库中，就会导致被除数为零，因此一般情况下使用

TF-IDF理解及其Java实现代码实例

然后

TF-IDF理解及其Java实现代码实例

tf-idf实现（java）

这里采用了外部插件ikanalyzer-2012.jar，用其进行分词

具体代码如下：

package tfidf;
import java.io.*;
import java.util.*;
import org.wltea.analyzer.lucene.ikanalyzer;
public class readfiles {
	/**
   * @param args
   */
	private static arraylist<string> filelist = new arraylist<string>();
	// the list of file
	//get list of file for the directory, including sub-directory of it
	public static list<string> readdirs(string filepath) throws filenotfoundexception, ioexception
	  {
		try
		    {
			file file = new file(filepath);
			if(!file.isdirectory())
			      {
				system.out.println("输入的[]");
				system.out.println("filepath:" + file.getabsolutepath());
			} else
			      {
				string[] flist = file.list();
				for (int i = 0; i < flist.length; i++)
				        {
					file newfile = new file(filepath + "\\" + flist[i]);
					if(!newfile.isdirectory())
					          {
						filelist.add(newfile.getabsolutepath());
					} else if(newfile.isdirectory()) //if file is a directory, call readdirs
					{
						readdirs(filepath + "\\" + flist[i]);
					}
				}
			}
		}
		catch(filenotfoundexception e)
		    {
			system.out.println(e.getmessage());
		}
		return filelist;
	}
	//read file
	public static string readfile(string file) throws filenotfoundexception, ioexception
	  {
		stringbuffer strsb = new stringbuffer();
		//string is constant， stringbuffer can be changed.
		inputstreamreader instrr = new inputstreamreader(new fileinputstream(file), "gbk");
		//byte streams to character streams
		bufferedreader br = new bufferedreader(instrr);
		string line = br.readline();
		while(line != null){
			strsb.append(line).append("\r\n");
			line = br.readline();
		}
		return strsb.tostring();
	}
	//word segmentation
	public static arraylist<string> cutwords(string file) throws ioexception{
		arraylist<string> words = new arraylist<string>();
		string text = readfiles.readfile(file);
		ikanalyzer analyzer = new ikanalyzer();
		words = analyzer.split(text);
		return words;
	}
	//term frequency in a file, times for each word
	public static hashmap<string, integer> normaltf(arraylist<string> cutwords){
		hashmap<string, integer> restf = new hashmap<string, integer>();
		for (string word : cutwords){
			if(restf.get(word) == null){
				restf.put(word, 1);
				system.out.println(word);
			} else{
				restf.put(word, restf.get(word) + 1);
				system.out.println(word.tostring());
			}
		}
		return restf;
	}
	//term frequency in a file, frequency of each word
	public static hashmap<string, float> tf(arraylist<string> cutwords){
		hashmap<string, float> restf = new hashmap<string, float>();
		int wordlen = cutwords.size();
		hashmap<string, integer> inttf = readfiles.normaltf(cutwords);
		iterator iter = inttf.entryset().iterator();
		//iterator for that get from tf
		while(iter.hasnext()){
			map.entry entry = (map.entry)iter.next();
			restf.put(entry.getkey().tostring(), float.parsefloat(entry.getvalue().tostring()) / wordlen);
			system.out.println(entry.getkey().tostring() + " = "+ float.parsefloat(entry.getvalue().tostring()) / wordlen);
		}
		return restf;
	}
	//tf times for file
	public static hashmap<string, hashmap<string, integer>> normaltfallfiles(string dirc) throws ioexception{
		hashmap<string, hashmap<string, integer>> allnormaltf = new hashmap<string, hashmap<string,integer>>();
		list<string> filelist = readfiles.readdirs(dirc);
		for (string file : filelist){
			hashmap<string, integer> dict = new hashmap<string, integer>();
			arraylist<string> cutwords = readfiles.cutwords(file);
			//get cut word for one file
			dict = readfiles.normaltf(cutwords);
			allnormaltf.put(file, dict);
		}
		return allnormaltf;
	}
	//tf for all file
	public static hashmap<string,hashmap<string, float>> tfallfiles(string dirc) throws ioexception{
		hashmap<string, hashmap<string, float>> alltf = new hashmap<string, hashmap<string, float>>();
		list<string> filelist = readfiles.readdirs(dirc);
		for (string file : filelist){
			hashmap<string, float> dict = new hashmap<string, float>();
			arraylist<string> cutwords = readfiles.cutwords(file);
			//get cut words for one file
			dict = readfiles.tf(cutwords);
			alltf.put(file, dict);
		}
		return alltf;
	}
	public static hashmap<string, float> idf(hashmap<string,hashmap<string, float>> all_tf){
		hashmap<string, float> residf = new hashmap<string, float>();
		hashmap<string, integer> dict = new hashmap<string, integer>();
		int docnum = filelist.size();
		for (int i = 0; i < docnum; i++){
			hashmap<string, float> temp = all_tf.get(filelist.get(i));
			iterator iter = temp.entryset().iterator();
			while(iter.hasnext()){
				map.entry entry = (map.entry)iter.next();
				string word = entry.getkey().tostring();
				if(dict.get(word) == null){
					dict.put(word, 1);
				} else {
					dict.put(word, dict.get(word) + 1);
				}
			}
		}
		system.out.println("idf for every word is:");
		iterator iter_dict = dict.entryset().iterator();
		while(iter_dict.hasnext()){
			map.entry entry = (map.entry)iter_dict.next();
			float value = (float)math.log(docnum / float.parsefloat(entry.getvalue().tostring()));
			residf.put(entry.getkey().tostring(), value);
			system.out.println(entry.getkey().tostring() + " = " + value);
		}
		return residf;
	}
	public static void tf_idf(hashmap<string,hashmap<string, float>> all_tf,hashmap<string, float> idfs){
		hashmap<string, hashmap<string, float>> restfidf = new hashmap<string, hashmap<string, float>>();
		int docnum = filelist.size();
		for (int i = 0; i < docnum; i++){
			string filepath = filelist.get(i);
			hashmap<string, float> tfidf = new hashmap<string, float>();
			hashmap<string, float> temp = all_tf.get(filepath);
			iterator iter = temp.entryset().iterator();
			while(iter.hasnext()){
				map.entry entry = (map.entry)iter.next();
				string word = entry.getkey().tostring();
				float value = (float)float.parsefloat(entry.getvalue().tostring()) * idfs.get(word);
				tfidf.put(word, value);
			}
			restfidf.put(filepath, tfidf);
		}
		system.out.println("tf-idf for every file is :");
		distfidf(restfidf);
	}
	public static void distfidf(hashmap<string, hashmap<string, float>> tfidf){
		iterator iter1 = tfidf.entryset().iterator();
		while(iter1.hasnext()){
			map.entry entrys = (map.entry)iter1.next();
			system.out.println("filename: " + entrys.getkey().tostring());
			system.out.print("{");
			hashmap<string, float> temp = (hashmap<string, float>) entrys.getvalue();
			iterator iter2 = temp.entryset().iterator();
			while(iter2.hasnext()){
				map.entry entry = (map.entry)iter2.next();
				system.out.print(entry.getkey().tostring() + " = " + entry.getvalue().tostring() + ", ");
			}
			system.out.println("}");
		}
	}
	public static void main(string[] args) throws ioexception {
		// todo auto-generated method stub
		string file = "d:/testfiles";
		hashmap<string,hashmap<string, float>> all_tf = tfallfiles(file);
		system.out.println();
		hashmap<string, float> idfs = idf(all_tf);
		system.out.println();
		tf_idf(all_tf, idfs);
	}
}

结果如下图：

TF-IDF理解及其Java实现代码实例