中科院分词系统NLPIR的JAVA代码(补充2)
程序员文章站
2022-06-14 19:39:05
...
在上篇博客基础上,添加以下两个功能:
- 统计词频功能,并以降序排列
- 显示运行进度
具体代码如下:
package code;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.PrintWriter;
import java.io.UnsupportedEncodingException;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import com.sun.jna.Library;
import com.sun.jna.Native;
public class Nlpir {
// 定义接口CLibrary,继承自com.sun.jna.Library
public interface CLibrary extends Library {
// 定义并初始化接口的静态变量
CLibrary Instance = (CLibrary) Native.loadLibrary(
"D:\\LDA\\java\\NLPIR\\lib\\win64\\NLPIR", CLibrary.class);
public int NLPIR_Init(String sDataPath, int encoding,
String sLicenceCode);
public String NLPIR_ParagraphProcess(String sSrc, int bPOSTagged); //对字符串内容进行分词
public String NLPIR_GetKeyWords(String sLine, int nMaxKeyLimit,
boolean bWeightOut); //从字符串中提取关键词
public String NLPIR_GetFileKeyWords(String sLine, int nMaxKeyLimit,
boolean bWeightOut); //从txt文件中提取关键词,本代码未用到
public int NLPIR_AddUserWord(String sWord);
public String NLPIR_GetLastErrorMsg();
public void NLPIR_Exit();
}
public static String transString(String aidString, String ori_encoding,
String new_encoding) {
try {
return new String(aidString.getBytes(ori_encoding), new_encoding);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
return null;
}
public static void main(String[] args) throws Exception {
String path2 = "D:\\LDA\\java\\stopword.txt";
Set<String> set = getStopWord(path2); //调用下面getStopWord方法
Map<String, Integer> map = new HashMap<String,Integer>();
/*初始化*/
String argu = "D:\\LDA\\java\\NLPIR"; //Data的上一级目录
String system_charset = "UTF-8";
int charset_type = 1;
int init_flag = CLibrary.Instance.NLPIR_Init(argu, charset_type, "0");
String nativeBytes = null;
if (0 == init_flag) {
nativeBytes = CLibrary.Instance.NLPIR_GetLastErrorMsg();
System.err.println("初始化失败!fail reason is "+nativeBytes);
return;
}
/*添加用户词典功能*/
String userFilePath = "D:\\LDA\\java\\userdic.txt";
File userFile = new File(userFilePath);
InputStreamReader readerUser = new InputStreamReader(new FileInputStream(userFile), "gbk");
BufferedReader brUser = new BufferedReader(readerUser);
String lineUser="";
List<String> list = new ArrayList<String>();
while((lineUser = brUser.readLine()) != null){
String[] strings = lineUser.split("\n");
for(int i = 0; i < strings.length; i++){
list.add(strings[i]);
}
}
brUser.close();
//实现批量读取多个要分词的TXT文件
String sInput = "";
String path = "D:\\LDA\\java\\source\\test";
String outName = "";
File file = new File(path);
File[] files = file.listFiles();
String out = "";
String splitString = "";
String splitStringNo = "";
Arrays.sort(files);
double jindu = 1;
// System.out.println(files.length);查看多少个要处理的txt
for(int i = 0; i < files.length; i++){
if (files[i].isFile()){
out = "";
splitString= "";
splitStringNo = "";
outName = files[i].getName();
InputStreamReader reader = new InputStreamReader(new FileInputStream(files[i]), "gbk");
BufferedReader br = new BufferedReader(reader);
String line="";
sInput = "";
line = br.readLine();
while (line != null){
sInput += line;
line = br.readLine();
}
br.close();
/*停用词*/
Iterator<String> it = set.iterator();
int count = 0;
while(it.hasNext()){
String string = it.next();
if(sInput.contains(string)){
sInput = sInput.replace(string, "");
}
}
try {
/*分词后提取名词(保留词性标记或不保留)*/
nativeBytes = CLibrary.Instance.NLPIR_ParagraphProcess(sInput, 1);
String[] stringSplit = nativeBytes.split(" ");
for(int m=0; m < stringSplit.length; m++){
if(stringSplit[m].contains("n")){
splitString += stringSplit[m];
int k = stringSplit[m].indexOf("/");
splitStringNo += stringSplit[m].substring(0, k);
splitStringNo += " ";
// splitString += " ";
}
}
countWord(splitStringNo, map);
/*保留词性标注*/
// File fpOne = new File("D:\\LDA\\java\\result\\split\\" + outName);
// PrintWriter pfpOne = new PrintWriter(fpOne);
// pfpOne.write(splitString);
// pfpOne.close();
File fptwo = new File("D:\\LDA\\java\\result\\testno\\" + outName);
PrintWriter pfpTwo = new PrintWriter(fptwo);
pfpTwo.write(splitStringNo);
pfpTwo.close();
// out += nativeBytes; //对应TXT的内容,未添加用户词典的分词结果
// out += "\r\n";
for(int k = 0; k < list.size(); k++){
CLibrary.Instance.NLPIR_AddUserWord(list.get(k));
}
/*屏蔽,使result结果中只出现一种结果,否则两者都出现在TXT中*/
out += nativeBytes; //对应TXT的内容,增加用户词典后分词结果
out += "\r\n";
int nCountKey = 0;
String nativeByte = CLibrary.Instance.NLPIR_GetKeyWords(sInput, 10,false);
File fp1 = new File("D:\\LDA\\java\\keyword\\test\\" + outName);
PrintWriter pfp1 = new PrintWriter(fp1);
pfp1.write(nativeByte); //对应TXT内容,关键词
pfp1.close();
//分词结果(添加和未添加用户词典的分词结果)
File fp = new File("D:\\LDA\\java\\result\\test\\" + outName);
PrintWriter pfp = new PrintWriter(fp);
pfp.write(out);
pfp.close();
} catch (Exception ex) {
// TODO Auto-generated catch block
ex.printStackTrace();
}
}
double jinDuOne = jindu/files.length;
jindu++;
System.out.println(getFourDecimal(jinDuOne)*100 + "%");
}
printTxt(map);
CLibrary.Instance.NLPIR_Exit();
System.out.println("运行完成!!!");
}
/*去掉停用词方法*/
public static Set<String> getStopWord(String path) throws Exception{
Set<String> set = new HashSet<String>();
File file = new File(path);
InputStreamReader reader = new InputStreamReader(new FileInputStream(file), "gbk");
BufferedReader br = new BufferedReader(reader);
String line = "";
line = br.readLine();
while(line != null){
if(!line.trim().isEmpty()){
set.add(line);
}
line = br.readLine();
}
br.close();
return set;
}
public static void countWord(String ss, Map<String, Integer> map){
String[] string = ss.split(" ");
for(int i=0; i < string.length; i++){
if(map.containsKey(string[i])){
int stringCount = map.get(string[i]);
stringCount += 1;
map.put(string[i], stringCount);
}else{
map.put(string[i], 1);
}
}
}
public static void printTxt(Map<String, Integer> map) throws Exception{
String string = "";
List<Map.Entry<String, Integer>> list = new ArrayList<Map.Entry<String,Integer>>(map.entrySet());
Collections.sort(list, new Comparator<Map.Entry<String, Integer>>(){
@Override
public int compare(Entry<String, Integer> o1, Entry<String, Integer> o2) {
return o2.getValue().compareTo(o1.getValue());
}
});
for(Map.Entry<String, Integer> mapping : list){
String stringOne = mapping.getKey() + "---" + mapping.getValue();
string += stringOne;
string += "\r\n";
}
// for(Map.Entry<String, Integer> entry : map.entrySet()){
// String stringOne = entry.getKey() + "---" + entry.getValue();
// string += stringOne;
// string += "\r\n";
// }
File fptwo = new File("D:\\LDA\\java\\result\\countword\\" + "count");
PrintWriter pfpTwo = new PrintWriter(fptwo);
pfpTwo.write(string);
pfpTwo.close();
}
//保留4位有效数字
public static double getFourDecimal(double num){
DecimalFormat dFormat = new DecimalFormat("#.0000");
String string = dFormat.format(num);
Double temp = Double.valueOf(string);
return temp;
}
}