用java实现大数据去重、词频统计、排序
程序员文章站
2024-02-24 11:11:46
...
概述
前提:数据源不会爆内存
使用HashMap做去重、统计、使用TreeMap做排序
原代码
KeyWordCount.java
import util.TimeUtil;
import java.io.*;
import java.util.*;
/**
* 搜索关键词去重、统计、降序
*/
public class KeyWordCount {
static String log = "";
public static void main(String[] args) {
String fileStr = "数据源路径1";
String fileStr1 = "数据源路径2";
long start_1 = System.currentTimeMillis();
HashMap<String, String> map = Read2Map(fileStr);
long end_1 = System.currentTimeMillis();
System.out.printf("%s:%dms %n", "读取+分割+去重字符串耗时", end_1 - start_1);
log = log + "读取+分割+去重字符串耗时:" + (end_1 - start_1) + "ms" + "\n";
CountForMap(map);
}
/**
* 读取+分割+去重
*
* @param fileStr 数据源路径
* @return 去重后数据Map
*/
public static HashMap<String, String> Read2Map(String fileStr) {
HashMap<String, String> rdMap = new HashMap<>();
File f = new File(fileStr);
int i = 0;
try (
InputStreamReader isr = new InputStreamReader(new FileInputStream(f), "GBK");//解决中文乱码
BufferedReader br = new BufferedReader(isr);
) {
while (true) {
String line = br.readLine(); // 一次读一行
if (null == line)
break;
i++;
String[] lineArr = line.split("\t");//分割
StringBuilder sb = new StringBuilder();
sb.append(lineArr[1]).append("\t").append(lineArr[2].substring(1, lineArr[2].length() - 1).trim());//选列
rdMap.put(sb.toString(), null);
}
} catch (IOException e) {
e.printStackTrace();
rdMap.put("null", null);
}
System.out.println("原数据条数:" + i);
log = "原数据条数:" + i + "\n";
System.out.println("去重后数据条数:" + rdMap.size());
log = log + "去重后数据条数:" + rdMap.size() + "\n";
return rdMap;
}
/**
* 词频统计+降序+文件输出
*
* @param rdMap 去重后的数据Map
*/
public static void CountForMap(HashMap<String, String> rdMap) {
long start = System.currentTimeMillis();
TreeMap<String, Integer> countMap = new TreeMap<>();
for (String key : rdMap.keySet()) {
String keyStr = key.split("\t")[1];
//如果关键词没有,就加入并设个数为1
if (countMap.get(keyStr) == null) {
countMap.put(keyStr, 1);
} else {
int num = countMap.get(keyStr);
countMap.put(keyStr, num + 1);//关键词存在,个数+1
}
}
long end = System.currentTimeMillis();
System.out.printf("%s:%dms %n", "字符串频度统计耗时", end - start);
log = log + "字符串频度统计耗时:" + (end - start) + "ms" + "\n";
System.out.println("统计数据条数:"+countMap.size());
log = log + "统计数据条数:"+countMap.size() + "\n";
start = System.currentTimeMillis();
//将map.entrySet()转换成list
List<Map.Entry<String, Integer>> countList = new ArrayList<>(countMap.entrySet());
//通过比较器来实现排序
Collections.sort(countList, new Comparator<Map.Entry<String, Integer>>() {
@Override
public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) {
return o2.getValue().compareTo(o1.getValue());//降序
}
});
end = System.currentTimeMillis();
System.out.printf("%s:%dms %n", "字符串频度排序耗时", end - start);
log = log + "字符串频度排序耗时:" + (end - start) + "ms" + "\n";
System.out.println("");
start = System.currentTimeMillis();
File f = new File("src/testFile/input_" + TimeUtil.getTime() + ".txt");
try (
FileWriter fr = new FileWriter(f)
) {
fr.write(log);
fr.write("\n");
for (Map.Entry<String, Integer> mapping : countList) {
fr.write(mapping.getKey() + ":" + mapping.getValue() + "\n");
// System.out.println();
}
} catch (IOException e) {
e.printStackTrace();
}
end = System.currentTimeMillis();
System.out.printf("%s:%dms %n", "结果写入耗时:", end - start);
}
}
TimeUtil.java
package util;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Date;
public class TimeUtil {
/**
* 获取当前时间
*
* @return yyyyMMddHHmmss格式的14位时间戳
*/
public static String getTime() {
String newTime = null;//当前时间
DateFormat df = new SimpleDateFormat("yyyyMMddHHmmss");
newTime = df.format(new Date());
return newTime;
}
/**
* 将时间戳转化为带时间分割的时间戳
*
* @param timeCode yyyyMMddHHmmss格式的14位时间戳
* @return yyyy/MM/dd HH:mm:ss格式的时间戳
*/
public static String timeC2S(String timeCode) {
String timeStr = "无数据";
if (timeCode.length() == 14) {
System.out.println("TimeUtil" + timeCode);
timeStr = timeCode.substring(2, 4) + "/" + timeCode.substring(4, 6) + "/" + timeCode.substring(6, 8) +
" " + timeCode.substring(8, 10) + ":" + timeCode.substring(10, 12);
} else {
timeStr = "无数据";
}
return timeStr;
}
}
测试
推荐阅读