用java实现大数据去重、词频统计、排序

程序员文章站 2024-02-24 11:11:46

...

概述

前提：数据源不会爆内存

使用HashMap做去重、统计、使用TreeMap做排序

原代码

KeyWordCount.java

import util.TimeUtil;

import java.io.*;
import java.util.*;

/**
 * 搜索关键词去重、统计、降序
 */
public class KeyWordCount {
    static String log = "";

    public static void main(String[] args) {
        String fileStr = "数据源路径1";
        String fileStr1 = "数据源路径2";

        long start_1 = System.currentTimeMillis();
        HashMap<String, String> map = Read2Map(fileStr);
        long end_1 = System.currentTimeMillis();
        System.out.printf("%s：%dms %n", "读取+分割+去重字符串耗时", end_1 - start_1);
        log = log + "读取+分割+去重字符串耗时：" + (end_1 - start_1) + "ms" + "\n";
        CountForMap(map);
    }

    /**
     * 读取+分割+去重
     *
     * @param fileStr 数据源路径
     * @return 去重后数据Map
     */
    public static HashMap<String, String> Read2Map(String fileStr) {
        HashMap<String, String> rdMap = new HashMap<>();
        File f = new File(fileStr);
        int i = 0;
        try (
                InputStreamReader isr = new InputStreamReader(new FileInputStream(f), "GBK");//解决中文乱码
                BufferedReader br = new BufferedReader(isr);
        ) {
            while (true) {
                String line = br.readLine(); // 一次读一行
                if (null == line)
                    break;
                i++;
                String[] lineArr = line.split("\t");//分割
                StringBuilder sb = new StringBuilder();
                sb.append(lineArr[1]).append("\t").append(lineArr[2].substring(1, lineArr[2].length() - 1).trim());//选列
                rdMap.put(sb.toString(), null);
            }
        } catch (IOException e) {
            e.printStackTrace();
            rdMap.put("null", null);
        }
        System.out.println("原数据条数：" + i);
        log = "原数据条数：" + i + "\n";
        System.out.println("去重后数据条数：" + rdMap.size());
        log = log + "去重后数据条数：" + rdMap.size() + "\n";

        return rdMap;
    }

    /**
     * 词频统计+降序+文件输出
     *
     * @param rdMap 去重后的数据Map
     */
    public static void CountForMap(HashMap<String, String> rdMap) {
        long start = System.currentTimeMillis();
        TreeMap<String, Integer> countMap = new TreeMap<>();
        for (String key : rdMap.keySet()) {
            String keyStr = key.split("\t")[1];
            //如果关键词没有，就加入并设个数为1
            if (countMap.get(keyStr) == null) {
                countMap.put(keyStr, 1);
            } else {
                int num = countMap.get(keyStr);
                countMap.put(keyStr, num + 1);//关键词存在，个数+1
            }
        }
        long end = System.currentTimeMillis();
        System.out.printf("%s：%dms %n", "字符串频度统计耗时", end - start);
        log = log + "字符串频度统计耗时：" + (end - start) + "ms" + "\n";
        System.out.println("统计数据条数："+countMap.size());
        log = log + "统计数据条数："+countMap.size() + "\n";

        start = System.currentTimeMillis();
        //将map.entrySet()转换成list
        List<Map.Entry<String, Integer>> countList = new ArrayList<>(countMap.entrySet());
        //通过比较器来实现排序
        Collections.sort(countList, new Comparator<Map.Entry<String, Integer>>() {
            @Override
            public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) {
                return o2.getValue().compareTo(o1.getValue());//降序
            }
        });
        end = System.currentTimeMillis();
        System.out.printf("%s：%dms %n", "字符串频度排序耗时", end - start);
        log = log + "字符串频度排序耗时：" + (end - start) + "ms" + "\n";

        System.out.println("");

        start = System.currentTimeMillis();
        File f = new File("src/testFile/input_" + TimeUtil.getTime() + ".txt");
        try (
                FileWriter fr = new FileWriter(f)
        ) {
            fr.write(log);
            fr.write("\n");
            for (Map.Entry<String, Integer> mapping : countList) {
                fr.write(mapping.getKey() + ":" + mapping.getValue() + "\n");
//                System.out.println();
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
        end = System.currentTimeMillis();
        System.out.printf("%s：%dms %n", "结果写入耗时：", end - start);
    }
}

TimeUtil.java

package util;

import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Date;

public class TimeUtil {

    /**
     * 获取当前时间
     *
     * @return yyyyMMddHHmmss格式的14位时间戳
     */
    public static String getTime() {
        String newTime = null;//当前时间
        DateFormat df = new SimpleDateFormat("yyyyMMddHHmmss");
        newTime = df.format(new Date());
        return newTime;
    }

    /**
     * 将时间戳转化为带时间分割的时间戳
     *
     * @param timeCode yyyyMMddHHmmss格式的14位时间戳
     * @return yyyy/MM/dd HH:mm:ss格式的时间戳
     */
    public static String timeC2S(String timeCode) {
        String timeStr = "无数据";
        if (timeCode.length() == 14) {
            System.out.println("TimeUtil" + timeCode);
            timeStr = timeCode.substring(2, 4) + "/" + timeCode.substring(4, 6) + "/" + timeCode.substring(6, 8) +
                    " " + timeCode.substring(8, 10) + ":" + timeCode.substring(10, 12);
        } else {
            timeStr = "无数据";
        }
        return timeStr;
    }
}

测试

用java实现大数据去重、词频统计、排序

上一篇：命令三十二: wc

下一篇：文本查看及处理工具wc、cut、sort 、uniq、 diff、 patch