欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

用java实现大数据去重、词频统计、排序

程序员文章站 2024-02-24 11:11:46
...

概述

前提:数据源不会爆内存

使用HashMap做去重、统计、使用TreeMap做排序

 

原代码

KeyWordCount.java

import util.TimeUtil;

import java.io.*;
import java.util.*;

/**
 * 搜索关键词去重、统计、降序
 */
public class KeyWordCount {
    static String log = "";

    public static void main(String[] args) {
        String fileStr = "数据源路径1";
        String fileStr1 = "数据源路径2";

        long start_1 = System.currentTimeMillis();
        HashMap<String, String> map = Read2Map(fileStr);
        long end_1 = System.currentTimeMillis();
        System.out.printf("%s:%dms %n", "读取+分割+去重字符串耗时", end_1 - start_1);
        log = log + "读取+分割+去重字符串耗时:" + (end_1 - start_1) + "ms" + "\n";
        CountForMap(map);
    }

    /**
     * 读取+分割+去重
     *
     * @param fileStr 数据源路径
     * @return 去重后数据Map
     */
    public static HashMap<String, String> Read2Map(String fileStr) {
        HashMap<String, String> rdMap = new HashMap<>();
        File f = new File(fileStr);
        int i = 0;
        try (
                InputStreamReader isr = new InputStreamReader(new FileInputStream(f), "GBK");//解决中文乱码
                BufferedReader br = new BufferedReader(isr);
        ) {
            while (true) {
                String line = br.readLine(); // 一次读一行
                if (null == line)
                    break;
                i++;
                String[] lineArr = line.split("\t");//分割
                StringBuilder sb = new StringBuilder();
                sb.append(lineArr[1]).append("\t").append(lineArr[2].substring(1, lineArr[2].length() - 1).trim());//选列
                rdMap.put(sb.toString(), null);
            }
        } catch (IOException e) {
            e.printStackTrace();
            rdMap.put("null", null);
        }
        System.out.println("原数据条数:" + i);
        log = "原数据条数:" + i + "\n";
        System.out.println("去重后数据条数:" + rdMap.size());
        log = log + "去重后数据条数:" + rdMap.size() + "\n";

        return rdMap;
    }

    /**
     * 词频统计+降序+文件输出
     *
     * @param rdMap 去重后的数据Map
     */
    public static void CountForMap(HashMap<String, String> rdMap) {
        long start = System.currentTimeMillis();
        TreeMap<String, Integer> countMap = new TreeMap<>();
        for (String key : rdMap.keySet()) {
            String keyStr = key.split("\t")[1];
            //如果关键词没有,就加入并设个数为1
            if (countMap.get(keyStr) == null) {
                countMap.put(keyStr, 1);
            } else {
                int num = countMap.get(keyStr);
                countMap.put(keyStr, num + 1);//关键词存在,个数+1
            }
        }
        long end = System.currentTimeMillis();
        System.out.printf("%s:%dms %n", "字符串频度统计耗时", end - start);
        log = log + "字符串频度统计耗时:" + (end - start) + "ms" + "\n";
        System.out.println("统计数据条数:"+countMap.size());
        log = log + "统计数据条数:"+countMap.size() + "\n";

        start = System.currentTimeMillis();
        //将map.entrySet()转换成list
        List<Map.Entry<String, Integer>> countList = new ArrayList<>(countMap.entrySet());
        //通过比较器来实现排序
        Collections.sort(countList, new Comparator<Map.Entry<String, Integer>>() {
            @Override
            public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) {
                return o2.getValue().compareTo(o1.getValue());//降序
            }
        });
        end = System.currentTimeMillis();
        System.out.printf("%s:%dms %n", "字符串频度排序耗时", end - start);
        log = log + "字符串频度排序耗时:" + (end - start) + "ms" + "\n";

        System.out.println("");

        start = System.currentTimeMillis();
        File f = new File("src/testFile/input_" + TimeUtil.getTime() + ".txt");
        try (
                FileWriter fr = new FileWriter(f)
        ) {
            fr.write(log);
            fr.write("\n");
            for (Map.Entry<String, Integer> mapping : countList) {
                fr.write(mapping.getKey() + ":" + mapping.getValue() + "\n");
//                System.out.println();
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
        end = System.currentTimeMillis();
        System.out.printf("%s:%dms %n", "结果写入耗时:", end - start);
    }
}

TimeUtil.java

package util;

import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Date;

public class TimeUtil {

    /**
     * 获取当前时间
     *
     * @return yyyyMMddHHmmss格式的14位时间戳
     */
    public static String getTime() {
        String newTime = null;//当前时间
        DateFormat df = new SimpleDateFormat("yyyyMMddHHmmss");
        newTime = df.format(new Date());
        return newTime;
    }

    /**
     * 将时间戳转化为带时间分割的时间戳
     *
     * @param timeCode yyyyMMddHHmmss格式的14位时间戳
     * @return yyyy/MM/dd HH:mm:ss格式的时间戳
     */
    public static String timeC2S(String timeCode) {
        String timeStr = "无数据";
        if (timeCode.length() == 14) {
            System.out.println("TimeUtil" + timeCode);
            timeStr = timeCode.substring(2, 4) + "/" + timeCode.substring(4, 6) + "/" + timeCode.substring(6, 8) +
                    " " + timeCode.substring(8, 10) + ":" + timeCode.substring(10, 12);
        } else {
            timeStr = "无数据";
        }
        return timeStr;
    }
}

测试

用java实现大数据去重、词频统计、排序