Hadoop入门(二十二)Mapreduce的求平均值程序
程序员文章站
2022-07-07 21:32:50
...
一、简介
求平均值是统计中最常使用到的,现在使用Mapreduce在海量数据中统计数据的求平均值。
二、例子
(1)实例描述
给出三个文件,每个文件中都存储了若干个数值,求所有数值中的求平均值。
样例输入:
1)file1:
1
2
3
7
9
-99
2
2)file2:
11
2
23
17
9
199
22
3)file3:
21
12
3
17
2
39
12
期望输出:
14.952380952380953
(2)问题分析
实现统计海量数据的求平均值,不能将所有的数据加载到内存,计算只能使用类似外部排序的方式,加载一部分数据统计求和和统计个数,接着加载另一部分进行统计,最后相除取平均值。
(3)实现步骤
1)Map过程
首先使用默认的TextInputFormat类对输入文件进行处理,得到文本中每行的偏移量及其内容。显然,Map过程首先必须分析输入的<key,value>对,得到数值,然后在mapper中统计单个分块的求和和统计个数。
2)Reduce过程
经过map方法处理后,Reduce过程将获取每个mapper的求和进行统计,分行统计出总的求和和统计个数,最后相除算平均值。
(3)关键代码
package com.mk.mapreduce;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.net.URI;
public class AvgValue {
public static class AvgValueMapper extends Mapper<LongWritable, Text, IntWritable, IntWritable> {
private int sumValue = 0;
private int count = 0;
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
if (StringUtils.isBlank(value.toString())) {
System.out.println("空白行");
return;
}
int v = Integer.parseInt(value.toString().trim());
sumValue = sumValue + v;
count++;
}
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
context.write(new IntWritable(sumValue), new IntWritable(count));
}
}
public static class AvgValueReducer extends Reducer<IntWritable, IntWritable, DoubleWritable, NullWritable> {
private int sumValue = 0;
private int count = 0;
@Override
protected void reduce(IntWritable key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int s = key.get();
int c = 0;
for (IntWritable v : values)
c += v.get();
sumValue = sumValue + s;
count = count + c;
}
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
double avg = sumValue;
if(count!=0){
avg = sumValue * 1.0 / count;
}
context.write(new DoubleWritable(avg), NullWritable.get());
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
String uri = "hdfs://192.168.150.128:9000";
String input = "/avgValue/input";
String output = "/avgValue/output";
Configuration conf = new Configuration();
if (System.getProperty("os.name").toLowerCase().contains("win"))
conf.set("mapreduce.app-submission.cross-platform", "true");
FileSystem fileSystem = FileSystem.get(URI.create(uri), conf);
Path path = new Path(output);
fileSystem.delete(path, true);
Job job = new Job(conf, "AvgValue");
job.setJar("./out/artifacts/hadoop_test_jar/hadoop-test.jar");
job.setJarByClass(AvgValue.class);
job.setMapperClass(AvgValueMapper.class);
job.setReducerClass(AvgValueReducer.class);
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(DoubleWritable.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.addInputPaths(job, uri + input);
FileOutputFormat.setOutputPath(job, new Path(uri + output));
boolean ret = job.waitForCompletion(true);
System.out.println(job.getJobName() + "-----" + ret);
}
}
上一篇: Hadoop入门(二十三)Mapreduce的求数量最大程序
下一篇: ie6 png透明