Map-Reduce应用
软硬件环境
名称 版本
系统 Ubuntu 18.04.4 LTS
内存 7.5GiB
处理器 Intel Core i7-8565U CPU @ 1.80GHz *8
图形 Intel UHD Graphics(Whiskey Lake 3*8 GT2)
GNOME 3.28.2
操作系统类型 64位
磁盘 251.0 GB
Hadoop 2.10.0
Eclipse Eclipse IDE for Java Developers 2019-06 (4.12.0)
步骤
①首先在eclipse中创建一个Java Project,并命名为WordCount,并引入hadoop相关的jar包。
创建完成之后界面如下:
然后创建在eclipse中创建user library,命名为hadoop2_jars,并根据下面的4个目录添加jar包。
为了编写一个MapReduce程序,一般需要向Java工程中添加以下JAR包:
(1)“/usr/local/hadoop/share/hadoop/common”目录下的hadoop-common-3.1.3.jar和haoop-nfs-3.1.3.jar;
(2)“/usr/local/hadoop/share/hadoop/common/lib”目录下的所有JAR包;
(3)“/usr/local/hadoop/share/hadoop/mapreduce”目录下的所有JAR包,但是,不包括jdiff、lib、lib-examples和sources目录
(4)“/usr/local/hadoop/share/hadoop/mapreduce/lib”目录下的所有JAR包。
添加的所有jar包为:
1 /usr/local/hadoop/share/hadoop/common/hadoop-common-2.10.0.jar
2 /usr/local/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-client-core-2.10.0.jar
3 /usr/local/hadoop/share/hadoop/common/lib/commons-cli-1.2.jar
4 /usr/local/hadoop/share/hadoop/common/hadoop-nfs-2.10.0.jar
5 /usr/local/hadoop/share/hadoop/common/lib/activation-1.1.jar
6 /usr/local/hadoop/share/hadoop/common/lib/apacheds-i18n-2.0.0-M15.jar
7 /usr/local/hadoop/share/hadoop/common/lib/apacheds-kerberos-codec-2.0.0-M15.jar
8 /usr/local/hadoop/share/hadoop/common/lib/api-asn1-api-1.0.0-M20.jar
9 /usr/local/hadoop/share/hadoop/common/lib/api-util-1.0.0-M20.jar
10 /usr/local/hadoop/share/hadoop/common/lib/asm-3.2.jar
11 /usr/local/hadoop/share/hadoop/common/lib/avro-1.7.7.jar
12 /usr/local/hadoop/share/hadoop/common/lib/commons-beanutils-1.9.4.jar
13 /usr/local/hadoop/share/hadoop/common/lib/commons-codec-1.4.jar
14 /usr/local/hadoop/share/hadoop/common/lib/commons-collections-3.2.2.jar
15 /usr/local/hadoop/share/hadoop/common/lib/commons-compress-1.19.jar
16 /usr/local/hadoop/share/hadoop/common/lib/commons-configuration-1.6.jar
17 /usr/local/hadoop/share/hadoop/common/lib/commons-digester-1.8.jar
18 /usr/local/hadoop/share/hadoop/common/lib/commons-io-2.4.jar
19 /usr/local/hadoop/share/hadoop/common/lib/commons-lang3-3.4.jar
20 /usr/local/hadoop/share/hadoop/common/lib/commons-lang-2.6.jar
21 /usr/local/hadoop/share/hadoop/common/lib/commons-logging-1.1.3.jar
22 /usr/local/hadoop/share/hadoop/common/lib/commons-math3-3.1.1.jar
23 /usr/local/hadoop/share/hadoop/common/lib/commons-net-3.1.jar
24 /usr/local/hadoop/share/hadoop/common/lib/curator-client-2.7.1.jar
25 /usr/local/hadoop/share/hadoop/common/lib/curator-framework-2.7.1.jar
26 /usr/local/hadoop/share/hadoop/common/lib/curator-recipes-2.7.1.jar
27 /usr/local/hadoop/share/hadoop/common/lib/gson-2.2.4.jar
28 /usr/local/hadoop/share/hadoop/common/lib/guava-11.0.2.jar
29 /usr/local/hadoop/share/hadoop/common/lib/hadoop-annotations-2.10.0.jar
30 /usr/local/hadoop/share/hadoop/common/lib/hadoop-auth-2.10.0.jar
31 /usr/local/hadoop/share/hadoop/common/lib/hamcrest-core-1.3.jar
32 /usr/local/hadoop/share/hadoop/common/lib/htrace-core4-4.1.0-incubating.jar
33 /usr/local/hadoop/share/hadoop/common/lib/httpclient-4.5.2.jar
34 /usr/local/hadoop/share/hadoop/common/lib/httpcore-4.4.4.jar
35 /usr/local/hadoop/share/hadoop/common/lib/jackson-core-asl-1.9.13.jar
36 /usr/local/hadoop/share/hadoop/common/lib/jackson-jaxrs-1.9.13.jar
37 /usr/local/hadoop/share/hadoop/common/lib/jackson-mapper-asl-1.9.13.jar
38 /usr/local/hadoop/share/hadoop/common/lib/jackson-xc-1.9.13.jar
39 /usr/local/hadoop/share/hadoop/common/lib/java-xmlbuilder-0.4.jar
40 /usr/local/hadoop/share/hadoop/common/lib/jaxb-api-2.2.2.jar
41 /usr/local/hadoop/share/hadoop/common/lib/jaxb-impl-2.2.3-1.jar
42 /usr/local/hadoop/share/hadoop/common/lib/jcip-annotations-1.0-1.jar
43 /usr/local/hadoop/share/hadoop/common/lib/jersey-core-1.9.jar
44 /usr/local/hadoop/share/hadoop/common/lib/jersey-json-1.9.jar
45 /usr/local/hadoop/share/hadoop/common/lib/jersey-server-1.9.jar
46 /usr/local/hadoop/share/hadoop/common/lib/jets3t-0.9.0.jar
47 /usr/local/hadoop/share/hadoop/common/lib/jettison-1.1.jar
48 /usr/local/hadoop/share/hadoop/common/lib/jetty-6.1.26.jar
49 /usr/local/hadoop/share/hadoop/common/lib/jetty-sslengine-6.1.26.jar
50 /usr/local/hadoop/share/hadoop/common/lib/jetty-util-6.1.26.jar
51 /usr/local/hadoop/share/hadoop/common/lib/jsch-0.1.54.jar
52 /usr/local/hadoop/share/hadoop/common/lib/json-smart-1.3.1.jar
53 /usr/local/hadoop/share/hadoop/common/lib/jsp-api-2.1.jar
54 /usr/local/hadoop/share/hadoop/common/lib/jsr305-3.0.0.jar
55 /usr/local/hadoop/share/hadoop/common/lib/junit-4.11.jar
56 /usr/local/hadoop/share/hadoop/common/lib/log4j-1.2.17.jar
57 /usr/local/hadoop/share/hadoop/common/lib/mockito-all-1.8.5.jar
58 /usr/local/hadoop/share/hadoop/common/lib/netty-3.10.6.Final.jar
59 /usr/local/hadoop/share/hadoop/common/lib/nimbus-jose-jwt-4.41.1.jar
60 /usr/local/hadoop/share/hadoop/common/lib/paranamer-2.3.jar
61 /usr/local/hadoop/share/hadoop/common/lib/protobuf-java-2.5.0.jar
62 /usr/local/hadoop/share/hadoop/common/lib/servlet-api-2.5.jar
63 /usr/local/hadoop/share/hadoop/common/lib/slf4j-api-1.7.25.jar
64 /usr/local/hadoop/share/hadoop/common/lib/slf4j-log4j12-1.7.25.jar
65 /usr/local/hadoop/share/hadoop/common/lib/snappy-java-1.0.5.jar
66 /usr/local/hadoop/share/hadoop/common/lib/stax2-api-3.1.4.jar
67 /usr/local/hadoop/share/hadoop/common/lib/stax-api-1.0-2.jar
68 /usr/local/hadoop/share/hadoop/common/lib/woodstox-core-5.0.3.jar
69 /usr/local/hadoop/share/hadoop/common/lib/xmlenc-0.52.jar
70 /usr/local/hadoop/share/hadoop/common/lib/zookeeper-3.4.9.jar
71 /usr/local/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-client-app-2.10.0.jar
72 /usr/local/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-client-common-2.10.0.jar
73 /usr/local/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-client-hs-2.10.0.jar
74 /usr/local/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-client-hs-plugins-2.10.0.jar
75 /usr/local/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-client-jobclient-2.10.0.jar
76 /usr/local/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-client-jobclient-2.10.0-tests.jar
77 /usr/local/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-client-shuffle-2.10.0.jar
78 /usr/local/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-examples-2.10.0.jar
79 /usr/local/hadoop/share/hadoop/mapreduce/lib/aopalliance-1.0.jar
80 /usr/local/hadoop/share/hadoop/mapreduce/lib/asm-3.2.jar
81 /usr/local/hadoop/share/hadoop/mapreduce/lib/avro-1.7.7.jar
82 /usr/local/hadoop/share/hadoop/mapreduce/lib/commons-compress-1.19.jar
83 /usr/local/hadoop/share/hadoop/mapreduce/lib/commons-io-2.4.jar
84 /usr/local/hadoop/share/hadoop/mapreduce/lib/guice-3.0.jar
85 /usr/local/hadoop/share/hadoop/mapreduce/lib/guice-servlet-3.0.jar
86 /usr/local/hadoop/share/hadoop/mapreduce/lib/hadoop-annotations-2.10.0.jar
87 /usr/local/hadoop/share/hadoop/mapreduce/lib/hamcrest-core-1.3.jar
88 /usr/local/hadoop/share/hadoop/mapreduce/lib/jackson-core-asl-1.9.13.jar
89 /usr/local/hadoop/share/hadoop/mapreduce/lib/jackson-mapper-asl-1.9.13.jar
90 /usr/local/hadoop/share/hadoop/mapreduce/lib/javax.inject-1.jar
91 /usr/local/hadoop/share/hadoop/mapreduce/lib/jersey-core-1.9.jar
92 /usr/local/hadoop/share/hadoop/mapreduce/lib/jersey-guice-1.9.jar
93 /usr/local/hadoop/share/hadoop/mapreduce/lib/jersey-server-1.9.jar
94 /usr/local/hadoop/share/hadoop/mapreduce/lib/junit-4.11.jar
95 /usr/local/hadoop/share/hadoop/mapreduce/lib/leveldbjni-all-1.8.jar
96 /usr/local/hadoop/share/hadoop/mapreduce/lib/log4j-1.2.17.jar
97 /usr/local/hadoop/share/hadoop/mapreduce/lib/netty-3.10.6.Final.jar
98 /usr/local/hadoop/share/hadoop/mapreduce/lib/paranamer-2.3.jar
99 /usr/local/hadoop/share/hadoop/mapreduce/lib/protobuf-java-2.5.0.jar
100 /usr/local/hadoop/share/hadoop/mapreduce/lib/snappy-java-1.0.5.jar
②在eclipse的WordCount工程中创建java类:WordCount。并编写Map-Reduce相关的处理代码(见第六章的代码部分)。
效果如下图所示。
③使用eclipse打包,包的类型为Runnable Jar file。
在WordCount Project中右击,选择Export,然后选择Runnable JAR file
然后点击Next,会出现下图所示的界面
Finish之后,会发现/usr/local/hadoop/myapp目录下生成了WordCount.jar文件。
1 aaa@qq.com:myapp$ pwd
2 /usr/local/hadoop/myapp
3 aaa@qq.com:myapp$ ls
4 WordCount.jar
5 aaa@qq.com:myapp$
④使用hadoop运行生成的WordCount.jar程序。
首先切换到/usr/local/hadoop目录
aaa@qq.com:myapp$ cd ..
aaa@qq.com:hadoop$ pwd
/usr/local/hadoop
启动Hadoop,如果出现了下面的界面,说明启动成功了。
aaa@qq.com:hadoop$ jps
31137 Jps
20758 SecondaryNameNode
29095 org.eclipse.equinox.launcher_1.5.400.v20190515-0925.jar
20504 DataNode
20297 NameNode
可以看出,DataNode和NameNode都已经准备就绪。
通过hdfs dfs –ls命令查看当前目录下是否存在hdfs文件或者文件夹。
aaa@qq.com:hadoop$ hdfs dfs -ls
20/05/28 10:18:05 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Found 2 items
drwxr-xr-x - acat supergroup 0 2020-05-28 00:18 input
drwxr-xr-x - acat supergroup 0 2020-05-28 00:19 output
可以看出,存在hdfs格式的input和output文件夹,所以现在将其删除。
aaa@qq.com:hadoop$ hdfs dfs -rm -r input output
20/05/28 10:18:15 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Deleted input
Deleted output
然后再次使用hdfs dfs –ls命令进行查看,发现确实已经成功删除了。
aaa@qq.com:hadoop$ hdfs dfs -ls
20/05/28 10:18:21 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
创建hdfs格式的input文件夹
aaa@qq.com:hadoop$ hdfs dfs -mkdir input
20/05/28 10:26:26 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
aaa@qq.com:hadoop$ hdfs dfs -ls
20/05/28 10:26:31 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Found 1 items
drwxr-xr-x - acat supergroup 0 2020-05-28 10:26 input
在本地文件目录/usr/local/hadoop目录下创建两个文件:wordfile1.txt和wordfile2.txt
aaa@qq.com:hadoop$ vi wordfile1.txt
aaa@qq.com:hadoop$ vi wordfile2.txt
wordfile1.txt的文件内容是:
I love SPark
I love Hadoop
Hello world
It's sunney today
Wordfile2.txt的文件内容是:
Hadoop is good
SPark is fast
Big Data is important
把Wordfile1.txt和Wordfile2.txt这两个文件复制到hdfs的input目录中。
aaa@qq.com:hadoop$ hdfs dfs -put wordfile* input
20/05/28 10:30:38 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
使用hadoop命令运行eclipse中导出的WordCount.jar文件(Map-Reduce程序)。
aaa@qq.com:hadoop$ hadoop jar myapp/WordCount.jar input output
20/05/28 10:33:19 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
20/05/28 10:33:19 INFO Configuration.deprecation: session.id is deprecated. Instead, use dfs.metrics.session-id
20/05/28 10:33:19 INFO jvm.JvmMetrics: Initializing JVM Metrics with processName=JobTracker, sessionId=
20/05/28 10:33:20 INFO input.FileInputFormat: Total input files to process : 2
20/05/28 10:33:20 INFO mapreduce.JobSubmitter: number of splits:2
20/05/28 10:33:20 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_local220224193_0001
...此处省略若干行....
20/05/28 10:33:21 INFO mapred.LocalJobRunner: Finishing task: attempt_local220224193_0001_r_000000_0
20/05/28 10:33:21 INFO mapred.LocalJobRunner: reduce task executor complete.
20/05/28 10:33:21 INFO mapreduce.Job: Job job_local220224193_0001 running in uber mode : false
20/05/28 10:33:21 INFO mapreduce.Job: map 100% reduce 100%
20/05/28 10:33:21 INFO mapreduce.Job: Job job_local220224193_0001 completed successfully
20/05/28 10:33:21 INFO mapreduce.Job: Counters: 35
File System Counters
FILE: Number of bytes read=103852917
FILE: Number of bytes written=106112553
FILE: Number of read operations=0
FILE: Number of large read operations=0
FILE: Number of write operations=0
HDFS: Number of bytes read=273
HDFS: Number of bytes written=112
HDFS: Number of read operations=22
HDFS: Number of large read operations=0
HDFS: Number of write operations=5
Map-Reduce Framework
Map input records=7
Map output records=21
Map output bytes=192
Map output materialized bytes=209
Input split bytes=232
Combine input records=21
Combine output records=17
Reduce input groups=15
Reduce shuffle bytes=209
Reduce input records=17
Reduce output records=15
Spilled Records=34
Shuffled Maps =2
Failed Shuffles=0
Merged Map outputs=2
GC time elapsed (ms)=6
Total committed heap usage (bytes)=1172307968
Shuffle Errors
BAD_ID=0
CONNECTION=0
IO_ERROR=0
WRONG_LENGTH=0
WRONG_MAP=0
WRONG_REDUCE=0
File Input Format Counters
Bytes Read=108
File Output Format Counters
Bytes Written=112
通过hdfs dfs –ls命令查看,可以得知在hdfs中生成了output文件夹。
aaa@qq.com:hadoop$ hdfs dfs -ls
20/05/28 10:33:44 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Found 2 items
drwxr-xr-x - acat supergroup 0 2020-05-28 10:30 input
drwxr-xr-x - acat supergroup 0 2020-05-28 10:33 output
⑤查看map-reduce运行结果
使用hdfs dfs –cat命令查看词频统计结果。
aaa@qq.com:hadoop$ hdfs dfs -cat output/*
20/05/28 10:34:43 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Big 1
Data 1
Hadoop 2
Hello 1
I 2
It's 1
SPark 2
fast 1
good 1
important 1
is 3
love 2
sunney 1
today 1
world 1
hadoop jar myapp/WordCount.jar input output部分运行结果。
运行结果展示
代码
1 import java.io.IOException;
2 import java.util.Iterator;
3 import java.util.StringTokenizer;
4 import org.apache.hadoop.conf.Configuration;
5 import org.apache.hadoop.fs.Path;
6 import org.apache.hadoop.io.IntWritable;
7 import org.apache.hadoop.io.Text;
8 import org.apache.hadoop.mapreduce.Job;
9 import org.apache.hadoop.mapreduce.Mapper;
10 import org.apache.hadoop.mapreduce.Reducer;
11 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
12 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
13 import org.apache.hadoop.util.GenericOptionsParser;
14 public class WordCount {
15 public WordCount() {
16 }
17 public static void main(String[] args) throws Exception {
18 Configuration conf = new Configuration();
19 String[] otherArgs = (new GenericOptionsParser(conf, args)).getRemainingArgs();
20 if(otherArgs.length < 2) {
21 System.err.println("Usage: wordcount <in> [<in>...] <out>");
22 System.exit(2);
23 }
24 Job job = Job.getInstance(conf, "word count");
25 job.setJarByClass(WordCount.class);
26 job.setMapperClass(WordCount.TokenizerMapper.class);
27 job.setCombinerClass(WordCount.IntSumReducer.class);
28 job.setReducerClass(WordCount.IntSumReducer.class);
29 job.setOutputKeyClass(Text.class);
30 job.setOutputValueClass(IntWritable.class);
31 for(int i = 0; i < otherArgs.length - 1; ++i) {
32 FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
33 }
34 FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));
35 System.exit(job.waitForCompletion(true)?0:1);
36 }
37 public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> {
38 private static final IntWritable one = new IntWritable(1);
39 private Text word = new Text();
40 public TokenizerMapper() {
41 }
42 public void map(Object key, Text value, Mapper<Object, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException {
43 StringTokenizer itr = new StringTokenizer(value.toString());
44 while(itr.hasMoreTokens()) {
45 this.word.set(itr.nextToken());
46 context.write(this.word, one);
47 }
48 }
49 }
50 public static class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
51 private IntWritable result = new IntWritable();
52 public IntSumReducer() {
53 }
54 public void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
55 int sum = 0;
56 IntWritable val;
57 for(Iterator i$ = values.iterator(); i$.hasNext(); sum += val.get()) {
58 val = (IntWritable)i$.next();
59 }
60 this.result.set(sum);
61 context.write(key, this.result);
62 }
63 }
64 }