MapReduce基础
1. wordcount程序
1.1 wordcount源程序
import java.io.ioexception; import java.util.iterator; import java.util.stringtokenizer; import org.apache.hadoop.conf.configuration; import org.apache.hadoop.fs.path; import org.apache.hadoop.io.intwritable; import org.apache.hadoop.io.text; import org.apache.hadoop.mapreduce.job; import org.apache.hadoop.mapreduce.mapper; import org.apache.hadoop.mapreduce.reducer; import org.apache.hadoop.mapreduce.lib.input.fileinputformat; import org.apache.hadoop.mapreduce.lib.output.fileoutputformat; import org.apache.hadoop.util.genericoptionsparser; public class wordcount { public wordcount() { } public static void main(string[] args) throws exception { configuration conf = new configuration(); string[] otherargs = (new genericoptionsparser(conf, args)).getremainingargs(); if(otherargs.length < 2) { system.err.println("usage: wordcount <in> [<in>...] <out>"); system.exit(2); } job job = job.getinstance(conf, "word count"); job.setjarbyclass(wordcount.class); job.setmapperclass(wordcount.tokenizermapper.class); job.setcombinerclass(wordcount.intsumreducer.class); job.setreducerclass(wordcount.intsumreducer.class); job.setoutputkeyclass(text.class); job.setoutputvalueclass(intwritable.class); for(int i = 0; i < otherargs.length - 1; ++i) { fileinputformat.addinputpath(job, new path(otherargs[i])); } fileoutputformat.setoutputpath(job, new path(otherargs[otherargs.length - 1])); system.exit(job.waitforcompletion(true)?0:1); } public static class tokenizermapper extends mapper<object, text, text, intwritable> { private static final intwritable one = new intwritable(1); private text word = new text(); public tokenizermapper() { } public void map(object key, text value, mapper<object, text, text, intwritable>.context context) throws ioexception, interruptedexception { stringtokenizer itr = new stringtokenizer(value.tostring()); while(itr.hasmoretokens()) { this.word.set(itr.nexttoken()); context.write(this.word, one); } } } public static class intsumreducer extends reducer<text, intwritable, text, intwritable> { private intwritable result = new intwritable(); public intsumreducer() { } public void reduce(text key, iterable<intwritable> values, reducer<text, intwritable, text, intwritable>.context context) throws ioexception, interruptedexception { int sum = 0; intwritable val; for(iterator i$ = values.iterator(); i$.hasnext(); sum += val.get()) { val = (intwritable)i$.next(); } this.result.set(sum); context.write(key, this.result); } } }
1.2 运行程序,run as->java applicatiion
1.3 编译打包程序,产生jar文件
2 运行程序
2.1 建立要统计词频的文本文件
wordfile1.txt
spark hadoop
big data
wordfile2.txt
spark hadoop
big cloud
2.2 启动hdfs,新建input文件夹,上传词频文件
cd /usr/local/hadoop/
./sbin/start-dfs.sh
./bin/hadoop fs -mkdir input
./bin/hadoop fs -put /home/hadoop/wordfile1.txt input
./bin/hadoop fs -put /home/hadoop/wordfile2.txt input
2.3 查看已上传的词频文件:
hadoop@dblab-virtualbox:/usr/local/hadoop$ ./bin/hadoop fs -ls .
found 2 items
drwxr-xr-x - hadoop supergroup 0 2019-02-11 15:40 input
-rw-r--r-- 1 hadoop supergroup 5 2019-02-10 20:22 test.txt
hadoop@dblab-virtualbox:/usr/local/hadoop$ ./bin/hadoop fs -ls ./input
found 2 items
-rw-r--r-- 1 hadoop supergroup 27 2019-02-11 15:40 input/wordfile1.txt
-rw-r--r-- 1 hadoop supergroup 29 2019-02-11 15:40 input/wordfile2.txt
2.4 运行wordcount
./bin/hadoop jar /home/hadoop/wordcount.jar input output
屏幕上会输入大段信息
然后可以查看运行结果:
hadoop@dblab-virtualbox:/usr/local/hadoop$ ./bin/hadoop fs -cat output/*
hadoop 2
spark 2
---