hadoop单词统计详细样例
程序员文章站
2022-05-18 10:03:02
...
准备工作:
环境信息
hadoop-2.8.5,jdk:“1.8.0_91”
具体环境搭建请参看https://blog.csdn.net/zhi_zixing/article/details/100520916
统计样本wordCountText.txt
zixing
zixing ziyue ziyang zicheng
ziyu
xingxing
使用idea不用安装hadoop插件,如果使用eclipse则需要插件,这里以idea为例
maven配置
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>3.1.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>3.1.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-common</artifactId>
<version>3.1.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-jobclient</artifactId>
<version>3.1.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>3.1.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-yarn-common</artifactId>
<version>3.1.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-yarn-api</artifactId>
<version>3.1.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-yarn-client</artifactId>
<version>3.1.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-annotations</artifactId>
<version>3.1.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>3.1.2</version>
</dependency>
</dependencies>
log4j配置
# set everything to be logged to the console
log4j.rootCategory=INFO, console
log4j.threshold=ALL
log4j.appender.console=org.apache.log4j.ConsoleAppender
log4j.appender.console.target=System.err
log4j.appender.console.layout=org.apache.log4j.PatternLayout
log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
代码详解
package ai.zixing;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import java.io.IOException;
import java.util.StringTokenizer;
public class WordCount {
public static class WordCountMap extends Mapper<Object, Text, Text, IntWritable> {
//流,序列化及反序列化,所用不用int
private final static IntWritable one = new IntWritable( 1 );
private Text word = new Text( );
//行分割标志\r\n
//value为每行的内容
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
//StringTokenizer:java工具包,按照空格差分单词
StringTokenizer itr = new StringTokenizer( value.toString() );
while (itr.hasMoreTokens()) {
word.set( itr.nextToken() );
context.write( word, one );
}
}
}
public static class WordCountReduce extends Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write( key, result );
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//初始化环境变量
Configuration conf = new Configuration( );
//获取参数<input><output>
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: wordcount <in> <out>");
System.exit(2);
}
//初始化Job任务对象
Job job = Job.getInstance(conf,"zixingWordCount"); //抛出异常IOException
job.setJarByClass( WordCount.class );
//设置运行时map,也可以通过配置文件指定
job.setMapperClass( WordCountMap.class );
//设置Combiner类,默认不使用,使用时和educe使用相同的类
job.setCombinerClass( WordCountReduce.class );
//设置运行时Reduce,也可以通过配置文件指定
job.setReducerClass( WordCountReduce.class );
//设置任务输出Key的类型
job.setOutputKeyClass( Text.class );
//设置任务输出Value的类型
job.setOutputValueClass( IntWritable.class );
/* String args_1 = "hdfs://192.168.225.101:9000/wordCount/input/wordCountText.txt";
String args_2 = "hdfs://192.168.225.101:9000/wordCount/output2";
//输入路径
FileInputFormat.addInputPath( job, new Path( args_1 ) );
//输出路径
FileOutputFormat.setOutputPath( job, new Path( args_2 ) );*/
FileInputFormat.addInputPath( job, new Path( otherArgs[0] ) );
FileOutputFormat.setOutputPath( job, new Path( otherArgs[1]) );
//提交任务到远程环境上执行
System.exit( job.waitForCompletion( true) ? 0 :1); //抛出异常ClassNotFoundException, InterruptedException
}
}
注解:可以使用参数传入样例数据,也可以写具体路径
//写具体路径
String args_1 = "hdfs://192.168.225.101:9000/wordCount/input/wordCountText.txt";
String args_2 = "hdfs://192.168.225.101:9000/wordCount/output2";
//输入路径
FileInputFormat.addInputPath( job, new Path( args_1 ) );
//输出路径
FileOutputFormat.setOutputPath( job, new Path( args_2 ) );
运行参数配置
配置执行用户
-DHADOOP_USER_NAME=hadoop
如果是参数传入样本数据请在Program arguments中添加即可
hdfs://192.168.225.101:9000/wordCount/input/wordCountText.txt
hdfs://192.168.225.101:9000/wordCount/output2