欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

hadoop单词统计详细样例

程序员文章站 2022-05-18 10:03:02
...

准备工作:

环境信息

hadoop-2.8.5,jdk:“1.8.0_91”
具体环境搭建请参看https://blog.csdn.net/zhi_zixing/article/details/100520916

统计样本wordCountText.txt
zixing
zixing ziyue ziyang zicheng
ziyu
xingxing

使用idea不用安装hadoop插件,如果使用eclipse则需要插件,这里以idea为例

maven配置
<dependencies>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>3.1.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs</artifactId>
            <version>3.1.2</version>
        </dependency>

        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-common</artifactId>
            <version>3.1.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-jobclient</artifactId>
            <version>3.1.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-core</artifactId>
            <version>3.1.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-yarn-common</artifactId>
            <version>3.1.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-yarn-api</artifactId>
            <version>3.1.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-yarn-client</artifactId>
            <version>3.1.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-annotations</artifactId>
            <version>3.1.2</version>
        </dependency>

        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>3.1.2</version>
        </dependency>
    </dependencies>
log4j配置
# set everything to be logged to the console
log4j.rootCategory=INFO, console
log4j.threshold=ALL
log4j.appender.console=org.apache.log4j.ConsoleAppender
log4j.appender.console.target=System.err
log4j.appender.console.layout=org.apache.log4j.PatternLayout
log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
代码详解
package ai.zixing;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

import java.io.IOException;
import java.util.StringTokenizer;


public class WordCount {
    public static class WordCountMap extends Mapper<Object, Text, Text, IntWritable> {
        //流,序列化及反序列化,所用不用int
        private final static IntWritable one = new IntWritable( 1 );
        private Text word = new Text(  );
        //行分割标志\r\n
        //value为每行的内容
        public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
            //StringTokenizer:java工具包,按照空格差分单词
            StringTokenizer itr = new StringTokenizer( value.toString() );
            while (itr.hasMoreTokens()) {
                word.set( itr.nextToken() );
                context.write( word, one );
            }
        }
    }

    public static class WordCountReduce extends Reducer<Text, IntWritable, Text, IntWritable> {
        private  IntWritable  result = new IntWritable();
        public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            int sum = 0;
            for (IntWritable val : values) {
                sum += val.get();
            }
            result.set(sum);
            context.write( key, result );
        }
    }

    public static void  main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        //初始化环境变量
        Configuration conf = new Configuration(  );

        //获取参数<input><output>
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
        if (otherArgs.length != 2) {
            System.err.println("Usage: wordcount <in> <out>");
            System.exit(2);
        }


        //初始化Job任务对象
        Job job = Job.getInstance(conf,"zixingWordCount");     //抛出异常IOException
        job.setJarByClass( WordCount.class );

        //设置运行时map,也可以通过配置文件指定
        job.setMapperClass( WordCountMap.class );
        //设置Combiner类,默认不使用,使用时和educe使用相同的类
        job.setCombinerClass( WordCountReduce.class );
        //设置运行时Reduce,也可以通过配置文件指定
        job.setReducerClass( WordCountReduce.class );

        //设置任务输出Key的类型
        job.setOutputKeyClass( Text.class );
        //设置任务输出Value的类型
        job.setOutputValueClass( IntWritable.class );
       /* String args_1 = "hdfs://192.168.225.101:9000/wordCount/input/wordCountText.txt";
        String args_2 = "hdfs://192.168.225.101:9000/wordCount/output2";
        //输入路径
        FileInputFormat.addInputPath( job, new Path( args_1 ) );
        //输出路径
        FileOutputFormat.setOutputPath( job, new Path( args_2 ) );*/
        FileInputFormat.addInputPath( job, new Path( otherArgs[0] ) );
        FileOutputFormat.setOutputPath( job, new Path( otherArgs[1]) );

        //提交任务到远程环境上执行
        System.exit( job.waitForCompletion(  true) ? 0 :1);     //抛出异常ClassNotFoundException, InterruptedException
    }

}

注解:可以使用参数传入样例数据,也可以写具体路径

//写具体路径
String args_1 = "hdfs://192.168.225.101:9000/wordCount/input/wordCountText.txt";
String args_2 = "hdfs://192.168.225.101:9000/wordCount/output2";
//输入路径
FileInputFormat.addInputPath( job, new Path( args_1 ) );
//输出路径
FileOutputFormat.setOutputPath( job, new Path( args_2 ) );
运行参数配置

配置执行用户

-DHADOOP_USER_NAME=hadoop

hadoop单词统计详细样例
hadoop单词统计详细样例如果是参数传入样本数据请在Program arguments中添加即可

hdfs://192.168.225.101:9000/wordCount/input/wordCountText.txt
hdfs://192.168.225.101:9000/wordCount/output2
相关标签: Java代码