欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

Hadoop_WordCount单词统计

程序员文章站 2022-05-01 11:11:32
...

Hadoop_WordCount单词统计

  1. 创建hadoop02工程

  2. pom文件

<project xmlns="http://maven.apache.org/POM/4.0.0"
	xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
	<modelVersion>4.0.0</modelVersion>
	<groupId>com.blu</groupId>
	<artifactId>hadoop02</artifactId>
	<version>0.0.1-SNAPSHOT</version>

	<dependencies>
		<dependency>
			<groupId>org.apache.hadoop</groupId>
			<artifactId>hadoop-hdfs</artifactId>
			<version>2.9.2</version>
		</dependency>
		<dependency>
			<groupId>org.apache.hadoop</groupId>
			<artifactId>hadoop-client</artifactId>
			<version>2.9.2</version>
		</dependency>
	
		<dependency>
			<groupId>org.apache.hadoop</groupId>
			<artifactId>hadoop-common</artifactId>
			<version>2.9.2</version>
		</dependency>
		
		<dependency>
			<groupId>junit</groupId>
			<artifactId>junit</artifactId>
			<version>4.12</version>
			<scope>test</scope>
		</dependency>

		<dependency>
			<groupId>jdk.tools</groupId>
			<artifactId>jdk.tools</artifactId>
			<version>1.8</version>
			<scope>system</scope>
			<systemPath>${JAVA_HOME}/lib/tools.jar</systemPath>
		</dependency>
	</dependencies>

</project>

  1. MyWordCountMapper
package com.blu.mywordcount;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;


/**
 * 输入数据类
 * KEYIN: 默认情况下,输入的KEY数据类型是待处理数据中的某一行内容的起始偏移量。
 * 			类型为Long,hadoop提供了自己的序列化框架,LongWritable代替Long类型
 * VALUEIN:   默认情况下,输入的VALUE的值是某一行数据,类型是String,这里应该使用Text
 * 
 * 输出数据类型
 * KEYOUT: map方法处理完成后,要返回的KEY的数据类型
 * VALUEOUT: map方法处理完成后,要返回的VALUE的数据类型
 * 
 * 输入的数据格式:
 * good morning
 * good afternoon
 * 
 * 输出的数据格式:
 * good 1
 * morning 1
 * good 1
 * afternoon 1
 * 
 * @author BLU
 */
public class MyWordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
	
	private Text text = new Text();
	private IntWritable iw = new IntWritable(1);
	
	
	@Override
	protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)
			throws IOException, InterruptedException {
		
		//获取一行内容
		String content = value.toString();//good morning
		String[] vals = content.split(" ");
		for(String v : vals) {
			text.set(v);
			context.write(text, iw);
		}
	}
}

  1. MyWordCountReducer
package com.blu.mywordcount;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

/**
 * 输入的数据类型:mapper输出的数据类型
 * KEYIN,VALUEIN
 * 输出的数据类型:最终输出的数据类型
 * KEYOUT,VALUEOUT
 * 
 * 最终输出的数据格式:
 * good 2
 * morning 1
 * afternoon 1
 * 
 * @author BLU
 *
 */


public class MyWordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
	
	/**
	 * 从mapper来的数据(按字典顺序排序)
	 * afternoon 1
	 * good 1
	 * good 1
	 * morning 1
	 */
	
	
	IntWritable iwsum = new IntWritable();
	
	@Override
	protected void reduce(Text text, Iterable<IntWritable> value,
			Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
		
		int sum = 0;
		for(IntWritable iw : value) {
			sum +=iw.get();
		}
		iwsum.set(sum);
		//要输出的数据:
//		afternoon 1
//		good 2
//		morning 1
		context.write(text, iwsum);
		
	}
	
}

  1. MyWordCount
package com.blu.mywordcount;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class MyWordCount {
	
	public static void main(String[] args) {
		
		try {
			Configuration conf = new Configuration();
			Job job = Job.getInstance(conf);
			
			//设置运行的类
			job.setJarByClass(MyWordCount.class);
			
			//设置mapper和reducer对应的类
			job.setMapperClass(MyWordCountMapper.class);
			job.setReducerClass(MyWordCountReducer.class);
			
			//运行mapper的输出数据类型和最终输出的数据类型
			job.setMapOutputKeyClass(Text.class);
			job.setMapOutputValueClass(IntWritable.class);
			job.setOutputKeyClass(Text.class);
			job.setOutputValueClass(IntWritable.class);
			
			//设置文件的输入和输出的路径
			// hadoop jar example.jar wordcount /input/a.txt /output
			FileInputFormat.addInputPath(job, new Path(args[0]));
			FileOutputFormat.setOutputPath(job, new Path(args[1]));
			
			//运行job
			boolean flag = job.waitForCompletion(true);
			//0表示正常退出
			//1表示不正常退出
			System.exit(flag ?0 : 1);
			
			
		} catch (Exception e) {
			e.printStackTrace();
		}
	}

}

  1. 在Resource目录下创建log4j.properties文件
 ### 设置 ###
log4j.rootLogger = debug,stdout,D,E

### 输出信息到控制台 ###
log4j.appender.stdout = org.apache.log4j.ConsoleAppender
log4j.appender.stdout.Target = System.out
log4j.appender.stdout.layout = org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern = [%-5p] %d{yyyy-MM-dd HH:mm:ss,SSS} method:%l%n%m%n

### 输出DEBUG 级别以上的日志到=E://logs/error.log ###
log4j.appender.D = org.apache.log4j.DailyRollingFileAppender
log4j.appender.D.File = E://logs/log.log
log4j.appender.D.Append = true
log4j.appender.D.Threshold = DEBUG 
log4j.appender.D.layout = org.apache.log4j.PatternLayout
log4j.appender.D.layout.ConversionPattern = %-d{yyyy-MM-dd HH:mm:ss}  [ %t:%r ] - [ %p ]  %m%n

### 输出ERROR 级别以上的日志到=E://logs/error.log ###
log4j.appender.E = org.apache.log4j.DailyRollingFileAppender
log4j.appender.E.File =E://logs/error.log 
log4j.appender.E.Append = true
log4j.appender.E.Threshold = ERROR 
log4j.appender.E.layout = org.apache.log4j.PatternLayout
log4j.appender.E.layout.ConversionPattern = %-d{yyyy-MM-dd HH:mm:ss}  [ %t:%r ] - [ %p ]  %m%n

  1. 在D:\data下创建testdata.txt
good morning
good afternoon
good evening

  1. 带参数运行MyWordCount的main函数
    Hadoop_WordCount单词统计

  2. 运行结果
    在 D:\data下生成output目录,在该目录下生成了4个文件。Hadoop_WordCount单词统计
    用记事本打开part-r-00000文件:

afternoon	1
evening	1
good	3
morning	1

第二种运行方式

  1. 生成jar包
    右键项目 >> Run As >> Maven build…
    在Goals中填入打包方式为package
Hadoop_WordCount单词统计
  1. 在项目路径的target目录下生成了jar包:hadoop02-0.0.1-SNAPSHOT.jar
Hadoop_WordCount单词统计
  1. 将该jar包上传至虚拟机,将包含以下内容的testdata.txt文件上传至HDFS
good morning
good afternoon
good evening
  1. 运行以下命令:
hadoop jar hadoop02-0.0.1-SNAPSHOT.jar com.blu.mywordcount.MyWordCount /testdata.txt /outout
hadoop jar (要运行的jar包的具体地址)  (要运行的主函数所在的类名的全路径) (输入文件的路径,HDFS上) (运行结果的输出路径,HDFS上,需要写不存在的路径)
  1. 在HDFS中生成了output文件夹及一些子文件Hadoop_WordCount单词统计
  2. 运行以下命令下载文件至虚拟机的output文件夹
hdfs dfs -get /output/* /output
  1. 运行以下命令查询 part-r-00000 文件:
cat output/part-r-00000
  1. 结果:
afternoon	1
evening	1
good	3
morning	1
相关标签: Hadoop