Hadoop_WordCount单词统计
程序员文章站
2022-05-01 11:11:32
...
Hadoop_WordCount单词统计
-
创建hadoop02工程
-
pom文件
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.blu</groupId>
<artifactId>hadoop02</artifactId>
<version>0.0.1-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.9.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.9.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.9.2</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>jdk.tools</groupId>
<artifactId>jdk.tools</artifactId>
<version>1.8</version>
<scope>system</scope>
<systemPath>${JAVA_HOME}/lib/tools.jar</systemPath>
</dependency>
</dependencies>
</project>
- MyWordCountMapper
package com.blu.mywordcount;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
/**
* 输入数据类
* KEYIN: 默认情况下,输入的KEY数据类型是待处理数据中的某一行内容的起始偏移量。
* 类型为Long,hadoop提供了自己的序列化框架,LongWritable代替Long类型
* VALUEIN: 默认情况下,输入的VALUE的值是某一行数据,类型是String,这里应该使用Text
*
* 输出数据类型
* KEYOUT: map方法处理完成后,要返回的KEY的数据类型
* VALUEOUT: map方法处理完成后,要返回的VALUE的数据类型
*
* 输入的数据格式:
* good morning
* good afternoon
*
* 输出的数据格式:
* good 1
* morning 1
* good 1
* afternoon 1
*
* @author BLU
*/
public class MyWordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
private Text text = new Text();
private IntWritable iw = new IntWritable(1);
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)
throws IOException, InterruptedException {
//获取一行内容
String content = value.toString();//good morning
String[] vals = content.split(" ");
for(String v : vals) {
text.set(v);
context.write(text, iw);
}
}
}
- MyWordCountReducer
package com.blu.mywordcount;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
/**
* 输入的数据类型:mapper输出的数据类型
* KEYIN,VALUEIN
* 输出的数据类型:最终输出的数据类型
* KEYOUT,VALUEOUT
*
* 最终输出的数据格式:
* good 2
* morning 1
* afternoon 1
*
* @author BLU
*
*/
public class MyWordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
/**
* 从mapper来的数据(按字典顺序排序)
* afternoon 1
* good 1
* good 1
* morning 1
*/
IntWritable iwsum = new IntWritable();
@Override
protected void reduce(Text text, Iterable<IntWritable> value,
Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
int sum = 0;
for(IntWritable iw : value) {
sum +=iw.get();
}
iwsum.set(sum);
//要输出的数据:
// afternoon 1
// good 2
// morning 1
context.write(text, iwsum);
}
}
- MyWordCount
package com.blu.mywordcount;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class MyWordCount {
public static void main(String[] args) {
try {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
//设置运行的类
job.setJarByClass(MyWordCount.class);
//设置mapper和reducer对应的类
job.setMapperClass(MyWordCountMapper.class);
job.setReducerClass(MyWordCountReducer.class);
//运行mapper的输出数据类型和最终输出的数据类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//设置文件的输入和输出的路径
// hadoop jar example.jar wordcount /input/a.txt /output
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//运行job
boolean flag = job.waitForCompletion(true);
//0表示正常退出
//1表示不正常退出
System.exit(flag ?0 : 1);
} catch (Exception e) {
e.printStackTrace();
}
}
}
- 在Resource目录下创建log4j.properties文件
### 设置 ###
log4j.rootLogger = debug,stdout,D,E
### 输出信息到控制台 ###
log4j.appender.stdout = org.apache.log4j.ConsoleAppender
log4j.appender.stdout.Target = System.out
log4j.appender.stdout.layout = org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern = [%-5p] %d{yyyy-MM-dd HH:mm:ss,SSS} method:%l%n%m%n
### 输出DEBUG 级别以上的日志到=E://logs/error.log ###
log4j.appender.D = org.apache.log4j.DailyRollingFileAppender
log4j.appender.D.File = E://logs/log.log
log4j.appender.D.Append = true
log4j.appender.D.Threshold = DEBUG
log4j.appender.D.layout = org.apache.log4j.PatternLayout
log4j.appender.D.layout.ConversionPattern = %-d{yyyy-MM-dd HH:mm:ss} [ %t:%r ] - [ %p ] %m%n
### 输出ERROR 级别以上的日志到=E://logs/error.log ###
log4j.appender.E = org.apache.log4j.DailyRollingFileAppender
log4j.appender.E.File =E://logs/error.log
log4j.appender.E.Append = true
log4j.appender.E.Threshold = ERROR
log4j.appender.E.layout = org.apache.log4j.PatternLayout
log4j.appender.E.layout.ConversionPattern = %-d{yyyy-MM-dd HH:mm:ss} [ %t:%r ] - [ %p ] %m%n
- 在D:\data下创建testdata.txt
good morning
good afternoon
good evening
-
带参数运行MyWordCount的main函数
-
运行结果
在 D:\data下生成output目录,在该目录下生成了4个文件。
用记事本打开part-r-00000文件:
afternoon 1
evening 1
good 3
morning 1
第二种运行方式
- 生成jar包
右键项目 >> Run As >> Maven build…
在Goals中填入打包方式为package
- 在项目路径的target目录下生成了jar包:hadoop02-0.0.1-SNAPSHOT.jar
- 将该jar包上传至虚拟机,将包含以下内容的testdata.txt文件上传至HDFS
good morning
good afternoon
good evening
- 运行以下命令:
hadoop jar hadoop02-0.0.1-SNAPSHOT.jar com.blu.mywordcount.MyWordCount /testdata.txt /outout
hadoop jar (要运行的jar包的具体地址) (要运行的主函数所在的类名的全路径) (输入文件的路径,HDFS上) (运行结果的输出路径,HDFS上,需要写不存在的路径)
- 在HDFS中生成了output文件夹及一些子文件
- 运行以下命令下载文件至虚拟机的output文件夹
hdfs dfs -get /output/* /output
- 运行以下命令查询 part-r-00000 文件:
cat output/part-r-00000
- 结果:
afternoon 1
evening 1
good 3
morning 1
上一篇: MySQL&Oracle数据库创建自动递增字段实操
下一篇: Hive内部表和外部表