mapreduce求共同好友案例示例
案例需求:
求出两两之间的共同好友
一:案例分析图解
------------------------------------------------------------第一阶段----------------------------------------------------------------------------
1.导入pom.xml依赖,特别留意打包插件,他可以把依赖的jar包全部打包,保证可以在hadoop集群上运行
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.5</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.7.5</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.7.5</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>2.7.5</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>RELEASE</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
<encoding>UTF-8</encoding>
<!-- <verbal>true</verbal>-->
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>2.4.3</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<minimizeJar>true</minimizeJar>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
2.map阶段:把k1 v1 --> k2 v2
public class MyMapper extends Mapper<LongWritable,Text,Text,Text> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//1.切割v1,获取v2
String[] split01 = value.toString().split(":");
String v2 = split01[0];
//2.再切割
String[] split02 = split01[1].split(",");
Text text_k2 = new Text();
Text text_v2 = new Text();
//3.遍历数组获取k2,且把k2 v2写入context域中传递给shuffle阶段/reduce阶段
for (String k2 : split02){
text_k2.set(k2);
text_v2.set(v2);
context.write(text_k2,text_v2);
}
}
}
3.reduce阶段:把k2 v2 --> k3 v3
public class MyReducer extends Reducer<Text,Text,Text,Text> {
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
//把k2 v2 转化成 k3 v3
//1.遍历集合,把v2转成k2
StringBuffer buffer = new StringBuffer();
for (Text text : values){
buffer.append(text).append("-");
}
context.write(new Text(buffer.toString()),key);
}
}
4.主方法,执行程序入口类
public class JonMain extends Configured implements Tool {
@Override
public int run(String[] args) throws Exception {
//获取job任务对象
Job job = Job.getInstance(super.getConf(),"friend");
//第一步:设置读取方式和读取路径
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(job,new Path("hdfs://node01:8020/input"));
//第二步:设置map阶段
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
//第三步:分区
//第四步:排序
//第五步:规约
//第六步:分组
//第七步:设置reduce阶段
job.setReducerClass(MyReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
//第八步:设置输出方式和输出路径
job.setOutputFormatClass(TextOutputFormat.class);
Path path = new Path("hdfs://node01:8020/output01");
TextOutputFormat.setOutputPath(job,path);
//获取文件系统对象
FileSystem fileSystem = FileSystem.get(new URI("hdfs://node01"),super.getConf());
//判断输出目录是否存在,存在就删除,避免多次执行报错
if (fileSystem.exists(path)){
fileSystem.delete(path);
}
//等待任务结束
boolean b = job.waitForCompletion(true);
return b ? 0 : 1 ;
}
//主方法入口
public static void main(String[] args) throws Exception {
Configuration configuration = new Configuration();
int run = ToolRunner.run(configuration, new JonMain(), args);
System.exit(run);
}
}
--------------------------------------------------------------第二阶段----------------------------------------------------------------------------
1.map阶段
public class MyMapper extends Mapper<LongWritable,Text,Text,Text> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//1.切割v1,获取v2
String[] split01 = value.toString().split("\t");
String v2 = split01[1];
//2.再切割
String[] split02 = split01[0].split("-");
Text text_k2 = new Text();
Text text_v2 = new Text();
text_v2.set(v2);
//3.遍历数组获取k2
for (int i =0;i < split02.length-1;i++){
//4.排序
Arrays.sort(split02);
for (int j=i+1;j < split02.length ; j++){
String k2 = split02[i]+"-"+split02[j];
text_k2.set(k2);
context.write(text_k2,text_v2);
}
}
}
}
2.reduce阶段
public class MyReducer extends Reducer<Text,Text,Text,Text> {
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
//把k2 v2 转化成 k3 v3
//1.遍历集合,拼接v2
StringBuffer buffer = new StringBuffer();
for (Text text : values){
buffer.append(text).append("\t");
}
context.write(key,new Text(buffer.toString()));
}
}
3.主方法
public class JonMain extends Configured implements Tool {
@Override
public int run(String[] args) throws Exception {
//获取job任务对象
Job job = Job.getInstance(super.getConf(),"friend");
//第一步:设置读取方式和读取路径
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(job,new Path("hdfs://node01:8020/output01/part-r-00000"));
//第二步:设置map阶段
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
//第三步:分区
//第四步:排序
//第五步:规约
//第六步:分组
//第七步:设置reduce阶段
job.setReducerClass(MyReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
//第八步:设置输出方式和输出路径
job.setOutputFormatClass(TextOutputFormat.class);
Path path = new Path("hdfs://node01:8020/output02");
TextOutputFormat.setOutputPath(job,path);
//获取文件系统对象
FileSystem fileSystem = FileSystem.get(new URI("hdfs://node01"),super.getConf());
//判断输出目录是否存在,存在就删除,避免多次执行报错
if (fileSystem.exists(path)){
fileSystem.delete(path);
}
//等待任务结束
boolean b = job.waitForCompletion(true);
return b ? 0 : 1 ;
}
//主方法入口
public static void main(String[] args) throws Exception {
Configuration configuration = new Configuration();
int run = ToolRunner.run(configuration, new JonMain(), args);
System.exit(run);
}
}
---------------------------------------------------------------结果对比示例---------------------------------------------------------------------------
1.第一次运行结果
I-K-B-G-F-H-O-C-D- A
A-F-C-J-E- B
E-A-H-B-F-G-K- C
E-C-L-A-F-H-G- D
K- D
F-M-L-H-G-D-C-B-A- E
D-A-M- F
G-L- F
M- G
O- H
O- I
C- I
O- J
B- K
D-E- L
E- M
F- M
F- O
J-I-H-A- O
2.第二次运行结果
A-B E C
A-C B E D
A-D F E
A-E C B D
A-F E B D C
A-G E D C
A-H C O D E
A-I O
A-J B O
A-K C
A-L E D
A-M F E
B-C A E
B-D E A
B-E C
B-F C E A
B-G C A E
B-H E C A
B-I A
B-K C A
B-L E
B-M E
B-O A
C-D E A
C-E B D
C-F E D B A
C-G D E A
C-H E D A
C-I A
C-J B
C-K A
C-L D E
C-M E
C-O A
D-E L
D-F E A
D-G A E
D-H A E
D-I A
D-K A
D-L E
D-M E F
D-O A
E-F D C B
E-G D C
E-H D C
E-J B
E-K C
E-L D
F-G D C A E
F-H A C D E
F-I A
F-J B
F-K C A
F-L D E
F-M E
F-O A
G-H A D E C
G-I A
G-K A C
G-L D E F
G-M E
G-O A
H-I A O
H-J O
H-K A C
H-L E D
H-M E
H-O A
I-J O
I-K A
I-O A
K-O A
L-M E
上一篇: Kylin3.1.1集成CDH6.2.1
下一篇: Airflow docker版配置、部署