MR -- join操作

程序员文章站 2022-04-28 14:46:39

...

一、Reduce Join

1.1 原理

1、Map 端的主要工作：为来自不同表或文件的 key/value 对，打标签以区别不同来源的记录。然后用连接字段作为 key，其余部分和新加的标志作为 value，最后进行输出。

2、Reduce 端的主要工作：在 Reduce 端以连接字段作为 key 的分组以及完成，我们只需要在每一个分组当中将那些来源于不同文件的记录(在 Map 阶段已经打标志)分开，最后进行合并就 ok 了。

1.2 案例

1、需求
将商品信息表中数据根据商品 pid 合并到订单数据表中。

2、数据说明
订单表(id, pid, amount)

商品信息表(pid, pname)

01	小米
02	华为
03	格力

最终数据形式：

1001	小米	1
1004	小米	4
1002	华为	2
1005	华为	5
1003	格力	3
1006	格力	6

数据都以 “\t” 分割

3、流程分析
通过将关联条件作为 Map 输出的 key，将两表满足 Join 条件的数据并携带数据所来源的文件信息，发往同一个 ReduceTask，在 Reduce 中进行数据的串联。
MR -- join操作

4、编写代码
(1) 编写商品和订单合并后的 Bean 类

package reducejoin;

import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

/**
 * @description: 创建商品和订单合并后的 Bean 类
 * @author: hyr
 * @time: 2020/3/1 9:11
 */
public class TableBean implements Writable {
    private String order_id; // 订单 id
    private String p_id; // 产品 id
    private int amount; // 产品数量
    private String pname; // 产品名称
    private String flag; // 表的标记

    public TableBean() {
    }

    public TableBean(String order_id, String p_id, int amount, String pname, String flag) {
        this.order_id = order_id;
        this.p_id = p_id;
        this.amount = amount;
        this.pname = pname;
        this.flag = flag;
    }

    public String getOrder_id() {
        return order_id;
    }

    public void setOrder_id(String order_id) {
        this.order_id = order_id;
    }

    public String getP_id() {
        return p_id;
    }

    public void setP_id(String p_id) {
        this.p_id = p_id;
    }

    public int getAmount() {
        return amount;
    }

    public void setAmount(int amount) {
        this.amount = amount;
    }

    public String getPname() {
        return pname;
    }

    public void setPname(String pname) {
        this.pname = pname;
    }

    public String getFlag() {
        return flag;
    }

    public void setFlag(String flag) {
        this.flag = flag;
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeUTF(order_id);
        out.writeUTF(p_id);
        out.writeInt(amount);
        out.writeUTF(pname);
        out.writeUTF(flag);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        order_id = in.readUTF();
        p_id = in.readUTF();
        amount = in.readInt();
        pname = in.readUTF();
        flag = in.readUTF();
    }

    @Override
    public String toString() {
        return order_id + "\t" + pname + "\t" + amount + "\t";
    }
}

(2) 编写 Mapper 类

package reducejoin;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import java.io.IOException;

/**
 * @description: 编写 TableMapper 类
 * @author: hyr
 * @time: 2020/3/1 9:21
 */
public class TableMapper extends Mapper<LongWritable, Text, Text, TableBean> {
    String name;
    TableBean bean = new TableBean();
    Text k = new Text();

    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        // 1、获取输入文件切片
        FileSplit split = (FileSplit) context.getInputSplit();

        // 2、获取输入文件名称
        name = split.getPath().getName();
    }

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        // 1、获取输入数据
        String line = value.toString();

        // 2、不同文件分别处理
        if (name.startsWith("order")) { // 订单表处理
            // 2.1 切割
            String[] fields = line.split("\t");

            // 2.2 封装 bean 对象
            bean.setOrder_id(fields[0]);
            bean.setP_id(fields[1]);
            bean.setAmount(Integer.parseInt(fields[2]));
            bean.setPname("");
            bean.setFlag("order");

            k.set(fields[1]);
        } else { // 产品表处理
            // 2.3 切割
            String[] fields = line.split("\t");

            // 2.4 封装 bean 对象
            bean.setP_id(fields[0]);
            bean.setPname(fields[1]);
            bean.setFlag("pd");
            bean.setAmount(0);
            bean.setOrder_id("");

            k.set(fields[0]);
        }

        // 3、写出
        context.write(k, bean);
    }
}

(3) 编写 Reducer 类

package reducejoin;

import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;

/**
 * @description: 编写 TableReducer 类
 * @author: hyr
 * @time: 2020/3/1 9:38
 */
public class TableReducer extends Reducer<Text, TableBean, TableBean, NullWritable> {
    @Override
    protected void reduce(Text key, Iterable<TableBean> values, Context context) throws IOException, InterruptedException {
        // 1、准备存储订单的集合
        ArrayList<TableBean> orderBeans = new ArrayList<>();

        // 2、准备 bean 对象
        TableBean pdBean = new TableBean();

        for (TableBean bean : values){
            if ("order".equals(bean.getFlag())){ // 订单表
                // 拷贝传递过来的每条订单数据到集合中
                TableBean orderBean = new TableBean();
                try {
                    BeanUtils.copyProperties(orderBean, bean);
                } catch (IllegalAccessException e) {
                    e.printStackTrace();
                } catch (InvocationTargetException e) {
                    e.printStackTrace();
                }
                orderBeans.add(orderBean);
            }else { // 产品表
                // 拷贝传递过来的产品表到内存中
                try {
                    BeanUtils.copyProperties(pdBean, bean);
                } catch (IllegalAccessException e) {
                    e.printStackTrace();
                } catch (InvocationTargetException e) {
                    e.printStackTrace();
                }
            }
        }

        // 3、表的拼接
        for (TableBean bean : orderBeans){
            bean.setPname(pdBean.getPname());

            // 4、将数据写出去
            context.write(bean, NullWritable.get());
        }
    }
}

(4) 编写 Driver 类

package reducejoin;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
 * @description: 编写 TableDriver 类
 * @author: hyr
 * @time: 2020/3/1 9:51
 */

public class TableDriver {
    public static void main(String[] args) throws Exception {
        args = new String[]{"F:/input", "F:/output"};

        // 1、获取配置信息，或者job对象实例
        Configuration configuration = new Configuration();
        Job job = Job.getInstance(configuration);

        // 2、指定本程序的jar包所在的本地路径
        job.setJarByClass(TableDriver.class);

        // 3、指定本业务job要使用的Mapper/Reducer业务类
        job.setMapperClass(TableMapper.class);
        job.setReducerClass(TableReducer.class);

        // 4、指定Mapper输出数据的kv类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(TableBean.class);

        // 5、指定最终输出的数据的kv类型
        job.setOutputKeyClass(TableBean.class);
        job.setOutputValueClass(NullWritable.class);

        // 6、指定job的输入原始文件所在目录
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        // 7、提交配置参数
        boolean result = job.waitForCompletion(true);
        System.exit(result ? 0 : 1);
    }
}

5、程序运行结果
MR -- join操作

二、Map Join

2.1 介绍

1、使用场景
Map Join 适用于一张表十分小、一张表很大的场景。

2、优点
思考：在 Reduce 端处理过多的表，非常容易产生数据倾斜。怎么办？
在 Map 端缓存多张表，提前处理业务逻辑，这样增加 Map 端业务，减少 Reduce 端数据的压力，尽可能的减少数据倾斜。

3、具体办法：采用 DistributedCache
(1) 在 Mapper 的 setup 阶段，将文件读取到缓存集合中。
(2) 在驱动函数中加载缓存。

// 缓存普通文件到 Task 运行节点。
job.addCacheFile(new URI("file://F:/pd.txt"));

2.2 案例

1、需求
将商品信息表中数据根据商品 id 合并到订单数据表中。

2、数据说明
订单数据表(id, pid, amount)

商品信息表(pid, pname)

01	小米
02	华为
03	格力

最终数据形式

1001	01	1	小米
1002	02	2	华为
1003	03	3	格力
1004	01	4	小米
1005	02	5	华为
1006	03	6	格力

3、流程分析
MapJoin 适用于关联表中有小表的情形。
MR -- join操作
4、编写代码
(1) 编写 Mapper 类

package mapjoin;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.HashMap;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class DistributedCacheMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
    HashMap<String, String> pdMap = new HashMap<>();

    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        // 缓存小表
        URI[] cacheFiles = context.getCacheFiles();
        // 因为只缓存了一个文件，所以取第一个
        String path = cacheFiles[0].getPath().toString();

        BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(path), "UTF-8"));

        String line;
        while (StringUtils.isNotEmpty(line = reader.readLine())) {
            // 1、切割
            String[] fileds = line.split("\t");

            pdMap.put(fileds[0], fileds[1]);
        }

        // 2、关闭资源
        IOUtils.closeStream(reader);
    }

    Text k = new Text();

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        // 1、获取一行
        String line = value.toString();

        // 2、切割
        String[] fileds = line.split("\t");

        // 3、获取 pid
        String pid = fileds[1];

        // 4、取出 pname
        String pname = pdMap.get(pid);

        // 5、拼接
        line = line + "\t" + pname;

        k.set(line);

        // 6、写出
        context.write(k, NullWritable.get());
    }
}

(2) 编写 Driver 类

package mapjoin;

import java.net.URI;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
 * @description: 在驱动模块中添加缓存文件
 * @author: hyr
 * @time: 2020/3/1 10:16
 */
public class DistributedCacheDriver {
    public static void main(String[] args) throws Exception {
        // 根据自己电脑路径重新配置
        args = new String[]{"F:/input", "F:/output"};

        // 1、获取job信息
        Configuration configuration = new Configuration();
        Job job = Job.getInstance(configuration);

        // 2、设置加载jar包路径
        job.setJarByClass(DistributedCacheDriver.class);

        // 3、关联map
        job.setMapperClass(DistributedCacheMapper.class);

        // 4、设置最终输出数据类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        // 5、设置输入输出路径
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        // 6、加载缓存数据
        job.addCacheFile(new URI("file:///F:/pd.txt"));

        // 7、Map端Join的逻辑不需要Reduce阶段，设置reduceTask数量为0
        job.setNumReduceTasks(0);

        // 8、提交
        boolean result = job.waitForCompletion(true);
        System.exit(result ? 0 : 1);
    }
}

5、程序结果
MR -- join操作

MR -- join操作

一、Reduce Join

1.1 原理

1.2 案例

二、Map Join

2.1 介绍

2.2 案例

PHP绕过open_basedir限制操作文件的方法

PHP中模拟链表和链表的基本操作示例

MVC4制作网站教程第三章修改用户组操作3.3

mysql学习笔记之表的基本操作

了解JavaScript表单操作和表单域

MVC4制作网站教程第三章浏览用户组操作3.1

javascript系统时间设置操作示例

JavaScript中join()方法的使用简介

MVC4制作网站教程第三章添加用户组操作3.2

MVC4制作网站教程第三章删除用户组操作3.4

MR -- join操作

一、Reduce Join

1.1 原理

1.2 案例

二、Map Join

2.1 介绍

2.2 案例

PHP绕过open_basedir限制操作文件的方法

PHP中模拟链表和链表的基本操作示例

MVC4制作网站教程第三章 修改用户组操作3.3

mysql学习笔记之表的基本操作

了解JavaScript表单操作和表单域

MVC4制作网站教程第三章 浏览用户组操作3.1

javascript系统时间设置操作示例

JavaScript中join()方法的使用简介

MVC4制作网站教程第三章 添加用户组操作3.2

MVC4制作网站教程第三章 删除用户组操作3.4

MVC4制作网站教程第三章修改用户组操作3.3

MVC4制作网站教程第三章浏览用户组操作3.1

MVC4制作网站教程第三章添加用户组操作3.2

MVC4制作网站教程第三章删除用户组操作3.4