大数据-Hadoop生态(15)-MapReduce框架原理-自定义FileInputFormat
程序员文章站
2023-11-15 23:54:10
1. 需求 将多个小文件合并成一个SequenceFile文件(SequenceFile文件是Hadoop用来存储二进制形式的key-value对的文件格式),SequenceFile里面存储着多个文件,存储的形式为文件路径+名称为key,文件内容为value 三个小文件 one.txt two.t ......
1. 需求
将多个小文件合并成一个sequencefile文件(sequencefile文件是hadoop用来存储二进制形式的key-value对的文件格式),sequencefile里面存储着多个文件,存储的形式为文件路径+名称为key,文件内容为value
三个小文件
one.txt
yongpeng weidong weinan sanfeng luozong xiaoming
two.txt
shuaige changmo zhenqiang dongli lingu xuanxuan
three.txt
longlong fanfan mazong kailun yuhang yixin longlong fanfan mazong kailun yuhang yixin
2. 需求分析
3.案例代码
1) 自定义recordreader
package com.nty.inputformat; import org.apache.hadoop.fs.fsdatainputstream; import org.apache.hadoop.fs.filesystem; import org.apache.hadoop.fs.path; import org.apache.hadoop.io.byteswritable; import org.apache.hadoop.io.ioutils; import org.apache.hadoop.io.text; import org.apache.hadoop.mapreduce.inputsplit; import org.apache.hadoop.mapreduce.recordreader; import org.apache.hadoop.mapreduce.taskattemptcontext; import org.apache.hadoop.mapreduce.lib.input.filesplit; import java.io.ioexception; /** * author nty * date time 2018-12-11 9:10 */ public class customrecordreader extends recordreader<text, byteswritable> { /** * 由于采用了fileinputformat的输入方式,所以输入源3个文件,会分成三个切片,所以一个recordreader只处理一个文件,一次读完 */ //标记文件是否被读过,true表示没被读过 private boolean flag = true; private text key = new text(); private byteswritable value = new byteswritable(); //输入流 fsdatainputstream fis; private filesplit fs; /** * 初始化方法,只调用一次 * @param split * @param context * @throws ioexception * @throws interruptedexception */ public void initialize(inputsplit split, taskattemptcontext context) throws ioexception, interruptedexception { //filesplit是inputsplit的子类 fs = (filesplit) split; //获取文件路径 path path = fs.getpath(); //获取文件系统 filesystem filesystem = filesystem.get(context.getconfiguration()); //filesystem filesystem = path.getfilesystem(context.getconfiguration()); //开流 fis = filesystem.open(path); } /** * 读取下一组kv * @return 读到了返回true,反之返回false * @throws ioexception * @throws interruptedexception */ public boolean nextkeyvalue() throws ioexception, interruptedexception { if(flag){ //读取文件进入key和value string path = fs.getpath().tostring(); key.set(path); //文件是一次性读完,bytes的长度不能为普遍的1024,当然这么写会涉及到大文件的问题,不做讨论. byte[] bytes = new byte[(int) fs.getlength()]; fis.read(bytes); value.set(bytes,0,bytes.length); //重新标记 flag = false; return true; } return false; } /** * 获取当前读到的key * @return * @throws ioexception * @throws interruptedexception */ public text getcurrentkey() throws ioexception, interruptedexception { return this.key; } /** * 获取当前读到的value * @return * @throws ioexception * @throws interruptedexception */ public byteswritable getcurrentvalue() throws ioexception, interruptedexception { return this.value; } /** * 获取当前读取的进度 * @return * @throws ioexception * @throws interruptedexception */ public float getprogress() throws ioexception, interruptedexception { //文件一次读完,只有0和1的进度,根据flag来判断 return flag ? 0f : 1f; } /** * 关闭资源 * @throws ioexception */ public void close() throws ioexception { ioutils.closestream(fis); } }
2) 自定义inputformat
package com.nty.inputformat; import org.apache.hadoop.fs.path; import org.apache.hadoop.io.byteswritable; import org.apache.hadoop.io.text; import org.apache.hadoop.mapreduce.inputsplit; import org.apache.hadoop.mapreduce.jobcontext; import org.apache.hadoop.mapreduce.recordreader; import org.apache.hadoop.mapreduce.taskattemptcontext; import org.apache.hadoop.mapreduce.lib.input.fileinputformat; import java.io.ioexception; /** * author nty * date time 2018-12-11 9:09 */ //需求中,key为文件路径+名称,所以key类型为text,value为文件内容,用byteswritable public class custominputformat extends fileinputformat<text, byteswritable> { //最后输出的value为一个文件,所让文件不能被切分,返回false @override protected boolean issplitable(jobcontext context, path filename) { return false; } //返回自定义的 recordreader public recordreader<text, byteswritable> createrecordreader(inputsplit split, taskattemptcontext context) throws ioexception, interruptedexception { return new customrecordreader(); } }
3) 编写mapper类
package com.nty.inputformat; import org.apache.hadoop.io.byteswritable; import org.apache.hadoop.io.text; import org.apache.hadoop.mapreduce.mapper; import java.io.ioexception; /** * author nty * date time 2018-12-11 9:10 */ public class custommapper extends mapper<text, byteswritable, text, byteswritable> { @override protected void map(text key, byteswritable value, context context) throws ioexception, interruptedexception { context.write(key,value); } }
4) 编写reducer类
package com.nty.inputformat; import org.apache.hadoop.io.byteswritable; import org.apache.hadoop.io.text; import org.apache.hadoop.mapreduce.reducer; import java.io.ioexception; /** * author nty * date time 2018-12-11 9:10 */ public class customreducer extends reducer<text, byteswritable, text, byteswritable> { @override protected void reduce(text key, iterable<byteswritable> values, context context) throws ioexception, interruptedexception { for (byteswritable value : values) { context.write(key, value); } } }
5) 编写driver类
package com.nty.inputformat; import org.apache.hadoop.conf.configuration; import org.apache.hadoop.fs.path; import org.apache.hadoop.io.byteswritable; import org.apache.hadoop.io.text; import org.apache.hadoop.mapreduce.job; import org.apache.hadoop.mapreduce.lib.input.fileinputformat; import org.apache.hadoop.mapreduce.lib.output.fileoutputformat; import org.apache.hadoop.mapreduce.lib.output.sequencefileoutputformat; /** * author nty * date time 2018-12-11 9:10 */ public class customdriver { public static void main(string[] args) throws exception{ //获取job configuration configuration = new configuration(); job job = job.getinstance(configuration); //设置类 job.setjarbyclass(customdriver.class); //设置input和output job.setinputformatclass(custominputformat.class); job.setoutputformatclass(sequencefileoutputformat.class); //设置mapper和reducer job.setmapperclass(custommapper.class); job.setreducerclass(customreducer.class); //设置mapper和reducer的输入输出 job.setmapoutputkeyclass(text.class); job.setmapoutputvalueclass(byteswritable.class); job.setoutputkeyclass(text.class); job.setoutputvalueclass(byteswritable.class); //设置文件路径 fileinputformat.setinputpaths(job, new path("d:\\hadoop_test")); fileoutputformat.setoutputpath(job, new path("d:\\hadoop_test_out")); //提交 boolean b = job.waitforcompletion(true); system.exit(b ? 0 : 1); } }