使用Maven搭建Hadoop开发环境
关于maven的使用就不再啰嗦了,网上很多,并且这么多年变化也不大,这里仅介绍怎么搭建hadoop的开发环境。
1. 首先创建工程
2. 然后在pom.xml文件里添加hadoop的依赖包hadoop-common, hadoop-client, hadoop-hdfs,添加后的pom.xml文件如下
<project xmlns:xsi="http://www.w3.org/2001/xmlschema-instance" xmlns="http://maven.apache.org/pom/4.0.0" xsi:schemalocation="http://maven.apache.org/pom/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> <modelversion>4.0.0</modelversion> <groupid>my.hadoopstudy</groupid> <artifactid>hadoopstudy</artifactid> <packaging>jar</packaging> <version>1.0-snapshot</version> <name>hadoopstudy</name> <url>http://maven.apache.org</url> <dependencies> <dependency> <groupid>org.apache.hadoop</groupid> <artifactid>hadoop-common</artifactid> <version>2.5.1</version> </dependency> <dependency> <groupid>org.apache.hadoop</groupid> <artifactid>hadoop-hdfs</artifactid> <version>2.5.1</version> </dependency> <dependency> <groupid>org.apache.hadoop</groupid> <artifactid>hadoop-client</artifactid> <version>2.5.1</version> </dependency> <dependency> <groupid>junit</groupid> <artifactid>junit</artifactid> <version>3.8.1</version> <scope>test</scope> </dependency> </dependencies> </project>
3. 测试
3.1 首先我们可以测试一下hdfs的开发,这里假定使用上一篇hadoop文章中的hadoop集群,类代码如下
package my.hadoopstudy.dfs; import org.apache.hadoop.conf.configuration; import org.apache.hadoop.fs.fsdataoutputstream; import org.apache.hadoop.fs.filestatus; import org.apache.hadoop.fs.filesystem; import org.apache.hadoop.fs.path; import org.apache.hadoop.io.ioutils; import java.io.inputstream; import java.net.uri; public class test { public static void main(string[] args) throws exception { string uri = "hdfs://9.111.254.189:9000/"; configuration config = new configuration(); filesystem fs = filesystem.get(uri.create(uri), config); // 列出hdfs上/user/fkong/目录下的所有文件和目录 filestatus[] statuses = fs.liststatus(new path("/user/fkong")); for (filestatus status : statuses) { system.out.println(status); } // 在hdfs的/user/fkong目录下创建一个文件,并写入一行文本 fsdataoutputstream os = fs.create(new path("/user/fkong/test.log")); os.write("hello world!".getbytes()); os.flush(); os.close(); // 显示在hdfs的/user/fkong下指定文件的内容 inputstream is = fs.open(new path("/user/fkong/test.log")); ioutils.copybytes(is, system.out, 1024, true); } }
3.2 测试mapreduce作业
测试代码比较简单,如下:
package my.hadoopstudy.mapreduce; import org.apache.hadoop.conf.configuration; import org.apache.hadoop.fs.path; import org.apache.hadoop.io.intwritable; import org.apache.hadoop.io.text; import org.apache.hadoop.mapreduce.job; import org.apache.hadoop.mapreduce.mapper; import org.apache.hadoop.mapreduce.reducer; import org.apache.hadoop.mapreduce.lib.input.fileinputformat; import org.apache.hadoop.mapreduce.lib.output.fileoutputformat; import org.apache.hadoop.util.genericoptionsparser; import java.io.ioexception; public class eventcount { public static class mymapper extends mapper<object, text, text, intwritable>{ private final static intwritable one = new intwritable(1); private text event = new text(); public void map(object key, text value, context context) throws ioexception, interruptedexception { int idx = value.tostring().indexof(" "); if (idx > 0) { string e = value.tostring().substring(0, idx); event.set(e); context.write(event, one); } } } public static class myreducer extends reducer<text,intwritable,text,intwritable> { private intwritable result = new intwritable(); public void reduce(text key, iterable<intwritable> values, context context) throws ioexception, interruptedexception { int sum = 0; for (intwritable val : values) { sum += val.get(); } result.set(sum); context.write(key, result); } } public static void main(string[] args) throws exception { configuration conf = new configuration(); string[] otherargs = new genericoptionsparser(conf, args).getremainingargs(); if (otherargs.length < 2) { system.err.println("usage: eventcount <in> <out>"); system.exit(2); } job job = job.getinstance(conf, "event count"); job.setjarbyclass(eventcount.class); job.setmapperclass(mymapper.class); job.setcombinerclass(myreducer.class); job.setreducerclass(myreducer.class); job.setoutputkeyclass(text.class); job.setoutputvalueclass(intwritable.class); fileinputformat.addinputpath(job, new path(otherargs[0])); fileoutputformat.setoutputpath(job, new path(otherargs[1])); system.exit(job.waitforcompletion(true) ? 0 : 1); } }
运行“mvn package”命令产生jar包hadoopstudy-1.0-snapshot.jar,并将jar文件复制到hadoop安装目录下
这里假定我们需要分析几个日志文件中的event信息来统计各种event个数,所以创建一下目录和文件
/tmp/input/event.log.1
/tmp/input/event.log.2
/tmp/input/event.log.3
因为这里只是要做一个列子,所以每个文件内容可以都一样,假如内容如下
job_new ...
job_new ...
job_finish ...
job_new ...
job_finish ...
然后把这些文件复制到hdfs上
运行mapreduce作业
查看执行结果
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持。
上一篇: /etc/my.cnf 详细配置