Flink DataStream之Kafka数据写入HDFS,并分区到Hive
程序员文章站
2022-07-14 13:29:32
...
Flink DataStream之Kafka数据写入HDFS,并分区到Hive
因业务要求,我们需要从Kafka中读取数据,变换后最终Sink到业务的消息队列中,为保证数据的可靠性,我们同时对Sink的结果数据,进行保存。最终选择将流数据Sink到HDFS上,在Flink中,同时也提供了HDFS Connector。下面就介绍如何将流式数据写入HDFS,同时将数据load到Hive表中。
一、pom.xml文件配置
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-filesystem_2.11</artifactId>
<version>1.8.0</version>
</dependency>
二、Flink DataStream代码
代码是将最后的结果数据,拼接成CSV格式,最后写入HDFS中。下面的逻辑在真实地业务中删除许多。仅保留有用对大家的代码。
public class RMQAndBucketFileConnectSink {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
Properties p = new Properties();
p.setProperty("bootstrap.servers", "dev-hdp-2.huazhu.com:6667,dev-hdp-3.huazhu.com:6667,dev-hdp-4.huazhu.com:6667");
SingleOutputStreamOperator<String> ds = env.addSource(new FlinkKafkaConsumer010<String>("user", new SimpleStringSchema(), p))
.map(new MapFunction<String, User>() {
@Override
public User map(String value) throws Exception {
return new Gson().fromJson(value, User.class);
}
})
.assignTimestampsAndWatermarks(new AscendingTimestampExtractor<User>() {
@Override
public long extractAscendingTimestamp(User element) {
return element.createTime;
}
})
.map(new MapFunction<User, String>() {
@Override
public String map(User value) throws Exception {
return value.userId + "," + value.name + "," + value.age + "," + value.sex + "," + value.createTime + "," + value.updateTime;
}
});
// 写入RabbitMQ
RMQConnectionConfig rmqConnectionConfig = new RMQConnectionConfig.Builder()
.setHost("localhost")
.setVirtualHost("/")
.setPort(5672)
.setUserName("admin")
.setPassword("admin")
.build();
// 写入RabbitMQ,如果队列是持久化的,需要重写RMQSink的 setupQueue方法
RMQSink<String> rmqSink = new RMQSink<>(rmqConnectionConfig, "queue_name", new SimpleStringSchema());
ds.addSink(rmqSink);
// 写入HDFS
BucketingSink<String> bucketingSink = new BucketingSink<>("/apps/hive/warehouse/users");
// 设置以yyyyMMdd的格式进行切分目录,类似hive的日期分区
bucketingSink.setBucketer(new DateTimeBucketer<>("yyyyMMdd", ZoneId.of("Asia/Shanghai")));
// 设置文件块大小128M,超过128M会关闭当前文件,开启下一个文件
bucketingSink.setBatchSize(1024 * 1024 * 128L);
// 设置一小时翻滚一次
bucketingSink.setBatchRolloverInterval(60 * 60 * 1000L);
// 设置等待写入的文件前缀,默认是_
bucketingSink.setPendingPrefix("");
// 设置等待写入的文件后缀,默认是.pending
bucketingSink.setPendingSuffix("");
//设置正在处理的文件前缀,默认为_
bucketingSink.setInProgressPrefix(".");
ds.addSink(bucketingSink);
env.execute("RMQAndBucketFileConnectSink");
}
}
写入的HDFS文件目录如下:
/apps/hive/warehouse/users/20190708
/apps/hive/warehouse/users/20190709
/apps/hive/warehouse/users/20190710
...
三、Hive表的创建以及导入
创建hive表
create external table default.users(
`userId` string,
`name` string,
`age` int,
`sex` int,
`ctime` string,
`utime` string,
)
partitioned by(dt string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
创建定时任务,每天凌晨导入HDFS文件到Hive,导入Hive脚本。
load_hive.sh
如下:
#!/usr/bin/env bash
d=`date -d "-1 day" +%Y%m%d`
# 每天HDFS的数据导入hive分区中
/usr/hdp/2.6.3.0-235/hive/bin/hive -e "alter table default.users add partition (dt='${d}') location '/apps/hive/warehouse/users/${d}'"
使用crontab
每天凌晨调度就行。
上一篇: go语言之陷阱for range