hive系列-1.hive里踩的坑
程序员文章站
2022-07-14 15:15:31
...
新增字段&跑历史分区
1、现象
hive(笔者使用的hive版本为1.1.0)表新增加列,重跑历史分区数据,发现新增的列在老分区中取值都为NULL。如下示例
1)初始化表
//init table
drop table if exists tmp.test;
create table tmp.test
(
id string
)
partitioned by (dt string);
insert overwrite table tmp.test
partition(dt='20190509')
select
'0001';
// show
select
*
from
tmp.test
where
dt='20190509';
+-------+-----------
| id | dt |
+-------+-----------+
| 0001 | 20190509 |
+-------+-----------+
1 row selected (0.234 seconds)
2)添加字段重写数据新增字段为空
//add columns
alter table tmp.test add columns(name string);
//insert data
insert overwrite table tmp.test
partition(dt='20190509')
select
'0001' as id,
'光头强' as name;
//show
select
*
from
tmp.test
where
dt='20190509';
+-------+-------+-----------+
| id | name | dt |
+-------+-------+-----------+
| 0001 | NULL | 20190509 |
+-------+-------+-----------+
1 row selected (0.956 seconds)
2、解决办法
添加字段后,重跑已有分区前,先删除已有分区
//add columns
alter table tmp.test add columns(name string);
//drop old partition
alter table tmp.test drop if exists partition(dt='20190509');
//insert data
insert overwrite table tmp.test
partition(dt='20190509')
select
'0001' as id,
'光头强' as name;
//show
select
*
from
tmp.test
where
dt='20190509';
+-------+-------+-----------+
| id | name | dt |
+-------+-------+-----------+
| 0001 | 光头强 | 20190509 |
+-------+-------+-----------+
1 row selected (0.184 seconds)