典型ETL hive sql 事例
程序员文章站
2022-04-28 09:39:58
...
## 开天辟地 外部表
create external table ods_app_log(
sdk_ver string,
time_zone string,
commit_id string,
commit_time string,
pid string,
app_token string,
app_id string,
device_id string,
device_id_type string,
release_channel string,
app_ver_name string,
app_ver_code string,
os_name string,
os_ver string,
language string,
country string,
manufacture string,
device_model string,
resolution string,
net_type string,
account string,
app_device_id string,
mac string,
android_id string,
imei string,
cid_sn string,
build_num string,
mobile_data_type string,
promotion_channel string,
carrier string,
city string,
user_id string
)
partitioned by (day string, os string)
row format delimited fields terminated by '\001'
location '/app-log-data/clean'
## 建立外部表分区
alter table ods_app_log add partition(day='2019-11-05', os='android') location '/app-log-data/clean/2019-11-05/android'
alter table ods_app_log add partition(day='2019-11-05', os='ios_mac') location '/app-log-data/clean/2019-11-05/ios_mac'
________________________________________________________________________________________________________________________________________________________________________
## 建立日活etl表
create table etl_user_active_day (
sdk_ver string,
time_zone string,
commit_id string,
commit_time string,
pid string,
app_token string,
app_id string,
device_id string,
device_id_type string,
release_channel string,
app_ver_name string,
app_ver_code string,
os_name string,
os_ver string,
language string,
country string,
manufacture string,
device_model string,
resolution string,
net_type string,
account string,
app_device_id string,
mac string,
android_id string,
imei string,
cid_sn string,
build_num string,
mobile_data_type string,
promotion_channel string,
carrier string,
city string,
user_id string
)
partitioned by (day string)
row format delimited fields terminated by '\001';
# 将 2019-11-05 日活数据 存入
insert into table etl_user_active_day partition(day='2019-11-06')
select
sdk_ver ,
time_zone ,
commit_id ,
commit_time ,
pid ,
app_token ,
app_id ,
device_id ,
device_id_type ,
release_channel ,
app_ver_name ,
app_ver_code ,
os_name ,
os_ver ,
language ,
country ,
manufacture ,
device_model ,
resolution ,
net_type ,
account ,
app_device_id ,
mac ,
android_id ,
imei ,
cid_sn ,
build_num ,
mobile_data_type ,
promotion_channel,
carrier ,
city ,
user_id
from
(
select *,
row_number() over(partition by user_id order by commit_time) as rn
from ods_app_log where day='2019-11-06'
) tmp
where rn=1
;
# 建立日活维度表
create table dim_user_active_day (os_name string, city string, release_channel string, app_ver_name string, cnts int)
partitioned by (day string, dim string);
# 统计日活
from etl_user_active_day
insert into dim_user_active_day partition(day='2019-11-06', dim='0000')
select 'all', 'all', 'all', 'all', count(1)
where day='2019-11-06'
insert into dim_user_active_day partition(day='2019-11-06', dim='1000')
select os_name, 'all', 'all', 'all', count(1)
where day='2019-11-06'
group by os_name
insert into dim_user_active_day partition(day='2019-11-06', dim='0100')
select 'all', city, 'all', 'all', count(1)
where day='2019-11-06'
group by city
insert into dim_user_active_day partition(day='2019-11-06', dim='0010')
select 'all', 'all', release_channel, 'all', count(1)
where day='2019-11-06'
group by release_channel
insert into dim_user_active_day partition(day='2019-11-06', dim='0001')
select 'all', 'all', 'all', app_ver_name, count(1)
where day='2019-11-06'
group by app_ver_name
# 统计日新
# 创建历史用户表
create table etl_user_history (user_id string);
# 创建日新用户表
create table etl_user_new_day like etl_user_active_day;
insert into etl_user_new_day partition(day='2019-11-06')
select
sdk_ver ,
time_zone ,
commit_id ,
commit_time ,
pid ,
app_token ,
app_id ,
device_id ,
device_id_type ,
release_channel ,
app_ver_name ,
app_ver_code ,
os_name ,
os_ver ,
language ,
country ,
manufacture ,
device_model ,
resolution ,
net_type ,
account ,
app_device_id ,
mac ,
android_id ,
imei ,
cid_sn ,
build_num ,
mobile_data_type ,
promotion_channel,
carrier ,
city ,
a.user_id
from etl_user_active_day a left join etl_user_history b on a.user_id=b.user_id
where a.day ='2019-11-06' and b.user_id is null;
insert into etl_user_history
select
user_id
from etl_user_new_day
where day='2019-11-06';
# 统计昨日留存用户
create table etl_user_keepalive_nextday like etl_user_active_day;
insert into etl_user_keepalive_nextday partition(day='2019-11-06')
select
sdk_ver ,
time_zone ,
commit_id ,
commit_time ,
pid ,
app_token ,
app_id ,
device_id ,
device_id_type ,
release_channel ,
app_ver_name ,
app_ver_code ,
os_name ,
os_ver ,
language ,
country ,
manufacture ,
device_model ,
resolution ,
net_type ,
account ,
app_device_id ,
mac ,
android_id ,
imei ,
cid_sn ,
build_num ,
mobile_data_type ,
promotion_channel,
carrier ,
city ,
user_id
from
etl_user_new_day a left semi join etl_user_active_day b
on a.user_id=b.user_id and a.day='2019-11-05' and b.day='2019-11-06';
# 版本轨迹追踪
上一篇: Apache NiFi用户指南
推荐阅读