欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

典型ETL hive sql 事例

程序员文章站 2022-04-28 09:39:58
...
## 开天辟地 外部表
create external table ods_app_log(
               sdk_ver    string, 
             time_zone    string, 
             commit_id    string, 
           commit_time    string, 
                   pid    string, 
             app_token    string, 
                app_id    string, 
             device_id    string, 
        device_id_type    string, 
       release_channel    string, 
          app_ver_name    string, 
          app_ver_code    string, 
               os_name    string, 
                os_ver    string, 
              language    string, 
               country    string, 
           manufacture    string, 
          device_model    string, 
            resolution    string, 
              net_type    string, 
               account    string, 
         app_device_id    string, 
                   mac    string, 
            android_id    string, 
                  imei    string, 
                cid_sn    string, 
             build_num    string, 
      mobile_data_type    string, 
     promotion_channel    string, 
               carrier    string, 
                  city    string, 
               user_id    string
             
)
partitioned by (day string, os string)
row format delimited fields terminated by '\001'
location '/app-log-data/clean'

## 建立外部表分区 
alter table ods_app_log add partition(day='2019-11-05', os='android') location '/app-log-data/clean/2019-11-05/android'
alter table ods_app_log add partition(day='2019-11-05', os='ios_mac') location '/app-log-data/clean/2019-11-05/ios_mac'

________________________________________________________________________________________________________________________________________________________________________

## 建立日活etl表
create table etl_user_active_day (
sdk_ver         string,
time_zone         string,
commit_id         string,
commit_time         string,
pid         string,
app_token         string,
app_id         string,
device_id         string,
device_id_type         string,
release_channel         string,
app_ver_name         string,
app_ver_code         string,
os_name         string,
os_ver         string,
language         string,
country         string,
manufacture         string,
device_model         string,
resolution         string,
net_type         string,
account         string,
app_device_id         string,
mac         string,
android_id         string,
imei         string,
cid_sn         string,
build_num         string,
mobile_data_type         string,
promotion_channel         string,
carrier         string,
city         string,
user_id         string
)
partitioned by (day string)
row format delimited fields terminated by '\001';


# 将 2019-11-05 日活数据 存入
insert into table etl_user_active_day partition(day='2019-11-06')
select 
sdk_ver         ,
time_zone        ,
commit_id        ,
commit_time      ,
pid         ,
app_token        ,
app_id         ,
device_id        ,
device_id_type   ,
release_channel  ,
app_ver_name     ,
app_ver_code     ,
os_name         ,
os_ver         ,
language         ,
country         ,
manufacture      ,
device_model     ,
resolution       ,
net_type         ,
account         ,
app_device_id    ,
mac         ,
android_id       ,
imei         ,
cid_sn         ,
build_num        ,
mobile_data_type ,
promotion_channel,
carrier         ,
city         ,
user_id         
from 
(
select *, 
row_number() over(partition by user_id order by commit_time) as rn
from ods_app_log where day='2019-11-06'
) tmp
where rn=1
;

# 建立日活维度表

create table dim_user_active_day (os_name string, city string, release_channel string, app_ver_name string, cnts int)
    partitioned by (day string, dim string);


# 统计日活

from  etl_user_active_day

insert into  dim_user_active_day partition(day='2019-11-06', dim='0000')
select 'all', 'all', 'all', 'all', count(1)
where day='2019-11-06'
 
insert into  dim_user_active_day partition(day='2019-11-06', dim='1000')
select os_name, 'all', 'all', 'all', count(1)
where day='2019-11-06'
group by os_name

insert into  dim_user_active_day partition(day='2019-11-06', dim='0100')
select 'all', city, 'all', 'all', count(1)
where day='2019-11-06'
group by city

insert into  dim_user_active_day partition(day='2019-11-06', dim='0010')
select 'all', 'all', release_channel, 'all', count(1)
where day='2019-11-06'
group by release_channel

insert into  dim_user_active_day partition(day='2019-11-06', dim='0001')
select 'all', 'all', 'all', app_ver_name, count(1)
where day='2019-11-06'
group by app_ver_name



# 统计日新

# 创建历史用户表
create table etl_user_history (user_id string);

# 创建日新用户表


create table etl_user_new_day like etl_user_active_day;

insert into etl_user_new_day partition(day='2019-11-06')
select 
sdk_ver         ,
time_zone        ,
commit_id        ,
commit_time      ,
pid         ,
app_token        ,
app_id         ,
device_id        ,
device_id_type   ,
release_channel  ,
app_ver_name     ,
app_ver_code     ,
os_name         ,
os_ver         ,
language         ,
country         ,
manufacture      ,
device_model     ,
resolution       ,
net_type         ,
account         ,
app_device_id    ,
mac         ,
android_id       ,
imei         ,
cid_sn         ,
build_num        ,
mobile_data_type ,
promotion_channel,
carrier         ,
city         ,
a.user_id    
from etl_user_active_day a left join etl_user_history b  on a.user_id=b.user_id
where a.day ='2019-11-06' and b.user_id is null;

insert into etl_user_history 
select 
user_id
from etl_user_new_day 
where day='2019-11-06';


# 统计昨日留存用户

create table etl_user_keepalive_nextday like etl_user_active_day;

insert into etl_user_keepalive_nextday partition(day='2019-11-06')
select 
sdk_ver         ,
time_zone        ,
commit_id        ,
commit_time      ,
pid         ,
app_token        ,
app_id         ,
device_id        ,
device_id_type   ,
release_channel  ,
app_ver_name     ,
app_ver_code     ,
os_name         ,
os_ver         ,
language         ,
country         ,
manufacture      ,
device_model     ,
resolution       ,
net_type         ,
account         ,
app_device_id    ,
mac         ,
android_id       ,
imei         ,
cid_sn         ,
build_num        ,
mobile_data_type ,
promotion_channel,
carrier         ,
city         ,
user_id
from
etl_user_new_day a left semi join etl_user_active_day b
on a.user_id=b.user_id and a.day='2019-11-05' and b.day='2019-11-06';



# 版本轨迹追踪
 

典型ETL hive sql 事例

相关标签: ETL