欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

Hive 基础知识(二)

程序员文章站 2022-07-14 14:38:40
...

Hive 中的 json_tuple 和 parse_url_tuple 函数

json_tuple

  • json_tuple 函数用来解析 json 串
# 示例json串
{"movie":"2022","rate":"5","time":"956716207","userid":"6040"}

# 使用函数解析
select json_tuple('{"movie":"2022","rate":"5","time":"956716207","userid":"6040"}',"movie", "rate", "time", "userid")
as (movieid, rate, time, userid);

# 输出
movieid	rate	time	userid
2022	5	956716207	6040

parse_url_tuple

  • parse_url_tuple 函数用来解析 URL
# 示例URL
http://www.aaa.com/bbb/cc.html?sessionid=123456&a=b&c=d

# 使用函数解析
select parse_url_tuple("http://www.aaa.com/bbb/cc.html?sessionid=123456&a=b&c=d", "HOST", "PATH", "QUERY", "QUERY:sessionid", "QUERY:a", "QUERY:c")
as (HOST, PATH, QUERY, SESSIONID, A, C);

# 输出
host	path	query	sessionid	a	c
www.aaa.com	/bbb/cc.html	sessionid=123456&a=b&c=d	123456	b	d

开窗函数的使用

# 数据样例
hive_rownumber.id	hive_rownumber.age	hive_rownumber.name	hive_rownumber.gender
1	18	zhangsan	M
2	19	lisi	M
3	22	wangwu	F
4	16	zhaoliu	F
5	30	tianqi	M
6	26	wangba	F

# 根据性别类型,统计年龄最大的两条数据(分组 TOPN)
select id, age, name, gender, r
from 
(
select id, age, name, gender,
ROW_NUMBER() over(PARTITION BY gender order by age desc) r
from hive_rownumber
) t
where t.r <= 2;

# 输出
id	age	name	gender	r
6	26	wangba	F	1
3	22	wangwu	F	2
5	30	tianqi	M	1
2	19	lisi	M	2

hiveserver2(HS2) 和 beeline

  • beeline是hive官方推荐的一个命令行客户端,需要先启动hiveserver2(HS2)服务
# 进入hive的bin目录下,如果配置了环境变量可以直接执行
cd ${HIVE_HOME}/bin

# 启动 hiveserver2
# 可以使用 nohup 后台运行该服务
nohup ./hiveserver2 &

# 启动beeline
# -u 指定数据库地址,默认端口 10000
# -n 指定当前用户名
./beeline -u jdbc:hive2://hadoop001:10000/default -n hadoop

复杂数据类型

  • arrays:数组,一个数组中装的元素类型是一样的
# 示例数据,字段间分隔符为\t
zhangsan	beijing,shanghai,tianjin,hangzhou
lisi	changchun,chengdu,wuhan,beijing

# 创建表,注意如何指定数组类型及其分隔符
create table hive_array(
name string,
work_locations array<string>
) row format
delimited fields terminated by '\t'
collection items terminated by ',';

# 查询数据
hive_array.name	hive_array.work_locations
zhangsan	["beijing","shanghai","tianjin","hangzhou"]
lisi	["changchun","chengdu","wuhan","beijing"]

# 可以在 SELECT 后查询的
# 查询某个 index 的数据,index 从 0 开始
work_locations[index]
# 查询数组大小
size(work_locations)

# 可以在 WHERE 后筛选的
# 数组是否包含某个元素
array_contains(work_locations, "hangzhou");
  • maps:kv对,key的类型是一样的
# 示例数据
1,zhangsan,father:xiaoming#mother:xiaohuang#brother:xiaoxu,28
2,lisi,father:mayun#mother:huangyi#brother:guanyu,22
3,wangwu,father:wangjianlin#mother:ruhua#sister:jingtian,29
4,mayun,father:mayongzhen#mother:angelbaby,26


# 创建表,注意如何指定 map 类型及其分隔符
create table hive_map(
id int,
name string,
members map<string, string>,
age int
) row format
delimited fields terminated by ','
collection items terminated by '#'
map keys terminated by ':';

# 查询数据
hive_map.id	hive_map.name	hive_map.members	hive_map.age
1	zhangsan	{"father":"xiaoming","mother":"xiaohuang","brother":"xiaoxu"}	28
2	lisi	{"father":"mayun","mother":"huangyi","brother":"guanyu"}	22
3	wangwu	{"father":"wangjianlin","mother":"ruhua","sister":"jingtian"}	29
4	mayun	{"father":"mayongzhen","mother":"angelbaby"}	26


# 可以在 SELECT 后查询的
# 查询某个 key 的数据
members["father"]

# 返回 key 的数组
map_keys(members)

# 返回 value 的数组
map_values(members)

# 返回 map 中的 kv 对的个数
size(members)
  • structs:可以存放任意类型
# 示例数据
192.168.1.1#zhangsan:40
192.168.1.2#lisi:50
192.168.1.3#wangwu:60
192.168.1.4#zhaoliu:70

# 创建表
create table hive_struct(
ip string,
userinfo struct<name:string, age:int>)
row format
delimited fields terminated by '#'
collection items terminated by ':';

# 查询数据
hive_struct.ip	hive_struct.userinfo
192.168.1.1	{"name":"zhangsan","age":40}
192.168.1.2	{"name":"lisi","age":50}
192.168.1.3	{"name":"wangwu","age":60}
192.168.1.4	{"name":"zhaoliu","age":70}

# 可以在 SELECT 后查询的
# 直接.即可
userinfo.name

分区表

# 示例数据
10703007267488  2014-05-01 06:01:12.334+01
10101043505096  2014-05-01 07:28:12.342+01
10103043509747  2014-05-01 07:50:12.33+01
10103043501575  2014-05-01 09:27:12.33+01
10104043514061  2014-05-01 09:03:12.324+01

# 指定分区字段
create table order_partition(
order_no string,
order_time string
)
partitioned by (event_month string)
row format delimited fields terminated by '\t';

# load 数据,指定分区
load data local inpath 'xxx' into table order_partition partition (event_month='2014-05');

# 查看数据
# 真正表的字段是不包括分区字段的,分区字段只是HDFS上文件夹的名称
order_partition.order_no	order_partition.order_time	order_partition.event_month
10703007267488	2014-05-01 06:01:12.334+01	2014-05
10101043505096	2014-05-01 07:28:12.342+01	2014-05
10103043509747	2014-05-01 07:50:12.33+01	2014-05
10103043501575	2014-05-01 09:27:12.33+01	2014-05
10104043514061	2014-05-01 09:03:12.324+01	2014-05

# 查看 HDFS 上的存储
# 存储在 /user/hive/warehouse/order_partition/event_month=2014-05 目录下

# WHERE 子查询可以跟上 event_month=‘xxx’ 查询指定分区的数据

# 生产上的流程一般是数据经过清洗后存放在 HDFS 的目录上,然后将目录中的数据加载到分区表中
# 一级分区
# 将文件放到 HDFS 的对应目录
hdfs dfs -mkdir /user/hive/warehouse/order_partition/event_month=2014-06
hdfs dfs -put xxx /user/hive/warehouse/order_partition/event_month=2014-06

# 加载所有分区
msck repair table order_partition

# 多级分区,注意分区字段多了一个 step
create table order_multi_partition(
order_no string,
order_time string
)
partitioned by (event_month string, step string)
row format delimited fields terminated by '\t';

# 指定 step
load data local inpath 'xxx' into table order_partition partition (event_month='2014-05', step=1);

# HDFS 上的存储位置为 /user/hive/warehouse/order_multi_partition/event_month=2014-05/step=1

# 查询数据
order_multi_partition.order_no	order_multi_partition.order_time	order_multi_partition.event_month	order_multi_partition.step
10703007267488	2014-05-01 06:01:12.334+01	2014-05	1
10101043505096	2014-05-01 07:28:12.342+01	2014-05	1
10103043509747	2014-05-01 07:50:12.33+01	2014-05	1
10103043501575	2014-05-01 09:27:12.33+01	2014-05	1
10104043514061	2014-05-01 09:03:12.324+01	2014-05	1

# 将同一部门的人分到同一个分区

# 创建表
create table emp_partition(
empno int,
ename string,
job string,
mgr int,
hiredate string,
sal double,
comm double
) partitioned by (deptno int)
row format
delimited fields terminated by '\t';


# 静态分区:手动指定每个分区然后加载数据,如果有很多分区,这样一个个的加载非常麻烦
insert into table emp_partition partition(deptno=10)
select empno, ename, job, mgr, hiredate, sal, comm from emp
where deptno=10;

# 动态分区:自动按分区分到不同分区
# 需要设置 set hive.exec.dynamic.partition.mode=nonstrict,默认为strict
# 需要将分区字段写到 SELECT 后的最后一个字段
insert into table emp_dynamic_partition partition(deptno)
select empno, ename, job, mgr, hiredate, sal, comm, deptno from emp;
相关标签: Hive