欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

Oracle 分析函数详解

程序员文章站 2022-03-25 20:26:49
...
------------------------------------------------------------------------------------------------------------------------------------
/*
分析函数的作用总结
1.可以减少表或索引的访问次数。
2.实现复杂的行间计算,聚合等,使sql的结构清晰。
3.缺点是很多分析函数要求排序,需要对排序的字段建立索引以达到优化的效果。

章节如下
1.分析函数简单示例
2.排序分析函数(RANK,DENSE_RANK,ROW_NUMBER)
3.分析函数(Top/Bottom N,First/Last,NTILE)
4.分析函数之窗口函数
5.分析函数之报表函数
*/
---------------------------------------------------------------------------------------------------------------------------------------
--1.分析函数简单示例

--生成测试数据
DROP TABLE ORDERS_TMP;
CREATE TABLE ORDERS_TMP
(
CUST_NBR NUMBER(5) NOT NULL,
REGION_ID NUMBER(5) NOT NULL,
SALESPERSON_ID NUMBER(5) NOT NULL,
YEAR NUMBER(4) NOT NULL,
MONTH NUMBER(2) NOT NULL,
TOT_ORDERS NUMBER(7) NOT NULL,
TOT_SALES NUMBER(11,2) NOT NULL
);

INSERT INTO ORDERS_TMP VALUES(11,7,11,2001,7,2,12204);
INSERT INTO ORDERS_TMP VALUES(4,5,4,2001,10,2,37802);
INSERT INTO ORDERS_TMP VALUES(7,6,7,2001,2,3,3705);
INSERT INTO ORDERS_TMP VALUES(10,6,8,2001,1,2,21691);
INSERT INTO ORDERS_TMP VALUES(10,6,7,2001,2,3,42624);

COMMIT;

SELECT * FROM ORDERS_TMP A;

 CUST_NBR  REGION_ID SALESPERSON_ID       YEAR      MONTH TOT_ORDERS  TOT_SALES
--------- ---------- -------------- ---------- ---------- ---------- ----------
       11          7             11       2001          7          2      12204
        4          5              4       2001         10          2      37802
        7          6              7       2001          2          3       3705
       10          6              8       2001          1          2      21691
       10          6              7       2001          2          3      42624

--统计销售额大于%20以及所在区域的销售额,及各个客户销售比例
SELECT ALL_SALES.*, 100*ROUND(CUST_SALES/REGION_SALES,2)||'%' PERCENT
  FROM (SELECT A.CUST_NBR CUSTOMER,
               A.REGION_ID REGION,
               SUM(A.TOT_SALES) CUST_SALES,
               SUM(SUM(A.TOT_SALES)) OVER(PARTITION BY A.REGION_ID) REGION_SALES,
               SUM(SUM(A.TOT_SALES)) OVER() TOTAL_SALES
          FROM ORDERS_TMP A
         GROUP BY A.REGION_ID, A.CUST_NBR) ALL_SALES
 WHERE ALL_SALES.CUST_SALES > ALL_SALES.REGION_SALES * 0.2;

  CUSTOMER     REGION CUST_SALES REGION_SALES TOTAL_SALES PERCENT
---------- ---------- ---------- ------------ ----------- --------
         4          5      37802        37802      118026 100%
        10          6      64315        68020      118026 95%
        11          7      12204        12204      118026 100%
--Over函数的作用是告诉SQL引擎:按照区域对数据进行分组,然后累积每个区域下所有客户的订单总额
--PARTITION 子句,将相同的行聚合在一起,若没有partition子句,标示统计所有的行
----------------------------------------------------------------------------------------------------------------------------------------
--二.分析函数(RANK,DENSE_RANK,ROW_NUMBER)

--生成测试数据
CREATE TABLE USER_ORDER
(
REGION_ID NUMBER(2),
CUSTOMER_ID NUMBER(2),
CUSTOMER_SALES NUMBER
);


INSERT INTO USER_ORDER VALUES(7,11,1590421);
INSERT INTO USER_ORDER VALUES(6,10,1696748);
INSERT INTO USER_ORDER VALUES(6,9,1108959);
INSERT INTO USER_ORDER VALUES(5,2,1224992);
INSERT INTO USER_ORDER VALUES(9,24,1224992);
INSERT INTO USER_ORDER VALUES(9,23,1224992);

COMMIT;

SELECT * FROM USER_ORDER;

 REGION_ID CUSTOMER_ID CUSTOMER_SALES
---------- ----------- --------------
         7          11        1590421
         6          10        1696748
         6           9        1108959
         5           2        1224992
         9          24        1224992
         9          23        1224992

--不使用分析函数,查找出订单总额前三名的客户,请注意有金额相同的记录
SELECT ROWNUM, T.*
  FROM (SELECT * FROM USER_ORDER ORDER BY CUSTOMER_SALES DESC) T
 WHERE ROWNUM <= 3
 ORDER BY CUSTOMER_SALES DESC;

    ROWNUM  REGION_ID CUSTOMER_ID CUSTOMER_SALES
---------- ---------- ----------- --------------
         1          6          10        1696748
         2          7          11        1590421
         3          5           2        1224992
--很明显,若是简单的按照rownum排序的话,我们漏掉了两条

--采用分析函数
SELECT REGION_ID,
       CUSTOMER_ID,
       SUM(CUSTOMER_SALES) TOTAL,
       RANK() OVER(ORDER BY SUM(CUSTOMER_SALES) DESC) RANK,
       DENSE_RANK() OVER(ORDER BY SUM(CUSTOMER_SALES) DESC) DENSE_RANK,
       ROW_NUMBER() OVER(ORDER BY SUM(CUSTOMER_SALES) DESC) ROWNUMBER
  FROM USER_ORDER
 GROUP BY REGION_ID, CUSTOMER_ID

 REGION_ID CUSTOMER_ID      TOTAL       RANK DENSE_RANK  ROWNUMBER
---------- ----------- ---------- ---------- ---------- ----------
         6          10    1696748          1          1          1
         7          11    1590421          2          2          2
         9          23    1224992          3          3          3
         5           2    1224992          3          3          4
         9          24    1224992          3          3          5
         6           9    1108959          6          4          6

--从上面的结果可以看出三种排序策略的不同
--1.RANK 函数对于相同的结果,排名是一样的,但会在最后一条相同的记录和下一条记录之间空出排名
--2.DENSE_RANK策略与RANK差不多,但不会空出排名,排名是连续的。
--3.ROW_NUMBER返回一个唯一值,若遇到相同的数据,排名依次递增

--使用分析函数对各个地区内的订单总额排序
SELECT REGION_ID,
       CUSTOMER_ID,
       SUM(CUSTOMER_SALES) TOTAL,
       RANK() OVER(PARTITION BY REGION_ID  ORDER BY SUM(CUSTOMER_SALES) DESC) RANK,
       DENSE_RANK() OVER(PARTITION BY REGION_ID ORDER BY SUM(CUSTOMER_SALES) DESC) DENSE_RANK,
       ROW_NUMBER() OVER(PARTITION BY REGION_ID  ORDER BY SUM(CUSTOMER_SALES) DESC) ROW_NUMBER
  FROM USER_ORDER
 GROUP BY REGION_ID, CUSTOMER_ID;

REGION_ID CUSTOMER_ID      TOTAL       RANK DENSE_RANK ROW_NUMBER
--------- ----------- ---------- ---------- ---------- ----------
        5           2    1224992          1          1          1
        6          10    1696748          1          1          1
        6           9    1108959          2          2          2
        7          11    1590421          1          1          1
        9          23    1224992          1          1          1
        9          24    1224992          1          1          2

--Partition by 子句在排列函数中的作用是将一个结果集划分成几个部分,这样排列函数就能够应用于这各个子集。

UPDATE USER_ORDER SET CUSTOMER_SALES = NULL WHERE CUSTOMER_ID = 23;
COMMIT;
--修改用户订单表其中一条为空值,然后再来看排序结果

SELECT REGION_ID,
       CUSTOMER_ID,
       SUM(CUSTOMER_SALES) TOTAL,
       RANK() OVER(PARTITION BY REGION_ID  ORDER BY SUM(CUSTOMER_SALES) DESC) RANK,
       DENSE_RANK() OVER(PARTITION BY REGION_ID ORDER BY SUM(CUSTOMER_SALES) DESC) DENSE_RANK,
       ROW_NUMBER() OVER(PARTITION BY REGION_ID  ORDER BY SUM(CUSTOMER_SALES) DESC) ROW_NUMBER
  FROM USER_ORDER
 GROUP BY REGION_ID, CUSTOMER_ID;
 
REGION_ID CUSTOMER_ID      TOTAL       RANK DENSE_RANK ROW_NUMBER
--------- ----------- ---------- ---------- ---------- ----------
        5           2    1224992          1          1          1
        6          10    1696748          1          1          1
        6           9    1108959          2          2          2
        7          11    1590421          1          1          1
        9          23                     1          1          1
        9          24    1224992          2          2          2

--我们会发现空值会排第一位,需要调整下,在分析函数的结尾加上NULLS LAST(NULLS LAST/FIRST告诉Oracle空值排名)
SELECT REGION_ID,
       CUSTOMER_ID,
       SUM(CUSTOMER_SALES) TOTAL,
       RANK() OVER(PARTITION BY REGION_ID  ORDER BY SUM(CUSTOMER_SALES) DESC NULLS LAST) RANK,
       DENSE_RANK() OVER(PARTITION BY REGION_ID ORDER BY SUM(CUSTOMER_SALES) DESC NULLS LAST) DENSE_RANK,
       ROW_NUMBER() OVER(PARTITION BY REGION_ID  ORDER BY SUM(CUSTOMER_SALES) DESC NULLS LAST) ROW_NUMBER
  FROM USER_ORDER
 GROUP BY REGION_ID, CUSTOMER_ID;

 REGION_ID CUSTOMER_ID      TOTAL       RANK DENSE_RANK ROW_NUMBER
---------- ----------- ---------- ---------- ---------- ----------
         5           2    1224992          1          1          1
         6          10    1696748          1          1          1
         6           9    1108959          2          2          2
         7          11    1590421          1          1          1
         9          24    1224992          1          1          1
         9          23                     2          2          2
-------------------------------------------------------------------------------------------------------------------------------------
--三 分析函数(Top/Bottom N,First/Last,NTILE)
--删除掉空值及重复值
DELETE USER_ORDER WHERE CUSTOMER_ID IN (23,24);
COMMIT;
--查找出订单总额最多和最少的用户,若按照签名的排名分析函数,至少需要两个查询,下面使用First/last来解决这个问题
admin@ORCL> SELECT * FROM USER_ORDER;

 REGION_ID CUSTOMER_ID CUSTOMER_SALES
---------- ----------- --------------
         7          11        1590421
         6          10        1696748
         6           9        1108959
         5           2        1224992


SELECT MIN(CUSTOMER_ID) KEEP(DENSE_RANK FIRST ORDER BY SUM(CUSTOMER_SALES) DESC) FIRST,
       MIN(CUSTOMER_ID) KEEP(DENSE_RANK LAST ORDER BY SUM(CUSTOMER_SALES) DESC) LAST
  FROM USER_ORDER
 GROUP BY CUSTOMER_ID;

    FIRST       LAST
--------- ----------
       10          9

--MIN中的函数用于保证当存在多个Frist或Last的情况,返回唯一条记录(注意)
--DENSE_RANK是必须的,RANK与ROW_NUMBER无法代替




--利用分析函数找出排名1/2的客户
SELECT REGION_ID,
       CUSTOMER_ID,
       SUM(CUSTOMER_SALES) AS CUSTOMER_SALES,
       NTILE(2) OVER(ORDER BY SUM(CUSTOMER_SALES) DESC) TIL
  FROM USER_ORDER
 GROUP BY REGION_ID, CUSTOMER_ID;
 
--NTIL为各个记录在记录集中的排名计算比例,我们看到所有的记录被分为2个等级,那么我们只要取tile=1的记录值即可。

------------------------------------------------------------------------------------------------------------------------------------
--四 分析函数之窗口函数

CREATE TABLE ORDERS(
MONTH NUMBER(2),
TOT_SALES NUMBER
);

INSERT INTO ORDERS VALUES(1,610697);
INSERT INTO ORDERS VALUES(2,428676);
INSERT INTO ORDERS VALUES(3,637031);
INSERT INTO ORDERS VALUES(4,541146);
INSERT INTO ORDERS VALUES(5,592935);
INSERT INTO ORDERS VALUES(6,501485);
INSERT INTO ORDERS VALUES(7,606914);
INSERT INTO ORDERS VALUES(8,460520);
INSERT INTO ORDERS VALUES(9,392898);
INSERT INTO ORDERS VALUES(10,510117);
INSERT INTO ORDERS VALUES(11,532889);
INSERT INTO ORDERS VALUES(12,492458);
COMMIT;

SELECT * FROM ORDERS;

     MONTH  TOT_SALES
---------- ----------
         1     610697
         2     428676
         3     637031
         4     541146
         5     592935
         6     501485
         7     606914
         8     460520
         9     392898
        10     510117
        11     532889
        12     492458

--ROWS BETWEEN ..PRECEDING AND ...FOLLOWING 意思是:在xxx之前和xxx之后
SELECT MONTH,
SUM(TOT_SALES) MONTH_SALES,
SUM(SUM(TOT_SALES)) OVER (ORDER BY MONTH ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING)TOTAL_SALES1,
SUM(SUM(TOT_SALES)) OVER (ORDER BY MONTH ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) TOTAL_SALES2,
SUM(SUM(TOT_SALES)) OVER (ORDER BY MONTH ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)TOTAL_SALES3
FROM ORDERS
GROUP BY MONTH;

     MONTH MONTH_SALES TOTAL_SALES1 TOTAL_SALES2 TOTAL_SALES3
---------- ----------- ------------ ------------ ------------
         1      610697      6307766      1039373      6307766
         2      428676      6307766      1676404      6307766
         3      637031      5697069      1606853      6307766
         4      541146      5268393      1771112      6307766
         5      592935      4631362      1635566      6307766
         6      501485      4090216      1701334      6307766
         7      606914      3497281      1568919      6307766
         8      460520      2995796      1460332      6307766
         9      392898      2388882      1363535      6307766
        10      510117      1928362      1435904      6307766
        11      532889      1535464      1535464      6307766
        12      492458      1025347      1025347      6307766

--从结果中可以看出。
--TOTAL_SALES1逻辑是统计当前行+前一条+随后的所有记录。
--TOTAL_SALES2逻辑是统计当前行+当前行的前一条+当前行的后一条。
--TOTAL_SALES3逻辑是统计所有行


--统计每月订单总额及截止到当前月的订单总额
SELECT MONTH,
       SUM(TOT_SALES) MONTH_SALES,
       SUM(SUM(TOT_SALES)) OVER(ORDER BY MONTH ASC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 FOLLOWING) TOTAL_SALES1,
       SUM(SUM(TOT_SALES)) OVER(ORDER BY MONTH ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) TOTAL_SALE2,
       SUM(SUM(TOT_SALES)) OVER(ORDER BY MONTH) TOTAL_SALES3,
       ROUND(AVG(SUM(TOT_SALES)) OVER(ORDER BY MONTH),2) AVG_SALES
  FROM ORDERS
 GROUP BY MONTH;

     MONTH MONTH_SALES TOTAL_SALES1 TOTAL_SALE2 TOTAL_SALES3  AVG_SALES
---------- ----------- ------------ ----------- ------------ ----------
         1      610697       610697      610697       610697     610697
         2      428676      1039373     1039373      1039373   519686.5
         3      637031      1676404     1676404      1676404  558801.33
         4      541146      2217550     2217550      2217550   554387.5
         5      592935      2810485     2810485      2810485     562097
         6      501485      3311970     3311970      3311970     551995
         7      606914      3918884     3918884      3918884  559840.57
         8      460520      4379404     4379404      4379404   547425.5
         9      392898      4772302     4772302      4772302  530255.78
        10      510117      5282419     5282419      5282419   528241.9
        11      532889      5815308     5815308      5815308  528664.36
        12      492458      6307766     6307766      6307766  525647.17
--从结果来看,前三条分析函数语句结果是相同的。
--order by 默认数据范围是当前所需的partition的第一行到当前行
--AVG_SALES用于统计滚动的月平均销量

--我们还可以用First_value和Last_value用于在窗口记录集中查找第一条和最后一条记录
SELECT MONTH,
       SUM(TOT_SALES) MONTH_SALES,
       FIRST_VALUE(SUM(TOT_SALES)) OVER(ORDER BY MONTH ASC ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) PRE_VALUE,
       LAST_VALUE(SUM(TOT_SALES)) OVER(ORDER BY MONTH ASC ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) NEXT_VALUE
  FROM ORDERS
 GROUP BY MONTH;
 
     MONTH MONTH_SALES  PRE_VALUE NEXT_VALUE
---------- ----------- ---------- ----------
         1      610697     610697     428676
         2      428676     610697     637031
         3      637031     428676     541146
         4      541146     637031     592935
         5      592935     541146     501485
         6      501485     592935     606914
         7      606914     501485     460520
         8      460520     606914     392898
         9      392898     460520     510117
        10      510117     392898     532889
        11      532889     510117     492458
        12      492458     532889     492458


--生成测试数据,测试窗口函数中RANGE的使用

CREATE TABLE ORDERS_TIME
(
ORDER_DATE DATE,
AMOUNT NUMBER
);

INSERT INTO ORDERS_TIME VALUES(TO_DATE('1999-01-01','YYYY-MM-DD'),100);
INSERT INTO ORDERS_TIME VALUES(TO_DATE('2000-01-01','YYYY-MM-DD'),50);
INSERT INTO ORDERS_TIME VALUES(TO_DATE('1999-01-01','YYYY-MM-DD'),30);
INSERT INTO ORDERS_TIME VALUES(TO_DATE('2000-01-03','YYYY-MM-DD'),20);
INSERT INTO ORDERS_TIME VALUES(TO_DATE('2000-01-02','YYYY-MM-DD'),200);
COMMIT;

SELECT * FROM ORDERS_TIME;
ORDER_DATE         AMOUNT
-------------- ----------
      1999/1/1	100
      2000/1/1	50
      1999/1/1	30
      2000/1/3	20
      2000/1/2	200
      2000/2/2	200

SELECT ORDER_DATE,SUM(AMOUNT),
SUM(SUM(AMOUNT)) OVER(ORDER BY ORDER_DATE RANGE BETWEEN INTERVAL '1' DAY PRECEDING AND INTERVAL '1' DAY FOLLOWING) SALES
 FROM ORDERS_TIME
GROUP BY ORDER_DATE;

ORDER_DATE	SUM(AMOUNT)	SALES
----------   --------   -----
1999/1/1     	130	     130
2000/1/1	    50	     250
2000/1/2	    200      270
2000/1/3	    20	     220
2000/2/2	    200	     200
--RANGE及interval关键字用于统计当前日期的销售额+前一天+后一天


---窗口函数进阶--比较相邻

SELECT ORDER_DATE,SUM(AMOUNT),
LAG(SUM(AMOUNT),1,NULL) OVER(ORDER BY ORDER_DATE) PRE_SALES,
LEAD(SUM(AMOUNT),1,NULL) OVER(ORDER BY ORDER_DATE) NEXT_SALES
 FROM ORDERS_TIME
GROUP BY ORDER_DATE;
--,数据的偏移量默认是1,若没有下一个值,用null来代替
--也可以加partition by进行分组,然后当寻找上一个/下一个值时,只在组内寻找
----------------------------------------------------------------------------------------------------------------------------------------
--五 分析函数之报表函数


--统计每个客户的销量占区域销量的比例,采用比较常规方法

SELECT CUST_NBR,
       REGION_ID,
       SUM(TOT_SALES) CUST_SALES,
       100*ROUND(SUM(TOT_SALES)/SUM(SUM(TOT_SALES)) OVER(PARTITION BY REGION_ID),2)||'%' PERCENT
  FROM ORDERS_TMP O
 GROUP BY REGION_ID, CUST_NBR;

  CUST_NBR  REGION_ID CUST_SALES PERCENT
---------- ---------- ---------- -------------------------
         4          5      37802 100%
         7          6       3705 5%
        10          6      64315 95%
        11          7      12204 100%

--下面使用报表函数RATIO_TO_REPORT
SELECT CUST_NBR,
       REGION_ID,
       SUM(TOT_SALES) CUST_SALES,
       100*ROUND(RATIO_TO_REPORT((SUM(TOT_SALES))) OVER(PARTITION BY REGION_ID),2)||'%' PERCENT
  FROM ORDERS_TMP O
 GROUP BY REGION_ID, CUST_NBR;

  CUST_NBR  REGION_ID CUST_SALES PERCENT
---------- ---------- ---------- ---------------------
         4          5      37802 100%
         7          6       3705 5%
        10          6      64315 95%
        11          7      12204 100%
--结果是一样的,Oracle提供的Ratio_to_report函数统计每条记录在其子集中所占的比例



--本文章参考
--http://wenku.baidu.com/view/6694a7225901020207409c5d.html