您的位置:首页 > 其它

Hive分析函数二

2016-10-12 23:46 453 查看
版本0.13.1

准备的数据

create table emp_function
(
deptno   string,
sal      int,
hiredate string

)
row format delimited fields terminated by '|'

10|1300|1982-01-23 00:00:00.0
10|5000|1981-11-17 00:00:00.0
10|2450|1981-06-09 00:00:00.0
20|1100|1987-05-23 00:00:00.0
20|3000|1987-04-19 00:00:00.0
20|800|1980-12-17 00:00:00.0
20|2975|1981-04-02 00:00:00.0
20|3000|1981-12-03 00:00:00.0
30|1500|1981-09-08 00:00:00.0
30|1600|1981-02-20 00:00:00.0
30|2850|1981-05-01 00:00:00.0
30|1250|1981-09-28 00:00:00.0
30|1250|1981-02-22 00:00:00.0
30|950|1981-12-03 00:00:00.0


SUM — 注意,结果和ORDER BY相关,默认为升序

select
deptno,
hiredate,sal,
sum(sal)over(partition by deptno order by hiredate) as sal1,--起始行到当前行汇总
sum(sal)over(partition by deptno order by hiredate rows between unbounded preceding and current row ) as sal2,
--起始行到当前行汇总 和sal1一样
sum(sal)over(partition by deptno ) as sal3,--分组内全部汇总
sum(sal)over(partition by deptno order by hiredate rows between 3 preceding and current row ) as sal4,
--分组内 当前行+前3行汇总
sum(sal)over(partition by deptno order by hiredate rows between 3 preceding and 1 following ) as sal5,
--分组内 后一行+前3行汇总+当前行
sum(sal)over(partition by deptno order by hiredate rows between current row and unbounded following ) as sal6
--分组内 当前行+往后所有的行
from emp_function order by deptno,hiredate;

deptno    hiredate               sal     sal1    sal2    sal3     sal4   sal5    sal6
10      1981-06-09 00:00:00.0   2450    2450.0  2450.0  8750.0  2450.0  7450.0  8750.0
10      1981-11-17 00:00:00.0   5000    7450.0  7450.0  8750.0  7450.0  8750.0  6300.0
10      1982-01-23 00:00:00.0   1300    8750.0  8750.0  8750.0  8750.0  8750.0  1300.0
20      1980-12-17 00:00:00.0   800     800.0   800.0   10875.0 800.0   3775.0  10875.0
20      1981-04-02 00:00:00.0   2975    3775.0  3775.0  10875.0 3775.0  6775.0  10075.0
20      1981-12-03 00:00:00.0   3000    6775.0  6775.0  10875.0 6775.0  9775.0  7100.0
20      1987-04-19 00:00:00.0   3000    9775.0  9775.0  10875.0 9775.0  10875.0 4100.0
20      1987-05-23 00:00:00.0   1100    10875.0 10875.0 10875.0 10075.0 10075.0 1100.0
30      1981-02-20 00:00:00.0   1600    1600.0  1600.0  9400.0  1600.0  2850.0  9400.0
30      1981-02-22 00:00:00.0   1250    2850.0  2850.0  9400.0  2850.0  5700.0  7800.0
30      1981-05-01 00:00:00.0   2850    5700.0  5700.0  9400.0  5700.0  7200.0  6550.0
30      1981-09-08 00:00:00.0   1500    7200.0  7200.0  9400.0  7200.0  8450.0  3700.0
30      1981-09-28 00:00:00.0   1250    8450.0  8450.0  9400.0  6850.0  7800.0  2200.0
30      1981-12-03 00:00:00.0   950     9400.0  9400.0  9400.0  6550.0  6550.0  950.0


sal1 等于起始行的值到当前行值的一个累加

sal2 等于起始行的值到当前行值的一个累加 等于sal1

sal3 就是分组内所有值相加

sal4 就是分组内当前行+往前3行的值相加 deptno 等于20 中有一条10075=1100+3000+3000+2975

sal5 就是分组内当前行+往前3行+往后一行 的值相加 deptno 等于30 7800=950+1250+1500+2850+1250

sal6 就是分组内当前行+往后所有行 的值相加 deptno 等于30 3700=950+1250+1500

如果不指定ROWS BETWEEN,默认为从起点到当前行;

如果不指定ORDER BY,则将分组内所有值累加;

关键是理解ROWS BETWEEN含义,也叫做WINDOW子句:

PRECEDING:往前

FOLLOWING:往后

CURRENT ROW:当前行

UNBOUNDED:起点,UNBOUNDED PRECEDING 表示从前面的起点, UNBOUNDED FOLLOWING:表示到后面的终点

avg函数的使用

select
deptno,
hiredate,sal,
avg(sal)over(partition by deptno order by hiredate) as sal1,
avg(sal)over(partition by deptno order by hiredate rows between unbounded preceding and current row ) as sal2,
avg(sal)over(partition by deptno ) as sal3,
avg(sal)over(partition by deptno order by hiredate rows between 3 preceding and current row ) as sal4,
avg(sal)over(partition by deptno order by hiredate rows between 3 preceding and 1 following ) as sal5,
avg(sal)over(partition by deptno order by hiredate rows between current row and unbounded following ) as sal6
from emp_function order by deptno,hiredate;

deptno    hiredate               sal     sal1    sal2    sal3     sal4   sal5    sal6
10      1981-06-09 00:00:00.0   2450    2450.0  2450.0  2916.6666666666665      2450.0  3725.0  2916.6666666666665
10      1981-11-17 00:00:00.0   5000    3725.0  3725.0  2916.6666666666665      3725.0  2916.6666666666665      3150.0
10      1982-01-23 00:00:00.0   1300    2916.6666666666665      2916.6666666666665      2916.6666666666665      2916.6666666666665      2916.6666666666665      1300.0
20      1980-12-17 00:00:00.0   800     800.0   800.0   2175.0  800.0   1887.5  2175.0
20      1981-04-02 00:00:00.0   2975    1887.5  1887.5  2175.0  1887.5  2258.3333333333335      2518.75
20      1981-12-03 00:00:00.0   3000    2258.3333333333335      2258.3333333333335      2175.0  2258.3333333333335      2443.75 2366.6666666666665
20      1987-04-19 00:00:00.0   3000    2443.75 2443.75 2175.0  2443.75 2175.0  2050.0
20      1987-05-23 00:00:00.0   1100    2175.0  2175.0  2175.0  2518.75 2518.75 1100.0
30      1981-02-20 00:00:00.0   1600    1600.0  1600.0  1566.6666666666667      1600.0  1425.0  1566.6666666666667
30      1981-02-22 00:00:00.0   1250    1425.0  1425.0  1566.6666666666667      1425.0  1900.0  1560.0
30      1981-05-01 00:00:00.0   2850    1900.0  1900.0  1566.6666666666667      1900.0  1800.0  1637.5
30      1981-09-08 00:00:00.0   1500    1800.0  1800.0  1566.6666666666667      1800.0  1690.0  1233.3333333333333
30      1981-09-28 00:00:00.0   1250    1690.0  1690.0  1566.6666666666667      1712.5  1560.0  1100.0
30      1981-12-03 00:00:00.0   950     1566.6666666666667      1566.6666666666667      1566.6666666666667      1637.5  1637.5  950.0


min函数的使用

select
deptno,
hiredate,sal,
min(sal)over(partition by deptno order by hiredate) as sal1,
min(sal)over(partition by deptno order by hiredate rows between unbounded preceding and current row ) as sal2,
min(sal)over(partition by deptno ) as sal3,
min(sal)over(partition by deptno order by hiredate rows between 3 preceding and current row ) as sal4,
min(sal)over(partition by deptno order by hiredate rows between 3 preceding and 1 following ) as sal5,
min(sal)over(partition by deptno order by hiredate rows between current row and unbounded following ) as sal6
from emp_function order by deptno,hiredate;

deptno    hiredate               sal     sal1    sal2    sal3     sal4   sal5    sal6
10      1981-06-09 00:00:00.0   2450    2450    2450    1300    2450    2450    1300
10      1981-11-17 00:00:00.0   5000    2450    2450    1300    2450    1300    1300
10      1982-01-23 00:00:00.0   1300    1300    1300    1300    1300    1300    1300
20      1980-12-17 00:00:00.0   800     800     800     1100    800     2975    1100
20      1981-04-02 00:00:00.0   2975    2975    2975    1100    2975    2975    1100
20      1981-12-03 00:00:00.0   3000    2975    2975    1100    2975    2975    1100
20      1987-04-19 00:00:00.0   3000    2975    2975    1100    2975    1100    1100
20      1987-05-23 00:00:00.0   1100    1100    1100    1100    1100    1100    1100
30      1981-02-20 00:00:00.0   1600    1600    1600    1250    1600    1250    1250
30      1981-02-22 00:00:00.0   1250    1250    1250    1250    1250    1250    1250
30      1981-05-01 00:00:00.0   2850    1250    1250    1250    1250    1250    1250
30      1981-09-08 00:00:00.0   1500    1250    1250    1250    1250    1250    1250
30      1981-09-28 00:00:00.0   1250    1250    1250    1250    1250    1250    1250
30      1981-12-03 00:00:00.0   950     1250    1250    1250    1250    1250    950


max函数的使用

select
deptno,
hiredate,sal,
max(sal)over(partition by deptno order by hiredate) as sal1,
max(sal)over(partition by deptno order by hiredate rows between unbounded preceding and current row ) as sal2,
max(sal)over(partition by deptno ) as sal3,
max(sal)over(partition by deptno order by hiredate rows between 3 preceding and current row ) as sal4,
max(sal)over(partition by deptno order by hiredate rows between 3 preceding and 1 following ) as sal5,
max(sal)over(partition by deptno order by hiredate rows between current row and unbounded following ) as sal6
from emp_function order by deptno,hiredate;

deptno    hiredate               sal     sal1    sal2    sal3     sal4   sal5    sal6
10      1981-06-09 00:00:00.0   2450    2450    2450    5000    2450    5000    5000
10      1981-11-17 00:00:00.0   5000    5000    5000    5000    5000    5000    5000
10      1982-01-23 00:00:00.0   1300    5000    5000    5000    5000    5000    1300
20      1980-12-17 00:00:00.0   800     800     800     800     800     800     800
20      1981-04-02 00:00:00.0   2975    800     800     800     800     800     3000
20      1981-12-03 00:00:00.0   3000    800     800     800     800     800     3000
20      1987-04-19 00:00:00.0   3000    800     800     800     800     800     3000
20      1987-05-23 00:00:00.0   1100    800     800     800     3000    3000    1100
30      1981-02-20 00:00:00.0   1600    1600    1600    950     1600    1600    950
30      1981-02-22 00:00:00.0   1250    1600    1600    950     1600    2850    950
30      1981-05-01 00:00:00.0   2850    2850    2850    950     2850    2850    950
30      1981-09-08 00:00:00.0   1500    2850    2850    950     2850    2850    950
30      1981-09-28 00:00:00.0   1250    2850    2850    950     2850    950     950
30      1981-12-03 00:00:00.0   950     950     950     950     950     950     950
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  hive 函数 分析函数