Grouping Sets主要是用一条sql高效统计出在不同维度字段汇总输出结果,代替union all的冗余低效
1.clickhouse的写法
--group by与grouping sets 之间不带分组字段,根据grouping sets的字段组合汇总
例如:
select substr(dt,1,7),substr(dt,1,4) ,sum(ct)from
(
select 1 as id,'2023-02-03' as dt, 1 ct
union all
select 2 as id,'2023-01-02' as dt, 2 as ct
)
group by
grouping sets(
(substr(dt,1,7)),
(substr(dt,1,4))
)
--cube立方体的组合,更简洁,【datacenter,distro,version】不同组合排列分组汇总
SELECT
datacenter,
distro,
version,
SUM(quantity)
FROM
(
select 1 as datacenter,'2023-02-03' as distro, 1 as version,10 as quantity
union all
select 2 as datacenter,'2023-01-02' as distro, 2 as version,20 as quantity
)
GROUP BY
CUBE(datacenter,distro,version)
ORDER BY
datacenter,
distro,
version
2.presto的写法
与clickhouse的写法一样,不需要group by与grouping sets 之间分组字段
例如:
select
cast(first_startapp_date as varchar) as statistics_date
,statistics_week
,statistics_month
,'整体' as user_type
,count(distinct user_id) as new_app_ucnt
,count(distinct if(is_1day = 1,user_id,null)) as new_app_ucnt_1day
,count(distinct if(is_7day = 1,user_id,null)) as new_app_ucnt_7day
,count(distinct if(is_1week = 1,user_id,null)) as new_app_ucnt_1week
,count(distinct if(is_1month = 1,user_id,null)) as new_app_ucnt_1month
,count(distinct if(is_retention_1day = 1,user_id,null)) as retention_1day_ucnt
,count(distinct if(is_retention_7day = 1,user_id,null)) as retention_7day_ucnt
,count(distinct if(is_retention_1week = 1,user_id,null)) as retention_1week_ucnt
,count(distinct if(is_retention_1month = 1,user_id,null)) as retention_1month_ucnt
from hive2_cdh.tmp.tmp_ll_test
where statistics_month='2023-07'
group by
grouping sets (
(first_startapp_date)
,(statistics_week)
,(statistics_month)
)
3.hive的写法
group by与grouping sets 之间必须带有分组字段,且必须出现在select里
样例:
select
a.receive_date_start
,a.receive_week_start
,a.receive_month_start
,a.user_date_label
,a.user_week_label
,a.user_month_label
,a.user_date_classification
,a.user_week_classification
,a.user_month_classification
,count(distinct a.parent_user_id) as start_app_uv
from tmp_xb.tmp_ll_0713 as a
where receive_date_start>='2023-06-27'
and receive_date_start<='2023-07-03'
group by
a.receive_date_start
,a.receive_week_start
,a.receive_month_start
,a.user_date_label
,a.user_week_label
,a.user_month_label
,a.user_date_classification
,a.user_week_classification
,a.user_month_classification
grouping sets
(
(a.receive_date_start
,a.user_date_label
,a.user_date_classification)
,(a.receive_week_start
,a.user_week_label
,a.user_week_classification)
,(a.receive_month_start
,a.user_month_label
,a.user_month_classification)
)