count(distinct hotel_seq) size(collect_set(hotel_seq)) 性能对比
优化措施
可使用 size(collect_set(hotel_seq)) 替换 count(distinct hotel_seq)
注:size(collect_set()) 在去重统计字段量级很大时,可能会OOM
执行计划对比
count(distinct )需要进行两次shuffle,第一次去重,第二次统计去重数
size(collect_set())只需进行一次shuffle
explain select dt,count(distinct ticket_type_id) from ticket.dwd_ticket_choose_product_result_log where dt='2022-03-01' group by 1;
operation: QUERY
INFO : Starting task [Stage-2:EXPLAIN] in serial mode
INFO : Completed executing command(queryId=qhstats_20220302195445_c456e147-3222-4652-954d-77910a510823); Time taken: 0.089 seconds
INFO : OK
STAGE DEPENDENCIES:
Stage-1 is a root stage
Stage-0 depends on stages: Stage-1
STAGE PLANS:
Stage: Stage-1
Map Reduce
Map Operator Tree:
TableScan
alias: dwd_ticket_choose_product_result_log
Statistics: Num rows: 111776 Data size: 217915894 Basic stats: COMPLETE Column stats: NONE
Select Operator
expressions: ticket_type_id (type: string)
outputColumnNames: ticket_type_id
Statistics: Num rows: 111776 Data size: 217915894 Basic stats: COMPLETE Column stats: NONE
Group By Operator
aggregations: count(DISTINCT ticket_type_id)
keys: '2022-03-01' (type: string), ticket_type_id (type: string)
mode: hash
outputColumnNames: _col0, _col1, _col2
Statistics: Num rows: 111776 Data size: 217915894 Basic stats: COMPLETE Column stats: NONE
Reduce Output Operator
key expressions: _col0 (type: string), _col1 (type: string)
sort order: ++
Map-reduce partition columns: _col0 (type: string)
Statistics: Num rows: 111776 Data size: 217915894 Basic stats: COMPLETE Column stats: NONE
Reduce Operator Tree:
Group By Operator
aggregations: count(DISTINCT KEY._col1:0._col0)
keys: KEY._col0 (type: string)
mode: mergepartial
outputColumnNames: _col0, _col1
Statistics: Num rows: 55888 Data size: 108957947 Basic stats: COMPLETE Column stats: NONE
File Output Operator
compressed: false
Statistics: Num rows: 55888 Data size: 108957947 Basic stats: COMPLETE Column stats: NONE
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
Stage: Stage-0
Fetch Operator
limit: -1
Processor Tree:
ListSink
47 rows selected (0.254 seconds)
explain select dt,size(collect_set(ticket_type_id)) from ticket.dwd_ticket_choose_product_result_log where dt='2022-03-01' group by 1;
operation: QUERY
INFO : Starting task [Stage-2:EXPLAIN] in serial mode
INFO : Completed executing command(queryId=qhstats_20220302195600_c96e09de-550d-4858-a9c9-70c3e0675c78); Time taken: 0.09 seconds
INFO : OK
STAGE DEPENDENCIES:
Stage-1 is a root stage
Stage-0 depends on stages: Stage-1
STAGE PLANS:
Stage: Stage-1
Map Reduce
Map Operator Tree:
TableScan
alias: dwd_ticket_choose_product_result_log
Statistics: Num rows: 111776 Data size: 217915894 Basic stats: COMPLETE Column stats: NONE
Select Operator
expressions: ticket_type_id (type: string)
outputColumnNames: ticket_type_id
Statistics: Num rows: 111776 Data size: 217915894 Basic stats: COMPLETE Column stats: NONE
Group By Operator
aggregations: collect_set(ticket_type_id)
keys: '2022-03-01' (type: string)
mode: hash
outputColumnNames: _col0, _col1
Statistics: Num rows: 111776 Data size: 217915894 Basic stats: COMPLETE Column stats: NONE
Reduce Output Operator
key expressions: '2022-03-01' (type: string)
sort order: +
Map-reduce partition columns: '2022-03-01' (type: string)
Statistics: Num rows: 111776 Data size: 217915894 Basic stats: COMPLETE Column stats: NONE
value expressions: _col1 (type: array<string>)
Reduce Operator Tree:
Group By Operator
aggregations: collect_set(VALUE._col0)
keys: '2022-03-01' (type: string)
mode: mergepartial
outputColumnNames: _col0, _col1
Statistics: Num rows: 55888 Data size: 108957947 Basic stats: COMPLETE Column stats: NONE
Select Operator
expressions: '2022-03-01' (type: string), size(_col1) (type: int)
outputColumnNames: _col0, _col1
Statistics: Num rows: 55888 Data size: 108957947 Basic stats: COMPLETE Column stats: NONE
File Output Operator
compressed: false
Statistics: Num rows: 55888 Data size: 108957947 Basic stats: COMPLETE Column stats: NONE
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
Stage: Stage-0
Fetch Operator
limit: -1
Processor Tree:
ListSink
52 rows selected (0.251 seconds)
窗口函数对性能的影响
窗口函数相比于join子查询无性能损失
select
report_date,
win_payment,
hotel_level,
lose_limit_type,
lose_fmjd,
lose_fg_to_pp,
lose_reason,
cnt,
max(cnt) over(partition by report_date) total_cnt,
hotel_num,
max(hotel_num) over(partition by report_date) total_hotel_num,
product_num,
max(product_num) over(partition by report_date) total_product_num
from tmp_tb_${hour} b
------------------------------------------------------------------------
select
a.report_date,
a.win_payment,
a.hotel_level,
a.lose_limit_type,
a.lose_fmjd,
a.lose_fg_to_pp,
a.lose_reason,
a.cnt,
b.cnt total_cnt,
a.hotel_num,
b.hotel_num total_hotel_num,
a.product_num,
b.product_num total_product_num
from tmp_tb_${hour} a
left join (
select
report_date,
cnt,
hotel_num,
product_num
from tmp_tb_${hour}
where win_payment='ALL'
and hotel_level='ALL'
and lose_limit_type='ALL'
and lose_fmjd='ALL'
and lose_fg_to_pp='ALL'
and lose_reason='ALL'
group by 1,2,3,4
) b on a.report_date=b.report_date;
多维对性能的影响
with cube替换
select
case when win_payment is not null then win_payment else 'ALL' end win_payment,
case when hotel_level is not null then hotel_level else 'ALL' end hotel_level,
case when lose_limit_type is not null then lose_limit_type else 'ALL' end lose_limit_type,
case when lose_fmjd is not null then lose_fmjd else 'ALL' end lose_fmjd,
case when lose_fg_to_pp is not null then lose_fg_to_pp else 'ALL' end lose_fg_to_pp,
case when lose_reason is not null then lose_reason else 'ALL' end lose_reason,
count(*) cnt,
--count(distinct hotel_seq) hotel_num,
size(collect_set(hotel_seq)) hotel_num,
--count(distinct lose_product_name) product_num,
size(collect_set(lose_product_name)) product_num,
grouping__id,
'$FDATE_SUB0DAY ' as report_date
from (
select
dt,
hour,
case when win_payment in ('0','1') then '现付' when win_payment='2' then '预付' else '其他' end win_payment,
case when hotel_level='1' then '一星' when hotel_level='2' then '三星' when hotel_level='3' then '四星' when hotel_level='4' then '五星' when hotel_level='5' then '二星' when hotel_level='0' then '未知' else '其他' end hotel_level,
case when nvl(lose_limit_type,'') != '' then lose_limit_type else '其他' end lose_limit_type,
case when lose_fmjd='1' then '是' when lose_fmjd='0' then '否' end lose_fmjd,
case when lose_fg_to_pp='1' then '是' when lose_fg_to_pp='0' then '否' end lose_fg_to_pp,
case when lose_reason !='' then lose_reason else '其他' end lose_reason,
hotel_seq,
lose_product_name
from ods_select_product_log_v3
where dt='$DATE'
)a
group by win_payment,hotel_level,lose_limit_type,lose_fmjd,lose_fg_to_pp,lose_reason
with cube
--order by grouping__id
select
win_payment_t win_payment,
hotel_level_t hotel_level,
lose_limit_type_t lose_limit_type,
lose_fmjd_t lose_fmjd,
lose_fg_to_pp_t lose_fg_to_pp,
lose_reason_t lose_reason,
count(*) cnt,
--count(distinct hotel_seq) hotel_num,
size(collect_set(hotel_seq)) hotel_num,
--count(distinct lose_product_name) product_num,
size(collect_set(lose_product_name)) product_num,
'$FORMAT_DATE' as report_date
from
(
select
dt,
hour,
case when win_payment in ('0','1') then '现付' when win_payment='2' then '预付' else '其他' end win_payment,
case when hotel_level='1' then '一星' when hotel_level='2' then '三星' when hotel_level='3' then '四星' when hotel_level='4' then '五星' when hotel_level='5' then '二星' when hotel_level='0' then '未知' else '其他' end hotel_level,
case when nvl(lose_limit_type,'') != '' then lose_limit_type else '其他' end lose_limit_type,
case when lose_fmjd='1' then '是' when lose_fmjd='0' then '否' else 'NULL' end lose_fmjd,
case when lose_fg_to_pp='1' then '是' when lose_fg_to_pp='0' then '否' else 'NULL' end lose_fg_to_pp,
case when lose_reason !='' then lose_reason else '其他' end lose_reason,
hotel_seq,
lose_product_name
from ods_select_product_log_v3
where dt='$DATE'
)a
lateral view explode(array('ALL',a.win_payment)) tt1 as win_payment_t
lateral view explode(array('ALL',a.hotel_level)) tt2 as hotel_level_t
lateral view explode(array('ALL',a.lose_limit_type)) tt3 as lose_limit_type_t
lateral view explode(array('ALL',a.lose_fmjd)) tt4 as lose_fmjd_t
lateral view explode(array('ALL',a.lose_fg_to_pp)) tt5 as lose_fg_to_pp_t
lateral view explode(array('ALL',a.lose_reason)) tt6 as lose_reason_t
group by win_payment_t,hotel_level_t,lose_limit_type_t,lose_fmjd_t,lose_fg_to_pp_t,lose_reason_t