hive性能优化

2022-03-02 | 阅读：次

count(distinct hotel_seq) size(collect_set(hotel_seq)) 性能对比

优化措施

可使用 size(collect_set(hotel_seq)) 替换 count(distinct hotel_seq)

注：size(collect_set()) 在去重统计字段量级很大时，可能会OOM

执行计划对比

count(distinct )需要进行两次shuffle，第一次去重，第二次统计去重数
size(collect_set())只需进行一次shuffle

explain select dt,count(distinct ticket_type_id) from ticket.dwd_ticket_choose_product_result_log where dt='2022-03-01' group by 1;

operation: QUERY

INFO  : Starting task [Stage-2:EXPLAIN] in serial mode
INFO  : Completed executing command(queryId=qhstats_20220302195445_c456e147-3222-4652-954d-77910a510823); Time taken: 0.089 seconds
INFO  : OK
STAGE DEPENDENCIES:
  Stage-1 is a root stage
  Stage-0 depends on stages: Stage-1

STAGE PLANS:
  Stage: Stage-1
    Map Reduce
      Map Operator Tree:
          TableScan
            alias: dwd_ticket_choose_product_result_log
            Statistics: Num rows: 111776 Data size: 217915894 Basic stats: COMPLETE Column stats: NONE
            Select Operator
              expressions: ticket_type_id (type: string)
              outputColumnNames: ticket_type_id
              Statistics: Num rows: 111776 Data size: 217915894 Basic stats: COMPLETE Column stats: NONE
              Group By Operator
                aggregations: count(DISTINCT ticket_type_id)
                keys: '2022-03-01' (type: string), ticket_type_id (type: string)
                mode: hash
                outputColumnNames: _col0, _col1, _col2
                Statistics: Num rows: 111776 Data size: 217915894 Basic stats: COMPLETE Column stats: NONE
                Reduce Output Operator
                  key expressions: _col0 (type: string), _col1 (type: string)
                  sort order: ++
                  Map-reduce partition columns: _col0 (type: string)
                  Statistics: Num rows: 111776 Data size: 217915894 Basic stats: COMPLETE Column stats: NONE
      Reduce Operator Tree:
        Group By Operator
          aggregations: count(DISTINCT KEY._col1:0._col0)
          keys: KEY._col0 (type: string)
          mode: mergepartial
          outputColumnNames: _col0, _col1
          Statistics: Num rows: 55888 Data size: 108957947 Basic stats: COMPLETE Column stats: NONE
          File Output Operator
            compressed: false
            Statistics: Num rows: 55888 Data size: 108957947 Basic stats: COMPLETE Column stats: NONE
            table:
                input format: org.apache.hadoop.mapred.SequenceFileInputFormat
                output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
                serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe

  Stage: Stage-0
    Fetch Operator
      limit: -1
      Processor Tree:
        ListSink

47 rows selected (0.254 seconds)

explain select dt,size(collect_set(ticket_type_id)) from ticket.dwd_ticket_choose_product_result_log where dt='2022-03-01' group by 1;

operation: QUERY

INFO  : Starting task [Stage-2:EXPLAIN] in serial mode
INFO  : Completed executing command(queryId=qhstats_20220302195600_c96e09de-550d-4858-a9c9-70c3e0675c78); Time taken: 0.09 seconds
INFO  : OK
STAGE DEPENDENCIES:
  Stage-1 is a root stage
  Stage-0 depends on stages: Stage-1

STAGE PLANS:
  Stage: Stage-1
    Map Reduce
      Map Operator Tree:
          TableScan
            alias: dwd_ticket_choose_product_result_log
            Statistics: Num rows: 111776 Data size: 217915894 Basic stats: COMPLETE Column stats: NONE
            Select Operator
              expressions: ticket_type_id (type: string)
              outputColumnNames: ticket_type_id
              Statistics: Num rows: 111776 Data size: 217915894 Basic stats: COMPLETE Column stats: NONE
              Group By Operator
                aggregations: collect_set(ticket_type_id)
                keys: '2022-03-01' (type: string)
                mode: hash
                outputColumnNames: _col0, _col1
                Statistics: Num rows: 111776 Data size: 217915894 Basic stats: COMPLETE Column stats: NONE
                Reduce Output Operator
                  key expressions: '2022-03-01' (type: string)
                  sort order: +
                  Map-reduce partition columns: '2022-03-01' (type: string)
                  Statistics: Num rows: 111776 Data size: 217915894 Basic stats: COMPLETE Column stats: NONE
                  value expressions: _col1 (type: array<string>)
      Reduce Operator Tree:
        Group By Operator
          aggregations: collect_set(VALUE._col0)
          keys: '2022-03-01' (type: string)
          mode: mergepartial
          outputColumnNames: _col0, _col1
          Statistics: Num rows: 55888 Data size: 108957947 Basic stats: COMPLETE Column stats: NONE
          Select Operator
            expressions: '2022-03-01' (type: string), size(_col1) (type: int)
            outputColumnNames: _col0, _col1
            Statistics: Num rows: 55888 Data size: 108957947 Basic stats: COMPLETE Column stats: NONE
            File Output Operator
              compressed: false
              Statistics: Num rows: 55888 Data size: 108957947 Basic stats: COMPLETE Column stats: NONE
              table:
                  input format: org.apache.hadoop.mapred.SequenceFileInputFormat
                  output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
                  serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe

  Stage: Stage-0
    Fetch Operator
      limit: -1
      Processor Tree:
        ListSink

52 rows selected (0.251 seconds)

窗口函数对性能的影响

窗口函数相比于join子查询无性能损失

select
    report_date,
    win_payment,
    hotel_level,
    lose_limit_type,
    lose_fmjd,
    lose_fg_to_pp,
    lose_reason,
    cnt,
    max(cnt) over(partition by report_date) total_cnt,
    hotel_num,
    max(hotel_num) over(partition by report_date) total_hotel_num,
    product_num,
    max(product_num) over(partition by report_date) total_product_num
from tmp_tb_${hour} b
------------------------------------------------------------------------
select
    a.report_date,
    a.win_payment,
    a.hotel_level,
    a.lose_limit_type,
    a.lose_fmjd,
    a.lose_fg_to_pp,
    a.lose_reason,
    a.cnt,
    b.cnt total_cnt,
    a.hotel_num,
    b.hotel_num total_hotel_num,
    a.product_num,
    b.product_num total_product_num
from tmp_tb_${hour} a
left join (
    select
        report_date,
        cnt,
        hotel_num,
        product_num
    from tmp_tb_${hour}
    where win_payment='ALL'
      and hotel_level='ALL'
      and lose_limit_type='ALL'
      and lose_fmjd='ALL'
      and lose_fg_to_pp='ALL'
      and lose_reason='ALL'
    group by 1,2,3,4
) b on a.report_date=b.report_date;

多维对性能的影响

with cube替换

select
    case when win_payment is not null then win_payment else 'ALL' end win_payment,
    case when hotel_level is not null then hotel_level else 'ALL' end hotel_level,
    case when lose_limit_type is not null then lose_limit_type else 'ALL' end lose_limit_type,
    case when lose_fmjd is not null then lose_fmjd else 'ALL' end lose_fmjd,
    case when lose_fg_to_pp is not null then lose_fg_to_pp else 'ALL' end lose_fg_to_pp,
    case when lose_reason is not null then lose_reason else 'ALL' end lose_reason,
    count(*) cnt,
    --count(distinct hotel_seq) hotel_num,
    size(collect_set(hotel_seq)) hotel_num,
    --count(distinct lose_product_name) product_num,
    size(collect_set(lose_product_name)) product_num,
    grouping__id,
    '$FDATE_SUB0DAY ' as report_date
from (
    select
        dt,
        hour,
        case when win_payment in ('0','1') then '现付' when win_payment='2' then '预付' else '其他' end win_payment,
        case when hotel_level='1' then '一星' when hotel_level='2' then '三星' when hotel_level='3' then '四星' when hotel_level='4' then '五星' when hotel_level='5' then '二星' when hotel_level='0' then '未知' else '其他' end hotel_level,
        case when nvl(lose_limit_type,'') != '' then lose_limit_type else '其他' end lose_limit_type,
        case when lose_fmjd='1' then '是' when lose_fmjd='0' then '否' end lose_fmjd,
        case when lose_fg_to_pp='1' then '是' when lose_fg_to_pp='0' then '否' end lose_fg_to_pp,
        case when lose_reason !='' then lose_reason else '其他' end lose_reason,
        hotel_seq,
        lose_product_name
    from ods_select_product_log_v3
    where dt='$DATE'
)a
group by win_payment,hotel_level,lose_limit_type,lose_fmjd,lose_fg_to_pp,lose_reason
with cube
--order by grouping__id

select
    win_payment_t win_payment,
    hotel_level_t hotel_level,
    lose_limit_type_t lose_limit_type,
    lose_fmjd_t lose_fmjd,
    lose_fg_to_pp_t lose_fg_to_pp,
    lose_reason_t lose_reason,
    count(*) cnt,
    --count(distinct hotel_seq) hotel_num,
    size(collect_set(hotel_seq)) hotel_num,
    --count(distinct lose_product_name) product_num,
    size(collect_set(lose_product_name)) product_num,
    '$FORMAT_DATE' as report_date
from
(
    select
        dt,
        hour,
        case when win_payment in ('0','1') then '现付' when win_payment='2' then '预付' else '其他' end win_payment,
        case when hotel_level='1' then '一星' when hotel_level='2' then '三星' when hotel_level='3' then '四星' when hotel_level='4' then '五星' when hotel_level='5' then '二星' when hotel_level='0' then '未知' else '其他' end hotel_level,
        case when nvl(lose_limit_type,'') != '' then lose_limit_type else '其他' end lose_limit_type,
        case when lose_fmjd='1' then '是' when lose_fmjd='0' then '否' else 'NULL' end lose_fmjd,
        case when lose_fg_to_pp='1' then '是' when lose_fg_to_pp='0' then '否' else 'NULL' end lose_fg_to_pp,
        case when lose_reason !='' then lose_reason else '其他' end lose_reason,
        hotel_seq,
        lose_product_name
    from ods_select_product_log_v3
    where dt='$DATE'
)a
lateral view explode(array('ALL',a.win_payment)) tt1 as win_payment_t
lateral view explode(array('ALL',a.hotel_level)) tt2 as hotel_level_t
lateral view explode(array('ALL',a.lose_limit_type)) tt3 as lose_limit_type_t
lateral view explode(array('ALL',a.lose_fmjd)) tt4 as lose_fmjd_t
lateral view explode(array('ALL',a.lose_fg_to_pp)) tt5 as lose_fg_to_pp_t
lateral view explode(array('ALL',a.lose_reason)) tt6 as lose_reason_t
group by win_payment_t,hotel_level_t,lose_limit_type_t,lose_fmjd_t,lose_fg_to_pp_t,lose_reason_t

jacky-wangjj