From 9b53b837832670ea68b56cbd61f55d9f32b875d5 Mon Sep 17 00:00:00 2001 From: Huang Wei Date: Thu, 18 Aug 2022 09:58:28 +0800 Subject: [PATCH] docs: window and udf fix --- docs/zh/reference/sql/dql/WINDOW_CLAUSE.md | 42 +++++++++++++++--- .../udf/default_defs/window_functions_def.cc | 44 ++++++++++++------- 2 files changed, 65 insertions(+), 21 deletions(-) diff --git a/docs/zh/reference/sql/dql/WINDOW_CLAUSE.md b/docs/zh/reference/sql/dql/WINDOW_CLAUSE.md index 7e733895ca3..168cfe8ac82 100644 --- a/docs/zh/reference/sql/dql/WINDOW_CLAUSE.md +++ b/docs/zh/reference/sql/dql/WINDOW_CLAUSE.md @@ -72,18 +72,50 @@ function_name ( * ) OVER window_name ## SQL语句模版 -- ROWS WINDOW SQL模版 +- ROWS WINDOW(条数窗口) SQL模版 ```sql SELECT select_expr [, select_expr ...], window_function_name(expr) OVER window_name, ... FROM ... WINDOW AS window_name (PARTITION BY ... ORDER BY ... ROWS BETWEEN ... AND ...) ``` -- ROWS RANGE WINDOW SQL模版 +- ROWS RANGE WINDOW(时间窗口) SQL模版 ```sql SELECT select_expr [,select_expr...], window_function_name(expr) OVER window_name, ... FROM ... WINDOW AS window_name (PARTITION BY ... ORDER BY ... ROWS_RANEG BETWEEN ... AND ...) ``` +## 快速上手 + +首先选择窗口类型,按时间,还是按条数划分窗口。 + +再看窗口想要什么大小,这里要分窗口类型说明: +1. 时间窗口:时间窗口通常使用s, m, h, d等时间单位,如果没有单位,默认为ms。比如: + + [3小时前,当前行] - 3h preceding and current row + [3小时前,30分钟前] - 3h preceding and 30m preceding + +1. 条数窗口:条数不需要单位。比如: + [10条,当前行] - 10 preceding and current row + [10条,3条] - 10 preceding and 3 preceding + +### 如何推断窗口是什么样的? + +首先,先明确是什么执行模式: + +离线模式,即批模式,它是对from表的每一行都做一次窗口划分与计算。因此,每一行对应产生一行SQL结果。 +请求模式,会带一条请求行,它会将请求行当做from表的数据,只对该行做窗口划分和计算,因此,只产生一行SQL结果。 + +再看,如何划分窗口: + +我们将批模式看作多次请求模式来看待。所以,对一次请求行来说,窗口只可能包含,它自己,与它的partition by列值相等的行(可能的全集)。 + +partition key相等的所有行,还不是窗口,经由order by列排序后,还需要排除窗口范围以外的数据。比如,10 preceding and current row的条数窗口,就要抛弃10行以外的数据行(第10行包含在窗口内),又因为包括current row,于是窗口一共有11行数据。 + +* preceding为闭区间,包含该条,开区间使用open preceding + +窗口还可以exclude current time,current row等,详情见下文。 + + ## 边界说明 | SELECT语句元素 | 离线模式 | 在线预览模式 | 在线请求模式 | 说明 | @@ -114,14 +146,14 @@ WindowFrameUnits | 'ROWS_RANGE' ``` -WindowFrameUnits定义了窗口的框架类型。OpenMLDB支持两类窗口框架:ROWS和ROWS_RANGE +WindowFrameUnits定义了窗口的框架类型。OpenMLDB支持两类窗口框架:ROWS和ROWS_RANGE。 SQL标准的RANGE类窗口OpenMLDB系统目前暂不支持。他们直接的对比差异如下图所示 ![Figure 1: window frame type](../dql/images/window_frame_type.png) -- ROWS: 窗口按行划入窗口,根据条数滑出窗口 -- ROWS_RANGE:窗口按行划入窗口,根据时间区间滑出窗口 +- ROWS: 窗口按行划入窗口,根据**条数**滑出窗口 +- ROWS_RANGE:窗口按行划入窗口,根据**时间区间**滑出窗口 - RANGE: 窗口按时间粒度划入窗口(一次可能滑入多条同一时刻的数据行),按时间区间滑出窗口 ### Window Frame Extent diff --git a/hybridse/src/udf/default_defs/window_functions_def.cc b/hybridse/src/udf/default_defs/window_functions_def.cc index 03aa9c39015..10b067478c9 100644 --- a/hybridse/src/udf/default_defs/window_functions_def.cc +++ b/hybridse/src/udf/default_defs/window_functions_def.cc @@ -92,11 +92,13 @@ node::ExprNode* BuildAt(UdfResolveContext* ctx, ExprNode* input, ExprNode* idx, } template -void RegisterBaseListAt(UdfLibrary* lib) { - lib->RegisterExternal("at") +void RegisterBaseListLag(UdfLibrary* lib) { + lib->RegisterExternal("lag") .doc(R"( @brief Returns value evaluated at the row that is offset rows before the current row within the partition. Offset is evaluated with respect to the current row - + Note: This function equals the `at()` function. + But old `at()` is start from the last row of window(may not be the current row) + @param offset The number of rows forwarded from the current row, must not negative Example: @@ -109,7 +111,16 @@ void RegisterBaseListAt(UdfLibrary* lib) { |3 | 2| |4 | 2| @code{.sql} - SELECT at(c1, 1) as co OVER w from t1 window (order by c1 partition by c2); + SELECT lag(c1, 1) over w as co from t1 window w as(partition by c2 order by c1 rows between unbounded preceding and current row); + -- output + -- | co | + -- |----| + -- |NULL| + -- |0 | + -- |NULL| + -- |2 | + -- |3 | + SELECT at(c1, 1) over w as co from t1 window w as(partition by c2 order by c1 rows between unbounded preceding and current row); -- output -- | co | -- |----| @@ -119,6 +130,7 @@ void RegisterBaseListAt(UdfLibrary* lib) { -- |2 | -- |3 | @endcode + )") .args, int64_t>(reinterpret_cast(AtList)) .return_by_arg(true) @@ -127,23 +139,23 @@ void RegisterBaseListAt(UdfLibrary* lib) { void DefaultUdfLibrary::InitWindowFunctions() { // basic at impl for , int32> - RegisterBaseListAt(this); - RegisterBaseListAt(this); - RegisterBaseListAt(this); - RegisterBaseListAt(this); - RegisterBaseListAt(this); - RegisterBaseListAt(this); - RegisterBaseListAt(this); - RegisterBaseListAt(this); - RegisterBaseListAt(this); + RegisterBaseListLag(this); + RegisterBaseListLag(this); + RegisterBaseListLag(this); + RegisterBaseListLag(this); + RegisterBaseListLag(this); + RegisterBaseListLag(this); + RegisterBaseListLag(this); + RegisterBaseListLag(this); + RegisterBaseListLag(this); - // general at - RegisterExprUdf("at").list_argument_at(0).args( + // general lag + RegisterExprUdf("lag").list_argument_at(0).args( [](UdfResolveContext* ctx, ExprNode* input, ExprNode* idx) { return BuildAt(ctx, input, idx, nullptr); }); - RegisterAlias("lag", "at"); + RegisterAlias("at", "lag"); RegisterExprUdf("first_value") .list_argument_at(0) .args([](UdfResolveContext* ctx, ExprNode* input) {