Skip to content

Commit

Permalink
lpad (#113)
Browse files Browse the repository at this point in the history
* ARROW-12567: [C++][Gandiva] Implement LPAD and RPAD functions for string input values

- LPAD([string] basetext, [number] x, [optional string] padtext)
- RPAD([string] basetext, [number] x, [optional string] padtext)

lpad - Prepends padtext to basetext in a way that allows as many characters as possible from padtext given an output string length of x. When x is less than or equal to the length of basetext, only characters from basetext are printed in the output. If padtext is omitted then spaces are prepended.

rpad - Appends padtext to basetext in a way that allows as many characters as possible from padtext given an output string length of x. When x is less than or equal to the length of basetext, only characters from basetext are printed in the output. If padtext is omitted then spaces are appended.

Closes apache#10173 from jpedroantunes/feature/lpad-rpad-functions and squashes the following commits:

4efc0fe <João Pedro> Add utf8_length method that ignore invalid char considering size 1
33a5a14 <João Pedro> Fix identation on function string registry
4c4b2f4 <João Pedro> Change lpad and rpad functions signature and definition
26b90b0 <João Pedro> Correct ci lint errors on gandiva
66594a0 <João Pedro> Correct lint local errors on gandiva
b6b63e9 <João Pedro> Add projector test for RPAD string function
dc72148 <João Pedro> Add function registry for RPAD string function without pad text
c270fb1 <João Pedro> Add base implementation and tests for RPAD functions
08d2053 <João Pedro> Add function registry for LPAD string function without pad text
585cad3 <João Pedro> Add base implementation and tests for LPAD function without pad texts considering string input values
73927fc <João Pedro> Add projector test for LPAD string function
2c929a9 <João Pedro> Add function registry for LPAD string function
aecaff6 <João Pedro> Add base implementation and tests for LPAD function considering string input values

Authored-by: João Pedro <[email protected]>
Signed-off-by: Praveen <[email protected]>

* ARROW-13780: [Gandiva][UDF] Fix bug in udf space/rpad/lpad

- add max/min return length for space/lpad/rpad udfs
- correct return length

Closes apache#11016 from ZMZ91/bugfix/limit_return_chars_count

Authored-by: ZMZ <[email protected]>
Signed-off-by: Pindikura Ravindra <[email protected]>
Signed-off-by: Yuan Zhou <[email protected]>

* fix concat

Signed-off-by: Yuan Zhou <[email protected]>

Co-authored-by: João Pedro <[email protected]>
Co-authored-by: ZMZ <[email protected]>
  • Loading branch information
3 people authored Jun 14, 2022
1 parent 6a43921 commit 2ca3236
Show file tree
Hide file tree
Showing 7 changed files with 47,708 additions and 1 deletion.
14 changes: 14 additions & 0 deletions cpp/src/gandiva/function_registry_string.cc
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,20 @@ std::vector<NativeFunction> GetStringFunctionRegistry() {
NativeFunction::kNeedsFunctionHolder |
NativeFunction::kCanReturnErrors),

NativeFunction("lpad", {}, DataTypeVector{utf8(), int32(), utf8()}, utf8(),
kResultNullIfNull, "lpad_utf8_int32_utf8",
NativeFunction::kNeedsContext),

NativeFunction("lpad", {}, DataTypeVector{utf8(), int32()}, utf8(),
kResultNullIfNull, "lpad_utf8_int32", NativeFunction::kNeedsContext),

NativeFunction("rpad", {}, DataTypeVector{utf8(), int32(), utf8()}, utf8(),
kResultNullIfNull, "rpad_utf8_int32_utf8",
NativeFunction::kNeedsContext),

NativeFunction("rpad", {}, DataTypeVector{utf8(), int32()}, utf8(),
kResultNullIfNull, "rpad_utf8_int32", NativeFunction::kNeedsContext),

NativeFunction("concatOperator", {}, DataTypeVector{utf8(), utf8()}, utf8(),
kResultNullIfNull, "concatOperator_utf8_utf8",
NativeFunction::kNeedsContext),
Expand Down
217 changes: 216 additions & 1 deletion cpp/src/gandiva/precompiled/string_ops.cc
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,27 @@ gdv_int32 utf8_length(gdv_int64 context, const char* data, gdv_int32 data_len) {
return count;
}

// Count the number of utf8 characters, ignoring invalid char, considering size 1
FORCE_INLINE
gdv_int32 utf8_length_ignore_invalid(const char* data, gdv_int32 data_len) {
int char_len = 0;
int count = 0;
for (int i = 0; i < data_len; i += char_len) {
char_len = utf8_char_length(data[i]);
if (char_len == 0 || i + char_len > data_len) { // invalid byte or incomplete glyph
// if invalid byte or incomplete glyph, ignore it
char_len = 1;
}
for (int j = 1; j < char_len; ++j) {
if ((data[i + j] & 0xC0) != 0x80) { // bytes following head-byte of glyph
char_len += 1;
}
}
++count;
}
return count;
}

// Get the byte position corresponding to a character position for a non-empty utf8
// sequence
FORCE_INLINE
Expand Down Expand Up @@ -281,6 +302,37 @@ const char* lower_utf8(gdv_int64 context, const char* data, gdv_int32 data_len,
return ret;
}

// set max/min str length for space_int32, space_int64, lpad_utf8_int32_utf8
// and rpad_utf8_int32_utf8 to avoid exceptions
static const gdv_int32 max_str_length = 65536;
static const gdv_int32 min_str_length = 0;
// Returns a string of 'n' spaces.
#define SPACE_STR(IN_TYPE) \
GANDIVA_EXPORT \
const char* space_##IN_TYPE(gdv_int64 ctx, gdv_##IN_TYPE n, int32_t* out_len) { \
n = std::min(static_cast<gdv_##IN_TYPE>(max_str_length), n); \
n = std::max(static_cast<gdv_##IN_TYPE>(min_str_length), n); \
gdv_int32 n_times = static_cast<gdv_int32>(n); \
if (n_times <= 0) { \
*out_len = 0; \
return ""; \
} \
char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(ctx, n_times)); \
if (ret == nullptr) { \
gdv_fn_context_set_error_msg(ctx, "Could not allocate memory for output string"); \
*out_len = 0; \
return ""; \
} \
for (int i = 0; i < n_times; i++) { \
ret[i] = ' '; \
} \
*out_len = n_times; \
return ret; \
}

SPACE_STR(int32)
SPACE_STR(int64)

// Reverse a utf8 sequence
FORCE_INLINE
const char* reverse_utf8(gdv_int64 context, const char* data, gdv_int32 data_len,
Expand Down Expand Up @@ -767,11 +819,13 @@ const char* concat_utf8_utf8_utf8(gdv_int64 context, const char* in1, gdv_int32
bool in3_validity, gdv_int32* out_len) {
if (!in1_validity) {
in1_len = 0;
in2_len = 0;
}
if (!in2_validity) {
if (!in2_validity || (!in1_validity && !in3_validity)) {
in2_len = 0;
}
if (!in3_validity) {
in2_len = 0;
in3_len = 0;
}
return concatOperator_utf8_utf8_utf8(context, in1, in1_len, in2, in2_len, in3, in3_len,
Expand Down Expand Up @@ -1424,6 +1478,167 @@ const char* replace_utf8_utf8_utf8(gdv_int64 context, const char* text,
out_len);
}

FORCE_INLINE
gdv_int32 evaluate_return_char_length(gdv_int32 text_len, gdv_int32 actual_text_len,
gdv_int32 return_length, const char* fill_text,
gdv_int32 fill_text_len) {
gdv_int32 fill_actual_text_len = utf8_length_ignore_invalid(fill_text, fill_text_len);
gdv_int32 repeat_times = (return_length - actual_text_len) / fill_actual_text_len;
gdv_int32 return_char_length = repeat_times * fill_text_len + text_len;
gdv_int32 mod = (return_length - actual_text_len) % fill_actual_text_len;
gdv_int32 char_len = 0;
gdv_int32 fill_index = 0;
for (gdv_int32 i = 0; i < mod; i++) {
char_len = utf8_char_length(fill_text[fill_index]);
fill_index += char_len;
return_char_length += char_len;
}
return return_char_length;
}

FORCE_INLINE
const char* lpad_utf8_int32_utf8(gdv_int64 context, const char* text, gdv_int32 text_len,
gdv_int32 return_length, const char* fill_text,
gdv_int32 fill_text_len, gdv_int32* out_len) {
// if the text length or the defined return length (number of characters to return)
// is <=0, then return an empty string.
return_length = std::min(max_str_length, return_length);
return_length = std::max(min_str_length, return_length);
if (text_len == 0 || return_length <= 0) {
*out_len = 0;
return "";
}

// count the number of utf8 characters on text, ignoring invalid bytes
int actual_text_len = utf8_length_ignore_invalid(text, text_len);

if (return_length == actual_text_len ||
(return_length > actual_text_len && fill_text_len == 0)) {
// case where the return length is same as the text's length, or if it need to
// fill into text but "fill_text" is empty, then return text directly.
*out_len = text_len;
return text;
} else if (return_length < actual_text_len) {
// case where it truncates the result on return length.
*out_len = utf8_byte_pos(context, text, text_len, return_length);
return text;
} else {
// case (return_length > actual_text_len)
// case where it needs to copy "fill_text" on the string left. The total number
// of chars to copy is given by (return_length - actual_text_len)
gdv_int32 return_char_length = evaluate_return_char_length(
text_len, actual_text_len, return_length, fill_text, fill_text_len);
char* ret = reinterpret_cast<gdv_binary>(
gdv_fn_context_arena_malloc(context, return_char_length));
if (ret == nullptr) {
gdv_fn_context_set_error_msg(context,
"Could not allocate memory for output string");
*out_len = 0;
return "";
}
// try to fulfill the return string with the "fill_text" continuously
int32_t copied_chars_count = 0;
int32_t copied_chars_position = 0;
while (copied_chars_count < return_length - actual_text_len) {
int32_t char_len;
int32_t fill_index;
// for each char, evaluate its length to consider it when mem copying
for (fill_index = 0; fill_index < fill_text_len; fill_index += char_len) {
if (copied_chars_count >= return_length - actual_text_len) {
break;
}
char_len = utf8_char_length(fill_text[fill_index]);
// ignore invalid char on the fill text, considering it as size 1
if (char_len == 0) char_len += 1;
copied_chars_count++;
}
memcpy(ret + copied_chars_position, fill_text, fill_index);
copied_chars_position += fill_index;
}
// after fulfilling the text, copy the main string
memcpy(ret + copied_chars_position, text, text_len);
*out_len = copied_chars_position + text_len;
return ret;
}
}

FORCE_INLINE
const char* rpad_utf8_int32_utf8(gdv_int64 context, const char* text, gdv_int32 text_len,
gdv_int32 return_length, const char* fill_text,
gdv_int32 fill_text_len, gdv_int32* out_len) {
// if the text length or the defined return length (number of characters to return)
// is <=0, then return an empty string.
return_length = std::min(max_str_length, return_length);
return_length = std::max(min_str_length, return_length);
if (text_len == 0 || return_length <= 0) {
*out_len = 0;
return "";
}

// count the number of utf8 characters on text, ignoring invalid bytes
int actual_text_len = utf8_length_ignore_invalid(text, text_len);

if (return_length == actual_text_len ||
(return_length > actual_text_len && fill_text_len == 0)) {
// case where the return length is same as the text's length, or if it need to
// fill into text but "fill_text" is empty, then return text directly.
*out_len = text_len;
return text;
} else if (return_length < actual_text_len) {
// case where it truncates the result on return length.
*out_len = utf8_byte_pos(context, text, text_len, return_length);
return text;
} else {
// case (return_length > actual_text_len)
// case where it needs to copy "fill_text" on the string right
gdv_int32 return_char_length = evaluate_return_char_length(
text_len, actual_text_len, return_length, fill_text, fill_text_len);
char* ret = reinterpret_cast<gdv_binary>(
gdv_fn_context_arena_malloc(context, return_char_length));
if (ret == nullptr) {
gdv_fn_context_set_error_msg(context,
"Could not allocate memory for output string");
*out_len = 0;
return "";
}
// fulfill the initial text copying the main input string
memcpy(ret, text, text_len);
// try to fulfill the return string with the "fill_text" continuously
int32_t copied_chars_count = 0;
int32_t copied_chars_position = 0;
while (actual_text_len + copied_chars_count < return_length) {
int32_t char_len;
int32_t fill_length;
// for each char, evaluate its length to consider it when mem copying
for (fill_length = 0; fill_length < fill_text_len; fill_length += char_len) {
if (actual_text_len + copied_chars_count >= return_length) {
break;
}
char_len = utf8_char_length(fill_text[fill_length]);
// ignore invalid char on the fill text, considering it as size 1
if (char_len == 0) char_len += 1;
copied_chars_count++;
}
memcpy(ret + text_len + copied_chars_position, fill_text, fill_length);
copied_chars_position += fill_length;
}
*out_len = copied_chars_position + text_len;
return ret;
}
}

FORCE_INLINE
const char* lpad_utf8_int32(gdv_int64 context, const char* text, gdv_int32 text_len,
gdv_int32 return_length, gdv_int32* out_len) {
return lpad_utf8_int32_utf8(context, text, text_len, return_length, " ", 1, out_len);
}

FORCE_INLINE
const char* rpad_utf8_int32(gdv_int64 context, const char* text, gdv_int32 text_len,
gdv_int32 return_length, gdv_int32* out_len) {
return rpad_utf8_int32_utf8(context, text, text_len, return_length, " ", 1, out_len);
}

FORCE_INLINE
const char* split_part(gdv_int64 context, const char* text, gdv_int32 text_len,
const char* delimiter, gdv_int32 delim_len, gdv_int32 index,
Expand Down
Loading

0 comments on commit 2ca3236

Please sign in to comment.