feat: add regexp_match_substring, regexp_strpos, and regexp_count_sub…

…string
substrait-io · Aug 17, 2022 · d1598bd · d1598bd
1 parent dc677c2
commit d1598bd
Showing 1 changed file with 194 additions and 0 deletions.
diff --git a/extensions/functions_string.yaml b/extensions/functions_string.yaml
@@ -46,6 +46,67 @@ scalar_functions:
           - value: i32
           - value: i32
         return: "string"
+  -
+    name: regexp_match_substring
+    description: >-
+      Extract a substring that matches the given regular expression pattern. The regular expression
+      pattern should follow the International Components for Unicode implementation
+      (https://unicode-org.github.io/icu/userguide/strings/regexp.html). The occurrence of the
+      pattern to be extracted is specified using the `occurrence` argument. Specifying `1` means
+      the first occurrence will be extracted, `2` means the second occurrence, and so on.
+      The `occurrence` argument should be a positive non-zero integer. The number of characters
+      from the beginning of the string to begin starting to search for pattern matches can be
+      specified using the `position` argument. Specifying `1` means to search for matches
+      starting at the first character of the input string, `2` means the second character, and so
+      on. The `position` argument should be a positive non-zero integer.
+
+      The `case_sensitivity` option specifies case-sensitive or case-insensitive matching.
+      Enabling the `multiline` option will treat the input string as multiple lines. This makes
+      the `^` and `$` characters match at the beginning and end of any line, instead of just the
+      beginning and end of the input string. Enabling the `dotall` option makes the `.` character
+      match line terminator characters in a string.
+
+      Behavior is undefined if the regex fails to compile, the occurrence value is out of range, or
+      the position value is out of range.
+    impls:
+      - args:
+          - name: case_sensitivity
+            options: [ CASE_SENSITIVE, CASE_INSENSITIVE]
+            required: false
+          - name: multiline
+            options: [ MULTILINE_DISABLED, MULTILINE_ENABLED]
+            required: false
+          - name: dotall
+            options: [ DOTALL_DISABLED, DOTALL_ENABLED]
+            required: false
+          - value: "varchar<L1>"
+            name: "input"
+          - value: "varchar<L2>"
+            name: "pattern"
+          - value: i64
+            name: "position"
+          - value: i64
+            name: "occurrence"
+        return: "varchar<L1>"
+      - args:
+          - name: case_sensitivity
+            options: [ CASE_SENSITIVE, CASE_INSENSITIVE]
+            required: false
+          - name: multiline
+            options: [ MULTILINE_DISABLED, MULTILINE_ENABLED]
+            required: false
+          - name: dotall
+            options: [ DOTALL_DISABLED, DOTALL_ENABLED]
+            required: false
+          - value: "string"
+            name: "input"
+          - value: "string"
+            name: "pattern"
+          - value: i64
+            name: "position"
+          - value: i64
+            name: "occurrence"
+        return: "string"
   -
     name: starts_with
     description: Whether this string starts with another string.
@@ -196,6 +257,69 @@ scalar_functions:
             name: "substring"
             description: The substring to search for.
         return: i64
+  -
+    name: regexp_strpos
+    description: >-
+      Return the position of an occurrence of the given regular expression pattern in a
+      string. The first character of the string is at position 1. The regular expression pattern
+      should follow the International Components for Unicode implementation
+      (https://unicode-org.github.io/icu/userguide/strings/regexp.html). The number of characters
+      from the beginning of the string to begin starting to search for pattern matches can be
+      specified using the `position` argument. Specifying `1` means to search for matches
+      starting at the first character of the input string, `2` means the second character, and so
+      on. The `position` argument should be a positive non-zero integer. Which occurrence to
+      return the position of is specified using the `occurrence` argument. Specifying `1` means
+      the position first occurrence will be returned, `2` means the position of the second
+      occurrence, and so on. The `occurrence` argument should be a positive non-zero integer. If
+      no occurrence is found, 0 is returned.
+
+      The `case_sensitivity` option specifies case-sensitive or case-insensitive matching.
+      Enabling the `multiline` option will treat the input string as multiple lines. This makes
+      the `^` and `$` characters match at the beginning and end of any line, instead of just the
+      beginning and end of the input string. Enabling the `dotall` option makes the `.` character
+      match line terminator characters in a string.
+
+      Behavior is undefined if the regex fails to compile, the occurrence value is out of range, or
+      the position value is out of range.
+    impls:
+      - args:
+          - name: case_sensitivity
+            options: [ CASE_SENSITIVE, CASE_INSENSITIVE]
+            required: false
+          - name: multiline
+            options: [ MULTILINE_DISABLED, MULTILINE_ENABLED]
+            required: false
+          - name: dotall
+            options: [ DOTALL_DISABLED, DOTALL_ENABLED]
+            required: false
+          - value: "varchar<L1>"
+            name: "input"
+          - value: "varchar<L2>"
+            name: "pattern"
+          - value: i64
+            name: "position"
+          - value: i64
+            name: "occurrence"
+        return: i64
+      - args:
+          - name: case_sensitivity
+            options: [ CASE_SENSITIVE, CASE_INSENSITIVE]
+            required: false
+          - name: multiline
+            options: [ MULTILINE_DISABLED, MULTILINE_ENABLED]
+            required: false
+          - name: dotall
+            options: [ DOTALL_DISABLED, DOTALL_ENABLED]
+            required: false
+          - value: "string"
+            name: "input"
+          - value: "string"
+            name: "pattern"
+          - value: i64
+            name: "position"
+          - value: i64
+            name: "occurrence"
+        return: i64
   -
     name: count_substring
     description: Return the number of non-overlapping occurrences of a substring in an input string.
@@ -224,6 +348,76 @@ scalar_functions:
             name: "substring"
             description: The substring to count.
         return: i64
+  -
+    name: regexp_count_substring
+    description: >-
+      Return the number of non-overlapping occurrences of a regular expression pattern in an input
+      string. The regular expression pattern should follow the International Components for
+      Unicode implementation (https://unicode-org.github.io/icu/userguide/strings/regexp.html).
+      The number of characters from the beginning of the string to begin starting to search for
+      pattern matches can be specified using the `position` argument. Specifying `1` means to
+      search for matches starting at the first character of the input string, `2` means the
+      second character, and so on. The `position` argument should be a positive non-zero integer.
+
+      The `case_sensitivity` option specifies case-sensitive or case-insensitive matching.
+      Enabling the `multiline` option will treat the input string as multiple lines. This makes
+      the `^` and `$` characters match at the beginning and end of any line, instead of just the
+      beginning and end of the input string. Enabling the `dotall` option makes the `.` character
+      match line terminator characters in a string.
+
+      Behavior is undefined if the regex fails to compile or the position value is out of range.
+    impls:
+      - args:
+          - name: case_sensitivity
+            options: [ CASE_SENSITIVE, CASE_INSENSITIVE]
+            required: false
+          - name: multiline
+            options: [ MULTILINE_DISABLED, MULTILINE_ENABLED]
+            required: false
+          - name: dotall
+            options: [ DOTALL_DISABLED, DOTALL_ENABLED]
+            required: false
+          - value: "string"
+            name: "input"
+          - value: "string"
+            name: "pattern"
+          - value: i64
+            name: "position"
+        return: i64
+      - args:
+          - name: case_sensitivity
+            options: [ CASE_SENSITIVE, CASE_INSENSITIVE]
+            required: false
+          - name: multiline
+            options: [ MULTILINE_DISABLED, MULTILINE_ENABLED]
+            required: false
+          - name: dotall
+            options: [ DOTALL_DISABLED, DOTALL_ENABLED]
+            required: false
+          - value: "varchar<L1>"
+            name: "input"
+          - value: "varchar<L2>"
+            name: "pattern"
+          - value: i64
+            name: "position"
+        return: i64
+      - args:
+          - name: case_sensitivity
+            options: [ CASE_SENSITIVE, CASE_INSENSITIVE]
+            required: false
+          - name: multiline
+            options: [ MULTILINE_DISABLED, MULTILINE_ENABLED]
+            required: false
+          - name: dotall
+            options: [ DOTALL_DISABLED, DOTALL_ENABLED]
+            required: false
+          - value: "fixedchar<L1>"
+            name: "input"
+          - value: "fixedchar<L2>"
+            name: "pattern"
+          - value: i64
+            name: "position"
+        return: i64
   -
     name: replace
     description: >-