From 1fd9a7e28e911f7080987979f3c6541d40068b48 Mon Sep 17 00:00:00 2001 From: "leanne.laceybyrne@eliatra.com" Date: Mon, 23 Sep 2024 16:22:36 +0100 Subject: [PATCH 01/14] finished tokenizers example Signed-off-by: leanne.laceybyrne@eliatra.com --- .../tokenizers/character-group-tokenizer.md | 41 +++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 _analyzers/tokenizers/character-group-tokenizer.md diff --git a/_analyzers/tokenizers/character-group-tokenizer.md b/_analyzers/tokenizers/character-group-tokenizer.md new file mode 100644 index 0000000000..e80f26fe59 --- /dev/null +++ b/_analyzers/tokenizers/character-group-tokenizer.md @@ -0,0 +1,41 @@ +--- +layout: default +title: Character Group Tokenizer +parent: Tokenizers +nav_order: 60 +has_children: false +has_toc: false +--- + +# Character group tokenizer + +The Character Group Tokenizer is designed to segment text into tokens based on the presence of specific characters. This tokenizer is ideal for scenarios where a straightforward tokenization approach is required, avoiding the complexity and overhead associated with pattern-based tokenizers. + +The Character Group Tokenizer accepts the following parameters: +1. `tokenize_on_chars`: Specifies a set of characters on which the text should be tokenized. When any character from this set is encountered, a new token is created. For example, single characters `(e.g., -, @)` and character classes such as `whitespace`, `letter`, `digit`, `punctuation`, and `symbol`. +4. `max_token_length`: This parameter defines the maximum length allowed for a token. If a token exceeds this specified length, it will be split at intervals defined by `max_token_length`. The default value is `255`. + +## Example of the character group tokenizer +``` +POST _analyze +{ + "tokenizer": { + "type": "char_group", + "tokenize_on_chars": [ + "whitespace", + "-", + ":" + ] + }, + "text": "Fast-cars: drive fast!" +} +``` +Summary of the outputted response text: +``` +Fast cars drive fast +``` + + + + + From 0a849c820287f8baf09b72c4c56add199eae4ab7 Mon Sep 17 00:00:00 2001 From: "leanne.laceybyrne@eliatra.com" Date: Mon, 23 Sep 2024 16:27:57 +0100 Subject: [PATCH 02/14] updating nav order Signed-off-by: leanne.laceybyrne@eliatra.com --- _analyzers/tokenizers/character-group-tokenizer.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_analyzers/tokenizers/character-group-tokenizer.md b/_analyzers/tokenizers/character-group-tokenizer.md index e80f26fe59..8a5aff1647 100644 --- a/_analyzers/tokenizers/character-group-tokenizer.md +++ b/_analyzers/tokenizers/character-group-tokenizer.md @@ -2,7 +2,7 @@ layout: default title: Character Group Tokenizer parent: Tokenizers -nav_order: 60 +nav_order: 70 has_children: false has_toc: false --- From ecb5e5f0e042a6378b23746d9cd005067c73d1ac Mon Sep 17 00:00:00 2001 From: "leanne.laceybyrne@eliatra.com" Date: Wed, 9 Oct 2024 14:08:56 +0100 Subject: [PATCH 03/14] layout cleanup Signed-off-by: leanne.laceybyrne@eliatra.com --- _analyzers/tokenizers/character-group-tokenizer.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/_analyzers/tokenizers/character-group-tokenizer.md b/_analyzers/tokenizers/character-group-tokenizer.md index 8a5aff1647..a72a767224 100644 --- a/_analyzers/tokenizers/character-group-tokenizer.md +++ b/_analyzers/tokenizers/character-group-tokenizer.md @@ -16,6 +16,9 @@ The Character Group Tokenizer accepts the following parameters: 4. `max_token_length`: This parameter defines the maximum length allowed for a token. If a token exceeds this specified length, it will be split at intervals defined by `max_token_length`. The default value is `255`. ## Example of the character group tokenizer + +We can tokenize the on characters such as `whitespace`, `-` and `:`. + ``` POST _analyze { @@ -30,12 +33,9 @@ POST _analyze "text": "Fast-cars: drive fast!" } ``` -Summary of the outputted response text: + +By analyzing the text "Fast-cars: drive fast!", we can see the specified characters have been removed: + ``` Fast cars drive fast ``` - - - - - From 828e7fc604b9f3c69f9b8d7b494dbfbb95971f1d Mon Sep 17 00:00:00 2001 From: "leanne.laceybyrne@eliatra.com" Date: Wed, 9 Oct 2024 14:47:17 +0100 Subject: [PATCH 04/14] grammar fix Signed-off-by: leanne.laceybyrne@eliatra.com --- _analyzers/tokenizers/character-group-tokenizer.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_analyzers/tokenizers/character-group-tokenizer.md b/_analyzers/tokenizers/character-group-tokenizer.md index a72a767224..a98a7bf86c 100644 --- a/_analyzers/tokenizers/character-group-tokenizer.md +++ b/_analyzers/tokenizers/character-group-tokenizer.md @@ -19,7 +19,7 @@ The Character Group Tokenizer accepts the following parameters: We can tokenize the on characters such as `whitespace`, `-` and `:`. -``` +```json POST _analyze { "tokenizer": { From def3c62fc6ed88b467c3ab9c0302d7cfd4c37d3f Mon Sep 17 00:00:00 2001 From: "leanne.laceybyrne@eliatra.com" Date: Thu, 10 Oct 2024 12:41:24 +0100 Subject: [PATCH 05/14] doc: small update for page numbers Signed-off-by: leanne.laceybyrne@eliatra.com --- _analyzers/tokenizers/character-group-tokenizer.md | 2 +- _analyzers/tokenizers/index.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/_analyzers/tokenizers/character-group-tokenizer.md b/_analyzers/tokenizers/character-group-tokenizer.md index a98a7bf86c..c68823f6d7 100644 --- a/_analyzers/tokenizers/character-group-tokenizer.md +++ b/_analyzers/tokenizers/character-group-tokenizer.md @@ -2,7 +2,7 @@ layout: default title: Character Group Tokenizer parent: Tokenizers -nav_order: 70 +nav_order: 20 has_children: false has_toc: false --- diff --git a/_analyzers/tokenizers/index.md b/_analyzers/tokenizers/index.md index e5ac796c12..6bf6fc1aba 100644 --- a/_analyzers/tokenizers/index.md +++ b/_analyzers/tokenizers/index.md @@ -1,7 +1,7 @@ --- layout: default title: Tokenizers -nav_order: 60 +nav_order: 10 has_children: false has_toc: false redirect_from: From cef551a60dc2bd7aafabb342a37d943c01123567 Mon Sep 17 00:00:00 2001 From: "leanne.laceybyrne@eliatra.com" Date: Thu, 10 Oct 2024 12:56:54 +0100 Subject: [PATCH 06/14] layout fix: correct scentence case for all examples Signed-off-by: leanne.laceybyrne@eliatra.com --- _analyzers/tokenizers/character-group-tokenizer.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/_analyzers/tokenizers/character-group-tokenizer.md b/_analyzers/tokenizers/character-group-tokenizer.md index c68823f6d7..ca3349c89a 100644 --- a/_analyzers/tokenizers/character-group-tokenizer.md +++ b/_analyzers/tokenizers/character-group-tokenizer.md @@ -9,9 +9,9 @@ has_toc: false # Character group tokenizer -The Character Group Tokenizer is designed to segment text into tokens based on the presence of specific characters. This tokenizer is ideal for scenarios where a straightforward tokenization approach is required, avoiding the complexity and overhead associated with pattern-based tokenizers. +The character group tokenizer is designed to segment text into tokens based on the presence of specific characters. This tokenizer is ideal for scenarios where a straightforward tokenization approach is required, avoiding the complexity and overhead associated with pattern-based tokenizers. -The Character Group Tokenizer accepts the following parameters: +The character group tokenizer accepts the following parameters: 1. `tokenize_on_chars`: Specifies a set of characters on which the text should be tokenized. When any character from this set is encountered, a new token is created. For example, single characters `(e.g., -, @)` and character classes such as `whitespace`, `letter`, `digit`, `punctuation`, and `symbol`. 4. `max_token_length`: This parameter defines the maximum length allowed for a token. If a token exceeds this specified length, it will be split at intervals defined by `max_token_length`. The default value is `255`. From d4c1cc43cc1b97d35a5726857239ec7c2c3f2c37 Mon Sep 17 00:00:00 2001 From: "leanne.laceybyrne@eliatra.com" Date: Fri, 11 Oct 2024 11:25:30 +0100 Subject: [PATCH 07/14] small update: adding copy tag for json segment Signed-off-by: leanne.laceybyrne@eliatra.com --- _analyzers/tokenizers/character-group-tokenizer.md | 1 + 1 file changed, 1 insertion(+) diff --git a/_analyzers/tokenizers/character-group-tokenizer.md b/_analyzers/tokenizers/character-group-tokenizer.md index ca3349c89a..1d1dcc7465 100644 --- a/_analyzers/tokenizers/character-group-tokenizer.md +++ b/_analyzers/tokenizers/character-group-tokenizer.md @@ -33,6 +33,7 @@ POST _analyze "text": "Fast-cars: drive fast!" } ``` +{% include copy-curl.html %} By analyzing the text "Fast-cars: drive fast!", we can see the specified characters have been removed: From b20028f2bc420c4ba2a606160d3b8b82aaadded7 Mon Sep 17 00:00:00 2001 From: Melissa Vagi Date: Tue, 15 Oct 2024 17:59:35 -0600 Subject: [PATCH 08/14] Update _analyzers/tokenizers/character-group-tokenizer.md Signed-off-by: Melissa Vagi --- _analyzers/tokenizers/character-group-tokenizer.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_analyzers/tokenizers/character-group-tokenizer.md b/_analyzers/tokenizers/character-group-tokenizer.md index 1d1dcc7465..d281df9d6b 100644 --- a/_analyzers/tokenizers/character-group-tokenizer.md +++ b/_analyzers/tokenizers/character-group-tokenizer.md @@ -1,6 +1,6 @@ --- layout: default -title: Character Group Tokenizer +title: Character group tokenizer parent: Tokenizers nav_order: 20 has_children: false From 9a8d21fc8032e5575f9b6e0037c6bdecfd7d83ff Mon Sep 17 00:00:00 2001 From: Melissa Vagi Date: Tue, 15 Oct 2024 18:10:31 -0600 Subject: [PATCH 09/14] Update _analyzers/tokenizers/character-group-tokenizer.md Signed-off-by: Melissa Vagi --- _analyzers/tokenizers/character-group-tokenizer.md | 1 + 1 file changed, 1 insertion(+) diff --git a/_analyzers/tokenizers/character-group-tokenizer.md b/_analyzers/tokenizers/character-group-tokenizer.md index d281df9d6b..8a713270e3 100644 --- a/_analyzers/tokenizers/character-group-tokenizer.md +++ b/_analyzers/tokenizers/character-group-tokenizer.md @@ -12,6 +12,7 @@ has_toc: false The character group tokenizer is designed to segment text into tokens based on the presence of specific characters. This tokenizer is ideal for scenarios where a straightforward tokenization approach is required, avoiding the complexity and overhead associated with pattern-based tokenizers. The character group tokenizer accepts the following parameters: + 1. `tokenize_on_chars`: Specifies a set of characters on which the text should be tokenized. When any character from this set is encountered, a new token is created. For example, single characters `(e.g., -, @)` and character classes such as `whitespace`, `letter`, `digit`, `punctuation`, and `symbol`. 4. `max_token_length`: This parameter defines the maximum length allowed for a token. If a token exceeds this specified length, it will be split at intervals defined by `max_token_length`. The default value is `255`. From 563ae6b8d887841f790fd19502f1a4d977c5e706 Mon Sep 17 00:00:00 2001 From: Melissa Vagi Date: Tue, 15 Oct 2024 18:10:55 -0600 Subject: [PATCH 10/14] Update _analyzers/tokenizers/character-group-tokenizer.md Signed-off-by: Melissa Vagi --- _analyzers/tokenizers/character-group-tokenizer.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_analyzers/tokenizers/character-group-tokenizer.md b/_analyzers/tokenizers/character-group-tokenizer.md index 8a713270e3..1c0437a8ff 100644 --- a/_analyzers/tokenizers/character-group-tokenizer.md +++ b/_analyzers/tokenizers/character-group-tokenizer.md @@ -14,7 +14,7 @@ The character group tokenizer is designed to segment text into tokens based on t The character group tokenizer accepts the following parameters: 1. `tokenize_on_chars`: Specifies a set of characters on which the text should be tokenized. When any character from this set is encountered, a new token is created. For example, single characters `(e.g., -, @)` and character classes such as `whitespace`, `letter`, `digit`, `punctuation`, and `symbol`. -4. `max_token_length`: This parameter defines the maximum length allowed for a token. If a token exceeds this specified length, it will be split at intervals defined by `max_token_length`. The default value is `255`. +2. `max_token_length`: This parameter defines the maximum length allowed for a token. If a token exceeds this specified length, it will be split at intervals defined by `max_token_length`. The default value is `255`. ## Example of the character group tokenizer From 0eca713ef0cec394989b4302aab13402573ce5bd Mon Sep 17 00:00:00 2001 From: leanneeliatra <131779422+leanneeliatra@users.noreply.github.com> Date: Thu, 17 Oct 2024 14:51:23 +0100 Subject: [PATCH 11/14] Apply suggestions from code review Co-authored-by: Melissa Vagi Signed-off-by: leanneeliatra <131779422+leanneeliatra@users.noreply.github.com> --- _analyzers/tokenizers/character-group-tokenizer.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/_analyzers/tokenizers/character-group-tokenizer.md b/_analyzers/tokenizers/character-group-tokenizer.md index 1c0437a8ff..f232af0742 100644 --- a/_analyzers/tokenizers/character-group-tokenizer.md +++ b/_analyzers/tokenizers/character-group-tokenizer.md @@ -9,16 +9,16 @@ has_toc: false # Character group tokenizer -The character group tokenizer is designed to segment text into tokens based on the presence of specific characters. This tokenizer is ideal for scenarios where a straightforward tokenization approach is required, avoiding the complexity and overhead associated with pattern-based tokenizers. +The character group tokenizer is a simple text segmentation tool that splits text into tokens based on the presence of specific characters. This tokenizer is ideal for scenarios where a simple tokenization method is required, avoiding the complexity and overhead associated with pattern-based tokenizers. The character group tokenizer accepts the following parameters: -1. `tokenize_on_chars`: Specifies a set of characters on which the text should be tokenized. When any character from this set is encountered, a new token is created. For example, single characters `(e.g., -, @)` and character classes such as `whitespace`, `letter`, `digit`, `punctuation`, and `symbol`. -2. `max_token_length`: This parameter defines the maximum length allowed for a token. If a token exceeds this specified length, it will be split at intervals defined by `max_token_length`. The default value is `255`. +1. `tokenize_on_chars`: Specifies a set of characters on which the text should be tokenized. The tokenizer creates a new token upon encountering any character from the specified set, for example, single characters `(e.g., -, @)` and character classes such as `whitespace`, `letter`, `digit`, `punctuation`, and `symbol`. +2. `max_token_length`: Defines the token's maximum length. If the token exceeds the specified length, then the tokenizer splits a token at intervals defined by the parameter. Default is `255`. -## Example of the character group tokenizer +## Example: Using the character group tokenizer -We can tokenize the on characters such as `whitespace`, `-` and `:`. +To tokenize the on characters such as `whitespace`, `-` and `:`, see the following example request: ```json POST _analyze @@ -36,7 +36,7 @@ POST _analyze ``` {% include copy-curl.html %} -By analyzing the text "Fast-cars: drive fast!", we can see the specified characters have been removed: +The following response shows that the specified characters have been removed: ``` Fast cars drive fast From 4af10ef04706679478701d58a20ebced26537ea5 Mon Sep 17 00:00:00 2001 From: Fanit Kolchina Date: Thu, 2 Jan 2025 10:59:39 -0500 Subject: [PATCH 12/14] Doc review Signed-off-by: Fanit Kolchina --- .../tokenizers/character-group-tokenizer.md | 43 ------ _analyzers/tokenizers/character-group.md | 124 ++++++++++++++++++ 2 files changed, 124 insertions(+), 43 deletions(-) delete mode 100644 _analyzers/tokenizers/character-group-tokenizer.md create mode 100644 _analyzers/tokenizers/character-group.md diff --git a/_analyzers/tokenizers/character-group-tokenizer.md b/_analyzers/tokenizers/character-group-tokenizer.md deleted file mode 100644 index f232af0742..0000000000 --- a/_analyzers/tokenizers/character-group-tokenizer.md +++ /dev/null @@ -1,43 +0,0 @@ ---- -layout: default -title: Character group tokenizer -parent: Tokenizers -nav_order: 20 -has_children: false -has_toc: false ---- - -# Character group tokenizer - -The character group tokenizer is a simple text segmentation tool that splits text into tokens based on the presence of specific characters. This tokenizer is ideal for scenarios where a simple tokenization method is required, avoiding the complexity and overhead associated with pattern-based tokenizers. - -The character group tokenizer accepts the following parameters: - -1. `tokenize_on_chars`: Specifies a set of characters on which the text should be tokenized. The tokenizer creates a new token upon encountering any character from the specified set, for example, single characters `(e.g., -, @)` and character classes such as `whitespace`, `letter`, `digit`, `punctuation`, and `symbol`. -2. `max_token_length`: Defines the token's maximum length. If the token exceeds the specified length, then the tokenizer splits a token at intervals defined by the parameter. Default is `255`. - -## Example: Using the character group tokenizer - -To tokenize the on characters such as `whitespace`, `-` and `:`, see the following example request: - -```json -POST _analyze -{ - "tokenizer": { - "type": "char_group", - "tokenize_on_chars": [ - "whitespace", - "-", - ":" - ] - }, - "text": "Fast-cars: drive fast!" -} -``` -{% include copy-curl.html %} - -The following response shows that the specified characters have been removed: - -``` -Fast cars drive fast -``` diff --git a/_analyzers/tokenizers/character-group.md b/_analyzers/tokenizers/character-group.md new file mode 100644 index 0000000000..850b450198 --- /dev/null +++ b/_analyzers/tokenizers/character-group.md @@ -0,0 +1,124 @@ +--- +layout: default +title: Character group +parent: Tokenizers +nav_order: 20 +has_children: false +has_toc: false +--- + +# Character group tokenizer + +The `char_group` tokenizer splits text into tokens using specific characters as delimiters. It is suitable for situations requiring straightforward tokenization, offering a simpler alternative to pattern-based tokenizers without the added complexity. + +## Example usage + +The following example request creates a new index named `my_index` and configures an analyzer with a `char_group` tokenizer. The tokenizer splits text on white space, `-`, and `:` characters: + +```json +PUT /my_index +{ + "settings": { + "analysis": { + "tokenizer": { + "my_char_group_tokenizer": { + "type": "char_group", + "tokenize_on_chars": [ + "whitespace", + "-", + ":" + ] + } + }, + "analyzer": { + "my_char_group_analyzer": { + "type": "custom", + "tokenizer": "my_char_group_tokenizer" + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "my_char_group_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_index/_analyze +{ + "analyzer": "my_char_group_analyzer", + "text": "Fast-driving cars: they drive fast!" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "Fast", + "start_offset": 0, + "end_offset": 4, + "type": "word", + "position": 0 + }, + { + "token": "driving", + "start_offset": 5, + "end_offset": 12, + "type": "word", + "position": 1 + }, + { + "token": "cars", + "start_offset": 13, + "end_offset": 17, + "type": "word", + "position": 2 + }, + { + "token": "they", + "start_offset": 19, + "end_offset": 23, + "type": "word", + "position": 3 + }, + { + "token": "drive", + "start_offset": 24, + "end_offset": 29, + "type": "word", + "position": 4 + }, + { + "token": "fast!", + "start_offset": 30, + "end_offset": 35, + "type": "word", + "position": 5 + } + ] +} +``` + +## Parameters + +The `char_group` tokenizer can be configured with the following parameters. + +| **Parameter** | **Required/Optional** | **Data type** | **Description** | +| :--- | :--- | :--- | :--- | +| `tokenize_on_chars` | Required | Array | Specifies a set of characters on which the text should be tokenized. You can specify single characters (for example, `-`, `@`) or character classes such as `whitespace`, `letter`, `digit`, `punctuation`, or `symbol`. | +| `max_token_length` | Optional | Integer | Sets the maximum length of the produced token. If this length is exceeded, the token is split into multiple tokens at the length configured in `max_token_length`. Default is `255`. | \ No newline at end of file From f5f063906565e244e85c39a91c313f01cd64beac Mon Sep 17 00:00:00 2001 From: Fanit Kolchina Date: Thu, 2 Jan 2025 11:00:34 -0500 Subject: [PATCH 13/14] Reorder index Signed-off-by: Fanit Kolchina --- _analyzers/tokenizers/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_analyzers/tokenizers/index.md b/_analyzers/tokenizers/index.md index 6bf6fc1aba..e5ac796c12 100644 --- a/_analyzers/tokenizers/index.md +++ b/_analyzers/tokenizers/index.md @@ -1,7 +1,7 @@ --- layout: default title: Tokenizers -nav_order: 10 +nav_order: 60 has_children: false has_toc: false redirect_from: From 52eaad871da322f0b501ea969c4e8db084d5ae8f Mon Sep 17 00:00:00 2001 From: Fanit Kolchina Date: Thu, 2 Jan 2025 11:45:48 -0500 Subject: [PATCH 14/14] Add escape characters Signed-off-by: Fanit Kolchina --- _analyzers/tokenizers/character-group.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_analyzers/tokenizers/character-group.md b/_analyzers/tokenizers/character-group.md index 850b450198..56e52780fc 100644 --- a/_analyzers/tokenizers/character-group.md +++ b/_analyzers/tokenizers/character-group.md @@ -120,5 +120,5 @@ The `char_group` tokenizer can be configured with the following parameters. | **Parameter** | **Required/Optional** | **Data type** | **Description** | | :--- | :--- | :--- | :--- | -| `tokenize_on_chars` | Required | Array | Specifies a set of characters on which the text should be tokenized. You can specify single characters (for example, `-`, `@`) or character classes such as `whitespace`, `letter`, `digit`, `punctuation`, or `symbol`. | +| `tokenize_on_chars` | Required | Array | Specifies a set of characters on which the text should be tokenized. You can specify single characters (for example, `-` or `@`), including escape characters (for example, `\n`), or character classes such as `whitespace`, `letter`, `digit`, `punctuation`, or `symbol`. | | `max_token_length` | Optional | Integer | Sets the maximum length of the produced token. If this length is exceeded, the token is split into multiple tokens at the length configured in `max_token_length`. Default is `255`. | \ No newline at end of file