diff --git a/src/Nest/Analysis/Tokenizers/NGram/EdgeNGramTokenizer.cs b/src/Nest/Analysis/Tokenizers/NGram/EdgeNGramTokenizer.cs index 8c85c351451..6f99b5ddff8 100644 --- a/src/Nest/Analysis/Tokenizers/NGram/EdgeNGramTokenizer.cs +++ b/src/Nest/Analysis/Tokenizers/NGram/EdgeNGramTokenizer.cs @@ -29,6 +29,18 @@ public interface IEdgeNGramTokenizer : ITokenizer /// [DataMember(Name ="token_chars")] IEnumerable TokenChars { get; set; } + + /// + /// Custom characters that should be treated as part of a token. For example, + /// setting this to +-_ will make the tokenizer treat the plus, minus and + /// underscore sign as part of a token. + /// + /// Requires setting as part of + /// + /// Available in Elasticsearch 7.6.0+. + /// + [DataMember(Name = "custom_token_chars")] + string CustomTokenChars { get; set; } } /// @@ -44,6 +56,9 @@ public class EdgeNGramTokenizer : TokenizerBase, IEdgeNGramTokenizer /// public IEnumerable TokenChars { get; set; } + + /// + public string CustomTokenChars { get; set; } } /// @@ -52,22 +67,27 @@ public class EdgeNGramTokenizerDescriptor { protected override string Type => "edge_ngram"; int? IEdgeNGramTokenizer.MaxGram { get; set; } - int? IEdgeNGramTokenizer.MinGram { get; set; } IEnumerable IEdgeNGramTokenizer.TokenChars { get; set; } - /// + string IEdgeNGramTokenizer.CustomTokenChars { get; set; } + + /// public EdgeNGramTokenizerDescriptor MinGram(int? minGram) => Assign(minGram, (a, v) => a.MinGram = v); - /// + /// public EdgeNGramTokenizerDescriptor MaxGram(int? maxGram) => Assign(maxGram, (a, v) => a.MaxGram = v); - /// + /// public EdgeNGramTokenizerDescriptor TokenChars(IEnumerable tokenChars) => Assign(tokenChars, (a, v) => a.TokenChars = v); - /// + /// public EdgeNGramTokenizerDescriptor TokenChars(params TokenChar[] tokenChars) => Assign(tokenChars, (a, v) => a.TokenChars = v); + + /// + public EdgeNGramTokenizerDescriptor CustomTokenChars(string customTokenChars) => + Assign(customTokenChars, (a, v) => a.CustomTokenChars = v); } } diff --git a/src/Nest/Analysis/Tokenizers/NGram/NGramTokenizer.cs b/src/Nest/Analysis/Tokenizers/NGram/NGramTokenizer.cs index 06c62228971..bbfb36cfe7b 100644 --- a/src/Nest/Analysis/Tokenizers/NGram/NGramTokenizer.cs +++ b/src/Nest/Analysis/Tokenizers/NGram/NGramTokenizer.cs @@ -29,6 +29,18 @@ public interface INGramTokenizer : ITokenizer /// [DataMember(Name ="token_chars")] IEnumerable TokenChars { get; set; } + + /// + /// Custom characters that should be treated as part of a token. For example, + /// setting this to +-_ will make the tokenizer treat the plus, minus and + /// underscore sign as part of a token. + /// + /// Requires setting as part of + /// + /// Available in Elasticsearch 7.6.0+. + /// + [DataMember(Name = "custom_token_chars")] + string CustomTokenChars { get; set; } } /// @@ -44,6 +56,9 @@ public class NGramTokenizer : TokenizerBase, INGramTokenizer /// public IEnumerable TokenChars { get; set; } + + /// + public string CustomTokenChars { get; set; } } /// @@ -52,21 +67,26 @@ public class NGramTokenizerDescriptor { protected override string Type => "ngram"; int? INGramTokenizer.MaxGram { get; set; } - int? INGramTokenizer.MinGram { get; set; } IEnumerable INGramTokenizer.TokenChars { get; set; } - /// + string INGramTokenizer.CustomTokenChars { get; set; } + + /// public NGramTokenizerDescriptor MinGram(int? minGram) => Assign(minGram, (a, v) => a.MinGram = v); - /// + /// public NGramTokenizerDescriptor MaxGram(int? minGram) => Assign(minGram, (a, v) => a.MaxGram = v); - /// + /// public NGramTokenizerDescriptor TokenChars(IEnumerable tokenChars) => Assign(tokenChars, (a, v) => a.TokenChars = v); - /// + /// public NGramTokenizerDescriptor TokenChars(params TokenChar[] tokenChars) => Assign(tokenChars, (a, v) => a.TokenChars = v); + + /// + public NGramTokenizerDescriptor CustomTokenChars(string customTokenChars) => + Assign(customTokenChars, (a, v) => a.CustomTokenChars = v); } } diff --git a/src/Nest/Analysis/Tokenizers/NGram/TokenChar.cs b/src/Nest/Analysis/Tokenizers/NGram/TokenChar.cs index aa20a9d4e5d..c4b898278c8 100644 --- a/src/Nest/Analysis/Tokenizers/NGram/TokenChar.cs +++ b/src/Nest/Analysis/Tokenizers/NGram/TokenChar.cs @@ -20,5 +20,13 @@ public enum TokenChar [EnumMember(Value = "symbol")] Symbol, + + /// + /// Custom token characters. + /// + /// Available in Elasticsearch 7.6.0+ + /// + [EnumMember(Value = "custom")] + Custom, } } diff --git a/tests/Tests/Analysis/Tokenizers/TokenizerTests.cs b/tests/Tests/Analysis/Tokenizers/TokenizerTests.cs index 53a4b00c0a9..7e9711a8c75 100644 --- a/tests/Tests/Analysis/Tokenizers/TokenizerTests.cs +++ b/tests/Tests/Analysis/Tokenizers/TokenizerTests.cs @@ -34,6 +34,36 @@ public class EdgeNGramTests : TokenizerAssertionBase public override string Name => "endgen"; } + [SkipVersion("<7.6.0", "CustomTokenChars introduced in 7.6.0")] + public class EdgeNGramCustomTokenCharsTests : TokenizerAssertionBase + { + public override FuncTokenizer Fluent => (n, t) => t.EdgeNGram(n, e => e + .MaxGram(2) + .MinGram(1) + .TokenChars(TokenChar.Custom) + .CustomTokenChars("+-_") + ); + + public override ITokenizer Initializer => new EdgeNGramTokenizer + { + MaxGram = 2, + MinGram = 1, + TokenChars = new[] { TokenChar.Custom }, + CustomTokenChars = "+-_" + }; + + public override object Json => new + { + min_gram = 1, + max_gram = 2, + token_chars = new[] { "custom" }, + custom_token_chars = "+-_", + type = "edge_ngram" + }; + + public override string Name => "endgen_custom"; + } + public class NGramTests : TokenizerAssertionBase { public override FuncTokenizer Fluent => (n, t) => t.NGram(n, e => e @@ -60,6 +90,36 @@ public class NGramTests : TokenizerAssertionBase public override string Name => "ng"; } + [SkipVersion("<7.6.0", "CustomTokenChars introduced in 7.6.0")] + public class NGramCustomTokenCharsTests : TokenizerAssertionBase + { + public override FuncTokenizer Fluent => (n, t) => t.NGram(n, e => e + .MaxGram(2) + .MinGram(1) + .TokenChars(TokenChar.Custom) + .CustomTokenChars("+-_") + ); + + public override ITokenizer Initializer => new NGramTokenizer + { + MaxGram = 2, + MinGram = 1, + TokenChars = new[] { TokenChar.Custom }, + CustomTokenChars = "+-_" + }; + + public override object Json => new + { + min_gram = 1, + max_gram = 2, + token_chars = new[] { "custom" }, + custom_token_chars = "+-_", + type = "ngram" + }; + + public override string Name => "ngram_custom"; + } + public class PathHierarchyTests : TokenizerAssertionBase { public override FuncTokenizer Fluent => (n, t) => t.PathHierarchy(n, e => e