Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support custom token chars in (edge)ngram tokenizer #4384

Merged
merged 1 commit into from
Feb 12, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 25 additions & 5 deletions src/Nest/Analysis/Tokenizers/NGram/EdgeNGramTokenizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,18 @@ public interface IEdgeNGramTokenizer : ITokenizer
/// </summary>
[DataMember(Name ="token_chars")]
IEnumerable<TokenChar> TokenChars { get; set; }

/// <summary>
/// Custom characters that should be treated as part of a token. For example,
/// setting this to +-_ will make the tokenizer treat the plus, minus and
/// underscore sign as part of a token.
/// <para />
/// Requires setting <see cref="TokenChar.Custom"/> as part of <see cref="TokenChars"/>
/// <para />
/// Available in Elasticsearch 7.6.0+.
/// </summary>
[DataMember(Name = "custom_token_chars")]
string CustomTokenChars { get; set; }
}

/// <inheritdoc />
Expand All @@ -44,6 +56,9 @@ public class EdgeNGramTokenizer : TokenizerBase, IEdgeNGramTokenizer

/// <inheritdoc />
public IEnumerable<TokenChar> TokenChars { get; set; }

/// <inheritdoc />
public string CustomTokenChars { get; set; }
}

/// <inheritdoc />
Expand All @@ -52,22 +67,27 @@ public class EdgeNGramTokenizerDescriptor
{
protected override string Type => "edge_ngram";
int? IEdgeNGramTokenizer.MaxGram { get; set; }

int? IEdgeNGramTokenizer.MinGram { get; set; }
IEnumerable<TokenChar> IEdgeNGramTokenizer.TokenChars { get; set; }

/// <inheritdoc />
string IEdgeNGramTokenizer.CustomTokenChars { get; set; }

/// <inheritdoc cref="IEdgeNGramTokenizer.MinGram" />
public EdgeNGramTokenizerDescriptor MinGram(int? minGram) => Assign(minGram, (a, v) => a.MinGram = v);

/// <inheritdoc />
/// <inheritdoc cref="IEdgeNGramTokenizer.MaxGram" />
public EdgeNGramTokenizerDescriptor MaxGram(int? maxGram) => Assign(maxGram, (a, v) => a.MaxGram = v);

/// <inheritdoc />
/// <inheritdoc cref="IEdgeNGramTokenizer.TokenChars" />
public EdgeNGramTokenizerDescriptor TokenChars(IEnumerable<TokenChar> tokenChars) =>
Assign(tokenChars, (a, v) => a.TokenChars = v);

/// <inheritdoc />
/// <inheritdoc cref="IEdgeNGramTokenizer.TokenChars" />
public EdgeNGramTokenizerDescriptor TokenChars(params TokenChar[] tokenChars) =>
Assign(tokenChars, (a, v) => a.TokenChars = v);

/// <inheritdoc cref="IEdgeNGramTokenizer.CustomTokenChars" />
public EdgeNGramTokenizerDescriptor CustomTokenChars(string customTokenChars) =>
Assign(customTokenChars, (a, v) => a.CustomTokenChars = v);
}
}
30 changes: 25 additions & 5 deletions src/Nest/Analysis/Tokenizers/NGram/NGramTokenizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,18 @@ public interface INGramTokenizer : ITokenizer
/// </summary>
[DataMember(Name ="token_chars")]
IEnumerable<TokenChar> TokenChars { get; set; }

/// <summary>
/// Custom characters that should be treated as part of a token. For example,
/// setting this to +-_ will make the tokenizer treat the plus, minus and
/// underscore sign as part of a token.
/// <para />
/// Requires setting <see cref="TokenChar.Custom"/> as part of <see cref="TokenChars"/>
/// <para />
/// Available in Elasticsearch 7.6.0+.
/// </summary>
[DataMember(Name = "custom_token_chars")]
string CustomTokenChars { get; set; }
}

/// <inheritdoc />
Expand All @@ -44,6 +56,9 @@ public class NGramTokenizer : TokenizerBase, INGramTokenizer

/// <inheritdoc />
public IEnumerable<TokenChar> TokenChars { get; set; }

/// <inheritdoc />
public string CustomTokenChars { get; set; }
}

/// <inheritdoc />
Expand All @@ -52,21 +67,26 @@ public class NGramTokenizerDescriptor
{
protected override string Type => "ngram";
int? INGramTokenizer.MaxGram { get; set; }

int? INGramTokenizer.MinGram { get; set; }
IEnumerable<TokenChar> INGramTokenizer.TokenChars { get; set; }

/// <inheritdoc />
string INGramTokenizer.CustomTokenChars { get; set; }

/// <inheritdoc cref="INGramTokenizer.MinGram" />
public NGramTokenizerDescriptor MinGram(int? minGram) => Assign(minGram, (a, v) => a.MinGram = v);

/// <inheritdoc />
/// <inheritdoc cref="INGramTokenizer.MaxGram" />
public NGramTokenizerDescriptor MaxGram(int? minGram) => Assign(minGram, (a, v) => a.MaxGram = v);

/// <inheritdoc />
/// <inheritdoc cref="INGramTokenizer.TokenChars" />
public NGramTokenizerDescriptor TokenChars(IEnumerable<TokenChar> tokenChars) =>
Assign(tokenChars, (a, v) => a.TokenChars = v);

/// <inheritdoc />
/// <inheritdoc cref="INGramTokenizer.TokenChars" />
public NGramTokenizerDescriptor TokenChars(params TokenChar[] tokenChars) => Assign(tokenChars, (a, v) => a.TokenChars = v);

/// <inheritdoc cref="INGramTokenizer.CustomTokenChars" />
public NGramTokenizerDescriptor CustomTokenChars(string customTokenChars) =>
Assign(customTokenChars, (a, v) => a.CustomTokenChars = v);
}
}
8 changes: 8 additions & 0 deletions src/Nest/Analysis/Tokenizers/NGram/TokenChar.cs
Original file line number Diff line number Diff line change
Expand Up @@ -20,5 +20,13 @@ public enum TokenChar

[EnumMember(Value = "symbol")]
Symbol,

/// <summary>
/// Custom token characters.
/// <para></para>
/// Available in Elasticsearch 7.6.0+
/// </summary>
[EnumMember(Value = "custom")]
Custom,
}
}
60 changes: 60 additions & 0 deletions tests/Tests/Analysis/Tokenizers/TokenizerTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,36 @@ public class EdgeNGramTests : TokenizerAssertionBase<EdgeNGramTests>
public override string Name => "endgen";
}

[SkipVersion("<7.6.0", "CustomTokenChars introduced in 7.6.0")]
public class EdgeNGramCustomTokenCharsTests : TokenizerAssertionBase<EdgeNGramCustomTokenCharsTests>
{
public override FuncTokenizer Fluent => (n, t) => t.EdgeNGram(n, e => e
.MaxGram(2)
.MinGram(1)
.TokenChars(TokenChar.Custom)
.CustomTokenChars("+-_")
);

public override ITokenizer Initializer => new EdgeNGramTokenizer
{
MaxGram = 2,
MinGram = 1,
TokenChars = new[] { TokenChar.Custom },
CustomTokenChars = "+-_"
};

public override object Json => new
{
min_gram = 1,
max_gram = 2,
token_chars = new[] { "custom" },
custom_token_chars = "+-_",
type = "edge_ngram"
};

public override string Name => "endgen_custom";
}

public class NGramTests : TokenizerAssertionBase<NGramTests>
{
public override FuncTokenizer Fluent => (n, t) => t.NGram(n, e => e
Expand All @@ -60,6 +90,36 @@ public class NGramTests : TokenizerAssertionBase<NGramTests>
public override string Name => "ng";
}

[SkipVersion("<7.6.0", "CustomTokenChars introduced in 7.6.0")]
public class NGramCustomTokenCharsTests : TokenizerAssertionBase<NGramCustomTokenCharsTests>
{
public override FuncTokenizer Fluent => (n, t) => t.NGram(n, e => e
.MaxGram(2)
.MinGram(1)
.TokenChars(TokenChar.Custom)
.CustomTokenChars("+-_")
);

public override ITokenizer Initializer => new NGramTokenizer
{
MaxGram = 2,
MinGram = 1,
TokenChars = new[] { TokenChar.Custom },
CustomTokenChars = "+-_"
};

public override object Json => new
{
min_gram = 1,
max_gram = 2,
token_chars = new[] { "custom" },
custom_token_chars = "+-_",
type = "ngram"
};

public override string Name => "ngram_custom";
}

public class PathHierarchyTests : TokenizerAssertionBase<PathHierarchyTests>
{
public override FuncTokenizer Fluent => (n, t) => t.PathHierarchy(n, e => e
Expand Down