From 00d5965223a83607eb2e7c00d5aa8e97f1e4356b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20B=C3=BCscher?= Date: Fri, 10 Jan 2020 15:36:24 +0100 Subject: [PATCH 1/5] Deprecate and remove camel-case nGram and edgeNGram tokenizers We already deprecated and removed the camel-case versions of the nGram and edgeNGram filters a while ago and we should do the same with the nGram and edgeNGram tokenizers. This PR deprecates the use of these names in favour of ngram and edge_ngram in 7 and disallows usage in new indices starting with 8. The deprecation part will be backported to 7.6. Closes #50561 --- .../migration/migrate_8_0/mappings.asciidoc | 9 ++ .../analysis/common/CommonAnalysisPlugin.java | 44 +++++++++- .../common/CommonAnalysisPluginTests.java | 84 +++++++++++++++++++ 3 files changed, 134 insertions(+), 3 deletions(-) diff --git a/docs/reference/migration/migrate_8_0/mappings.asciidoc b/docs/reference/migration/migrate_8_0/mappings.asciidoc index 8c6fa75aad5e1..8f9df5e9858af 100644 --- a/docs/reference/migration/migrate_8_0/mappings.asciidoc +++ b/docs/reference/migration/migrate_8_0/mappings.asciidoc @@ -39,3 +39,12 @@ The setting has been deprecated with 7.5 and is no longer supported on new indic Mappings for older indices will continue to work but emit a deprecation warning. The `enabled` setting for `_field_names` should be removed from templates and mappings. Disabling _field_names is not necessary because it no longer carries a large index overhead. + +[float] +[[nGram-edgeNGram-dreprecation]] +==== Disallow use of the `nGram` and `edgeNGram` tokenizer names + +The `nGram` and `edgeNGram` tokenizer names haven been deprecated with 7.6 and are no longer +supported on new indices. Mappings for indices created after 7.6 will continue to work but +emit a deprecation warning. The tokenizer name should be changed to the fully equivalent +`ngram` or `edge_ngram` names for new indices and in index templates. diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java index b9de97952eb4b..ccfffbb85e7ec 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java @@ -337,9 +337,29 @@ public Map> getTokenizers() { tokenizers.put("simple_pattern", SimplePatternTokenizerFactory::new); tokenizers.put("simple_pattern_split", SimplePatternSplitTokenizerFactory::new); tokenizers.put("thai", ThaiTokenizerFactory::new); - tokenizers.put("nGram", NGramTokenizerFactory::new); + tokenizers.put("nGram", (IndexSettings indexSettings, Environment environment, String name, Settings settings) -> { + if (indexSettings.getIndexVersionCreated().onOrAfter(org.elasticsearch.Version.V_8_0_0)) { + throw new IllegalArgumentException("The [nGram] tokenizer name was deprecated in 7.6. " + + "Please use the tokenizer name to [ngram] for indices created in versions 8 or higher instead."); + } else if (indexSettings.getIndexVersionCreated().onOrAfter(org.elasticsearch.Version.V_7_6_0)) { + deprecationLogger.deprecatedAndMaybeLog("nGram_tokenizer_deprecation", + "The [nGram] tokenizer name is deprecated and will be removed in a future version. " + + "Please change the tokenizer name to [ngram] instead."); + } + return new NGramTokenizerFactory(indexSettings, environment, name, settings); + }); tokenizers.put("ngram", NGramTokenizerFactory::new); - tokenizers.put("edgeNGram", EdgeNGramTokenizerFactory::new); + tokenizers.put("edgeNGram", (IndexSettings indexSettings, Environment environment, String name, Settings settings) -> { + if (indexSettings.getIndexVersionCreated().onOrAfter(org.elasticsearch.Version.V_8_0_0)) { + throw new IllegalArgumentException("The [edgeNGram] tokenizer name was deprecated in 7.6. " + + "Please use the tokenizer name to [edge_nGram] for indices created in versions 8 or higher instead."); + } else if (indexSettings.getIndexVersionCreated().onOrAfter(org.elasticsearch.Version.V_7_6_0)) { + deprecationLogger.deprecatedAndMaybeLog("edgeNGram_tokenizer_deprecation", + "The [edgeNGram] tokenizer name is deprecated and will be removed in a future version. " + + "Please change the tokenizer name to [edge_ngram] instead."); + } + return new EdgeNGramTokenizerFactory(indexSettings, environment, name, settings); + }); tokenizers.put("edge_ngram", EdgeNGramTokenizerFactory::new); tokenizers.put("char_group", CharGroupTokenizerFactory::new); tokenizers.put("classic", ClassicTokenizerFactory::new); @@ -522,8 +542,26 @@ public List getPreConfiguredTokenizers() { tokenizers.add(PreConfiguredTokenizer.singleton("lowercase", XLowerCaseTokenizer::new)); // Temporary shim for aliases. TODO deprecate after they are moved - tokenizers.add(PreConfiguredTokenizer.singleton("nGram", NGramTokenizer::new)); + tokenizers.add(PreConfiguredTokenizer.elasticsearchVersion("nGram", (version) -> { + if (version.onOrAfter(org.elasticsearch.Version.V_8_0_0)) { + throw new IllegalArgumentException("The [nGram] tokenizer name was deprecated in 7.6. " + + "Please use the tokenizer name to [ngram] for indices created in versions 8 or higher instead."); + } else if (version.onOrAfter(org.elasticsearch.Version.V_7_6_0)) { + deprecationLogger.deprecatedAndMaybeLog("nGram_tokenizer_deprecation", + "The [nGram] tokenizer name is deprecated and will be removed in a future version. " + + "Please change the tokenizer name to [ngram] instead."); + } + return new NGramTokenizer(); + })); tokenizers.add(PreConfiguredTokenizer.elasticsearchVersion("edgeNGram", (version) -> { + if (version.onOrAfter(org.elasticsearch.Version.V_8_0_0)) { + throw new IllegalArgumentException("The [edgeNGram] tokenizer name was deprecated in 7.6. " + + "Please use the tokenizer name to [edge_ngram] for indices created in versions 8 or higher instead."); + } else if (version.onOrAfter(org.elasticsearch.Version.V_7_6_0)) { + deprecationLogger.deprecatedAndMaybeLog("nGram_tokenizer_deprecation", + "The [edgeNGram] tokenizer name is deprecated and will be removed in a future version. " + + "Please change the tokenizer name to [edge_ngram] instead."); + } if (version.onOrAfter(Version.V_7_3_0)) { return new EdgeNGramTokenizer(NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE, NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE); } diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisPluginTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisPluginTests.java index 90190e42f2f85..7bed1a1d09a84 100644 --- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisPluginTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisPluginTests.java @@ -26,6 +26,7 @@ import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.analysis.TokenFilterFactory; +import org.elasticsearch.index.analysis.TokenizerFactory; import org.elasticsearch.test.ESTestCase; import org.elasticsearch.test.IndexSettingsModule; import org.elasticsearch.test.VersionUtils; @@ -126,4 +127,87 @@ public void testEdgeNGramFilterInCustomAnalyzerDeprecationError() throws IOExcep + "Please change the filter name to [edge_ngram] instead."); } } + + /** + * Check that we log a deprecation warning for "nGram" and "edgeNGram" tokenizer names with 7.6 and + * disallow usages for indices created after 8.0 + */ + public void testNGramTokenizerDeprecation() throws IOException { + // tests for prebuilt tokenizer + doTestPrebuiltTokenizerDeprecation("nGram", "ngram", + VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, Version.V_7_5_2), false); + doTestPrebuiltTokenizerDeprecation("edgeNGram", "edge_ngram", + VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, Version.V_7_5_2), false); + doTestPrebuiltTokenizerDeprecation("nGram", "ngram", + VersionUtils.randomVersionBetween(random(), Version.V_7_6_0, + Version.max(Version.V_7_6_0, VersionUtils.getPreviousVersion(Version.V_8_0_0))), + true); + doTestPrebuiltTokenizerDeprecation("edgeNGram", "edge_ngram", + VersionUtils.randomVersionBetween(random(), Version.V_7_6_0, + Version.max(Version.V_7_6_0, VersionUtils.getPreviousVersion(Version.V_8_0_0))), true); + expectThrows(IllegalArgumentException.class, () -> doTestPrebuiltTokenizerDeprecation("nGram", "ngram", + VersionUtils.randomVersionBetween(random(), Version.V_8_0_0, Version.CURRENT), true)); + expectThrows(IllegalArgumentException.class, () -> doTestPrebuiltTokenizerDeprecation("edgeNGram", "edge_ngram", + VersionUtils.randomVersionBetween(random(), Version.V_8_0_0, Version.CURRENT), true)); + + // same batch of tests for custom tokenizer definition in the settings + doTestCustomTokenizerDeprecation("nGram", "ngram", + VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, Version.V_7_5_2), false); + doTestCustomTokenizerDeprecation("edgeNGram", "edge_ngram", + VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, Version.V_7_5_2), false); + doTestCustomTokenizerDeprecation("nGram", "ngram", + VersionUtils.randomVersionBetween(random(), Version.V_7_6_0, + Version.max(Version.V_7_6_0, VersionUtils.getPreviousVersion(Version.V_8_0_0))), + true); + doTestCustomTokenizerDeprecation("edgeNGram", "edge_ngram", + VersionUtils.randomVersionBetween(random(), Version.V_7_6_0, + Version.max(Version.V_7_6_0, VersionUtils.getPreviousVersion(Version.V_8_0_0))), true); + expectThrows(IllegalArgumentException.class, () -> doTestCustomTokenizerDeprecation("nGram", "ngram", + VersionUtils.randomVersionBetween(random(), Version.V_8_0_0, Version.CURRENT), true)); + expectThrows(IllegalArgumentException.class, () -> doTestCustomTokenizerDeprecation("edgeNGram", "edge_ngram", + VersionUtils.randomVersionBetween(random(), Version.V_8_0_0, Version.CURRENT), true)); + + } + + public void doTestPrebuiltTokenizerDeprecation(String deprecatedName, String replacement, Version version, boolean expectWarning) + throws IOException { + final Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir()) + .put(IndexMetaData.SETTING_VERSION_CREATED, version).build(); + + try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) { + Map tokenizers = createTestAnalysis( + IndexSettingsModule.newIndexSettings("index", settings), settings, commonAnalysisPlugin).tokenizer; + TokenizerFactory tokenizerFactory = tokenizers.get(deprecatedName); + + Tokenizer tokenizer = tokenizerFactory.create(); + assertNotNull(tokenizer); + if (expectWarning) { + assertWarnings("The [" + deprecatedName + "] tokenizer name is deprecated and will be removed in a future version. " + + "Please change the tokenizer name to [" + replacement + "] instead."); + } + } + } + + public void doTestCustomTokenizerDeprecation(String deprecatedName, String replacement, Version version, boolean expectWarning) + throws IOException { + final Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir()) + .put(IndexMetaData.SETTING_VERSION_CREATED, version) + .put("index.analysis.analyzer.custom_analyzer.type", "custom") + .put("index.analysis.analyzer.custom_analyzer.tokenizer", "my_tokenizer") + .put("index.analysis.tokenizer.my_tokenizer.type", deprecatedName) + .build(); + + try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) { + Map tokenizers = createTestAnalysis( + IndexSettingsModule.newIndexSettings("index", settings), settings, commonAnalysisPlugin).tokenizer; + TokenizerFactory tokenizerFactory = tokenizers.get(deprecatedName); + + Tokenizer tokenizer = tokenizerFactory.create(); + assertNotNull(tokenizer); + if (expectWarning) { + assertWarnings("The [" + deprecatedName + "] tokenizer name is deprecated and will be removed in a future version. " + + "Please change the tokenizer name to [" + replacement + "] instead."); + } + } + } } From fdf34e3307ceb2273194e4548dfec33395223a11 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20B=C3=BCscher?= Date: Fri, 10 Jan 2020 18:16:17 +0100 Subject: [PATCH 2/5] Adapt EdgeNGramTokenizerTests --- .../analysis/common/EdgeNGramTokenizerTests.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/EdgeNGramTokenizerTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/EdgeNGramTokenizerTests.java index 95bf41f8e92c9..f1a1bdc466beb 100644 --- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/EdgeNGramTokenizerTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/EdgeNGramTokenizerTests.java @@ -86,9 +86,11 @@ public void testPreConfiguredTokenizer() throws IOException { } } - // Check deprecated name as well + // Check deprecated name as well, needs version before 8.0 because throws IAE after that { - try (IndexAnalyzers indexAnalyzers = buildAnalyzers(Version.CURRENT, "edgeNGram")) { + try (IndexAnalyzers indexAnalyzers = buildAnalyzers( + VersionUtils.randomVersionBetween(random(), Version.V_7_3_0, VersionUtils.getPreviousVersion(Version.CURRENT)), + "edgeNGram")) { NamedAnalyzer analyzer = indexAnalyzers.get("my_analyzer"); assertNotNull(analyzer); assertAnalyzesTo(analyzer, "test", new String[]{"t", "te"}); From 53e25dfbc966de0f8700ced7ea46f085d2af2c54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20B=C3=BCscher?= Date: Tue, 14 Jan 2020 10:06:13 +0100 Subject: [PATCH 3/5] iter --- .../elasticsearch/analysis/common/EdgeNGramTokenizerTests.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/EdgeNGramTokenizerTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/EdgeNGramTokenizerTests.java index f1a1bdc466beb..e1fbe90647e4b 100644 --- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/EdgeNGramTokenizerTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/EdgeNGramTokenizerTests.java @@ -89,7 +89,7 @@ public void testPreConfiguredTokenizer() throws IOException { // Check deprecated name as well, needs version before 8.0 because throws IAE after that { try (IndexAnalyzers indexAnalyzers = buildAnalyzers( - VersionUtils.randomVersionBetween(random(), Version.V_7_3_0, VersionUtils.getPreviousVersion(Version.CURRENT)), + VersionUtils.randomVersionBetween(random(), Version.V_7_3_0, VersionUtils.getPreviousVersion(Version.V_8_0_0)), "edgeNGram")) { NamedAnalyzer analyzer = indexAnalyzers.get("my_analyzer"); assertNotNull(analyzer); From bf9f8611c74ab4624d8caeadff17c8dffe432458 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20B=C3=BCscher?= Date: Tue, 14 Jan 2020 16:08:08 +0100 Subject: [PATCH 4/5] iter --- .../analysis/common/CommonAnalysisPlugin.java | 2 +- .../analysis/common/CommonAnalysisPluginTests.java | 10 ++++------ 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java index ccfffbb85e7ec..3c614f84d4e23 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java @@ -558,7 +558,7 @@ public List getPreConfiguredTokenizers() { throw new IllegalArgumentException("The [edgeNGram] tokenizer name was deprecated in 7.6. " + "Please use the tokenizer name to [edge_ngram] for indices created in versions 8 or higher instead."); } else if (version.onOrAfter(org.elasticsearch.Version.V_7_6_0)) { - deprecationLogger.deprecatedAndMaybeLog("nGram_tokenizer_deprecation", + deprecationLogger.deprecatedAndMaybeLog("edgeNGram_tokenizer_deprecation", "The [edgeNGram] tokenizer name is deprecated and will be removed in a future version. " + "Please change the tokenizer name to [edge_ngram] instead."); } diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisPluginTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisPluginTests.java index 764c1c5695fa4..03200d1966ba3 100644 --- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisPluginTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisPluginTests.java @@ -19,15 +19,18 @@ package org.elasticsearch.analysis.common; +import org.apache.lucene.analysis.Tokenizer; import org.elasticsearch.Version; import org.elasticsearch.cluster.metadata.IndexMetaData; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; +import org.elasticsearch.index.analysis.TokenizerFactory; import org.elasticsearch.test.ESTestCase; import org.elasticsearch.test.IndexSettingsModule; import org.elasticsearch.test.VersionUtils; import java.io.IOException; +import java.util.Map; public class CommonAnalysisPluginTests extends ESTestCase { @@ -141,7 +144,6 @@ public void testNGramTokenizerDeprecation() throws IOException { VersionUtils.randomVersionBetween(random(), Version.V_8_0_0, Version.CURRENT), true)); expectThrows(IllegalArgumentException.class, () -> doTestCustomTokenizerDeprecation("edgeNGram", "edge_ngram", VersionUtils.randomVersionBetween(random(), Version.V_8_0_0, Version.CURRENT), true)); - } public void doTestPrebuiltTokenizerDeprecation(String deprecatedName, String replacement, Version version, boolean expectWarning) @@ -173,12 +175,8 @@ public void doTestCustomTokenizerDeprecation(String deprecatedName, String repla .build(); try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) { - Map tokenizers = createTestAnalysis( - IndexSettingsModule.newIndexSettings("index", settings), settings, commonAnalysisPlugin).tokenizer; - TokenizerFactory tokenizerFactory = tokenizers.get(deprecatedName); + createTestAnalysis(IndexSettingsModule.newIndexSettings("index", settings), settings, commonAnalysisPlugin); - Tokenizer tokenizer = tokenizerFactory.create(); - assertNotNull(tokenizer); if (expectWarning) { assertWarnings("The [" + deprecatedName + "] tokenizer name is deprecated and will be removed in a future version. " + "Please change the tokenizer name to [" + replacement + "] instead."); From 1369e17cfe333fffb2db935549a9aa7455e69a7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20B=C3=BCscher?= Date: Tue, 14 Jan 2020 16:28:34 +0100 Subject: [PATCH 5/5] Moving migration note from mappings -> analysis --- docs/reference/migration/migrate_8_0/analysis.asciidoc | 9 +++++++++ docs/reference/migration/migrate_8_0/mappings.asciidoc | 9 --------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/reference/migration/migrate_8_0/analysis.asciidoc b/docs/reference/migration/migrate_8_0/analysis.asciidoc index dd4b4ae73e920..6a7205436d0f6 100644 --- a/docs/reference/migration/migrate_8_0/analysis.asciidoc +++ b/docs/reference/migration/migrate_8_0/analysis.asciidoc @@ -16,3 +16,12 @@ The `nGram` and `edgeNGram` token filter names that have been deprecated since version 6.4 have been removed. Both token filters can only be used by their alternative names `ngram` and `edge_ngram` since version 7.0. + +[float] +[[nGram-edgeNGram-tokenizer-dreprecation]] +==== Disallow use of the `nGram` and `edgeNGram` tokenizer names + +The `nGram` and `edgeNGram` tokenizer names haven been deprecated with 7.6 and are no longer +supported on new indices. Mappings for indices created after 7.6 will continue to work but +emit a deprecation warning. The tokenizer name should be changed to the fully equivalent +`ngram` or `edge_ngram` names for new indices and in index templates. diff --git a/docs/reference/migration/migrate_8_0/mappings.asciidoc b/docs/reference/migration/migrate_8_0/mappings.asciidoc index 8f9df5e9858af..8c6fa75aad5e1 100644 --- a/docs/reference/migration/migrate_8_0/mappings.asciidoc +++ b/docs/reference/migration/migrate_8_0/mappings.asciidoc @@ -39,12 +39,3 @@ The setting has been deprecated with 7.5 and is no longer supported on new indic Mappings for older indices will continue to work but emit a deprecation warning. The `enabled` setting for `_field_names` should be removed from templates and mappings. Disabling _field_names is not necessary because it no longer carries a large index overhead. - -[float] -[[nGram-edgeNGram-dreprecation]] -==== Disallow use of the `nGram` and `edgeNGram` tokenizer names - -The `nGram` and `edgeNGram` tokenizer names haven been deprecated with 7.6 and are no longer -supported on new indices. Mappings for indices created after 7.6 will continue to work but -emit a deprecation warning. The tokenizer name should be changed to the fully equivalent -`ngram` or `edge_ngram` names for new indices and in index templates.