From 79933d796b167159f4f3934aef3b49697ee91da4 Mon Sep 17 00:00:00 2001 From: Nik Everett Date: Wed, 9 May 2018 09:23:10 -0400 Subject: [PATCH] Docs: Test examples that recreate lang analyzers (#29535) We have a pile of documentation describing how to rebuild the built in language analyzers and, previously, our documentation testing framework made sure that the examples successfully built *an* analyzer but they didn't assert that the analyzer built by the documentation matches the built in anlayzer. Unsuprisingly, some of the examples aren't quite right. This adds a mechanism that tests that the analyzers built by the docs. The mechanism is fairly simple and brutal but it seems to be working: build a hundred random unicode sequences and send them through the `_analyze` API with the rebuilt analyzer and then again through the built in analyzer. Then make sure both APIs return the same results. Each of these calls to `_anlayze` takes about 20ms on my laptop which seems fine. --- .../doc/RestTestsFromSnippetsTask.groovy | 11 +- docs/README.asciidoc | 17 ++ docs/build.gradle | 2 + .../analysis/analyzers/lang-analyzer.asciidoc | 168 +++++++++++++----- .../smoketest/DocsClientYamlTestSuiteIT.java | 143 ++++++++++++++- .../test/rest/yaml/ClientYamlTestClient.java | 27 ++- .../rest/yaml/ESClientYamlSuiteTestCase.java | 15 +- .../yaml/section/ClientYamlTestSuite.java | 5 +- .../rest/yaml/section/ExecutableSection.java | 15 +- 9 files changed, 345 insertions(+), 58 deletions(-) diff --git a/buildSrc/src/main/groovy/org/elasticsearch/gradle/doc/RestTestsFromSnippetsTask.groovy b/buildSrc/src/main/groovy/org/elasticsearch/gradle/doc/RestTestsFromSnippetsTask.groovy index 95ec00beca7e0..15a4f21b17543 100644 --- a/buildSrc/src/main/groovy/org/elasticsearch/gradle/doc/RestTestsFromSnippetsTask.groovy +++ b/buildSrc/src/main/groovy/org/elasticsearch/gradle/doc/RestTestsFromSnippetsTask.groovy @@ -141,9 +141,11 @@ public class RestTestsFromSnippetsTask extends SnippetsTask { private static final String SYNTAX = { String method = /(?GET|PUT|POST|HEAD|OPTIONS|DELETE)/ String pathAndQuery = /(?[^\n]+)/ - String badBody = /GET|PUT|POST|HEAD|OPTIONS|DELETE|#/ + String badBody = /GET|PUT|POST|HEAD|OPTIONS|DELETE|startyaml|#/ String body = /(?(?:\n(?!$badBody)[^\n]+)+)/ - String nonComment = /$method\s+$pathAndQuery$body?/ + String rawRequest = /(?:$method\s+$pathAndQuery$body?)/ + String yamlRequest = /(?:startyaml(?s)(?.+?)(?-s)endyaml)/ + String nonComment = /(?:$rawRequest|$yamlRequest)/ String comment = /(?#.+)/ /(?:$comment|$nonComment)\n+/ }() @@ -333,6 +335,11 @@ public class RestTestsFromSnippetsTask extends SnippetsTask { // Comment return } + String yamlRequest = matcher.group("yaml"); + if (yamlRequest != null) { + current.println(yamlRequest) + return + } String method = matcher.group("method") String pathAndQuery = matcher.group("pathAndQuery") String body = matcher.group("body") diff --git a/docs/README.asciidoc b/docs/README.asciidoc index f76682b4ce346..ef3b520e44e1b 100644 --- a/docs/README.asciidoc +++ b/docs/README.asciidoc @@ -68,6 +68,23 @@ for its modifiers: but rather than the setup defined in `docs/build.gradle` the setup is defined right in the documentation file. +In addition to the standard CONSOLE syntax these snippets can contain blocks +of yaml surrounded by markers like this: + +``` +startyaml + - compare_analyzers: {index: thai_example, first: thai, second: rebuilt_thai} +endyaml +``` + +This allows slightly more expressive testing of the snippets. Since that syntax +is not supported by CONSOLE the usual way to incorporate it is with a +`// TEST[s//]` marker like this: + +``` +// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: thai_example, first: thai, second: rebuilt_thai}\nendyaml\n/] +``` + Any place you can use json you can use elements like `$body.path.to.thing` which is replaced on the fly with the contents of the thing at `path.to.thing` in the last response. diff --git a/docs/build.gradle b/docs/build.gradle index dbc112c48176c..c6ded0292bc92 100644 --- a/docs/build.gradle +++ b/docs/build.gradle @@ -60,6 +60,8 @@ buildRestTests.docs = fileTree(projectDir) { exclude 'build.gradle' // That is where the snippets go, not where they come from! exclude 'build' + // Just syntax examples + exclude 'README.asciidoc' } Closure setupTwitter = { String name, int count -> diff --git a/docs/reference/analysis/analyzers/lang-analyzer.asciidoc b/docs/reference/analysis/analyzers/lang-analyzer.asciidoc index cb976601fdcbe..d718a0b2da6ff 100644 --- a/docs/reference/analysis/analyzers/lang-analyzer.asciidoc +++ b/docs/reference/analysis/analyzers/lang-analyzer.asciidoc @@ -97,10 +97,11 @@ PUT /arabic_example } }, "analyzer": { - "arabic": { + "rebuilt_arabic": { "tokenizer": "standard", "filter": [ "lowercase", + "decimal_digit", "arabic_stop", "arabic_normalization", "arabic_keywords", @@ -113,6 +114,8 @@ PUT /arabic_example } ---------------------------------------------------- // CONSOLE +// TEST[s/"arabic_keywords",//] +// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: arabic_example, first: arabic, second: rebuilt_arabic}\nendyaml\n/] <1> The default stopwords can be overridden with the `stopwords` or `stopwords_path` parameters. <2> This filter should be removed unless there are words which should @@ -144,7 +147,7 @@ PUT /armenian_example } }, "analyzer": { - "armenian": { + "rebuilt_armenian": { "tokenizer": "standard", "filter": [ "lowercase", @@ -159,6 +162,8 @@ PUT /armenian_example } ---------------------------------------------------- // CONSOLE +// TEST[s/"armenian_keywords",//] +// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: armenian_example, first: armenian, second: rebuilt_armenian}\nendyaml\n/] <1> The default stopwords can be overridden with the `stopwords` or `stopwords_path` parameters. <2> This filter should be removed unless there are words which should @@ -190,7 +195,7 @@ PUT /basque_example } }, "analyzer": { - "basque": { + "rebuilt_basque": { "tokenizer": "standard", "filter": [ "lowercase", @@ -205,6 +210,8 @@ PUT /basque_example } ---------------------------------------------------- // CONSOLE +// TEST[s/"basque_keywords",//] +// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: basque_example, first: basque, second: rebuilt_basque}\nendyaml\n/] <1> The default stopwords can be overridden with the `stopwords` or `stopwords_path` parameters. <2> This filter should be removed unless there are words which should @@ -236,14 +243,15 @@ PUT /bengali_example } }, "analyzer": { - "bengali": { + "rebuilt_bengali": { "tokenizer": "standard", "filter": [ "lowercase", + "decimal_digit", + "bengali_keywords", "indic_normalization", "bengali_normalization", "bengali_stop", - "bengali_keywords", "bengali_stemmer" ] } @@ -253,6 +261,8 @@ PUT /bengali_example } ---------------------------------------------------- // CONSOLE +// TEST[s/"bengali_keywords",//] +// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: bengali_example, first: bengali, second: rebuilt_bengali}\nendyaml\n/] <1> The default stopwords can be overridden with the `stopwords` or `stopwords_path` parameters. <2> This filter should be removed unless there are words which should @@ -284,7 +294,7 @@ PUT /brazilian_example } }, "analyzer": { - "brazilian": { + "rebuilt_brazilian": { "tokenizer": "standard", "filter": [ "lowercase", @@ -299,6 +309,8 @@ PUT /brazilian_example } ---------------------------------------------------- // CONSOLE +// TEST[s/"brazilian_keywords",//] +// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: brazilian_example, first: brazilian, second: rebuilt_brazilian}\nendyaml\n/] <1> The default stopwords can be overridden with the `stopwords` or `stopwords_path` parameters. <2> This filter should be removed unless there are words which should @@ -330,7 +342,7 @@ PUT /bulgarian_example } }, "analyzer": { - "bulgarian": { + "rebuilt_bulgarian": { "tokenizer": "standard", "filter": [ "lowercase", @@ -345,6 +357,8 @@ PUT /bulgarian_example } ---------------------------------------------------- // CONSOLE +// TEST[s/"bulgarian_keywords",//] +// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: bulgarian_example, first: bulgarian, second: rebuilt_bulgarian}\nendyaml\n/] <1> The default stopwords can be overridden with the `stopwords` or `stopwords_path` parameters. <2> This filter should be removed unless there are words which should @@ -380,7 +394,7 @@ PUT /catalan_example } }, "analyzer": { - "catalan": { + "rebuilt_catalan": { "tokenizer": "standard", "filter": [ "catalan_elision", @@ -396,6 +410,8 @@ PUT /catalan_example } ---------------------------------------------------- // CONSOLE +// TEST[s/"catalan_keywords",//] +// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: catalan_example, first: catalan, second: rebuilt_catalan}\nendyaml\n/] <1> The default stopwords can be overridden with the `stopwords` or `stopwords_path` parameters. <2> This filter should be removed unless there are words which should @@ -415,11 +431,17 @@ PUT /cjk_example "filter": { "english_stop": { "type": "stop", - "stopwords": "_english_" <1> + "stopwords": [ <1> + "a", "and", "are", "as", "at", "be", "but", "by", "for", + "if", "in", "into", "is", "it", "no", "not", "of", "on", + "or", "s", "such", "t", "that", "the", "their", "then", + "there", "these", "they", "this", "to", "was", "will", + "with", "www" + ] } }, "analyzer": { - "cjk": { + "rebuilt_cjk": { "tokenizer": "standard", "filter": [ "cjk_width", @@ -434,8 +456,12 @@ PUT /cjk_example } ---------------------------------------------------- // CONSOLE +// TEST[s/"cjk_keywords",//] +// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: cjk_example, first: cjk, second: rebuilt_cjk}\nendyaml\n/] <1> The default stopwords can be overridden with the `stopwords` - or `stopwords_path` parameters. + or `stopwords_path` parameters. The default stop words are + *almost* the same as the `_english_` set, but not exactly + the same. [[czech-analyzer]] ===== `czech` analyzer @@ -463,7 +489,7 @@ PUT /czech_example } }, "analyzer": { - "czech": { + "rebuilt_czech": { "tokenizer": "standard", "filter": [ "lowercase", @@ -478,6 +504,8 @@ PUT /czech_example } ---------------------------------------------------- // CONSOLE +// TEST[s/"czech_keywords",//] +// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: czech_example, first: czech, second: rebuilt_czech}\nendyaml\n/] <1> The default stopwords can be overridden with the `stopwords` or `stopwords_path` parameters. <2> This filter should be removed unless there are words which should @@ -509,7 +537,7 @@ PUT /danish_example } }, "analyzer": { - "danish": { + "rebuilt_danish": { "tokenizer": "standard", "filter": [ "lowercase", @@ -524,6 +552,8 @@ PUT /danish_example } ---------------------------------------------------- // CONSOLE +// TEST[s/"danish_keywords",//] +// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: danish_example, first: danish, second: rebuilt_danish}\nendyaml\n/] <1> The default stopwords can be overridden with the `stopwords` or `stopwords_path` parameters. <2> This filter should be removed unless there are words which should @@ -564,7 +594,7 @@ PUT /dutch_example } }, "analyzer": { - "dutch": { + "rebuilt_dutch": { "tokenizer": "standard", "filter": [ "lowercase", @@ -580,6 +610,8 @@ PUT /dutch_example } ---------------------------------------------------- // CONSOLE +// TEST[s/"dutch_keywords",//] +// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: dutch_example, first: dutch, second: rebuilt_dutch}\nendyaml\n/] <1> The default stopwords can be overridden with the `stopwords` or `stopwords_path` parameters. <2> This filter should be removed unless there are words which should @@ -615,7 +647,7 @@ PUT /english_example } }, "analyzer": { - "english": { + "rebuilt_english": { "tokenizer": "standard", "filter": [ "english_possessive_stemmer", @@ -631,6 +663,8 @@ PUT /english_example } ---------------------------------------------------- // CONSOLE +// TEST[s/"english_keywords",//] +// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: english_example, first: english, second: rebuilt_english}\nendyaml\n/] <1> The default stopwords can be overridden with the `stopwords` or `stopwords_path` parameters. <2> This filter should be removed unless there are words which should @@ -662,7 +696,7 @@ PUT /finnish_example } }, "analyzer": { - "finnish": { + "rebuilt_finnish": { "tokenizer": "standard", "filter": [ "lowercase", @@ -677,6 +711,8 @@ PUT /finnish_example } ---------------------------------------------------- // CONSOLE +// TEST[s/"finnish_keywords",//] +// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: finnish_example, first: finnish, second: rebuilt_finnish}\nendyaml\n/] <1> The default stopwords can be overridden with the `stopwords` or `stopwords_path` parameters. <2> This filter should be removed unless there are words which should @@ -717,7 +753,7 @@ PUT /french_example } }, "analyzer": { - "french": { + "rebuilt_french": { "tokenizer": "standard", "filter": [ "french_elision", @@ -733,6 +769,8 @@ PUT /french_example } ---------------------------------------------------- // CONSOLE +// TEST[s/"french_keywords",//] +// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: french_example, first: french, second: rebuilt_french}\nendyaml\n/] <1> The default stopwords can be overridden with the `stopwords` or `stopwords_path` parameters. <2> This filter should be removed unless there are words which should @@ -764,7 +802,7 @@ PUT /galician_example } }, "analyzer": { - "galician": { + "rebuilt_galician": { "tokenizer": "standard", "filter": [ "lowercase", @@ -779,6 +817,8 @@ PUT /galician_example } ---------------------------------------------------- // CONSOLE +// TEST[s/"galician_keywords",//] +// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: galician_example, first: galician, second: rebuilt_galician}\nendyaml\n/] <1> The default stopwords can be overridden with the `stopwords` or `stopwords_path` parameters. <2> This filter should be removed unless there are words which should @@ -810,7 +850,7 @@ PUT /german_example } }, "analyzer": { - "german": { + "rebuilt_german": { "tokenizer": "standard", "filter": [ "lowercase", @@ -826,6 +866,8 @@ PUT /german_example } ---------------------------------------------------- // CONSOLE +// TEST[s/"german_keywords",//] +// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: german_example, first: german, second: rebuilt_german}\nendyaml\n/] <1> The default stopwords can be overridden with the `stopwords` or `stopwords_path` parameters. <2> This filter should be removed unless there are words which should @@ -861,7 +903,7 @@ PUT /greek_example } }, "analyzer": { - "greek": { + "rebuilt_greek": { "tokenizer": "standard", "filter": [ "greek_lowercase", @@ -876,6 +918,8 @@ PUT /greek_example } ---------------------------------------------------- // CONSOLE +// TEST[s/"greek_keywords",//] +// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: greek_example, first: greek, second: rebuilt_greek}\nendyaml\n/] <1> The default stopwords can be overridden with the `stopwords` or `stopwords_path` parameters. <2> This filter should be removed unless there are words which should @@ -907,14 +951,15 @@ PUT /hindi_example } }, "analyzer": { - "hindi": { + "rebuilt_hindi": { "tokenizer": "standard", "filter": [ "lowercase", + "decimal_digit", + "hindi_keywords", "indic_normalization", "hindi_normalization", "hindi_stop", - "hindi_keywords", "hindi_stemmer" ] } @@ -924,6 +969,8 @@ PUT /hindi_example } ---------------------------------------------------- // CONSOLE +// TEST[s/"hindi_keywords",//] +// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: hindi_example, first: hindi, second: rebuilt_hindi}\nendyaml\n/] <1> The default stopwords can be overridden with the `stopwords` or `stopwords_path` parameters. <2> This filter should be removed unless there are words which should @@ -955,7 +1002,7 @@ PUT /hungarian_example } }, "analyzer": { - "hungarian": { + "rebuilt_hungarian": { "tokenizer": "standard", "filter": [ "lowercase", @@ -970,6 +1017,8 @@ PUT /hungarian_example } ---------------------------------------------------- // CONSOLE +// TEST[s/"hungarian_keywords",//] +// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: hungarian_example, first: hungarian, second: rebuilt_hungarian}\nendyaml\n/] <1> The default stopwords can be overridden with the `stopwords` or `stopwords_path` parameters. <2> This filter should be removed unless there are words which should @@ -1002,7 +1051,7 @@ PUT /indonesian_example } }, "analyzer": { - "indonesian": { + "rebuilt_indonesian": { "tokenizer": "standard", "filter": [ "lowercase", @@ -1017,6 +1066,8 @@ PUT /indonesian_example } ---------------------------------------------------- // CONSOLE +// TEST[s/"indonesian_keywords",//] +// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: indonesian_example, first: indonesian, second: rebuilt_indonesian}\nendyaml\n/] <1> The default stopwords can be overridden with the `stopwords` or `stopwords_path` parameters. <2> This filter should be removed unless there are words which should @@ -1034,9 +1085,15 @@ PUT /irish_example "settings": { "analysis": { "filter": { + "irish_hyphenation": { + "type": "stop", + "stopwords": [ "h", "n", "t" ], + "ignore_case": true + }, "irish_elision": { "type": "elision", - "articles": [ "h", "n", "t" ] + "articles": [ "d", "m", "b" ], + "articles_case": true }, "irish_stop": { "type": "stop", @@ -1056,12 +1113,13 @@ PUT /irish_example } }, "analyzer": { - "irish": { + "rebuilt_irish": { "tokenizer": "standard", "filter": [ - "irish_stop", + "irish_hyphenation", "irish_elision", "irish_lowercase", + "irish_stop", "irish_keywords", "irish_stemmer" ] @@ -1072,6 +1130,8 @@ PUT /irish_example } ---------------------------------------------------- // CONSOLE +// TEST[s/"irish_keywords",//] +// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: irish_example, first: irish, second: rebuilt_irish}\nendyaml\n/] <1> The default stopwords can be overridden with the `stopwords` or `stopwords_path` parameters. <2> This filter should be removed unless there are words which should @@ -1112,7 +1172,7 @@ PUT /italian_example } }, "analyzer": { - "italian": { + "rebuilt_italian": { "tokenizer": "standard", "filter": [ "italian_elision", @@ -1128,6 +1188,8 @@ PUT /italian_example } ---------------------------------------------------- // CONSOLE +// TEST[s/"italian_keywords",//] +// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: italian_example, first: italian, second: rebuilt_italian}\nendyaml\n/] <1> The default stopwords can be overridden with the `stopwords` or `stopwords_path` parameters. <2> This filter should be removed unless there are words which should @@ -1159,7 +1221,7 @@ PUT /latvian_example } }, "analyzer": { - "latvian": { + "rebuilt_latvian": { "tokenizer": "standard", "filter": [ "lowercase", @@ -1174,6 +1236,8 @@ PUT /latvian_example } ---------------------------------------------------- // CONSOLE +// TEST[s/"latvian_keywords",//] +// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: latvian_example, first: latvian, second: rebuilt_latvian}\nendyaml\n/] <1> The default stopwords can be overridden with the `stopwords` or `stopwords_path` parameters. <2> This filter should be removed unless there are words which should @@ -1205,7 +1269,7 @@ PUT /lithuanian_example } }, "analyzer": { - "lithuanian": { + "rebuilt_lithuanian": { "tokenizer": "standard", "filter": [ "lowercase", @@ -1220,6 +1284,8 @@ PUT /lithuanian_example } ---------------------------------------------------- // CONSOLE +// TEST[s/"lithuanian_keywords",//] +// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: lithuanian_example, first: lithuanian, second: rebuilt_lithuanian}\nendyaml\n/] <1> The default stopwords can be overridden with the `stopwords` or `stopwords_path` parameters. <2> This filter should be removed unless there are words which should @@ -1251,7 +1317,7 @@ PUT /norwegian_example } }, "analyzer": { - "norwegian": { + "rebuilt_norwegian": { "tokenizer": "standard", "filter": [ "lowercase", @@ -1266,6 +1332,8 @@ PUT /norwegian_example } ---------------------------------------------------- // CONSOLE +// TEST[s/"norwegian_keywords",//] +// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: norwegian_example, first: norwegian, second: rebuilt_norwegian}\nendyaml\n/] <1> The default stopwords can be overridden with the `stopwords` or `stopwords_path` parameters. <2> This filter should be removed unless there are words which should @@ -1295,11 +1363,12 @@ PUT /persian_example } }, "analyzer": { - "persian": { + "rebuilt_persian": { "tokenizer": "standard", "char_filter": [ "zero_width_spaces" ], "filter": [ "lowercase", + "decimal_digit", "arabic_normalization", "persian_normalization", "persian_stop" @@ -1311,6 +1380,7 @@ PUT /persian_example } ---------------------------------------------------- // CONSOLE +// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: persian_example, first: persian, second: rebuilt_persian}\nendyaml\n/] <1> Replaces zero-width non-joiners with an ASCII space. <2> The default stopwords can be overridden with the `stopwords` or `stopwords_path` parameters. @@ -1341,7 +1411,7 @@ PUT /portuguese_example } }, "analyzer": { - "portuguese": { + "rebuilt_portuguese": { "tokenizer": "standard", "filter": [ "lowercase", @@ -1356,6 +1426,8 @@ PUT /portuguese_example } ---------------------------------------------------- // CONSOLE +// TEST[s/"portuguese_keywords",//] +// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: portuguese_example, first: portuguese, second: rebuilt_portuguese}\nendyaml\n/] <1> The default stopwords can be overridden with the `stopwords` or `stopwords_path` parameters. <2> This filter should be removed unless there are words which should @@ -1387,7 +1459,7 @@ PUT /romanian_example } }, "analyzer": { - "romanian": { + "rebuilt_romanian": { "tokenizer": "standard", "filter": [ "lowercase", @@ -1402,6 +1474,8 @@ PUT /romanian_example } ---------------------------------------------------- // CONSOLE +// TEST[s/"romanian_keywords",//] +// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: romanian_example, first: romanian, second: rebuilt_romanian}\nendyaml\n/] <1> The default stopwords can be overridden with the `stopwords` or `stopwords_path` parameters. <2> This filter should be removed unless there are words which should @@ -1434,7 +1508,7 @@ PUT /russian_example } }, "analyzer": { - "russian": { + "rebuilt_russian": { "tokenizer": "standard", "filter": [ "lowercase", @@ -1449,6 +1523,8 @@ PUT /russian_example } ---------------------------------------------------- // CONSOLE +// TEST[s/"russian_keywords",//] +// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: russian_example, first: russian, second: rebuilt_russian}\nendyaml\n/] <1> The default stopwords can be overridden with the `stopwords` or `stopwords_path` parameters. <2> This filter should be removed unless there are words which should @@ -1480,11 +1556,12 @@ PUT /sorani_example } }, "analyzer": { - "sorani": { + "rebuilt_sorani": { "tokenizer": "standard", "filter": [ "sorani_normalization", "lowercase", + "decimal_digit", "sorani_stop", "sorani_keywords", "sorani_stemmer" @@ -1496,6 +1573,8 @@ PUT /sorani_example } ---------------------------------------------------- // CONSOLE +// TEST[s/"sorani_keywords",//] +// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: sorani_example, first: sorani, second: rebuilt_sorani}\nendyaml\n/] <1> The default stopwords can be overridden with the `stopwords` or `stopwords_path` parameters. <2> This filter should be removed unless there are words which should @@ -1527,7 +1606,7 @@ PUT /spanish_example } }, "analyzer": { - "spanish": { + "rebuilt_spanish": { "tokenizer": "standard", "filter": [ "lowercase", @@ -1542,6 +1621,8 @@ PUT /spanish_example } ---------------------------------------------------- // CONSOLE +// TEST[s/"spanish_keywords",//] +// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: spanish_example, first: spanish, second: rebuilt_spanish}\nendyaml\n/] <1> The default stopwords can be overridden with the `stopwords` or `stopwords_path` parameters. <2> This filter should be removed unless there are words which should @@ -1573,7 +1654,7 @@ PUT /swedish_example } }, "analyzer": { - "swedish": { + "rebuilt_swedish": { "tokenizer": "standard", "filter": [ "lowercase", @@ -1588,6 +1669,8 @@ PUT /swedish_example } ---------------------------------------------------- // CONSOLE +// TEST[s/"swedish_keywords",//] +// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: swedish_example, first: swedish, second: rebuilt_swedish}\nendyaml\n/] <1> The default stopwords can be overridden with the `stopwords` or `stopwords_path` parameters. <2> This filter should be removed unless there are words which should @@ -1623,7 +1706,7 @@ PUT /turkish_example } }, "analyzer": { - "turkish": { + "rebuilt_turkish": { "tokenizer": "standard", "filter": [ "apostrophe", @@ -1639,6 +1722,8 @@ PUT /turkish_example } ---------------------------------------------------- // CONSOLE +// TEST[s/"turkish_keywords",//] +// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: turkish_example, first: turkish, second: rebuilt_turkish}\nendyaml\n/] <1> The default stopwords can be overridden with the `stopwords` or `stopwords_path` parameters. <2> This filter should be removed unless there are words which should @@ -1662,10 +1747,11 @@ PUT /thai_example } }, "analyzer": { - "thai": { + "rebuilt_thai": { "tokenizer": "thai", "filter": [ "lowercase", + "decimal_digit", "thai_stop" ] } @@ -1675,5 +1761,7 @@ PUT /thai_example } ---------------------------------------------------- // CONSOLE +// TEST[s/"thai_keywords",//] +// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: thai_example, first: thai, second: rebuilt_thai}\nendyaml\n/] <1> The default stopwords can be overridden with the `stopwords` or `stopwords_path` parameters. diff --git a/docs/src/test/java/org/elasticsearch/smoketest/DocsClientYamlTestSuiteIT.java b/docs/src/test/java/org/elasticsearch/smoketest/DocsClientYamlTestSuiteIT.java index e6f6fa7079603..af2377a17ffc2 100644 --- a/docs/src/test/java/org/elasticsearch/smoketest/DocsClientYamlTestSuiteIT.java +++ b/docs/src/test/java/org/elasticsearch/smoketest/DocsClientYamlTestSuiteIT.java @@ -19,19 +19,41 @@ package org.elasticsearch.smoketest; +import org.apache.http.HttpHost; +import org.apache.lucene.util.BytesRef; + import com.carrotsearch.randomizedtesting.annotations.Name; import com.carrotsearch.randomizedtesting.annotations.ParametersFactory; import org.elasticsearch.Version; import org.elasticsearch.client.RestClient; import org.apache.http.HttpHost; +import org.elasticsearch.common.ParseField; +import org.elasticsearch.common.xcontent.ConstructingObjectParser; +import org.elasticsearch.common.xcontent.NamedXContentRegistry; +import org.elasticsearch.common.xcontent.XContentLocation; +import org.elasticsearch.common.xcontent.XContentParser; +import org.elasticsearch.common.xcontent.XContentParser.Token; import org.elasticsearch.test.rest.yaml.ClientYamlDocsTestClient; import org.elasticsearch.test.rest.yaml.ClientYamlTestCandidate; import org.elasticsearch.test.rest.yaml.ClientYamlTestClient; +import org.elasticsearch.test.rest.yaml.ClientYamlTestExecutionContext; +import org.elasticsearch.test.rest.yaml.ClientYamlTestResponse; import org.elasticsearch.test.rest.yaml.ESClientYamlSuiteTestCase; import org.elasticsearch.test.rest.yaml.restspec.ClientYamlSuiteRestSpec; +import org.elasticsearch.test.rest.yaml.section.ExecutableSection; import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; import java.util.List; +import java.util.Map; + +import static org.elasticsearch.common.xcontent.ConstructingObjectParser.constructorArg; + +import static java.util.Collections.emptyMap; +import static java.util.Collections.singletonList; +import static java.util.Collections.singletonMap; public class DocsClientYamlTestSuiteIT extends ESClientYamlSuiteTestCase { @@ -41,7 +63,12 @@ public DocsClientYamlTestSuiteIT(@Name("yaml") ClientYamlTestCandidate testCandi @ParametersFactory public static Iterable parameters() throws Exception { - return ESClientYamlSuiteTestCase.createParameters(); + List entries = new ArrayList<>(ExecutableSection.DEFAULT_EXECUTABLE_CONTEXTS.size() + 1); + entries.addAll(ExecutableSection.DEFAULT_EXECUTABLE_CONTEXTS); + entries.add(new NamedXContentRegistry.Entry(ExecutableSection.class, + new ParseField("compare_analyzers"), CompareAnalyzers::parse)); + NamedXContentRegistry executeableSectionRegistry = new NamedXContentRegistry(entries); + return ESClientYamlSuiteTestCase.createParameters(executeableSectionRegistry); } @Override @@ -64,5 +91,117 @@ protected ClientYamlTestClient initClientYamlTestClient(ClientYamlSuiteRestSpec List hosts, Version esVersion) throws IOException { return new ClientYamlDocsTestClient(restSpec, restClient, hosts, esVersion); } -} + /** + * Compares the the results of running two analyzers against many random + * strings. The goal is to figure out if two anlayzers are "the same" by + * comparing their results. This is far from perfect but should be fairly + * accurate, especially for gross things like missing {@code decimal_digit} + * token filters, and should be fairly fast because it compares a fairly + * small number of tokens. + */ + private static class CompareAnalyzers implements ExecutableSection { + private static ConstructingObjectParser PARSER = + new ConstructingObjectParser<>("test_analyzer", false, (a, location) -> { + String index = (String) a[0]; + String first = (String) a[1]; + String second = (String) a[2]; + return new CompareAnalyzers(location, index, first, second); + }); + static { + PARSER.declareString(constructorArg(), new ParseField("index")); + PARSER.declareString(constructorArg(), new ParseField("first")); + PARSER.declareString(constructorArg(), new ParseField("second")); + } + private static CompareAnalyzers parse(XContentParser parser) throws IOException { + XContentLocation location = parser.getTokenLocation(); + CompareAnalyzers section = PARSER.parse(parser, location); + assert parser.currentToken() == Token.END_OBJECT : "End of object required"; + parser.nextToken(); // throw out the END_OBJECT to conform with other ExecutableSections + return section; + } + + private final XContentLocation location; + private final String index; + private final String first; + private final String second; + + private CompareAnalyzers(XContentLocation location, String index, String first, String second) { + this.location = location; + this.index = index; + this.first = first; + this.second = second; + } + + @Override + public XContentLocation getLocation() { + return location; + } + + @Override + public void execute(ClientYamlTestExecutionContext executionContext) throws IOException { + int size = 100; + int maxLength = 15; + List testText = new ArrayList<>(size); + for (int i = 0; i < size; i++) { + /** + * Build a string with a few unicode sequences separated by + * spaces. The unicode sequences aren't going to be of the same + * code page which is a shame because it makes the entire + * string less realistic. But this still provides a fairly + * nice string to compare. + */ + int spaces = between(0, 5); + StringBuilder b = new StringBuilder((spaces + 1) * maxLength); + b.append(randomRealisticUnicodeOfCodepointLengthBetween(1, maxLength)); + for (int t = 0; t < spaces; t++) { + b.append(' '); + b.append(randomRealisticUnicodeOfCodepointLengthBetween(1, maxLength)); + } + testText.add(b.toString() + // Don't look up stashed values + .replace("$", "\\$")); + } + Map body = new HashMap<>(2); + body.put("analyzer", first); + body.put("text", testText); + ClientYamlTestResponse response = executionContext.callApi("indices.analyze", singletonMap("index", index), + singletonList(body), emptyMap()); + Iterator firstTokens = ((List) response.evaluate("tokens")).iterator(); + body.put("analyzer", second); + response = executionContext.callApi("indices.analyze", singletonMap("index", index), + singletonList(body), emptyMap()); + Iterator secondTokens = ((List) response.evaluate("tokens")).iterator(); + + Object previousFirst = null; + Object previousSecond = null; + while (firstTokens.hasNext()) { + if (false == secondTokens.hasNext()) { + fail(second + " has fewer tokens than " + first + ". " + + first + " has [" + firstTokens.next() + "] but " + second + " is out of tokens. " + + first + "'s last token was [" + previousFirst + "] and " + + second + "'s last token was' [" + previousSecond + "]"); + } + Map firstToken = (Map) firstTokens.next(); + Map secondToken = (Map) secondTokens.next(); + String firstText = (String) firstToken.get("token"); + String secondText = (String) secondToken.get("token"); + // Check the text and produce an error message with the utf8 sequence if they don't match. + if (false == secondText.equals(firstText)) { + fail("text differs: " + first + " was [" + firstText + "] but " + second + " was [" + secondText + + "]. In utf8 those are\n" + new BytesRef(firstText) + " and\n" + new BytesRef(secondText)); + } + // Now check the whole map just in case the text matches but something else differs + assertEquals(firstToken, secondToken); + previousFirst = firstToken; + previousSecond = secondToken; + } + if (secondTokens.hasNext()) { + fail(second + " has more tokens than " + first + ". " + + second + " has [" + secondTokens.next() + "] but " + first + " is out of tokens. " + + first + "'s last token was [" + previousFirst + "] and " + + second + "'s last token was' [" + previousSecond + "]"); + } + } + } +} diff --git a/test/framework/src/main/java/org/elasticsearch/test/rest/yaml/ClientYamlTestClient.java b/test/framework/src/main/java/org/elasticsearch/test/rest/yaml/ClientYamlTestClient.java index 12621d73c32a9..795d99c51ef43 100644 --- a/test/framework/src/main/java/org/elasticsearch/test/rest/yaml/ClientYamlTestClient.java +++ b/test/framework/src/main/java/org/elasticsearch/test/rest/yaml/ClientYamlTestClient.java @@ -121,7 +121,7 @@ public ClientYamlTestResponse callApi(String apiName, Map params } String contentType = entity.getContentType().getValue(); //randomly test the GET with source param instead of GET/POST with body - if (sendBodyAsSourceParam(supportedMethods, contentType)) { + if (sendBodyAsSourceParam(supportedMethods, contentType, entity.getContentLength())) { logger.debug("sending the request body as source param with GET method"); queryStringParams.put("source", EntityUtils.toString(entity)); queryStringParams.put("source_content_type", contentType); @@ -177,14 +177,25 @@ public ClientYamlTestResponse callApi(String apiName, Map params } } - private static boolean sendBodyAsSourceParam(List supportedMethods, String contentType) { - if (supportedMethods.contains(HttpGet.METHOD_NAME)) { - if (contentType.startsWith(ContentType.APPLICATION_JSON.getMimeType()) || - contentType.startsWith(YAML_CONTENT_TYPE.getMimeType())) { - return RandomizedTest.rarely(); - } + private static boolean sendBodyAsSourceParam(List supportedMethods, String contentType, long contentLength) { + if (false == supportedMethods.contains(HttpGet.METHOD_NAME)) { + // The API doesn't claim to support GET anyway + return false; + } + if (contentLength < 0) { + // Negative length means "unknown" or "huge" in this case. Either way we can't send it as a parameter + return false; + } + if (contentLength > 2000) { + // Long bodies won't fit in the parameter and will cause a too_long_frame_exception + return false; + } + if (false == contentType.startsWith(ContentType.APPLICATION_JSON.getMimeType()) + && false == contentType.startsWith(YAML_CONTENT_TYPE.getMimeType())) { + // We can only encode JSON or YAML this way. + return false; } - return false; + return RandomizedTest.rarely(); } private ClientYamlSuiteRestApi restApi(String apiName) { diff --git a/test/framework/src/main/java/org/elasticsearch/test/rest/yaml/ESClientYamlSuiteTestCase.java b/test/framework/src/main/java/org/elasticsearch/test/rest/yaml/ESClientYamlSuiteTestCase.java index 927f9b46c966a..950bb14eed9af 100644 --- a/test/framework/src/main/java/org/elasticsearch/test/rest/yaml/ESClientYamlSuiteTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/test/rest/yaml/ESClientYamlSuiteTestCase.java @@ -28,6 +28,7 @@ import org.elasticsearch.common.Strings; import org.elasticsearch.common.collect.Tuple; import org.elasticsearch.common.io.PathUtils; +import org.elasticsearch.common.xcontent.NamedXContentRegistry; import org.elasticsearch.test.rest.ESRestTestCase; import org.elasticsearch.test.rest.yaml.restspec.ClientYamlSuiteRestApi; import org.elasticsearch.test.rest.yaml.restspec.ClientYamlSuiteRestSpec; @@ -143,7 +144,19 @@ protected ClientYamlTestClient initClientYamlTestClient(ClientYamlSuiteRestSpec return new ClientYamlTestClient(restSpec, restClient, hosts, esVersion); } + /** + * Create parameters for this parameterized test. Uses the + * {@link ExecutableSection#XCONTENT_REGISTRY list} of executable sections + * defined in {@link ExecutableSection}. + */ public static Iterable createParameters() throws Exception { + return createParameters(ExecutableSection.XCONTENT_REGISTRY); + } + + /** + * Create parameters for this parameterized test. + */ + public static Iterable createParameters(NamedXContentRegistry executeableSectionRegistry) throws Exception { String[] paths = resolvePathsProperty(REST_TESTS_SUITE, ""); // default to all tests under the test root List tests = new ArrayList<>(); Map> yamlSuites = loadSuites(paths); @@ -151,7 +164,7 @@ public static Iterable createParameters() throws Exception { for (String api : yamlSuites.keySet()) { List yamlFiles = new ArrayList<>(yamlSuites.get(api)); for (Path yamlFile : yamlFiles) { - ClientYamlTestSuite restTestSuite = ClientYamlTestSuite.parse(api, yamlFile); + ClientYamlTestSuite restTestSuite = ClientYamlTestSuite.parse(executeableSectionRegistry, api, yamlFile); for (ClientYamlTestSection testSection : restTestSuite.getTestSections()) { tests.add(new Object[]{ new ClientYamlTestCandidate(restTestSuite, testSection) }); } diff --git a/test/framework/src/main/java/org/elasticsearch/test/rest/yaml/section/ClientYamlTestSuite.java b/test/framework/src/main/java/org/elasticsearch/test/rest/yaml/section/ClientYamlTestSuite.java index 72c83f632efb0..b9988128b02a4 100644 --- a/test/framework/src/main/java/org/elasticsearch/test/rest/yaml/section/ClientYamlTestSuite.java +++ b/test/framework/src/main/java/org/elasticsearch/test/rest/yaml/section/ClientYamlTestSuite.java @@ -21,6 +21,7 @@ import org.elasticsearch.common.ParsingException; import org.elasticsearch.common.xcontent.DeprecationHandler; import org.elasticsearch.common.xcontent.LoggingDeprecationHandler; +import org.elasticsearch.common.xcontent.NamedXContentRegistry; import org.elasticsearch.common.xcontent.XContentParser; import org.elasticsearch.common.xcontent.yaml.YamlXContent; @@ -40,7 +41,7 @@ * Supports a setup section and multiple test sections. */ public class ClientYamlTestSuite { - public static ClientYamlTestSuite parse(String api, Path file) throws IOException { + public static ClientYamlTestSuite parse(NamedXContentRegistry executeableSectionRegistry, String api, Path file) throws IOException { if (!Files.isRegularFile(file)) { throw new IllegalArgumentException(file.toAbsolutePath() + " is not a file"); } @@ -64,7 +65,7 @@ public static ClientYamlTestSuite parse(String api, Path file) throws IOExceptio } } - try (XContentParser parser = YamlXContent.yamlXContent.createParser(ExecutableSection.XCONTENT_REGISTRY, + try (XContentParser parser = YamlXContent.yamlXContent.createParser(executeableSectionRegistry, LoggingDeprecationHandler.INSTANCE, Files.newInputStream(file))) { return parse(api, filename, parser); } catch(Exception e) { diff --git a/test/framework/src/main/java/org/elasticsearch/test/rest/yaml/section/ExecutableSection.java b/test/framework/src/main/java/org/elasticsearch/test/rest/yaml/section/ExecutableSection.java index 827457f4c2ae2..ce5ea1c1cde06 100644 --- a/test/framework/src/main/java/org/elasticsearch/test/rest/yaml/section/ExecutableSection.java +++ b/test/framework/src/main/java/org/elasticsearch/test/rest/yaml/section/ExecutableSection.java @@ -26,15 +26,18 @@ import java.io.IOException; import java.util.Arrays; +import java.util.List; + +import static java.util.Collections.unmodifiableList; /** * Represents a test fragment that can be executed (e.g. api call, assertion) */ public interface ExecutableSection { /** - * {@link NamedXContentRegistry} needed in the {@link XContentParser} before calling {@link ExecutableSection#parse(XContentParser)}. + * Default list of {@link ExecutableSection}s available for tests. */ - NamedXContentRegistry XCONTENT_REGISTRY = new NamedXContentRegistry(Arrays.asList( + List DEFAULT_EXECUTABLE_CONTEXTS = unmodifiableList(Arrays.asList( new NamedXContentRegistry.Entry(ExecutableSection.class, new ParseField("do"), DoSection::parse), new NamedXContentRegistry.Entry(ExecutableSection.class, new ParseField("set"), SetSection::parse), new NamedXContentRegistry.Entry(ExecutableSection.class, new ParseField("match"), MatchAssertion::parse), @@ -46,6 +49,12 @@ public interface ExecutableSection { new NamedXContentRegistry.Entry(ExecutableSection.class, new ParseField("lte"), LessThanOrEqualToAssertion::parse), new NamedXContentRegistry.Entry(ExecutableSection.class, new ParseField("length"), LengthAssertion::parse))); + /** + * {@link NamedXContentRegistry} that parses the default list of + * {@link ExecutableSection}s available for tests. + */ + NamedXContentRegistry XCONTENT_REGISTRY = new NamedXContentRegistry(DEFAULT_EXECUTABLE_CONTEXTS); + static ExecutableSection parse(XContentParser parser) throws IOException { ParserUtils.advanceToFieldName(parser); String section = parser.currentName(); @@ -60,7 +69,7 @@ static ExecutableSection parse(XContentParser parser) throws IOException { } /** - * Get the location in the test that this was defined. + * Get the location in the test that this was defined. */ XContentLocation getLocation();