From 74429fe263315251bccbf0a3e2250f4d5abadb80 Mon Sep 17 00:00:00 2001 From: Ralph Ursprung Date: Thu, 12 Sep 2024 18:49:29 +0200 Subject: [PATCH] add `phone` & `phone-search` analyzer + tokenizer this is largely based on [elasticsearch-phone] and internally uses [libphonenumber]. this intentionally only ports a subset of the features: only `phone` and `phone-search` are supported right now, `phone-email` can be added if/when there's a clear need for it. this allows defining the region to be used when analysing a phone number. so far only the generic "unkown" region (`ZZ`) had been used which worked as long as international numbers were prefixed with `+` but did not work when using local numbers (e.g. a number stored as `+4158...` was not matched against a number entered as `004158...` or `058...`). example configuration for an index: ```json { "index": { "analysis": { "analyzer": { "phone": { "type": "phone" }, "phone-search": { "type": "phone-search" }, "phone-ch": { "type": "phone", "phone-region": "CH" }, "phone-search-ch": { "type": "phone-search", "phone-region": "CH" } } } } } ``` this creates four analyzers: `phone` and `phone-search` which do not explicitly specify a region and thus fall back to `ZZ` (unknown region, regional version of international dialing prefix (e.g. `00` instead of `+` in most of europe) will not be recognised) and `phone-ch` and `phone-search-ch` which will try to parse the phone number as a swiss phone number (thus e.g. `00` as a prefix is recognised as the international dialing prefix). closes opensearch-project/OpenSearch#11326 [elasticsearch-phone]: https://github.com/purecloudlabs/elasticsearch-phone [libphonenumber]: https://github.com/google/libphonenumber Signed-off-by: Ralph Ursprung --- modules/analysis-common/build.gradle | 1 + .../common/CommonAnalysisModulePlugin.java | 5 + .../analysis/common/PhoneNumberAnalyzer.java | 52 ++++ .../common/PhoneNumberAnalyzerProvider.java | 31 +++ .../common/PhoneNumberTermTokenizer.java | 147 +++++++++++ .../PhoneNumberTermTokenizerFactory.java | 33 +++ .../common/PhoneNumberAnalyzerTests.java | 238 ++++++++++++++++++ .../analysis/common/phone_analysis.json | 22 ++ 8 files changed, 529 insertions(+) create mode 100644 modules/analysis-common/src/main/java/org/opensearch/analysis/common/PhoneNumberAnalyzer.java create mode 100644 modules/analysis-common/src/main/java/org/opensearch/analysis/common/PhoneNumberAnalyzerProvider.java create mode 100644 modules/analysis-common/src/main/java/org/opensearch/analysis/common/PhoneNumberTermTokenizer.java create mode 100644 modules/analysis-common/src/main/java/org/opensearch/analysis/common/PhoneNumberTermTokenizerFactory.java create mode 100644 modules/analysis-common/src/test/java/org/opensearch/analysis/common/PhoneNumberAnalyzerTests.java create mode 100644 modules/analysis-common/src/test/resources/org/opensearch/analysis/common/phone_analysis.json diff --git a/modules/analysis-common/build.gradle b/modules/analysis-common/build.gradle index 58ecf79cda0d7..6205679b26082 100644 --- a/modules/analysis-common/build.gradle +++ b/modules/analysis-common/build.gradle @@ -44,4 +44,5 @@ restResources { dependencies { compileOnly project(':modules:lang-painless') + implementation group: 'com.googlecode.libphonenumber', name: 'libphonenumber', version: '8.13.45' } diff --git a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java index f14e499081ce9..334c2fb454945 100644 --- a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java +++ b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java @@ -205,6 +205,8 @@ public Map>> getAn analyzers.put("fingerprint", FingerprintAnalyzerProvider::new); analyzers.put("pattern", PatternAnalyzerProvider::new); analyzers.put("snowball", SnowballAnalyzerProvider::new); + analyzers.put("phone", (indexSettings, environment, name, settings) -> new PhoneNumberAnalyzerProvider(indexSettings, "phone", settings, true)); + analyzers.put("phone-search", (indexSettings, environment, name, settings) -> new PhoneNumberAnalyzerProvider(indexSettings, "phone-search", settings, true)); // Language analyzers: analyzers.put("arabic", ArabicAnalyzerProvider::new); @@ -411,6 +413,9 @@ public Map> getTokenizers() { tokenizers.put("uax_url_email", UAX29URLEmailTokenizerFactory::new); tokenizers.put("whitespace", WhitespaceTokenizerFactory::new); tokenizers.put("keyword", KeywordTokenizerFactory::new); + tokenizers.put("phone", (indexSettings, environment, name, settings) -> new PhoneNumberTermTokenizerFactory(indexSettings, "phone", settings, true)); + tokenizers.put("phone-search", (indexSettings, environment, name, settings) -> new PhoneNumberTermTokenizerFactory(indexSettings, "phone-search", settings, true)); + return tokenizers; } diff --git a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/PhoneNumberAnalyzer.java b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/PhoneNumberAnalyzer.java new file mode 100644 index 0000000000000..e0179545e36eb --- /dev/null +++ b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/PhoneNumberAnalyzer.java @@ -0,0 +1,52 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.analysis.common; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilter; +import org.opensearch.common.settings.Settings; + +/** + * Analyzer for phone numbers, using {@link PhoneNumberTermTokenizer}. + * + *

+ * You can use the {@code phone} and {@code phone-search} analyzers on your fields to index phone numbers. + * Use {@code phone} (which creates ngrams) for the {@code analyzer} and {@code phone-search} (which doesn't create ngrams) + * for the {@code search_analyzer}. + *

+ * + *

+ * You optionally can specify a region with the {@code phone-region} setting for the phone number which will ensure that + * phone numbers without the international dialling prefix (using {@code +}) are also tokenized correctly. + *

+ * + *

+ * Note that the tokens will not refer to a specific position in the stream as the tokenizer is expected to be used on strings + * containing phone numbers and not arbitrary text with interspersed phone numbers. + *

+ */ +public class PhoneNumberAnalyzer extends Analyzer { + private final boolean addNgrams; + private final Settings settings; + + /** + * @param addNgrams defines whether ngrams for the phone number should be added. Set to true for indexing and false for search. + * @param settings the settings for the analyzer. + */ + public PhoneNumberAnalyzer(final Settings settings, final boolean addNgrams) { + this.addNgrams = addNgrams; + this.settings = settings; + } + + @Override + protected TokenStreamComponents createComponents(String fieldName) { + final var tokenizer = new PhoneNumberTermTokenizer(this.settings, this.addNgrams); + return new Analyzer.TokenStreamComponents(tokenizer, new RemoveDuplicatesTokenFilter(tokenizer)); + } +} diff --git a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/PhoneNumberAnalyzerProvider.java b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/PhoneNumberAnalyzerProvider.java new file mode 100644 index 0000000000000..8881c0a9c13b2 --- /dev/null +++ b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/PhoneNumberAnalyzerProvider.java @@ -0,0 +1,31 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.analysis.common; + +import org.opensearch.common.settings.Settings; +import org.opensearch.index.IndexSettings; +import org.opensearch.index.analysis.AbstractIndexAnalyzerProvider; + +/** + * Provider for {@link PhoneNumberAnalyzer}. + */ +public class PhoneNumberAnalyzerProvider extends AbstractIndexAnalyzerProvider { + + private final PhoneNumberAnalyzer analyzer; + + public PhoneNumberAnalyzerProvider(final IndexSettings indexSettings, final String name, final Settings settings, final boolean addNgrams) { + super(indexSettings, name, settings); + this.analyzer = new PhoneNumberAnalyzer(settings, addNgrams); + } + + @Override + public PhoneNumberAnalyzer get() { + return this.analyzer; + } +} diff --git a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/PhoneNumberTermTokenizer.java b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/PhoneNumberTermTokenizer.java new file mode 100644 index 0000000000000..21b4931d3c6dd --- /dev/null +++ b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/PhoneNumberTermTokenizer.java @@ -0,0 +1,147 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.analysis.common; + +import com.google.i18n.phonenumbers.NumberParseException; +import com.google.i18n.phonenumbers.PhoneNumberUtil; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.opensearch.common.io.Streams; +import org.opensearch.common.settings.Settings; +import org.opensearch.core.common.Strings; + +import java.io.IOException; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Optional; +import java.util.Set; + +/** + * This tokenizes a phone number into its individual parts, using {@link PhoneNumberUtil}. + * + *

+ * You can use the {@code phone} and {@code phone-search} analyzers on your fields to index phone numbers. + * Use {@code phone} (which creates ngrams) for the {@code analyzer} and {@code phone-search} (which doesn't create ngrams) + * for the {@code search_analyzer}. + *

+ * + *

+ * You optionally can specify a region with the {@code phone-region} setting for the phone number which will ensure that + * phone numbers without the international dialling prefix (using {@code +}) are also tokenized correctly. + *

+ * + *

+ * Note that the tokens will not refer to a specific position in the stream as the tokenizer is expected to be used on strings + * containing phone numbers and not arbitrary text with interspersed phone numbers. + *

+ */ +public final class PhoneNumberTermTokenizer extends Tokenizer { + private final boolean addNgrams; + private final Settings settings; + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); + private Iterator tokenIterator; + + /** + * @param addNgrams defines whether ngrams for the phone number should be added. Set to true for indexing and false for search. + * @param settings the settings for the analyzer. + */ + public PhoneNumberTermTokenizer(final Settings settings, final boolean addNgrams) { + super(); + this.addNgrams = addNgrams; + this.settings = settings; + } + + @Override + public void reset() throws IOException { + super.reset(); + tokenIterator = null; + } + + @Override + public boolean incrementToken() throws IOException { + clearAttributes(); + if (tokenIterator == null) { + tokenIterator = getTokens().iterator(); + } + if (tokenIterator.hasNext()) { + termAtt.append(tokenIterator.next()); + posIncrAtt.setPositionIncrement(0); + return true; + } + return false; + } + + private Set getTokens() throws IOException { + final var tokens = new HashSet(); + + var input = Streams.copyToString(this.input); + + tokens.add(input); + + // Rip off the "tel:" or "sip:" prefix + if (input.indexOf("tel:") == 0 || input.indexOf("sip:") == 0) { + tokens.add(input.substring(0, 4)); + input = input.substring(4); + } + + final var startIndex = input.startsWith("+") ? 1 : 0; + // Add the complete input but skip a leading + + tokens.add(input.substring(startIndex)); + + // Drop anything after @. Most likely there's nothing of interest + final var posAt = input.indexOf('@'); + if (posAt != -1) { + input = input.substring(0, posAt); + + // Add a token for the raw unmanipulated address. Note this could be a username (sip) instead of telephone + // number so take it as is + tokens.add(input.substring(startIndex)); + } + + // Let google's libphone try to parse it + final var phoneUtil = PhoneNumberUtil.getInstance(); + Optional countryCode = Optional.empty(); + try { + // ZZ is the generic "I don't know the country code" region. Google's libphone library will try to infer it. + final var region = this.settings.get("phone-region", "ZZ"); + final var numberProto = phoneUtil.parse(input, region); + if (numberProto != null) { + // Libphone likes it! + countryCode = Optional.of(String.valueOf(numberProto.getCountryCode())); + input = String.valueOf(numberProto.getNationalNumber()); + + // Add Country code, extension, and the number as tokens + tokens.add(countryCode.get()); + tokens.add(countryCode.get() + input); + if (!Strings.isEmpty(numberProto.getExtension())) { + tokens.add(numberProto.getExtension()); + } + + tokens.add(input); + } + } catch (final NumberParseException | StringIndexOutOfBoundsException e) { + // Libphone didn't like it, no biggie. We'll just ngram the number as it is. + } + + // ngram the phone number, e.g. 19198243333 produces 9, 91, 919, etc + if (this.addNgrams && Strings.isDigits(input)) { + for (int count = 1; count <= input.length(); ++count) { + final var token = input.substring(0, count); + tokens.add(token); + // If there was a country code, add more ngrams such that 19198243333 produces 19, 191, 1919, etc + countryCode.ifPresent(s -> tokens.add(s + token)); + } + } + + return tokens; + } + +} diff --git a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/PhoneNumberTermTokenizerFactory.java b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/PhoneNumberTermTokenizerFactory.java new file mode 100644 index 0000000000000..59d1058fef7b5 --- /dev/null +++ b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/PhoneNumberTermTokenizerFactory.java @@ -0,0 +1,33 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.analysis.common; + +import org.apache.lucene.analysis.Tokenizer; +import org.opensearch.common.settings.Settings; +import org.opensearch.index.IndexSettings; +import org.opensearch.index.analysis.AbstractTokenizerFactory; + +/** + * Factory for {@link PhoneNumberTermTokenizer}. + */ +public class PhoneNumberTermTokenizerFactory extends AbstractTokenizerFactory { + private final Settings settings; + private final boolean addNgrams; + + public PhoneNumberTermTokenizerFactory(final IndexSettings indexSettings, final String name, final Settings settings, final boolean addNgrams) { + super(indexSettings, settings, name); + this.settings = settings; + this.addNgrams = addNgrams; + } + + @Override + public Tokenizer create() { + return new PhoneNumberTermTokenizer(this.settings, this.addNgrams); + } +} diff --git a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/PhoneNumberAnalyzerTests.java b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/PhoneNumberAnalyzerTests.java new file mode 100644 index 0000000000000..b704ec5834e17 --- /dev/null +++ b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/PhoneNumberAnalyzerTests.java @@ -0,0 +1,238 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.analysis.common; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.junit.BeforeClass; +import org.opensearch.index.analysis.AnalysisTestsHelper; +import org.opensearch.test.OpenSearchTokenStreamTestCase; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import static org.hamcrest.Matchers.hasItemInArray; + +public class PhoneNumberAnalyzerTests extends OpenSearchTokenStreamTestCase { + private static final String RESOURCE = "/org/opensearch/analysis/common/phone_analysis.json"; + + private static Analyzer phoneAnalyzer; + private static Analyzer phoneSearchAnalyzer; + private static Analyzer phoneCHAnalyzer; + private static Analyzer phoneSearchCHAnalyzer; + + @BeforeClass + public static void beforeClass() throws IOException { + final var analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath( + createTempDir(), + RESOURCE, + new CommonAnalysisModulePlugin() + ); + phoneAnalyzer = analysis.indexAnalyzers.get("phone"); + assertNotNull(phoneAnalyzer); + phoneSearchAnalyzer = analysis.indexAnalyzers.get("phone-search"); + assertNotNull(phoneSearchAnalyzer); + phoneCHAnalyzer = analysis.indexAnalyzers.get("phone-ch"); + assertNotNull(phoneCHAnalyzer); + phoneSearchCHAnalyzer = analysis.indexAnalyzers.get("phone-search-ch"); + assertNotNull(phoneSearchCHAnalyzer); + } + + /** + * Test for all tokens which are emitted by the "phone" analyzer. + */ + public void testEuropeDetailled() throws IOException { + assertTokenStreamContents(phoneAnalyzer.tokenStream("test", "tel:+441344840400"), new String[]{"44", + "13", + "441344840", + "44134", + "13448404", + "tel:", + "134484040", + "44134484040", + "441344", + "1", + "441", + "134", + "1344", + "1344840400", + "tel:+441344840400", + "4413448404", + "4413", + "134484", + "44134484", + "441344840400", + "13448", + "4413448", + "1344840", + }); + } + + /** + * Test for all tokens which are emitted by the "phone" analyzer. + */ + public void testEuropeDetailledSearch() throws IOException { + assertTokenStreamContents(phoneSearchAnalyzer.tokenStream("test", "tel:+441344840400"), new String[]{ + "44", + "1344840400", + "tel:+441344840400", + "tel:", + "441344840400", + }); + } + + public void testEurope() throws IOException { + assertTokensInclude("tel:+441344840400", Arrays.asList("44", "1344", "1344840400", "441344840400")); + } + + public void testGermanCastle() throws IOException { + assertTokensInclude("tel:+498362930830", Arrays.asList("49", "498362930830", "8362930830")); + } + + public void testBMWofSydney() throws IOException { + assertTokensInclude("tel:+61293344555", Arrays.asList("61", "293344555", "61293344555")); + } + + public void testCoffeeShopInIreland() throws IOException { + assertTokensInclude("tel:+442890319416", Arrays.asList("44", "289", "2890319416", "442890319416")); + } + + public void testTelWithCountryCode() throws IOException { + assertTokensInclude("tel:+17177158163", Arrays.asList("1", "717", "7177", "17177158163")); + } + + public void testTelWithCountryCode2() throws IOException { + assertTokensInclude("tel:+12177148350", Arrays.asList("1", "217", "2177", "2177148350", "12177148350")); + } + + public void testNewTollFreeNumber() throws IOException { + assertTokensInclude("tel:+18337148350", Arrays.asList("1", "833", "8337", "8337148350", "18337148350")); + } + + public void testMissingCountryCode() throws IOException { + assertTokensInclude("tel:8177148350", Arrays.asList("817", "8177", "81771", "817714", "8177148350")); + } + + public void testSipWithNumericUsername() throws IOException { + assertTokensInclude("sip:222@autosbcpc", Arrays.asList("222")); + } + + public void testTruncatedNumber() throws IOException { + assertTokensInclude("tel:5551234", Arrays.asList("5551234")); + } + + public void testSipWithAlphabeticUsername() throws IOException { + assertTokensInclude("sip:abc@autosbcpc", Arrays.asList("abc")); + } + + public void testGarbageInGarbageOut() throws IOException { + assertTokensInclude("test", Arrays.asList("test")); + } + + public void testSipWithCountryCode() throws IOException { + assertTokensInclude("sip:+14177141363@178.97.105.13;isup-oli=0;pstn-params=808481808882", Arrays.asList("417", "4177", "14177")); + } + + public void testSipWithTelephoneExtension() throws IOException { + assertTokensInclude("sip:+13169410766;ext=2233@178.17.10.117:8060", Arrays.asList("316", "2233", "1316")); + } + + public void testSipWithUsername() throws IOException { + assertTokensInclude("sip:JeffSIP@178.12.220.18", Arrays.asList("JeffSIP")); + } + + public void testPhoneNumberWithoutPrefix() throws IOException { + assertTokensInclude("+14177141363", Arrays.asList("14177141363", "417", "4177", "14177")); + } + + public void testSipWithoutDomainPart() throws IOException { + assertTokensInclude("sip:+122882", Arrays.asList("122882", "122", "228", "1228", "2288", "12288")); + } + + public void testTelPrefix() throws IOException { + assertTokensInclude("tel:+1228", Arrays.asList("1228", "122", "228")); + } + + public void testNumberPrefix() throws IOException { + assertTokensInclude("+1228", Arrays.asList("1228", "122", "228")); + } + + public void testInternationalPrefixWithZZ() throws IOException { + assertTokensInclude(phoneAnalyzer, "+41583161010", Arrays.asList("41", "41583161010", "583161010")); + } + + public void testInternationalPrefixWithCH() throws IOException { + assertTokensInclude(phoneCHAnalyzer, "+41583161010", Arrays.asList("41", "41583161010", "583161010")); + } + + public void testNationalPrefixWithCH() throws IOException { + // + is equivalent to 00 in Switzerland + assertTokensInclude(phoneCHAnalyzer, "0041583161010", Arrays.asList("41", "41583161010", "583161010")); + } + + public void testLocalNumberWithCH() throws IOException { + // when omitting the international prefix swiss numbers must start with '0' + assertTokensInclude(phoneCHAnalyzer, "0583161010", Arrays.asList("41", "41583161010", "583161010")); + } + + public void testSearchInternationalPrefixWithZZ() throws IOException { + assertTokensInclude(phoneSearchAnalyzer, "+41583161010", Arrays.asList("41", "41583161010", "583161010")); + } + + public void testSearchInternationalPrefixWithCH() throws IOException { + assertTokensInclude(phoneSearchCHAnalyzer, "+41583161010", Arrays.asList("41", "41583161010", "583161010")); + } + + public void testSearchNationalPrefixWithCH() throws IOException { + // + is equivalent to 00 in Switzerland + assertTokensInclude(phoneSearchCHAnalyzer, "0041583161010", Arrays.asList("41", "41583161010", "583161010")); + } + + public void testSearchLocalNumberWithCH() throws IOException { + // when omitting the international prefix swiss numbers must start with '0' + assertTokensInclude(phoneSearchCHAnalyzer, "0583161010", Arrays.asList("41", "41583161010", "583161010")); + } + + /** + * Unlike {@link #assertTokenStreamContents(TokenStream, String[])} this only asserts whether the generated tokens + * contain the required ones but does not check for completeness or order. + */ + private void assertTokensInclude(final Analyzer analyzer, final String input, final List expectedTokens) throws IOException { + final var ts = analyzer.tokenStream("test", input); + final var allTokens = getAllTokens(ts).toArray(); + for (final var expectedToken : expectedTokens) { + assertThat(allTokens, hasItemInArray(expectedToken)); + } + } + + /** + * Unlike {@link #assertTokenStreamContents(TokenStream, String[])} this only asserts whether the generated tokens + * contain the required ones but does not check for completeness or order. + * This uses {@link #phoneAnalyzer}. + */ + private void assertTokensInclude(final String input, final List expectedTokens) throws IOException { + this.assertTokensInclude(phoneAnalyzer, input, expectedTokens); + } + + private List getAllTokens(final TokenStream ts) throws IOException { + final var tokens = new ArrayList(); + final var termAtt = ts.getAttribute(CharTermAttribute.class); + ts.reset(); + while (ts.incrementToken()) { + tokens.add(termAtt.toString()); + } + ts.end(); + ts.close(); + return tokens; + } + +} diff --git a/modules/analysis-common/src/test/resources/org/opensearch/analysis/common/phone_analysis.json b/modules/analysis-common/src/test/resources/org/opensearch/analysis/common/phone_analysis.json new file mode 100644 index 0000000000000..7e45177c57492 --- /dev/null +++ b/modules/analysis-common/src/test/resources/org/opensearch/analysis/common/phone_analysis.json @@ -0,0 +1,22 @@ +{ + "index": { + "analysis": { + "analyzer": { + "phone": { + "type": "phone" + }, + "phone-search": { + "type": "phone-search" + }, + "phone-ch": { + "type": "phone", + "phone-region": "CH" + }, + "phone-search-ch": { + "type": "phone-search", + "phone-region": "CH" + } + } + } + } +}