Skip to content

Commit

Permalink
Better plural stemmer than minimal_english (opensearch-project#4738) (o…
Browse files Browse the repository at this point in the history
…pensearch-project#4834)

Drops the trailing "e" in taxes, dresses, watches, dishes etc that otherwise
cause mismatches with plural and singular forms.

Signed-off-by: Nicholas Walter Knize <[email protected]>

Co-authored-by: Mark Harwood <[email protected]>
Co-authored-by: Nicholas Walter Knize <[email protected]>
(cherry picked from commit c92846d)
  • Loading branch information
nknize authored Oct 19, 2022
1 parent 45fb2e0 commit a44370d
Show file tree
Hide file tree
Showing 4 changed files with 262 additions and 0 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
- [Segment Replication] Update replicas to commit SegmentInfos instead of relying on segments_N from primary shards ([#4450](https://github.com/opensearch-project/OpenSearch/pull/4450))
- [Segment Replication] Adding check to make sure checkpoint is not processed when a shard's shard routing is primary ([#4716](https://github.com/opensearch-project/OpenSearch/pull/4716))
- Disable merge on refresh in DiskThresholdDeciderIT ([#4828](https://github.com/opensearch-project/OpenSearch/pull/4828))
- Better plural stemmer than minimal_english ([#4738](https://github.com/opensearch-project/OpenSearch/pull/4738))

### Security

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/

/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

/*
* Modifications Copyright OpenSearch Contributors. See
* GitHub history for details.
*/

package org.opensearch.analysis.common;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.en.EnglishMinimalStemFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;

import java.io.IOException;

public final class EnglishPluralStemFilter extends TokenFilter {
private final EnglishPluralStemmer stemmer = new EnglishPluralStemmer();
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);

public EnglishPluralStemFilter(TokenStream input) {
super(input);
}

@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
if (!keywordAttr.isKeyword()) {
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
termAtt.setLength(newlen);
}
return true;
} else {
return false;
}
}

/**
* Plural stemmer for English based on the {@link EnglishMinimalStemFilter}
* <p>
* This stemmer removes plurals but beyond EnglishMinimalStemFilter adds
* four new suffix rules to remove dangling e characters:
* <ul>
* <li>xes - "boxes" becomes "box"</li>
* <li>sses - "dresses" becomes "dress"</li>
* <li>shes - "dishes" becomes "dish"</li>
* <li>tches - "watches" becomes "watch"</li>
* </ul>
* See https://github.com/elastic/elasticsearch/issues/42892
* <p>
* In addition the s stemmer logic is amended so that
* <ul>
* <li>ees-&gt;ee so that bees matches bee</li>
* <li>ies-&gt;y only on longer words to that ties matches tie</li>
* <li>oes-&gt;o rule so that tomatoes matches tomato but retains e for some words eg shoes to shoe</li>
* </ul>
*/
public static class EnglishPluralStemmer {

// Words ending in oes that retain the e when stemmed
public static final char[][] oesExceptions = { "shoes".toCharArray(), "canoes".toCharArray(), "oboes".toCharArray() };
// Words ending in ches that retain the e when stemmed
public static final char[][] chesExceptions = {
"cliches".toCharArray(),
"avalanches".toCharArray(),
"mustaches".toCharArray(),
"moustaches".toCharArray(),
"quiches".toCharArray(),
"headaches".toCharArray(),
"heartaches".toCharArray(),
"porsches".toCharArray(),
"tranches".toCharArray(),
"caches".toCharArray() };

@SuppressWarnings("fallthrough")
public int stem(char s[], int len) {
if (len < 3 || s[len - 1] != 's') return len;

switch (s[len - 2]) {
case 'u':
case 's':
return len;
case 'e':
// Modified ies->y logic from original s-stemmer - only work on strings > 4
// so spies -> spy still but pies->pie.
// The original code also special-cased aies and eies for no good reason as far as I can tell.
// ( no words of consequence - eg http://www.thefreedictionary.com/words-that-end-in-aies )
if (len > 4 && s[len - 3] == 'i') {
s[len - 3] = 'y';
return len - 2;
}

// Suffix rules to remove any dangling "e"
if (len > 3) {
// xes (but >1 prefix so we can stem "boxes->box" but keep "axes->axe")
if (len > 4 && s[len - 3] == 'x') {
return len - 2;
}
// oes
if (len > 3 && s[len - 3] == 'o') {
if (isException(s, len, oesExceptions)) {
// Only remove the S
return len - 1;
}
// Remove the es
return len - 2;
}
if (len > 4) {
// shes/sses
if (s[len - 4] == 's' && (s[len - 3] == 'h' || s[len - 3] == 's')) {
return len - 2;
}

// ches
if (len > 4) {
if (s[len - 4] == 'c' && s[len - 3] == 'h') {
if (isException(s, len, chesExceptions)) {
// Only remove the S
return len - 1;
}
// Remove the es
return len - 2;

}
}
}
}

default:
return len - 1;
}
}

private boolean isException(char[] s, int len, char[][] exceptionsList) {
for (char[] oesRule : exceptionsList) {
int rulePos = oesRule.length - 1;
int sPos = len - 1;
boolean matched = true;
while (rulePos >= 0 && sPos >= 0) {
if (oesRule[rulePos] != s[sPos]) {
matched = false;
break;
}
rulePos--;
sPos--;
}
if (matched) {
return true;
}
}
return false;
}
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,8 @@ public TokenStream create(TokenStream tokenStream) {
return new SnowballFilter(tokenStream, new EnglishStemmer());
} else if ("minimal_english".equalsIgnoreCase(language) || "minimalEnglish".equalsIgnoreCase(language)) {
return new EnglishMinimalStemFilter(tokenStream);
} else if ("plural_english".equalsIgnoreCase(language) || "pluralEnglish".equalsIgnoreCase(language)) {
return new EnglishPluralStemFilter(tokenStream);
} else if ("possessive_english".equalsIgnoreCase(language) || "possessiveEnglish".equalsIgnoreCase(language)) {
return new EnglishPossessiveFilter(tokenStream);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,83 @@ public void testPorter2FilterFactory() throws IOException {
}
}

public void testEnglishPluralFilter() throws IOException {
int iters = scaledRandomIntBetween(20, 100);
for (int i = 0; i < iters; i++) {

Version v = VersionUtils.randomVersion(random());
Settings settings = Settings.builder()
.put("index.analysis.filter.my_plurals.type", "stemmer")
.put("index.analysis.filter.my_plurals.language", "plural_english")
.put("index.analysis.analyzer.my_plurals.tokenizer", "whitespace")
.put("index.analysis.analyzer.my_plurals.filter", "my_plurals")
.put(SETTING_VERSION_CREATED, v)
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build();

OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, PLUGIN);
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_plurals");
assertThat(tokenFilter, instanceOf(StemmerTokenFilterFactory.class));
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader("dresses"));
TokenStream create = tokenFilter.create(tokenizer);
IndexAnalyzers indexAnalyzers = analysis.indexAnalyzers;
NamedAnalyzer analyzer = indexAnalyzers.get("my_plurals");
assertThat(create, instanceOf(EnglishPluralStemFilter.class));

// Check old EnglishMinimalStemmer ("S" stemmer) logic
assertAnalyzesTo(analyzer, "phones", new String[] { "phone" });
assertAnalyzesTo(analyzer, "horses", new String[] { "horse" });
assertAnalyzesTo(analyzer, "cameras", new String[] { "camera" });

// The orginal s stemmer gives up on stemming oes words because English has no fixed rule for the stem
// (see https://howtospell.co.uk/making-O-words-plural )
// This stemmer removes the es but retains e for a small number of exceptions
assertAnalyzesTo(analyzer, "mosquitoes", new String[] { "mosquito" });
assertAnalyzesTo(analyzer, "heroes", new String[] { "hero" });
// oes exceptions that retain the e.
assertAnalyzesTo(analyzer, "shoes", new String[] { "shoe" });
assertAnalyzesTo(analyzer, "horseshoes", new String[] { "horseshoe" });
assertAnalyzesTo(analyzer, "canoes", new String[] { "canoe" });
assertAnalyzesTo(analyzer, "oboes", new String[] { "oboe" });

// Check improved EnglishPluralStemFilter logic
// sses
assertAnalyzesTo(analyzer, "dresses", new String[] { "dress" });
assertAnalyzesTo(analyzer, "possess", new String[] { "possess" });
assertAnalyzesTo(analyzer, "possesses", new String[] { "possess" });
// xes
assertAnalyzesTo(analyzer, "boxes", new String[] { "box" });
assertAnalyzesTo(analyzer, "axes", new String[] { "axe" });
// shes
assertAnalyzesTo(analyzer, "dishes", new String[] { "dish" });
assertAnalyzesTo(analyzer, "washes", new String[] { "wash" });
// ees
assertAnalyzesTo(analyzer, "employees", new String[] { "employee" });
assertAnalyzesTo(analyzer, "bees", new String[] { "bee" });
// tch
assertAnalyzesTo(analyzer, "watches", new String[] { "watch" });
assertAnalyzesTo(analyzer, "itches", new String[] { "itch" });
// ies->y but only for length >4
assertAnalyzesTo(analyzer, "spies", new String[] { "spy" });
assertAnalyzesTo(analyzer, "ties", new String[] { "tie" });
assertAnalyzesTo(analyzer, "lies", new String[] { "lie" });
assertAnalyzesTo(analyzer, "pies", new String[] { "pie" });
assertAnalyzesTo(analyzer, "dies", new String[] { "die" });

assertAnalyzesTo(analyzer, "lunches", new String[] { "lunch" });
assertAnalyzesTo(analyzer, "avalanches", new String[] { "avalanche" });
assertAnalyzesTo(analyzer, "headaches", new String[] { "headache" });
assertAnalyzesTo(analyzer, "caches", new String[] { "cache" });
assertAnalyzesTo(analyzer, "beaches", new String[] { "beach" });
assertAnalyzesTo(analyzer, "britches", new String[] { "britch" });
assertAnalyzesTo(analyzer, "cockroaches", new String[] { "cockroach" });
assertAnalyzesTo(analyzer, "cliches", new String[] { "cliche" });
assertAnalyzesTo(analyzer, "quiches", new String[] { "quiche" });

}
}

public void testMultipleLanguagesThrowsException() throws IOException {
Version v = VersionUtils.randomVersion(random());
Settings settings = Settings.builder()
Expand Down

0 comments on commit a44370d

Please sign in to comment.