Skip to content

Commit

Permalink
refine boolean analyzer to require values true,false,yes,no,1,0,t,f,y…
Browse files Browse the repository at this point in the history
…,n when indexing as string (case insensitive) and to only match one of those terms in any case when doing a query. before unknown values were mapped to false and any terms starting with t,f,y,n were matched.
  • Loading branch information
mdavis95 committed Aug 26, 2020
1 parent e1c5976 commit 843b3d1
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 35 deletions.
Original file line number Diff line number Diff line change
@@ -1,46 +1,39 @@
package io.zulia.server.analysis.analyzer;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.KeepWordFilter;
import org.apache.lucene.analysis.pattern.PatternReplaceFilter;

import java.io.IOException;
import java.util.Arrays;
import java.util.regex.Pattern;

/**
* Created by Matt Davis on 2/10/16.
* Based on org.apache.solr.schema.BoolField from Solr
*/
public class BooleanAnalyzer extends Analyzer {

protected final static char[] TRUE_TOKEN = { 'T' };
protected final static char[] FALSE_TOKEN = { 'F' };
public static final String TRUE_TOKEN = "T";
public static final String FALSE_TOKEN = "F";

private static final CharArraySet booleanTokens = CharArraySet.unmodifiableSet(new CharArraySet(Arrays.asList(TRUE_TOKEN, FALSE_TOKEN), false));
public static final Pattern truePattern = Pattern.compile("true|t|yes|y|1", Pattern.CASE_INSENSITIVE);
public static final Pattern falsePattern = Pattern.compile("false|f|no|n|0", Pattern.CASE_INSENSITIVE);

@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new Tokenizer() {
final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
boolean done = false;

@Override
public void reset() throws IOException {
super.reset();
done = false;
}

@Override
public boolean incrementToken() throws IOException {
clearAttributes();
if (done)
return false;
done = true;
int ch = input.read();
if (ch == -1)
return false;
termAtt.copyBuffer(((ch == 't' || ch == 'T' || ch == 'y' || ch == 'Y' || ch == '1') ? TRUE_TOKEN : FALSE_TOKEN), 0, 1);
return true;
}
};

return new TokenStreamComponents(tokenizer);

final Tokenizer tokenizer = new KeywordTokenizer();
TokenFilter result = new PatternReplaceFilter(tokenizer, truePattern, BooleanAnalyzer.TRUE_TOKEN, false);
result = new PatternReplaceFilter(result, falsePattern, FALSE_TOKEN, false);
result = new KeepWordFilter(result, booleanTokens);

return new TokenStreamComponents(tokenizer, result);

}

}
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package io.zulia.server.index.field;

import io.zulia.server.analysis.analyzer.BooleanAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
Expand Down Expand Up @@ -33,14 +34,16 @@ protected void handleValue(Document d, String storedFieldName, Object value, Str
}
else if (value instanceof String) {
String v = (String) value;
if (v.startsWith("T") || v.startsWith("F") || v.startsWith("Y") || v.startsWith("N") || v.startsWith("t") || v.startsWith("f") || v
.startsWith("y") || v.startsWith("n") || v.startsWith("0") || v.startsWith("1")) {
d.add((new Field(indexedFieldName, v, notStoredTextField)));
if (BooleanAnalyzer.truePattern.matcher(v).matches()) {
d.add((new Field(indexedFieldName, BooleanAnalyzer.TRUE_TOKEN, notStoredTextField)));
}
else if (BooleanAnalyzer.falsePattern.matcher(v).matches()) {
d.add((new Field(indexedFieldName, BooleanAnalyzer.FALSE_TOKEN, notStoredTextField)));
}
else {
throw new Exception(
"String for Boolean field must start with 'Y','y','N','n','T','t','F','f','0', or '1' for <" + storedFieldName + "> and found <" + v
+ ">");
"String for Boolean field be 'Yes', 'No', 'Y', 'N', '1', '0', 'True', 'False', 'T', 'F' (case insensitive) for <" + storedFieldName
+ "> and found <" + v + ">");
}
}
else if (value instanceof Number) {
Expand Down

0 comments on commit 843b3d1

Please sign in to comment.