-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
refine boolean analyzer to require values true,false,yes,no,1,0,t,f,y…
…,n when indexing as string (case insensitive) and to only match one of those terms in any case when doing a query. before unknown values were mapped to false and any terms starting with t,f,y,n were matched.
- Loading branch information
Showing
2 changed files
with
31 additions
and
35 deletions.
There are no files selected for viewing
53 changes: 23 additions & 30 deletions
53
zulia-analyzer/src/main/java/io/zulia/server/analysis/analyzer/BooleanAnalyzer.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,46 +1,39 @@ | ||
package io.zulia.server.analysis.analyzer; | ||
|
||
import org.apache.lucene.analysis.Analyzer; | ||
import org.apache.lucene.analysis.CharArraySet; | ||
import org.apache.lucene.analysis.TokenFilter; | ||
import org.apache.lucene.analysis.Tokenizer; | ||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; | ||
import org.apache.lucene.analysis.core.KeywordTokenizer; | ||
import org.apache.lucene.analysis.miscellaneous.KeepWordFilter; | ||
import org.apache.lucene.analysis.pattern.PatternReplaceFilter; | ||
|
||
import java.io.IOException; | ||
import java.util.Arrays; | ||
import java.util.regex.Pattern; | ||
|
||
/** | ||
* Created by Matt Davis on 2/10/16. | ||
* Based on org.apache.solr.schema.BoolField from Solr | ||
*/ | ||
public class BooleanAnalyzer extends Analyzer { | ||
|
||
protected final static char[] TRUE_TOKEN = { 'T' }; | ||
protected final static char[] FALSE_TOKEN = { 'F' }; | ||
public static final String TRUE_TOKEN = "T"; | ||
public static final String FALSE_TOKEN = "F"; | ||
|
||
private static final CharArraySet booleanTokens = CharArraySet.unmodifiableSet(new CharArraySet(Arrays.asList(TRUE_TOKEN, FALSE_TOKEN), false)); | ||
public static final Pattern truePattern = Pattern.compile("true|t|yes|y|1", Pattern.CASE_INSENSITIVE); | ||
public static final Pattern falsePattern = Pattern.compile("false|f|no|n|0", Pattern.CASE_INSENSITIVE); | ||
|
||
@Override | ||
protected TokenStreamComponents createComponents(String fieldName) { | ||
Tokenizer tokenizer = new Tokenizer() { | ||
final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); | ||
boolean done = false; | ||
|
||
@Override | ||
public void reset() throws IOException { | ||
super.reset(); | ||
done = false; | ||
} | ||
|
||
@Override | ||
public boolean incrementToken() throws IOException { | ||
clearAttributes(); | ||
if (done) | ||
return false; | ||
done = true; | ||
int ch = input.read(); | ||
if (ch == -1) | ||
return false; | ||
termAtt.copyBuffer(((ch == 't' || ch == 'T' || ch == 'y' || ch == 'Y' || ch == '1') ? TRUE_TOKEN : FALSE_TOKEN), 0, 1); | ||
return true; | ||
} | ||
}; | ||
|
||
return new TokenStreamComponents(tokenizer); | ||
|
||
final Tokenizer tokenizer = new KeywordTokenizer(); | ||
TokenFilter result = new PatternReplaceFilter(tokenizer, truePattern, BooleanAnalyzer.TRUE_TOKEN, false); | ||
result = new PatternReplaceFilter(result, falsePattern, FALSE_TOKEN, false); | ||
result = new KeepWordFilter(result, booleanTokens); | ||
|
||
return new TokenStreamComponents(tokenizer, result); | ||
|
||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters