From b12f65c1de48572efd68cb4e8b2416e16cb61366 Mon Sep 17 00:00:00 2001 From: Loay Ghreeb <52158423+LoayGhreeb@users.noreply.github.com> Date: Fri, 17 May 2024 23:45:20 +0300 Subject: [PATCH] Remove EnglishStemAnalyzer and use EnglishAnalyzer (#11301) --- .../jabref/logic/pdf/search/PdfIndexer.java | 4 +-- .../jabref/logic/pdf/search/PdfSearcher.java | 7 +++--- .../model/pdf/search/EnglishStemAnalyzer.java | 25 ------------------- .../jabref/model/pdf/search/SearchResult.java | 10 +++++--- 4 files changed, 13 insertions(+), 33 deletions(-) delete mode 100644 src/main/java/org/jabref/model/pdf/search/EnglishStemAnalyzer.java diff --git a/src/main/java/org/jabref/logic/pdf/search/PdfIndexer.java b/src/main/java/org/jabref/logic/pdf/search/PdfIndexer.java index be3d5e8faf5..45750ac5b84 100644 --- a/src/main/java/org/jabref/logic/pdf/search/PdfIndexer.java +++ b/src/main/java/org/jabref/logic/pdf/search/PdfIndexer.java @@ -16,11 +16,11 @@ import org.jabref.model.database.BibDatabaseContext; import org.jabref.model.entry.BibEntry; import org.jabref.model.entry.LinkedFile; -import org.jabref.model.pdf.search.EnglishStemAnalyzer; import org.jabref.model.pdf.search.SearchFieldConstants; import org.jabref.preferences.FilePreferences; import com.google.common.annotations.VisibleForTesting; +import org.apache.lucene.analysis.en.EnglishAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexNotFoundException; @@ -130,7 +130,7 @@ private void initializeIndexWriterAndReader(IndexWriterConfig.OpenMode mode) { indexWriter = new IndexWriter( indexDirectory, new IndexWriterConfig( - new EnglishStemAnalyzer()).setOpenMode(mode)); + new EnglishAnalyzer()).setOpenMode(mode)); } catch (IOException e) { LOGGER.error("Could not initialize the IndexWriter", e); // FIXME: This can also happen if another instance of JabRef is launched in parallel. diff --git a/src/main/java/org/jabref/logic/pdf/search/PdfSearcher.java b/src/main/java/org/jabref/logic/pdf/search/PdfSearcher.java index 40acc97f8af..fb6afccc29b 100644 --- a/src/main/java/org/jabref/logic/pdf/search/PdfSearcher.java +++ b/src/main/java/org/jabref/logic/pdf/search/PdfSearcher.java @@ -7,11 +7,12 @@ import java.util.Optional; import org.jabref.gui.LibraryTab; -import org.jabref.model.pdf.search.EnglishStemAnalyzer; import org.jabref.model.pdf.search.PdfSearchResults; import org.jabref.model.pdf.search.SearchResult; import org.jabref.model.strings.StringUtil; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.en.EnglishAnalyzer; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; @@ -31,7 +32,7 @@ public final class PdfSearcher { private static final Logger LOGGER = LoggerFactory.getLogger(LibraryTab.class); private final PdfIndexer indexer; - private EnglishStemAnalyzer englishStemAnalyzer = new EnglishStemAnalyzer(); + private final Analyzer englishAnalyzer = new EnglishAnalyzer(); private PdfSearcher(PdfIndexer indexer) { this.indexer = indexer; @@ -65,7 +66,7 @@ public PdfSearchResults search(final String searchString, final int maxHits) thr return new PdfSearchResults(); } try (IndexReader reader = DirectoryReader.open(optionalIndexWriter.get())) { - Query query = new MultiFieldQueryParser(PDF_FIELDS, englishStemAnalyzer).parse(searchString); + Query query = new MultiFieldQueryParser(PDF_FIELDS, englishAnalyzer).parse(searchString); IndexSearcher searcher = new IndexSearcher(reader); TopDocs results = searcher.search(query, maxHits); for (ScoreDoc scoreDoc : results.scoreDocs) { diff --git a/src/main/java/org/jabref/model/pdf/search/EnglishStemAnalyzer.java b/src/main/java/org/jabref/model/pdf/search/EnglishStemAnalyzer.java deleted file mode 100644 index 1dcccbb6583..00000000000 --- a/src/main/java/org/jabref/model/pdf/search/EnglishStemAnalyzer.java +++ /dev/null @@ -1,25 +0,0 @@ -package org.jabref.model.pdf.search; - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.LowerCaseFilter; -import org.apache.lucene.analysis.StopFilter; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.core.DecimalDigitFilter; -import org.apache.lucene.analysis.en.EnglishAnalyzer; -import org.apache.lucene.analysis.en.PorterStemFilter; -import org.apache.lucene.analysis.standard.StandardTokenizer; - -public class EnglishStemAnalyzer extends Analyzer { - - @Override - protected TokenStreamComponents createComponents(String fieldName) { - Tokenizer source = new StandardTokenizer(); - TokenStream filter = new LowerCaseFilter(source); - filter = new StopFilter(filter, EnglishAnalyzer.ENGLISH_STOP_WORDS_SET); - filter = new DecimalDigitFilter(filter); - filter = new PorterStemFilter(filter); - return new TokenStreamComponents(source, filter); - } -} - diff --git a/src/main/java/org/jabref/model/pdf/search/SearchResult.java b/src/main/java/org/jabref/model/pdf/search/SearchResult.java index 7a79191e94c..aeb70ff6779 100644 --- a/src/main/java/org/jabref/model/pdf/search/SearchResult.java +++ b/src/main/java/org/jabref/model/pdf/search/SearchResult.java @@ -7,7 +7,9 @@ import org.jabref.model.entry.BibEntry; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.en.EnglishAnalyzer; import org.apache.lucene.index.IndexableField; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; @@ -46,14 +48,16 @@ public SearchResult(IndexSearcher searcher, Query query, ScoreDoc scoreDoc) thro Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter("", ""), new QueryScorer(query)); - try (TokenStream contentStream = new EnglishStemAnalyzer().tokenStream(CONTENT, content)) { + try (Analyzer analyzer = new EnglishAnalyzer(); + TokenStream contentStream = analyzer.tokenStream(CONTENT, content)) { TextFragment[] frags = highlighter.getBestTextFragments(contentStream, content, true, 10); this.contentResultStringsHtml = Arrays.stream(frags).map(TextFragment::toString).collect(Collectors.toList()); } catch (InvalidTokenOffsetsException e) { this.contentResultStringsHtml = List.of(); } - try (TokenStream annotationStream = new EnglishStemAnalyzer().tokenStream(ANNOTATIONS, annotations)) { + try (Analyzer analyzer = new EnglishAnalyzer(); + TokenStream annotationStream = analyzer.tokenStream(ANNOTATIONS, annotations)) { TextFragment[] frags = highlighter.getBestTextFragments(annotationStream, annotations, true, 10); this.annotationsResultStringsHtml = Arrays.stream(frags).map(TextFragment::toString).collect(Collectors.toList()); } catch (InvalidTokenOffsetsException e) { @@ -62,7 +66,7 @@ public SearchResult(IndexSearcher searcher, Query query, ScoreDoc scoreDoc) thro } private String getFieldContents(IndexSearcher searcher, ScoreDoc scoreDoc, String field) throws IOException { - IndexableField indexableField = searcher.doc(scoreDoc.doc).getField(field); + IndexableField indexableField = searcher.storedFields().document(scoreDoc.doc).getField(field); if (indexableField == null) { return ""; }