diff --git a/src/main/java/io/anserini/search/SimpleImpactSearcher.java b/src/main/java/io/anserini/search/SimpleImpactSearcher.java index 9cfe24edd9..2098363562 100644 --- a/src/main/java/io/anserini/search/SimpleImpactSearcher.java +++ b/src/main/java/io/anserini/search/SimpleImpactSearcher.java @@ -50,326 +50,313 @@ import java.util.concurrent.Executors; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicLong; /** * Class that exposes basic search functionality, designed specifically to provide the bridge between Java and Python - * via pyjnius. + * via pyjnius. Note that methods are named according to Python conventions (e.g., snake case instead of camel case). */ public class SimpleImpactSearcher implements Closeable { - public static final Sort BREAK_SCORE_TIES_BY_DOCID = - new Sort(SortField.FIELD_SCORE, new SortField(IndexArgs.ID, SortField.Type.STRING_VAL)); - private static final Logger LOG = LogManager.getLogger(SimpleImpactSearcher.class); - - protected IndexReader reader; - protected Similarity similarity; - protected BagOfWordsQueryGenerator generator; - protected RerankerCascade cascade; - protected IndexSearcher searcher = null; - - /** - * This class is meant to serve as the bridge between Anserini and Pyserini. - * Note that we are adopting Python naming conventions here on purpose. - */ - public class Result { - public String docid; - public int lucene_docid; - public float score; - public String contents; - public String raw; - public Document lucene_document; // Since this is for Python access, we're using Python naming conventions. - - public Result(String docid, int lucene_docid, float score, String contents, String raw, Document lucene_document) { - this.docid = docid; - this.lucene_docid = lucene_docid; - this.score = score; - this.contents = contents; - this.raw = raw; - this.lucene_document = lucene_document; - } - } - - protected SimpleImpactSearcher() { + private static final Sort BREAK_SCORE_TIES_BY_DOCID = + new Sort(SortField.FIELD_SCORE, new SortField(IndexArgs.ID, SortField.Type.STRING_VAL)); + private static final Logger LOG = LogManager.getLogger(SimpleImpactSearcher.class); + + protected IndexReader reader; + protected Similarity similarity; + protected BagOfWordsQueryGenerator generator; + protected RerankerCascade cascade; + protected IndexSearcher searcher = null; + protected boolean backwardsCompatibilityLucene8; + + /** + * This class is meant to serve as the bridge between Anserini and Pyserini. + * Note that we are adopting Python naming conventions here on purpose. + */ + public static class Result { + public String docid; + public int lucene_docid; + public float score; + public String contents; + public String raw; + public Document lucene_document; // Since this is for Python access, we're using Python naming conventions. + + public Result(String docid, int lucene_docid, float score, String contents, String raw, Document lucene_document) { + this.docid = docid; + this.lucene_docid = lucene_docid; + this.score = score; + this.contents = contents; + this.raw = raw; + this.lucene_document = lucene_document; } - - /** - * Creates a {@code SimpleImpactSearcher}. - * - * @param indexDir index directory - * @throws IOException if errors encountered during initialization - */ - public SimpleImpactSearcher(String indexDir) throws IOException { - Path indexPath = Paths.get(indexDir); - - if (!Files.exists(indexPath) || !Files.isDirectory(indexPath) || !Files.isReadable(indexPath)) { - throw new IllegalArgumentException(indexDir + " does not exist or is not a directory."); - } - - this.reader = DirectoryReader.open(FSDirectory.open(indexPath)); - // Default to using ImpactSimilarity. - this.similarity = new ImpactSimilarity(); - this.generator = new BagOfWordsQueryGenerator(); - cascade = new RerankerCascade(); - cascade.add(new ScoreTiesAdjusterReranker()); + } + + protected SimpleImpactSearcher() { + } + + /** + * Creates a {@code SimpleImpactSearcher}. + * + * @param indexDir index directory + * @throws IOException if errors encountered during initialization + */ + public SimpleImpactSearcher(String indexDir) throws IOException { + Path indexPath = Paths.get(indexDir); + + if (!Files.exists(indexPath) || !Files.isDirectory(indexPath) || !Files.isReadable(indexPath)) { + throw new IllegalArgumentException(indexDir + " does not exist or is not a directory."); } - - - /** - * Returns the number of documents in the index. - * - * @return the number of documents in the index - */ - public int getTotalNumDocuments(){ - // Create an IndexSearch only once. Note that the object is thread safe. - if (searcher == null) { - searcher = new IndexSearcher(reader); - searcher.setSimilarity(similarity); - } - - return searcher.getIndexReader().maxDoc(); - } - - /** - * Closes this searcher. - */ - @Override - public void close() throws IOException { - try { - reader.close(); - } catch (Exception e) { - // Eat any exceptions. - return; - } + + this.reader = DirectoryReader.open(FSDirectory.open(indexPath)); + + // Fix for index compatibility issue between Lucene 8 and 9: https://github.com/castorini/anserini/issues/1952 + // If we detect an older index version, we turn off consistent tie-breaking, which avoids accessing docvalues, + // which is the source of the incompatibility. + this.backwardsCompatibilityLucene8 = !reader.toString().contains("Lucene9"); + + // Default to using ImpactSimilarity. + this.similarity = new ImpactSimilarity(); + this.generator = new BagOfWordsQueryGenerator(); + cascade = new RerankerCascade(); + cascade.add(new ScoreTiesAdjusterReranker()); + } + + /** + * Returns the number of documents in the index. + * + * @return the number of documents in the index + */ + public int get_total_num_docs() { + // Create an IndexSearch only once. Note that the object is thread safe. + if (searcher == null) { + searcher = new IndexSearcher(reader); + searcher.setSimilarity(similarity); } - - /** - * Searches in batch - * - * @param queries list of queries - * @param qids list of unique query ids - * @param k number of hits - * @param threads number of threads - * @return a map of query id to search results - */ - public Map batchSearch(List> queries, List qids, int k, int threads) { - // Create the IndexSearcher here, if needed. We do it here because if we leave the creation to the search - // method, we might end up with a race condition as multiple threads try to concurrently create the IndexSearcher. - if (searcher == null) { - searcher = new IndexSearcher(reader); - searcher.setSimilarity(similarity); - } - - ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(threads); - ConcurrentHashMap results = new ConcurrentHashMap<>(); - - long startTime = System.nanoTime(); - AtomicLong index = new AtomicLong(); - int queryCnt = queries.size(); - for (int q = 0; q < queryCnt; ++q) { - Map query = queries.get(q); - String qid = qids.get(q); - executor.execute(() -> { - try { - results.put(qid, search(query, k)); - } catch (IOException e) { - throw new CompletionException(e); - } - // Logging to track query latency. - // Note that this is potentially noisy because it might interfere with tqdm on the Python side; logging - // every 500 queries seems like a reasonable comprise between offering helpful info and not being too noisy. - Long lineNumber = index.incrementAndGet(); - if (lineNumber % 500 == 0) { - double timePerQuery = (double) (System.nanoTime() - startTime) / (lineNumber + 1) / 1e9; - LOG.info(String.format("Retrieving query " + lineNumber + " (%.3f s/query)", timePerQuery)); - } - }); - } - - executor.shutdown(); - - try { - // Wait for existing tasks to terminate - while (!executor.awaitTermination(1, TimeUnit.MINUTES)) { - LOG.info(String.format("%.2f percent completed", - (double) executor.getCompletedTaskCount() / queries.size() * 100.0d)); - } - } catch (InterruptedException ie) { - // (Re-)Cancel if current thread also interrupted - executor.shutdownNow(); - // Preserve interrupt status - Thread.currentThread().interrupt(); - } - - if (queryCnt != executor.getCompletedTaskCount()) { - throw new RuntimeException("queryCount = " + queryCnt + - " is not equal to completedTaskCount = " + executor.getCompletedTaskCount()); - } - - return results; + + return searcher.getIndexReader().maxDoc(); + } + + /** + * Closes this searcher. + */ + @Override + public void close() throws IOException { + try { + reader.close(); + } catch (Exception e) { + // Eat any exceptions. } - - /** - * Searches the collection, returning 10 hits by default. - * - * @param q query - * @return array of search results - * @throws IOException if error encountered during search - */ - public Result[] search(Map q) throws IOException { - return search(q, 10); + } + + /** + * Searches in batch using multiple threads. + * + * @param queries list of queries + * @param qids list of unique query ids + * @param k number of hits + * @param threads number of threads + * @return a map of query id to search results + */ + public Map batch_search(List> queries, + List qids, + int k, + int threads) { + // Create the IndexSearcher here, if needed. We do it here because if we leave the creation to the search + // method, we might end up with a race condition as multiple threads try to concurrently create the IndexSearcher. + if (searcher == null) { + searcher = new IndexSearcher(reader); + searcher.setSimilarity(similarity); } - - /** - * Searches the collection. - * - * @param q query - * @param k number of hits - * @return array of search results - * @throws IOException if error encountered during search - */ - public Result[] search(Map q, int k) throws IOException { - Query query = generator.buildQuery(IndexArgs.CONTENTS, q); - - return _search(query, k); + + ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(threads); + ConcurrentHashMap results = new ConcurrentHashMap<>(); + + int queryCnt = queries.size(); + for (int q = 0; q < queryCnt; ++q) { + Map query = queries.get(q); + String qid = qids.get(q); + executor.execute(() -> { + try { + results.put(qid, search(query, k)); + } catch (IOException e) { + throw new CompletionException(e); + } + }); } - - // internal implementation - protected Result[] _search(Query query, int k) throws IOException { - // Create an IndexSearch only once. Note that the object is thread safe. - if (searcher == null) { - searcher = new IndexSearcher(reader); - searcher.setSimilarity(similarity); - } - - SearchArgs searchArgs = new SearchArgs(); - searchArgs.arbitraryScoreTieBreak = false; - searchArgs.hits = k; - - TopDocs rs; - RerankerContext context; - rs = searcher.search(query, k, BREAK_SCORE_TIES_BY_DOCID, true); - context = new RerankerContext<>(searcher, null, query, null, - null, null, null, searchArgs); - - ScoredDocuments hits = cascade.run(ScoredDocuments.fromTopDocs(rs, searcher), context); - - Result[] results = new Result[hits.ids.length]; - for (int i = 0; i < hits.ids.length; i++) { - Document doc = hits.documents[i]; - String docid = doc.getField(IndexArgs.ID).stringValue(); - - IndexableField field; - field = doc.getField(IndexArgs.CONTENTS); - String contents = field == null ? null : field.stringValue(); - - field = doc.getField(IndexArgs.RAW); - String raw = field == null ? null : field.stringValue(); - - results[i] = new Result(docid, hits.ids[i], hits.scores[i], contents, raw, doc); + + executor.shutdown(); + + try { + // Wait for existing tasks to terminate + while (!executor.awaitTermination(1, TimeUnit.MINUTES)) { + LOG.info(String.format("%.2f percent completed", + (double) executor.getCompletedTaskCount() / queries.size() * 100.0d)); } - - return results; + } catch (InterruptedException ie) { + // (Re-)Cancel if current thread also interrupted + executor.shutdownNow(); + // Preserve interrupt status + Thread.currentThread().interrupt(); } - - /** - * Fetches the Lucene {@link Document} based on an internal Lucene docid. - * The method is named to be consistent with Lucene's {@link IndexReader#document(int)}, contra Java's standard - * method naming conventions. - * - * @param ldocid internal Lucene docid - * @return corresponding Lucene {@link Document} - */ - public Document document(int ldocid) { - try { - return reader.document(ldocid); - } catch (Exception e) { - // Eat any exceptions and just return null. - return null; - } + + if (queryCnt != executor.getCompletedTaskCount()) { + throw new RuntimeException("queryCount = " + queryCnt + + " is not equal to completedTaskCount = " + executor.getCompletedTaskCount()); } - - /** - * Returns the Lucene {@link Document} based on a collection docid. - * The method is named to be consistent with Lucene's {@link IndexReader#document(int)}, contra Java's standard - * method naming conventions. - * - * @param docid collection docid - * @return corresponding Lucene {@link Document} - */ - public Document document(String docid) { - return IndexReaderUtils.document(reader, docid); + + return results; + } + + /** + * Searches the collection, returning 10 hits by default. + * + * @param q query + * @return array of search results + * @throws IOException if error encountered during search + */ + public Result[] search(Map q) throws IOException { + return search(q, 10); + } + + /** + * Searches the collection. + * + * @param q query + * @param k number of hits + * @return array of search results + * @throws IOException if error encountered during search + */ + public Result[] search(Map q, int k) throws IOException { + Query query = generator.buildQuery(IndexArgs.CONTENTS, q); + + return _search(query, k); + } + + // internal implementation + protected Result[] _search(Query query, int k) throws IOException { + // Create an IndexSearch only once. Note that the object is thread safe. + if (searcher == null) { + searcher = new IndexSearcher(reader); + searcher.setSimilarity(similarity); } - - /** - * Fetches the Lucene {@link Document} based on some field other than its unique collection docid. - * For example, scientific articles might have DOIs. - * The method is named to be consistent with Lucene's {@link IndexReader#document(int)}, contra Java's standard - * method naming conventions. - * - * @param field field - * @param id unique id - * @return corresponding Lucene {@link Document} based on the value of a specific field - */ - public Document documentByField(String field, String id) { - return IndexReaderUtils.documentByField(reader, field, id); + + SearchArgs searchArgs = new SearchArgs(); + searchArgs.arbitraryScoreTieBreak = this.backwardsCompatibilityLucene8; + searchArgs.hits = k; + + TopDocs rs; + RerankerContext context; + if (this.backwardsCompatibilityLucene8) { + rs = searcher.search(query, k); + } else { + rs = searcher.search(query, k, BREAK_SCORE_TIES_BY_DOCID, true); } - - /** - * Returns the "contents" field of a document based on an internal Lucene docid. - * The method is named to be consistent with Lucene's {@link IndexReader#document(int)}, contra Java's standard - * method naming conventions. - * - * @param ldocid internal Lucene docid - * @return the "contents" field the document - */ - public String documentContents(int ldocid) { - try { - return reader.document(ldocid).get(IndexArgs.CONTENTS); - } catch (Exception e) { - // Eat any exceptions and just return null. - return null; - } + context = new RerankerContext<>(searcher, null, query, null, + null, null, null, searchArgs); + + ScoredDocuments hits = cascade.run(ScoredDocuments.fromTopDocs(rs, searcher), context); + + Result[] results = new Result[hits.ids.length]; + for (int i = 0; i < hits.ids.length; i++) { + Document doc = hits.documents[i]; + String docid = doc.getField(IndexArgs.ID).stringValue(); + + IndexableField field; + field = doc.getField(IndexArgs.CONTENTS); + String contents = field == null ? null : field.stringValue(); + + field = doc.getField(IndexArgs.RAW); + String raw = field == null ? null : field.stringValue(); + + results[i] = new Result(docid, hits.ids[i], hits.scores[i], contents, raw, doc); } - - /** - * Returns the "contents" field of a document based on a collection docid. - * The method is named to be consistent with Lucene's {@link IndexReader#document(int)}, contra Java's standard - * method naming conventions. - * - * @param docid collection docid - * @return the "contents" field the document - */ - public String documentContents(String docid) { - return IndexReaderUtils.documentContents(reader, docid); + + return results; + } + + /** + * Fetches the Lucene {@link Document} based on an internal Lucene docid. + * + * @param lucene_docid internal Lucene docid + * @return corresponding Lucene {@link Document} + */ + public Document doc(int lucene_docid) { + try { + return reader.document(lucene_docid); + } catch (Exception e) { + // Eat any exceptions and just return null. + return null; } - - /** - * Returns the "raw" field of a document based on an internal Lucene docid. - * The method is named to be consistent with Lucene's {@link IndexReader#document(int)}, contra Java's standard - * method naming conventions. - * - * @param ldocid internal Lucene docid - * @return the "raw" field the document - */ - public String documentRaw(int ldocid) { - try { - return reader.document(ldocid).get(IndexArgs.RAW); - } catch (Exception e) { - // Eat any exceptions and just return null. - return null; - } + } + + /** + * Returns the Lucene {@link Document} based on a collection docid. + * + * @param docid collection docid + * @return corresponding Lucene {@link Document} + */ + public Document doc(String docid) { + return IndexReaderUtils.document(reader, docid); + } + + /** + * Fetches the Lucene {@link Document} based on some field other than its unique collection docid. + * For example, scientific articles might have DOIs. + * + * @param field field + * @param id unique id + * @return corresponding Lucene {@link Document} based on the value of a specific field + */ + public Document doc_by_field(String field, String id) { + return IndexReaderUtils.documentByField(reader, field, id); + } + + /** + * Returns the "contents" field of a document based on an internal Lucene docid. + * + * @param lucene_docid internal Lucene docid + * @return the "contents" field the document + */ + public String doc_contents(int lucene_docid) { + try { + return reader.document(lucene_docid).get(IndexArgs.CONTENTS); + } catch (Exception e) { + // Eat any exceptions and just return null. + return null; } - - /** - * Returns the "raw" field of a document based on a collection docid. - * The method is named to be consistent with Lucene's {@link IndexReader#document(int)}, contra Java's standard - * method naming conventions. - * - * @param docid collection docid - * @return the "raw" field the document - */ - public String documentRaw(String docid) { - return IndexReaderUtils.documentRaw(reader, docid); + } + + /** + * Returns the "contents" field of a document based on a collection docid. + * + * @param docid collection docid + * @return the "contents" field the document + */ + public String doc_contents(String docid) { + return IndexReaderUtils.documentContents(reader, docid); + } + + /** + * Returns the "raw" field of a document based on an internal Lucene docid. + * + * @param lucene_docid internal Lucene docid + * @return the "raw" field the document + */ + public String doc_raw(int lucene_docid) { + try { + return reader.document(lucene_docid).get(IndexArgs.RAW); + } catch (Exception e) { + // Eat any exceptions and just return null. + return null; } } + + /** + * Returns the "raw" field of a document based on a collection docid. + * + * @param docid collection docid + * @return the "raw" field the document + */ + public String doc_raw(String docid) { + return IndexReaderUtils.documentRaw(reader, docid); + } +} \ No newline at end of file diff --git a/src/main/java/io/anserini/search/SimpleSearcher.java b/src/main/java/io/anserini/search/SimpleSearcher.java index f779533c50..1d7d9e65f0 100644 --- a/src/main/java/io/anserini/search/SimpleSearcher.java +++ b/src/main/java/io/anserini/search/SimpleSearcher.java @@ -28,14 +28,13 @@ import io.anserini.rerank.lib.ScoreTiesAdjusterReranker; import io.anserini.search.query.BagOfWordsQueryGenerator; import io.anserini.search.query.QueryGenerator; -import io.anserini.search.topicreader.TopicReader; -import org.apache.commons.lang3.time.DurationFormatUtils; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.ar.ArabicAnalyzer; import org.apache.lucene.analysis.bn.BengaliAnalyzer; import org.apache.lucene.analysis.cjk.CJKAnalyzer; +import org.apache.lucene.analysis.core.WhitespaceAnalyzer; import org.apache.lucene.analysis.da.DanishAnalyzer; import org.apache.lucene.analysis.de.GermanAnalyzer; import org.apache.lucene.analysis.es.SpanishAnalyzer; @@ -56,8 +55,6 @@ import org.apache.lucene.analysis.th.ThaiAnalyzer; import org.apache.lucene.analysis.tr.TurkishAnalyzer; import org.apache.lucene.analysis.uk.UkrainianMorfologikAnalyzer; -import org.apache.lucene.analysis.core.WhitespaceAnalyzer; - import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; @@ -71,119 +68,38 @@ import org.apache.lucene.search.similarities.LMDirichletSimilarity; import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.store.FSDirectory; -import org.kohsuke.args4j.CmdLineException; -import org.kohsuke.args4j.CmdLineParser; -import org.kohsuke.args4j.Option; -import org.kohsuke.args4j.OptionHandlerFilter; -import org.kohsuke.args4j.ParserProperties; import java.io.Closeable; import java.io.IOException; -import java.io.PrintWriter; -import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; -import java.util.ArrayList; -import java.util.Arrays; import java.util.HashMap; import java.util.List; -import java.util.Locale; import java.util.Map; -import java.util.SortedMap; import java.util.concurrent.CompletionException; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.Executors; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicLong; /** * Class that exposes basic search functionality, designed specifically to provide the bridge between Java and Python - * via pyjnius. + * via pyjnius. Note that methods are named according to Python conventions (e.g., snake case instead of camel case). */ public class SimpleSearcher implements Closeable { - public static final Sort BREAK_SCORE_TIES_BY_DOCID = + private static final Sort BREAK_SCORE_TIES_BY_DOCID = new Sort(SortField.FIELD_SCORE, new SortField(IndexArgs.ID, SortField.Type.STRING_VAL)); private static final Logger LOG = LogManager.getLogger(SimpleSearcher.class); - public static final class Args { - @Option(name = "-index", metaVar = "[path]", required = true, usage = "Path to Lucene index.") - public String index; - - @Option(name = "-topics", metaVar = "[file]", required = true, usage = "Topics file.") - public String topics; - - @Option(name = "-output", metaVar = "[file]", required = true, usage = "Output run file.") - public String output; - - @Option(name = "-bm25", usage = "Flag to use BM25.", forbids = {"-ql"}) - public Boolean useBM25 = true; - - @Option(name = "-bm25.k1", usage = "BM25 k1 value.", forbids = {"-ql"}) - public float bm25_k1 = 0.9f; - - @Option(name = "-bm25.b", usage = "BM25 b value.", forbids = {"-ql"}) - public float bm25_b = 0.4f; - - @Option(name = "-qld", usage = "Flag to use query-likelihood with Dirichlet smoothing.", forbids={"-bm25"}) - public Boolean useQL = false; - - @Option(name = "-qld.mu", usage = "Dirichlet smoothing parameter value for query-likelihood.", forbids={"-bm25"}) - public float ql_mu = 1000.0f; - - @Option(name = "-rm3", usage = "Flag to use RM3.") - public Boolean useRM3 = false; - - @Option(name = "-rm3.fbTerms", usage = "RM3 parameter: number of expansion terms") - public int rm3_fbTerms = 10; - - @Option(name = "-rm3.fbDocs", usage = "RM3 parameter: number of documents") - public int rm3_fbDocs = 10; - - @Option(name = "-rm3.originalQueryWeight", usage = "RM3 parameter: weight to assign to the original query") - public float rm3_originalQueryWeight = 0.5f; - - @Option(name = "-rocchio", usage = "Flag to use Rocchio.") - public Boolean useRocchio = false; - - @Option(name = "-rocchio.topFbTerms", usage = "Rocchio parameter: number of relevant expansion terms") - public int rocchio_topFbTerms = 10; - - @Option(name = "-rocchio.topFbDocs", usage = "Rocchio parameter: number of relevant documents") - public int rocchio_topFbDocs = 10; - - @Option(name = "-rocchio.bottomFbTerms", usage = "Rocchio parameter: number of nonrelevant expansion terms") - public int rocchio_bottomFbTerms = 10; - - @Option(name = "-rocchio.bottomFbDocs", usage = "Rocchio parameter: number of nonrelevant documents") - public int rocchio_bottomFbDocs = 10; - - @Option(name = "-rocchio.alpha", usage = "Rocchio parameter: weight to assign to the original query") - public float rocchio_alpha = 1.0f; - - @Option(name = "-rocchio.beta", usage = "Rocchio parameter: weight to assign to the relevant document vectors") - public float rocchio_beta = 0.75f; - - @Option(name = "-rocchio.gamma", usage = "Rocchio parameter: weight to assign to the nonrelevant document vectors") - public float rocchio_gamma = 0.0f; - - @Option(name = "-hits", metaVar = "[number]", usage = "Max number of hits to return.") - public int hits = 1000; - - @Option(name = "-threads", metaVar = "[number]", usage = "Number of threads to use.") - public int threads = 1; - - @Option(name = "-language", usage = "Analyzer Language") - public String language = "en"; - } - protected IndexReader reader; protected Similarity similarity; protected Analyzer analyzer; protected RerankerCascade cascade; + protected QueryGenerator generator = new BagOfWordsQueryGenerator(); protected boolean useRM3; protected boolean useRocchio; + protected boolean backwardsCompatibilityLucene8; protected IndexSearcher searcher = null; @@ -191,7 +107,7 @@ public static final class Args { * This class is meant to serve as the bridge between Anserini and Pyserini. * Note that we are adopting Python naming conventions here on purpose. */ - public class Result { + public static class Result { public String docid; public int lucene_docid; public float score; @@ -230,15 +146,20 @@ public SimpleSearcher(String indexDir) throws IOException { * @throws IOException if errors encountered during initialization */ public SimpleSearcher(String indexDir, Analyzer analyzer) throws IOException { + SearchArgs defaults = new SearchArgs(); Path indexPath = Paths.get(indexDir); if (!Files.exists(indexPath) || !Files.isDirectory(indexPath) || !Files.isReadable(indexPath)) { throw new IllegalArgumentException(indexDir + " does not exist or is not a directory."); } - SearchArgs defaults = new SearchArgs(); - this.reader = DirectoryReader.open(FSDirectory.open(indexPath)); + + // Fix for index compatibility issue between Lucene 8 and 9: https://github.com/castorini/anserini/issues/1952 + // If we detect an older index version, we turn off consistent tie-breaking, which avoids accessing docvalues, + // which is the source of the incompatibility. + this.backwardsCompatibilityLucene8 = !reader.toString().contains("Lucene9"); + // Default to using BM25. this.similarity = new BM25Similarity(Float.parseFloat(defaults.bm25_k1[0]), Float.parseFloat(defaults.bm25_b[0])); this.analyzer = analyzer; @@ -253,7 +174,7 @@ public SimpleSearcher(String indexDir, Analyzer analyzer) throws IOException { * * @param analyzer analyzer to use */ - public void setAnalyzer(Analyzer analyzer) { + public void set_analyzer(Analyzer analyzer) { this.analyzer = analyzer; } @@ -262,7 +183,7 @@ public void setAnalyzer(Analyzer analyzer) { * * @return analyzed used */ - public Analyzer getAnalyzer(){ + public Analyzer get_analyzer(){ return this.analyzer; } @@ -271,7 +192,7 @@ public Analyzer getAnalyzer(){ * * @param language language */ - public void setLanguage(String language) { + public void set_language(String language) { if (language.equals("ar")) { this.analyzer = new ArabicAnalyzer(); } else if (language.equals("bn")) { @@ -325,22 +246,18 @@ public void setLanguage(String language) { } /** - * Returns whether or not RM3 query expansion is being performed. + * Determines if RM3 query expansion is enabled. * - * @return whether or not RM3 query expansion is being performed + * @return true if RM query expansion is enabled; false otherwise. */ - public boolean useRM3() { + public boolean use_rm3() { return useRM3; } - public boolean useRocchio() { - return useRocchio; - } - /** * Disables RM3 query expansion. */ - public void unsetRM3() { + public void unset_rm3() { this.useRM3 = false; cascade = new RerankerCascade(); cascade.add(new ScoreTiesAdjusterReranker()); @@ -349,25 +266,25 @@ public void unsetRM3() { /** * Enables RM3 query expansion with default parameters. */ - public void setRM3() { + public void set_rm3() { SearchArgs defaults = new SearchArgs(); - setRM3(Integer.parseInt(defaults.rm3_fbTerms[0]), Integer.parseInt(defaults.rm3_fbDocs[0]), + set_rm3(Integer.parseInt(defaults.rm3_fbTerms[0]), Integer.parseInt(defaults.rm3_fbDocs[0]), Float.parseFloat(defaults.rm3_originalQueryWeight[0])); } /** - * Enables RM3 query expansion with default parameters. + * Enables RM3 query expansion with specified parameters. * * @param fbTerms number of expansion terms * @param fbDocs number of expansion documents * @param originalQueryWeight weight to assign to the original query */ - public void setRM3(int fbTerms, int fbDocs, float originalQueryWeight) { - setRM3(fbTerms, fbDocs, originalQueryWeight, false, true); + public void set_rm3(int fbTerms, int fbDocs, float originalQueryWeight) { + set_rm3(fbTerms, fbDocs, originalQueryWeight, false, true); } /** - * Enables RM3 query expansion with default parameters. + * Enables RM3 query expansion with specified parameters. * * @param fbTerms number of expansion terms * @param fbDocs number of expansion documents @@ -375,7 +292,7 @@ public void setRM3(int fbTerms, int fbDocs, float originalQueryWeight) { * @param outputQuery flag to print original and expanded queries * @param filterTerms whether to filter terms to be English only */ - public void setRM3(int fbTerms, int fbDocs, float originalQueryWeight, boolean outputQuery, boolean filterTerms) { + public void set_rm3(int fbTerms, int fbDocs, float originalQueryWeight, boolean outputQuery, boolean filterTerms) { useRM3 = true; cascade = new RerankerCascade("rm3"); cascade.add(new Rm3Reranker(this.analyzer, IndexArgs.CONTENTS, @@ -383,10 +300,19 @@ public void setRM3(int fbTerms, int fbDocs, float originalQueryWeight, boolean o cascade.add(new ScoreTiesAdjusterReranker()); } + /** + * Determines if Rocchio query expansion is enabled. + * + * @return true if Rocchio query expansion is enabled; false otherwise. + */ + public boolean use_rocchio() { + return useRocchio; + } + /** * Disables Rocchio query expansion. */ - public void unsetRocchio() { + public void unset_rocchio() { this.useRocchio = false; cascade = new RerankerCascade(); cascade.add(new ScoreTiesAdjusterReranker()); @@ -395,15 +321,16 @@ public void unsetRocchio() { /** * Enables Rocchio query expansion with default parameters. */ - public void setRocchio() { + public void set_rocchio() { SearchArgs defaults = new SearchArgs(); - setRocchio(Integer.parseInt(defaults.rocchio_topFbTerms[0]), Integer.parseInt(defaults.rocchio_topFbDocs[0]), + set_rocchio(Integer.parseInt(defaults.rocchio_topFbTerms[0]), Integer.parseInt(defaults.rocchio_topFbDocs[0]), Integer.parseInt(defaults.rocchio_bottomFbTerms[0]), Integer.parseInt(defaults.rocchio_bottomFbDocs[0]), - Float.parseFloat(defaults.rocchio_alpha[0]), Float.parseFloat(defaults.rocchio_beta[0]), Float.parseFloat(defaults.rocchio_gamma[0]), false, false); + Float.parseFloat(defaults.rocchio_alpha[0]), Float.parseFloat(defaults.rocchio_beta[0]), + Float.parseFloat(defaults.rocchio_gamma[0]), false, false); } /** - * Enables Rocchio query expansion with default parameters. + * Enables Rocchio query expansion with specified parameters. * * @param topFbTerms number of relevant expansion terms * @param topFbDocs number of relevant expansion documents @@ -414,7 +341,7 @@ public void setRocchio() { * @param gamma weight to assign to the nonrelevant document vectors * @param outputQuery flag to print original and expanded queries */ - public void setRocchio(int topFbTerms, int topFbDocs, int bottomFbTerms, int bottomFbDocs, float alpha, float beta, float gamma, boolean outputQuery, boolean useNegative) { + public void set_rocchio(int topFbTerms, int topFbDocs, int bottomFbTerms, int bottomFbDocs, float alpha, float beta, float gamma, boolean outputQuery, boolean useNegative) { useRocchio = true; cascade = new RerankerCascade("rocchio"); cascade.add(new RocchioReranker(this.analyzer, IndexArgs.CONTENTS, @@ -427,7 +354,7 @@ public void setRocchio(int topFbTerms, int topFbDocs, int bottomFbTerms, int bot * * @param mu mu smoothing parameter */ - public void setQLD(float mu) { + public void set_qld(float mu) { this.similarity = new LMDirichletSimilarity(mu); // We need to re-initialize the searcher @@ -441,7 +368,7 @@ public void setQLD(float mu) { * @param k1 k1 parameter * @param b b parameter */ - public void setBM25(float k1, float b) { + public void set_bm25(float k1, float b) { this.similarity = new BM25Similarity(k1, b); // We need to re-initialize the searcher @@ -454,7 +381,7 @@ public void setBM25(float k1, float b) { * * @return the {@link Similarity} currently being used */ - public Similarity getSimilarity() { + public Similarity get_similarity() { return similarity; } @@ -463,7 +390,7 @@ public Similarity getSimilarity() { * * @return the number of documents in the index */ - public int getTotalNumDocuments(){ + public int get_total_num_docs(){ // Create an IndexSearch only once. Note that the object is thread safe. if (searcher == null) { searcher = new IndexSearcher(reader); @@ -482,12 +409,11 @@ public void close() throws IOException { reader.close(); } catch (Exception e) { // Eat any exceptions. - return; } } /** - * Searches the collection using multiple threads. + * Searches the collection in batch using multiple threads. * * @param queries list of queries * @param qids list of unique query ids @@ -495,13 +421,15 @@ public void close() throws IOException { * @param threads number of threads * @return a map of query id to search results */ - public Map batchSearch(List queries, List qids, int k, int threads) { - QueryGenerator generator = new BagOfWordsQueryGenerator(); - return batchSearchFields(generator, queries, qids, k, threads, new HashMap<>()); + public Map batch_search(List queries, + List qids, + int k, + int threads) { + return batch_search_fields(this.generator, queries, qids, k, threads, new HashMap<>()); } /** - * Searches the collection using multiple threads. + * Searches the collection in batch using multiple threads. * * @param generator the method for generating queries * @param queries list of queries @@ -510,13 +438,16 @@ public Map batchSearch(List queries, List qids * @param threads number of threads * @return a map of query id to search results */ - public Map batchSearch(QueryGenerator generator, List queries, List qids, int k, int threads) { - return batchSearchFields(generator, queries, qids, k, threads, new HashMap<>()); + public Map batch_search(QueryGenerator generator, + List queries, + List qids, + int k, + int threads) { + return batch_search_fields(generator, queries, qids, k, threads, new HashMap<>()); } /** - * Searches the provided fields weighted by their boosts, using multiple threads. - * Batch version of {@link #searchFields(String, Map, int)}. + * Searches the provided fields weighted by their boosts, in batch using multiple threads. * * @param queries list of queries * @param qids list of unique query ids @@ -525,15 +456,16 @@ public Map batchSearch(QueryGenerator generator, List * @param fields map of fields to search with weights * @return a map of query id to search results */ - public Map batchSearchFields(List queries, List qids, int k, int threads, - Map fields) { - QueryGenerator generator = new BagOfWordsQueryGenerator(); - return batchSearchFields(generator, queries, qids, k, threads, fields); + public Map batch_search_fields(List queries, + List qids, + int k, + int threads, + Map fields) { + return batch_search_fields(this.generator, queries, qids, k, threads, fields); } /** - * Searches the provided fields weighted by their boosts, using multiple threads. - * Batch version of {@link #searchFields(String, Map, int)}. + * Searches the provided fields weighted by their boosts, in batch using multiple threads. * * @param generator the method for generating queries * @param queries list of queries @@ -543,8 +475,12 @@ public Map batchSearchFields(List queries, List batchSearchFields(QueryGenerator generator, List queries, List qids, int k, int threads, - Map fields) { + public Map batch_search_fields(QueryGenerator generator, + List queries, + List qids, + int k, + int threads, + Map fields) { // Create the IndexSearcher here, if needed. We do it here because if we leave the creation to the search // method, we might end up with a race condition as multiple threads try to concurrently create the IndexSearcher. if (searcher == null) { @@ -555,8 +491,6 @@ public Map batchSearchFields(QueryGenerator generator, List results = new ConcurrentHashMap<>(); - long startTime = System.nanoTime(); - AtomicLong index = new AtomicLong(); int queryCnt = queries.size(); for (int q = 0; q < queryCnt; ++q) { String query = queries.get(q); @@ -564,21 +498,13 @@ public Map batchSearchFields(QueryGenerator generator, List { try { if (fields.size() > 0) { - results.put(qid, searchFields(generator, query, fields, k)); + results.put(qid, search_fields(generator, query, fields, k)); } else { results.put(qid, search(generator, query, k)); } } catch (IOException e) { throw new CompletionException(e); } - // Logging to track query latency. - // Note that this is potentially noisy because it might interfere with tqdm on the Python side; logging - // every 500 queries seems like a reasonable comprise between offering helpful info and not being too noisy. - Long lineNumber = index.incrementAndGet(); - if (lineNumber % 500 == 0) { - double timePerQuery = (double) (System.nanoTime() - startTime) / (lineNumber + 1) / 1e9; - LOG.info(String.format("Retrieving query " + lineNumber + " (%.3f s/query)", timePerQuery)); - } }); } @@ -628,7 +554,7 @@ public Result[] search(String q, int k) throws IOException { Query query = new BagOfWordsQueryGenerator().buildQuery(IndexArgs.CONTENTS, analyzer, q); List queryTokens = AnalyzerUtils.analyze(analyzer, q); - return search(query, queryTokens, q, k); + return _search(query, queryTokens, q, k); } /** @@ -640,7 +566,7 @@ public Result[] search(String q, int k) throws IOException { * @throws IOException if error encountered during search */ public Result[] search(Query query, int k) throws IOException { - return search(query, null, null, k); + return _search(query, null, null, k); } /** @@ -655,11 +581,11 @@ public Result[] search(Query query, int k) throws IOException { public Result[] search(QueryGenerator generator, String q, int k) throws IOException { Query query = generator.buildQuery(IndexArgs.CONTENTS, analyzer, q); - return search(query, null, null, k); + return _search(query, null, null, k); } // internal implementation - protected Result[] search(Query query, List queryTokens, String queryString, int k) throws IOException { + protected Result[] _search(Query query, List queryTokens, String queryString, int k) throws IOException { // Create an IndexSearch only once. Note that the object is thread safe. if (searcher == null) { searcher = new IndexSearcher(reader); @@ -667,12 +593,16 @@ protected Result[] search(Query query, List queryTokens, String queryStr } SearchArgs searchArgs = new SearchArgs(); - searchArgs.arbitraryScoreTieBreak = false; + searchArgs.arbitraryScoreTieBreak = this.backwardsCompatibilityLucene8; searchArgs.hits = k; TopDocs rs; RerankerContext context; - rs = searcher.search(query, useRM3 ? searchArgs.rerankcutoff : k, BREAK_SCORE_TIES_BY_DOCID, true); + if (this.backwardsCompatibilityLucene8) { + rs = searcher.search(query, useRM3 ? searchArgs.rerankcutoff : k); + } else { + rs = searcher.search(query, useRM3 ? searchArgs.rerankcutoff : k, BREAK_SCORE_TIES_BY_DOCID, true); + } context = new RerankerContext<>(searcher, null, query, null, queryString, queryTokens, null, searchArgs); @@ -705,10 +635,12 @@ protected Result[] search(Query query, List queryTokens, String queryStr * @return array of search results * @throws IOException if error encountered during search */ - public Result[] searchFields(String q, Map fields, int k) throws IOException { + public Result[] search_fields(String q, + Map fields, + int k) throws IOException { // Note that this is used for MS MARCO experiments with document expansion. QueryGenerator queryGenerator = new BagOfWordsQueryGenerator(); - return searchFields(queryGenerator, q, fields, k); + return search_fields(queryGenerator, q, fields, k); } /** @@ -721,27 +653,28 @@ public Result[] searchFields(String q, Map fields, int k) throws * @return array of search results * @throws IOException if error encountered during search */ - public Result[] searchFields(QueryGenerator generator, String q, Map fields, int k) throws IOException { + public Result[] search_fields(QueryGenerator generator, + String q, + Map fields, + int k) throws IOException { IndexSearcher searcher = new IndexSearcher(reader); searcher.setSimilarity(similarity); Query query = generator.buildQuery(fields, analyzer, q); List queryTokens = AnalyzerUtils.analyze(analyzer, q); - return search(query, queryTokens, q, k); + return _search(query, queryTokens, q, k); } /** * Fetches the Lucene {@link Document} based on an internal Lucene docid. - * The method is named to be consistent with Lucene's {@link IndexReader#document(int)}, contra Java's standard - * method naming conventions. * - * @param ldocid internal Lucene docid + * @param lucene_docid internal Lucene docid * @return corresponding Lucene {@link Document} */ - public Document document(int ldocid) { + public Document doc(int lucene_docid) { try { - return reader.document(ldocid); + return reader.document(lucene_docid); } catch (Exception e) { // Eat any exceptions and just return null. return null; @@ -750,33 +683,32 @@ public Document document(int ldocid) { /** * Returns the Lucene {@link Document} based on a collection docid. - * The method is named to be consistent with Lucene's {@link IndexReader#document(int)}, contra Java's standard - * method naming conventions. * * @param docid collection docid * @return corresponding Lucene {@link Document} */ - public Document document(String docid) { + public Document doc(String docid) { return IndexReaderUtils.document(reader, docid); } /** * Returns a map of collection docid to Lucene {@link Document}. - * Batch version of {@link #document(String)}. + * Batch version of {@link #doc(String)}. * * @param docids list of docids * @return a map of docid to corresponding Lucene {@link Document} */ - public Map batchGetDocument(List docids, int threads) { + public Map batch_get_docs(List docids, int threads) { ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(threads); ConcurrentHashMap results = new ConcurrentHashMap<>(); for (String docid: docids) { executor.execute(() -> { try { - Document result = IndexReaderUtils.document(reader, docid); - results.put(docid, result); - } catch (Exception e){} + results.put(docid, IndexReaderUtils.document(reader, docid)); + } catch (Exception e) { + // Do nothing, just eat the exception. + } }); } @@ -801,28 +733,24 @@ public Map batchGetDocument(List docids, int threads) /** * Fetches the Lucene {@link Document} based on some field other than its unique collection docid. * For example, scientific articles might have DOIs. - * The method is named to be consistent with Lucene's {@link IndexReader#document(int)}, contra Java's standard - * method naming conventions. * * @param field field * @param id unique id * @return corresponding Lucene {@link Document} based on the value of a specific field */ - public Document documentByField(String field, String id) { + public Document doc_by_field(String field, String id) { return IndexReaderUtils.documentByField(reader, field, id); } /** * Returns the "contents" field of a document based on an internal Lucene docid. - * The method is named to be consistent with Lucene's {@link IndexReader#document(int)}, contra Java's standard - * method naming conventions. * - * @param ldocid internal Lucene docid + * @param lucene_docid internal Lucene docid * @return the "contents" field the document */ - public String documentContents(int ldocid) { + public String doc_contents(int lucene_docid) { try { - return reader.document(ldocid).get(IndexArgs.CONTENTS); + return reader.document(lucene_docid).get(IndexArgs.CONTENTS); } catch (Exception e) { // Eat any exceptions and just return null. return null; @@ -831,27 +759,23 @@ public String documentContents(int ldocid) { /** * Returns the "contents" field of a document based on a collection docid. - * The method is named to be consistent with Lucene's {@link IndexReader#document(int)}, contra Java's standard - * method naming conventions. * * @param docid collection docid * @return the "contents" field the document */ - public String documentContents(String docid) { + public String doc_contents(String docid) { return IndexReaderUtils.documentContents(reader, docid); } /** * Returns the "raw" field of a document based on an internal Lucene docid. - * The method is named to be consistent with Lucene's {@link IndexReader#document(int)}, contra Java's standard - * method naming conventions. * - * @param ldocid internal Lucene docid + * @param lucene_docid internal Lucene docid * @return the "raw" field the document */ - public String documentRaw(int ldocid) { + public String doc_raw(int lucene_docid) { try { - return reader.document(ldocid).get(IndexArgs.RAW); + return reader.document(lucene_docid).get(IndexArgs.RAW); } catch (Exception e) { // Eat any exceptions and just return null. return null; @@ -866,103 +790,7 @@ public String documentRaw(int ldocid) { * @param docid collection docid * @return the "raw" field the document */ - public String documentRaw(String docid) { + public String doc_raw(String docid) { return IndexReaderUtils.documentRaw(reader, docid); } - - // Note that this class is primarily meant to be used by automated regression scripts, not humans! - // tl;dr - Do not use this class for running experiments. Use SearchCollection instead! - // - // SimpleSearcher is the main class that exposes search functionality for Pyserini (in Python). - // As such, it has a different code path than SearchCollection, the preferred entry point for running experiments - // from Java. The main method here exposes only barebone options, primarily designed to verify that results from - // SimpleSearcher are *exactly* the same as SearchCollection (e.g., via automated regression scripts). - public static void main(String[] args) throws Exception { - Args searchArgs = new Args(); - CmdLineParser parser = new CmdLineParser(searchArgs, ParserProperties.defaults().withUsageWidth(100)); - - try { - parser.parseArgument(args); - } catch (CmdLineException e) { - System.err.println(e.getMessage()); - parser.printUsage(System.err); - System.err.println("Example: SimpleSearcher" + parser.printExample(OptionHandlerFilter.REQUIRED)); - return; - } - - final long start = System.nanoTime(); - SimpleSearcher searcher = new SimpleSearcher(searchArgs.index); - searcher.setLanguage(searchArgs.language); - SortedMap> topics = TopicReader.getTopicsByFile(searchArgs.topics); - - PrintWriter out = new PrintWriter(Files.newBufferedWriter(Paths.get(searchArgs.output), StandardCharsets.US_ASCII)); - List argsAsList = Arrays.asList(args); - - // Test a separate code path, where we specify BM25 explicitly, which is different from not specifying it at all. - if (argsAsList.contains("-bm25")) { - LOG.info("Testing code path of explicitly setting BM25."); - searcher.setBM25(searchArgs.bm25_k1, searchArgs.bm25_b); - } else if (searchArgs.useQL){ - LOG.info("Testing code path of explicitly setting QL."); - searcher.setQLD(searchArgs.ql_mu); - } - - if (searchArgs.useRM3) { - if (argsAsList.contains("-rm3.fbTerms") || argsAsList.contains("-rm3.fbTerms") || - argsAsList.contains("-rm3.originalQueryWeight")) { - LOG.info("Testing code path of explicitly setting RM3 parameters."); - searcher.setRM3(searchArgs.rm3_fbTerms, searchArgs.rm3_fbDocs, searchArgs.rm3_originalQueryWeight); - } else { - LOG.info("Testing code path of default RM3 parameters."); - searcher.setRM3(); - } - } else if (searchArgs.useRocchio) { - if (argsAsList.contains("-rocchio.topFbTerms") || argsAsList.contains("-rocchio.topFbDocs") || - argsAsList.contains("-rocchio.bottomFbTerms") || argsAsList.contains("-rocchio.bottomFbDocs") || - argsAsList.contains("-rocchio.alpha") || argsAsList.contains("-rocchio.beta") || argsAsList.contains("-rocchio.gamma")) { - LOG.info("Testing code path of explicitly setting Rocchio parameters."); - searcher.setRocchio(searchArgs.rocchio_topFbTerms, searchArgs.rocchio_topFbDocs, searchArgs.rocchio_bottomFbTerms, searchArgs.rocchio_bottomFbDocs, - searchArgs.rocchio_alpha, searchArgs.rocchio_beta, searchArgs.rocchio_gamma, false, false); - } else { - LOG.info("Testing code path of default Rocchio parameters."); - searcher.setRocchio(); - } - } - - if (searchArgs.threads == 1) { - for (Object id : topics.keySet()) { - Result[] results = searcher.search(topics.get(id).get("title"), searchArgs.hits); - - for (int i = 0; i < results.length; i++) { - out.println(String.format(Locale.US, "%s Q0 %s %d %f Anserini", - id, results[i].docid, (i + 1), results[i].score)); - } - } - } else { - List qids = new ArrayList<>(); - List queries = new ArrayList<>(); - - for (Object id : topics.keySet()) { - qids.add(id.toString()); - queries.add(topics.get(id).get("title")); - } - - Map allResults = searcher.batchSearch(queries, qids, searchArgs.hits, searchArgs.threads); - - // We iterate through, in natural object order. - for (Object id : topics.keySet()) { - Result[] results = allResults.get(id.toString()); - - for (int i = 0; i < results.length; i++) { - out.println(String.format(Locale.US, "%s Q0 %s %d %f Anserini", - id, results[i].docid, (i + 1), results[i].score)); - } - } - } - - out.close(); - - final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS); - LOG.info("Total run time: " + DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss")); - } } diff --git a/src/main/java/io/anserini/search/SimpleTweetSearcher.java b/src/main/java/io/anserini/search/SimpleTweetSearcher.java index 0ff0d13590..af386e9703 100644 --- a/src/main/java/io/anserini/search/SimpleTweetSearcher.java +++ b/src/main/java/io/anserini/search/SimpleTweetSearcher.java @@ -177,7 +177,7 @@ public static void main(String[] args) throws Exception { PrintWriter out = new PrintWriter(Files.newBufferedWriter(Paths.get(searchArgs.output), StandardCharsets.US_ASCII)); if (searchArgs.useRM3) { - searcher.setRM3(); + searcher.set_rm3(); } for (Object id : topics.keySet()) { diff --git a/src/test/java/io/anserini/search/SimpleImpactSearcherPrebuiltLucene8Test.java b/src/test/java/io/anserini/search/SimpleImpactSearcherPrebuiltLucene8Test.java new file mode 100644 index 0000000000..3a48453eb1 --- /dev/null +++ b/src/test/java/io/anserini/search/SimpleImpactSearcherPrebuiltLucene8Test.java @@ -0,0 +1,54 @@ +/* + * Anserini: A Lucene toolkit for reproducible information retrieval research + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.anserini.search; + +import org.junit.Test; + +import java.util.HashMap; +import java.util.Map; + +import static org.junit.Assert.assertEquals; + +public class SimpleImpactSearcherPrebuiltLucene8Test { + + @Test + public void testSearch1() throws Exception { + SimpleImpactSearcher searcher = + new SimpleImpactSearcher("src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_json_collection_tokenized"); + assertEquals(2, searcher.get_total_num_docs()); + + SimpleImpactSearcher.Result[] hits; + + Map query = new HashMap<>(); + query.put("##ing", 1.0f); + + hits = searcher.search(query, 10); + assertEquals(1, hits.length); + assertEquals("2000001", hits[0].docid); + assertEquals(2, (int) hits[0].score); + + query = new HashMap<>(); + query.put("test", 1.0f); + hits = searcher.search(query, 10); + assertEquals(1, hits.length); + assertEquals("2000000", hits[0].docid); + assertEquals(1, (int) hits[0].score); + + searcher.close(); + } + +} diff --git a/src/test/java/io/anserini/search/SimpleImpactSearcherPrebuiltLucene9Test.java b/src/test/java/io/anserini/search/SimpleImpactSearcherPrebuiltLucene9Test.java new file mode 100644 index 0000000000..c30d26eaad --- /dev/null +++ b/src/test/java/io/anserini/search/SimpleImpactSearcherPrebuiltLucene9Test.java @@ -0,0 +1,54 @@ +/* + * Anserini: A Lucene toolkit for reproducible information retrieval research + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.anserini.search; + +import org.junit.Test; + +import java.util.HashMap; +import java.util.Map; + +import static org.junit.Assert.assertEquals; + +public class SimpleImpactSearcherPrebuiltLucene9Test { + + @Test + public void testSearch1() throws Exception { + SimpleImpactSearcher searcher = + new SimpleImpactSearcher("src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_json_collection_tokenized"); + assertEquals(2, searcher.get_total_num_docs()); + + SimpleImpactSearcher.Result[] hits; + + Map query = new HashMap<>(); + query.put("##ing", 1.0f); + + hits = searcher.search(query, 10); + assertEquals(1, hits.length); + assertEquals("2000001", hits[0].docid); + assertEquals(2, (int) hits[0].score); + + query = new HashMap<>(); + query.put("test", 1.0f); + hits = searcher.search(query, 10); + assertEquals(1, hits.length); + assertEquals("2000000", hits[0].docid); + assertEquals(1, (int) hits[0].score); + + searcher.close(); + } + +} diff --git a/src/test/java/io/anserini/search/SimpleImpactSearcherTest.java b/src/test/java/io/anserini/search/SimpleImpactSearcherTest.java index aa4b2c3822..980be23037 100644 --- a/src/test/java/io/anserini/search/SimpleImpactSearcherTest.java +++ b/src/test/java/io/anserini/search/SimpleImpactSearcherTest.java @@ -25,22 +25,21 @@ import java.util.Map; public class SimpleImpactSearcherTest extends IndexerTestBase { - @Test public void testGetDoc() throws Exception { SimpleImpactSearcher searcher = new SimpleImpactSearcher(super.tempDir1.toString()); assertEquals("here is some text here is some more text. city.", - searcher.document(0).get("contents")); - assertEquals("more texts", searcher.document(1).get("contents")); - assertEquals("here is a test", searcher.document(2).get("contents")); - assertNull(searcher.document(3)); + searcher.doc(0).get("contents")); + assertEquals("more texts", searcher.doc(1).get("contents")); + assertEquals("here is a test", searcher.doc(2).get("contents")); + assertNull(searcher.doc(3)); assertEquals("here is some text here is some more text. city.", - searcher.document("doc1").get("contents")); - assertEquals("more texts", searcher.document("doc2").get("contents")); - assertEquals("here is a test", searcher.document("doc3").get("contents")); - assertNull(searcher.document(3)); + searcher.doc("doc1").get("contents")); + assertEquals("more texts", searcher.doc("doc2").get("contents")); + assertEquals("here is a test", searcher.doc("doc3").get("contents")); + assertNull(searcher.doc(3)); searcher.close(); } @@ -50,9 +49,9 @@ public void testGetDocByField() throws Exception { SimpleImpactSearcher searcher = new SimpleImpactSearcher(super.tempDir1.toString()); assertEquals("here is some text here is some more text. city.", - searcher.documentByField("id", "doc1").get("contents")); - assertEquals("more texts", searcher.documentByField("id", "doc2").get("contents")); - assertEquals("here is a test", searcher.documentByField("id", "doc3").get("contents")); + searcher.doc_by_field("id", "doc1").get("contents")); + assertEquals("more texts", searcher.doc_by_field("id", "doc2").get("contents")); + assertEquals("here is a test", searcher.doc_by_field("id", "doc3").get("contents")); searcher.close(); } @@ -61,21 +60,15 @@ public void testGetDocByField() throws Exception { public void testGetContents() throws Exception { SimpleImpactSearcher searcher = new SimpleImpactSearcher(super.tempDir1.toString()); - assertEquals("here is some text here is some more text. city.", - searcher.documentContents(0)); - assertEquals("more texts", - searcher.documentContents(1)); - assertEquals("here is a test", - searcher.documentContents(2)); - assertNull(searcher.document(3)); + assertEquals("here is some text here is some more text. city.", searcher.doc_contents(0)); + assertEquals("more texts", searcher.doc_contents(1)); + assertEquals("here is a test", searcher.doc_contents(2)); + assertNull(searcher.doc(3)); - assertEquals("here is some text here is some more text. city.", - searcher.documentContents("doc1")); - assertEquals("more texts", - searcher.documentContents("doc2")); - assertEquals("here is a test", - searcher.documentContents("doc3")); - assertNull(searcher.documentContents("doc42")); + assertEquals("here is some text here is some more text. city.", searcher.doc_contents("doc1")); + assertEquals("more texts", searcher.doc_contents("doc2")); + assertEquals("here is a test", searcher.doc_contents("doc3")); + assertNull(searcher.doc_contents("doc42")); searcher.close(); } @@ -85,20 +78,20 @@ public void testGetRaw() throws Exception { SimpleImpactSearcher searcher = new SimpleImpactSearcher(super.tempDir1.toString()); assertEquals("{\"contents\": \"here is some text here is some more text. city.\"}", - searcher.documentRaw(0)); + searcher.doc_raw(0)); assertEquals("{\"contents\": \"more texts\"}", - searcher.documentRaw(1)); + searcher.doc_raw(1)); assertEquals("{\"contents\": \"here is a test\"}", - searcher.documentRaw(2)); - assertNull(searcher.document(3)); + searcher.doc_raw(2)); + assertNull(searcher.doc(3)); assertEquals("{\"contents\": \"here is some text here is some more text. city.\"}", - searcher.documentRaw("doc1")); + searcher.doc_raw("doc1")); assertEquals("{\"contents\": \"more texts\"}", - searcher.documentRaw("doc2")); + searcher.doc_raw("doc2")); assertEquals("{\"contents\": \"here is a test\"}", - searcher.documentRaw("doc3")); - assertNull(searcher.documentContents("doc42")); + searcher.doc_raw("doc3")); + assertNull(searcher.doc_contents("doc42")); searcher.close(); } @@ -184,7 +177,7 @@ public void testBatchSearch() throws Exception { qids.add("query_test"); qids.add("query_more"); - Map hits = searcher.batchSearch(queries, qids, 10, 2); + Map hits = searcher.batch_search(queries, qids, 10, 2); assertEquals(2, hits.size()); assertEquals(1, hits.get("query_test").length); @@ -200,6 +193,6 @@ public void testBatchSearch() throws Exception { @Test public void testTotalNumDocuments() throws Exception { SimpleImpactSearcher searcher = new SimpleImpactSearcher(super.tempDir1.toString()); - assertEquals(3 ,searcher.getTotalNumDocuments()); + assertEquals(3 ,searcher.get_total_num_docs()); } } diff --git a/src/test/java/io/anserini/search/SimpleSearcherPrebuiltLucene8Test.java b/src/test/java/io/anserini/search/SimpleSearcherPrebuiltLucene8Test.java new file mode 100644 index 0000000000..09cf422416 --- /dev/null +++ b/src/test/java/io/anserini/search/SimpleSearcherPrebuiltLucene8Test.java @@ -0,0 +1,52 @@ +/* + * Anserini: A Lucene toolkit for reproducible information retrieval research + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.anserini.search; + +import org.junit.Test; + +import static org.junit.Assert.assertEquals; + +public class SimpleSearcherPrebuiltLucene8Test { + + @Test + public void testSearch1() throws Exception { + SimpleSearcher searcher = + new SimpleSearcher("src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2"); + assertEquals(3, searcher.get_total_num_docs()); + + SimpleSearcher.Result[] hits; + + hits = searcher.search("text", 10); + assertEquals(3, hits.length); + assertEquals("DOC222", hits[0].docid); + assertEquals(0.1015f, hits[0].score, 10e-4); + assertEquals("TREC_DOC_1", hits[1].docid); + assertEquals(0.0738f, hits[1].score, 10e-4); + assertEquals("WSJ_1", hits[2].docid); + assertEquals(0.0687f, hits[2].score, 10e-4); + + hits = searcher.search("simple", 10); + assertEquals(2, hits.length); + assertEquals("TREC_DOC_1", hits[0].docid); + assertEquals(0.2597f, hits[0].score, 10e-4); + assertEquals("DOC222", hits[1].docid); + assertEquals(0.2416f, hits[1].score, 10e-4); + + searcher.close(); + } + +} diff --git a/src/test/java/io/anserini/search/SimpleSearcherPrebuiltLucene9Test.java b/src/test/java/io/anserini/search/SimpleSearcherPrebuiltLucene9Test.java new file mode 100644 index 0000000000..1c0877c9d0 --- /dev/null +++ b/src/test/java/io/anserini/search/SimpleSearcherPrebuiltLucene9Test.java @@ -0,0 +1,52 @@ +/* + * Anserini: A Lucene toolkit for reproducible information retrieval research + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.anserini.search; + +import org.junit.Test; + +import static org.junit.Assert.assertEquals; + +public class SimpleSearcherPrebuiltLucene9Test { + + @Test + public void testSearch1() throws Exception { + SimpleSearcher searcher = + new SimpleSearcher("src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2"); + assertEquals(3, searcher.get_total_num_docs()); + + SimpleSearcher.Result[] hits; + + hits = searcher.search("text", 10); + assertEquals(3, hits.length); + assertEquals("DOC222", hits[0].docid); + assertEquals(0.1015f, hits[0].score, 10e-4); + assertEquals("TREC_DOC_1", hits[1].docid); + assertEquals(0.0738f, hits[1].score, 10e-4); + assertEquals("WSJ_1", hits[2].docid); + assertEquals(0.0687f, hits[2].score, 10e-4); + + hits = searcher.search("simple", 10); + assertEquals(2, hits.length); + assertEquals("TREC_DOC_1", hits[0].docid); + assertEquals(0.2597f, hits[0].score, 10e-4); + assertEquals("DOC222", hits[1].docid); + assertEquals(0.2416f, hits[1].score, 10e-4); + + searcher.close(); + } + +} diff --git a/src/test/java/io/anserini/search/SimpleSearcherTest.java b/src/test/java/io/anserini/search/SimpleSearcherTest.java index 7c108c02af..106b69cb68 100644 --- a/src/test/java/io/anserini/search/SimpleSearcherTest.java +++ b/src/test/java/io/anserini/search/SimpleSearcherTest.java @@ -17,40 +17,54 @@ package io.anserini.search; import io.anserini.IndexerTestBase; +import io.anserini.analysis.DefaultEnglishAnalyzer; import io.anserini.index.IndexArgs; import io.anserini.search.SimpleSearcher.Result; -import org.apache.lucene.index.Term; +import org.apache.lucene.analysis.ar.ArabicAnalyzer; import org.apache.lucene.document.Document; +import org.apache.lucene.index.Term; import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.similarities.BM25Similarity; +import org.apache.lucene.search.similarities.LMDirichletSimilarity; import org.junit.Test; -import java.io.File; -import java.nio.charset.StandardCharsets; -import java.nio.file.Files; -import java.nio.file.Paths; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.Random; public class SimpleSearcherTest extends IndexerTestBase { + @Test + public void testGettersAndSetters() throws Exception { + SimpleSearcher searcher = new SimpleSearcher(super.tempDir1.toString()); + assertTrue(searcher.get_analyzer() instanceof DefaultEnglishAnalyzer); + + searcher.set_language("ar"); + assertTrue(searcher.get_analyzer() instanceof ArabicAnalyzer); + + assertTrue(searcher.get_similarity() instanceof BM25Similarity); + + searcher.set_qld(100.0f); + assertTrue(searcher.get_similarity() instanceof LMDirichletSimilarity); + + searcher.close(); + } @Test public void testGetDoc() throws Exception { SimpleSearcher searcher = new SimpleSearcher(super.tempDir1.toString()); assertEquals("here is some text here is some more text. city.", - searcher.document(0).get("contents")); - assertEquals("more texts", searcher.document(1).get("contents")); - assertEquals("here is a test", searcher.document(2).get("contents")); - assertNull(searcher.document(3)); + searcher.doc(0).get("contents")); + assertEquals("more texts", searcher.doc(1).get("contents")); + assertEquals("here is a test", searcher.doc(2).get("contents")); + assertNull(searcher.doc(3)); assertEquals("here is some text here is some more text. city.", - searcher.document("doc1").get("contents")); - assertEquals("more texts", searcher.document("doc2").get("contents")); - assertEquals("here is a test", searcher.document("doc3").get("contents")); - assertNull(searcher.document(3)); + searcher.doc("doc1").get("contents")); + assertEquals("more texts", searcher.doc("doc2").get("contents")); + assertEquals("here is a test", searcher.doc("doc3").get("contents")); + assertNull(searcher.doc(3)); searcher.close(); } @@ -65,7 +79,7 @@ public void testBatchGetDoc() throws Exception { docIds.add("doc3"); docIds.add("fake_doc"); - Map results = searcher.batchGetDocument(docIds, 2); + Map results = searcher.batch_get_docs(docIds, 2); assertEquals("here is some text here is some more text. city.", results.get("doc1").get("contents")); assertEquals("more texts", results.get("doc2").get("contents")); assertEquals("here is a test", results.get("doc3").get("contents")); @@ -79,9 +93,9 @@ public void testGetDocByField() throws Exception { SimpleSearcher searcher = new SimpleSearcher(super.tempDir1.toString()); assertEquals("here is some text here is some more text. city.", - searcher.documentByField("id", "doc1").get("contents")); - assertEquals("more texts", searcher.documentByField("id", "doc2").get("contents")); - assertEquals("here is a test", searcher.documentByField("id", "doc3").get("contents")); + searcher.doc_by_field("id", "doc1").get("contents")); + assertEquals("more texts", searcher.doc_by_field("id", "doc2").get("contents")); + assertEquals("here is a test", searcher.doc_by_field("id", "doc3").get("contents")); searcher.close(); } @@ -90,21 +104,15 @@ public void testGetDocByField() throws Exception { public void testGetContents() throws Exception { SimpleSearcher searcher = new SimpleSearcher(super.tempDir1.toString()); - assertEquals("here is some text here is some more text. city.", - searcher.documentContents(0)); - assertEquals("more texts", - searcher.documentContents(1)); - assertEquals("here is a test", - searcher.documentContents(2)); - assertNull(searcher.document(3)); + assertEquals("here is some text here is some more text. city.", searcher.doc_contents(0)); + assertEquals("more texts", searcher.doc_contents(1)); + assertEquals("here is a test", searcher.doc_contents(2)); + assertNull(searcher.doc(3)); - assertEquals("here is some text here is some more text. city.", - searcher.documentContents("doc1")); - assertEquals("more texts", - searcher.documentContents("doc2")); - assertEquals("here is a test", - searcher.documentContents("doc3")); - assertNull(searcher.documentContents("doc42")); + assertEquals("here is some text here is some more text. city.", searcher.doc_contents("doc1")); + assertEquals("more texts", searcher.doc_contents("doc2")); + assertEquals("here is a test", searcher.doc_contents("doc3")); + assertNull(searcher.doc_contents("doc42")); searcher.close(); } @@ -114,20 +122,20 @@ public void testGetRaw() throws Exception { SimpleSearcher searcher = new SimpleSearcher(super.tempDir1.toString()); assertEquals("{\"contents\": \"here is some text here is some more text. city.\"}", - searcher.documentRaw(0)); + searcher.doc_raw(0)); assertEquals("{\"contents\": \"more texts\"}", - searcher.documentRaw(1)); + searcher.doc_raw(1)); assertEquals("{\"contents\": \"here is a test\"}", - searcher.documentRaw(2)); - assertNull(searcher.document(3)); + searcher.doc_raw(2)); + assertNull(searcher.doc(3)); assertEquals("{\"contents\": \"here is some text here is some more text. city.\"}", - searcher.documentRaw("doc1")); + searcher.doc_raw("doc1")); assertEquals("{\"contents\": \"more texts\"}", - searcher.documentRaw("doc2")); + searcher.doc_raw("doc2")); assertEquals("{\"contents\": \"here is a test\"}", - searcher.documentRaw("doc3")); - assertNull(searcher.documentContents("doc42")); + searcher.doc_raw("doc3")); + assertNull(searcher.doc_contents("doc42")); searcher.close(); } @@ -145,10 +153,8 @@ public void testSearch1() throws Exception { assertEquals("{\"contents\": \"here is a test\"}", hits[0].raw); // We can fetch the exact same information from the raw Lucene document also. - assertEquals("doc3", - hits[0].lucene_document.getField(IndexArgs.ID).stringValue()); - assertEquals("here is a test", - hits[0].lucene_document.getField(IndexArgs.CONTENTS).stringValue()); + assertEquals("doc3", hits[0].lucene_document.getField(IndexArgs.ID).stringValue()); + assertEquals("here is a test", hits[0].lucene_document.getField(IndexArgs.CONTENTS).stringValue()); assertEquals("{\"contents\": \"here is a test\"}", hits[0].lucene_document.getField(IndexArgs.RAW).stringValue()); @@ -172,9 +178,9 @@ public void testSearch2() throws Exception { assertEquals(2, results.length); assertEquals("doc1", results[0].docid); assertEquals(0, results[0].lucene_docid); + assertEquals(0.28830000f, results[0].score, 10e-6); assertEquals("doc2", results[1].docid); assertEquals(1, results[1].lucene_docid); - assertEquals(0.28830000f, results[0].score, 10e-6); assertEquals(0.27329999f, results[1].score, 10e-6); results = searcher.search("test"); @@ -186,6 +192,118 @@ public void testSearch2() throws Exception { searcher.close(); } + @Test + public void testSearch3() throws Exception { + SimpleSearcher searcher = new SimpleSearcher(super.tempDir1.toString()); + searcher.set_bm25(3.5f, 0.9f); + Result[] results; + + results = searcher.search("text", 1); + assertEquals(1, results.length); + assertEquals("doc2", results[0].docid); + assertEquals(1, results[0].lucene_docid); + assertEquals(0.16070f, results[0].score, 10e-5); + + results = searcher.search("text"); + assertEquals(2, results.length); + assertEquals("doc2", results[0].docid); + assertEquals(1, results[0].lucene_docid); + assertEquals(0.16070f, results[0].score, 10e-5); + assertEquals("doc1", results[1].docid); + assertEquals(0, results[1].lucene_docid); + assertEquals(0.10870f, results[1].score, 10e-5); + + results = searcher.search("test"); + assertEquals(1, results.length); + assertEquals("doc3", results[0].docid); + assertEquals(2, results[0].lucene_docid); + assertEquals(0.33530f, results[0].score, 10e-5); + + searcher.close(); + } + + @Test + public void testSearch4() throws Exception { + SimpleSearcher searcher = new SimpleSearcher(super.tempDir1.toString()); + searcher.set_qld(10); + Result[] results; + + results = searcher.search("text", 1); + assertEquals(1, results.length); + assertEquals("doc2", results[0].docid); + assertEquals(1, results[0].lucene_docid); + assertEquals(0.09910f, results[0].score, 10e-5); + + results = searcher.search("text"); + assertEquals(2, results.length); + assertEquals("doc2", results[0].docid); + assertEquals(1, results[0].lucene_docid); + assertEquals(0.09910f, results[0].score, 10e-5); + assertEquals("doc1", results[1].docid); + assertEquals(0, results[1].lucene_docid); + assertEquals(0.0f, results[1].score, 10e-5); + + results = searcher.search("test"); + assertEquals(1, results.length); + assertEquals("doc3", results[0].docid); + assertEquals(2, results[0].lucene_docid); + assertEquals(0.31850f, results[0].score, 10e-5); + + searcher.close(); + } + + @Test + public void testSearch5() throws Exception { + SimpleSearcher searcher = new SimpleSearcher(super.tempDir1.toString()); + searcher.set_rm3(); + assertTrue(searcher.use_rm3()); + + Result[] results; + + results = searcher.search("text", 1); + assertEquals(1, results.length); + assertEquals("doc1", results[0].docid); + assertEquals(0, results[0].lucene_docid); + assertEquals(0.14417f, results[0].score, 10e-5); + + searcher.unset_rm3(); + assertFalse(searcher.use_rm3()); + + results = searcher.search("text", 1); + assertEquals(1, results.length); + assertEquals("doc1", results[0].docid); + assertEquals(0, results[0].lucene_docid); + assertEquals(0.28830f, results[0].score, 10e-5); + + searcher.close(); + } + + @Test + public void testSearch6() throws Exception { + SimpleSearcher searcher = new SimpleSearcher(super.tempDir1.toString()); + searcher.set_rocchio(); + assertTrue(searcher.use_rocchio()); + + Result[] results; + + results = searcher.search("text", 1); + assertEquals(1, results.length); + assertEquals("doc1", results[0].docid); + assertEquals(0, results[0].lucene_docid); + assertEquals(0.28830f, results[0].score, 10e-5); + + searcher.unset_rocchio(); + assertFalse(searcher.use_rocchio()); + + results = searcher.search("text", 1); + assertEquals(1, results.length); + assertEquals("doc1", results[0].docid); + assertEquals(0, results[0].lucene_docid); + assertEquals(0.28830f, results[0].score, 10e-5); + + searcher.close(); + } + @Test public void testSearchCustomQuery() throws Exception { // Test the ability to pass in an arbitrary Lucene query. @@ -213,7 +331,7 @@ public void testBatchSearch() throws Exception { qids.add("query_test"); qids.add("query_more"); - Map hits = searcher.batchSearch(queries, qids, 10, 2); + Map hits = searcher.batch_search(queries, qids, 10, 2); assertEquals(2, hits.size()); assertEquals(1, hits.get("query_test").length); @@ -234,15 +352,15 @@ public void testFieldedSearch() throws Exception { fields.put("id", 1.0f); fields.put("contents", 1.0f); - SimpleSearcher.Result[] hits = searcher.searchFields("doc1", fields, 10); + SimpleSearcher.Result[] hits = searcher.search_fields("doc1", fields, 10); assertEquals(1, hits.length); assertEquals("doc1", hits[0].docid); - hits = searcher.searchFields("test", fields, 10); + hits = searcher.search_fields("test", fields, 10); assertEquals(1, hits.length); assertEquals("doc3", hits[0].docid); - hits = searcher.searchFields("test", Map.of("id", 1.0f), 10); + hits = searcher.search_fields("test", Map.of("id", 1.0f), 10); assertEquals(0, hits.length); searcher.close(); @@ -264,7 +382,7 @@ public void testFieldedBatchSearch() throws Exception { fields.put("id", 1.0f); fields.put("contents", 1.0f); - Map hits = searcher.batchSearchFields(queries, qids, 10, 2, fields); + Map hits = searcher.batch_search_fields(queries, qids, 10, 2, fields); assertEquals(2, hits.size()); assertEquals(1, hits.get("query_id").length); @@ -276,53 +394,9 @@ public void testFieldedBatchSearch() throws Exception { searcher.close(); } - @Test - public void testMain() throws Exception { - Random random = new Random(); - String tmpFile = "tmp" + random.nextInt() + ".txt"; - String contents; - - SimpleSearcher.main(new String[] {"-index", super.tempDir1.toString(), - "-topics", "src/main/resources/topics-and-qrels/topics.robust04.txt", - "-output", tmpFile}); - - contents = Files.readString(Paths.get(tmpFile), StandardCharsets.US_ASCII); - assertEquals("620 Q0 doc3 1 0.570200 Anserini\n", contents); - - SimpleSearcher.main(new String[] {"-index", super.tempDir1.toString(), "-threads", "2", - "-topics", "src/main/resources/topics-and-qrels/topics.robust04.txt", - "-output", tmpFile}); - - contents = Files.readString(Paths.get(tmpFile), StandardCharsets.US_ASCII); - assertEquals("620 Q0 doc3 1 0.570200 Anserini\n", contents); - - SimpleSearcher.main(new String[] {"-index", super.tempDir1.toString(), "-rm3", - "-topics", "src/main/resources/topics-and-qrels/topics.robust04.txt", - "-output", tmpFile}); - - contents = Files.readString(Paths.get(tmpFile), StandardCharsets.US_ASCII); - assertEquals("620 Q0 doc3 1 0.095000 Anserini\n", contents); - - SimpleSearcher.main(new String[] {"-index", super.tempDir1.toString(), "-rocchio", - "-topics", "src/main/resources/topics-and-qrels/topics.robust04.txt", - "-output", tmpFile}); - - contents = Files.readString(Paths.get(tmpFile), StandardCharsets.US_ASCII); - assertEquals("620 Q0 doc3 1 0.329200 Anserini\n", contents); - - SimpleSearcher.main(new String[] {"-index", super.tempDir1.toString(), "-qld", - "-topics", "src/main/resources/topics-and-qrels/topics.robust04.txt", - "-output", tmpFile}); - - contents = Files.readString(Paths.get(tmpFile), StandardCharsets.US_ASCII); - assertEquals("620 Q0 doc3 1 0.004500 Anserini\n", contents); - - new File(tmpFile).delete(); - } - @Test public void testTotalNumDocuments() throws Exception { SimpleSearcher searcher = new SimpleSearcher(super.tempDir1.toString()); - assertEquals(3 ,searcher.getTotalNumDocuments()); + assertEquals(3 ,searcher.get_total_num_docs()); } } diff --git a/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_json_collection_tokenized/_0.fdm b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_json_collection_tokenized/_0.fdm new file mode 100644 index 0000000000..3d51c34397 Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_json_collection_tokenized/_0.fdm differ diff --git a/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_json_collection_tokenized/_0.fdt b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_json_collection_tokenized/_0.fdt new file mode 100644 index 0000000000..61c418185b Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_json_collection_tokenized/_0.fdt differ diff --git a/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_json_collection_tokenized/_0.fdx b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_json_collection_tokenized/_0.fdx new file mode 100644 index 0000000000..3a45ffa58d Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_json_collection_tokenized/_0.fdx differ diff --git a/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_json_collection_tokenized/_0.fnm b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_json_collection_tokenized/_0.fnm new file mode 100644 index 0000000000..03370c73b3 Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_json_collection_tokenized/_0.fnm differ diff --git a/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_json_collection_tokenized/_0.nvd b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_json_collection_tokenized/_0.nvd new file mode 100644 index 0000000000..3ff72fdbe9 Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_json_collection_tokenized/_0.nvd differ diff --git a/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_json_collection_tokenized/_0.nvm b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_json_collection_tokenized/_0.nvm new file mode 100644 index 0000000000..acb3303343 Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_json_collection_tokenized/_0.nvm differ diff --git a/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_json_collection_tokenized/_0.si b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_json_collection_tokenized/_0.si new file mode 100644 index 0000000000..fb0100b114 Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_json_collection_tokenized/_0.si differ diff --git a/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_json_collection_tokenized/_0_Lucene80_0.dvd b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_json_collection_tokenized/_0_Lucene80_0.dvd new file mode 100644 index 0000000000..efc25a13f8 Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_json_collection_tokenized/_0_Lucene80_0.dvd differ diff --git a/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_json_collection_tokenized/_0_Lucene80_0.dvm b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_json_collection_tokenized/_0_Lucene80_0.dvm new file mode 100644 index 0000000000..5661e2965a Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_json_collection_tokenized/_0_Lucene80_0.dvm differ diff --git a/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_json_collection_tokenized/_0_Lucene84_0.doc b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_json_collection_tokenized/_0_Lucene84_0.doc new file mode 100644 index 0000000000..866f2f6d78 Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_json_collection_tokenized/_0_Lucene84_0.doc differ diff --git a/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_json_collection_tokenized/_0_Lucene84_0.tim b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_json_collection_tokenized/_0_Lucene84_0.tim new file mode 100644 index 0000000000..1478c073c1 Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_json_collection_tokenized/_0_Lucene84_0.tim differ diff --git a/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_json_collection_tokenized/_0_Lucene84_0.tip b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_json_collection_tokenized/_0_Lucene84_0.tip new file mode 100644 index 0000000000..b69675682d Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_json_collection_tokenized/_0_Lucene84_0.tip differ diff --git a/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_json_collection_tokenized/_0_Lucene84_0.tmd b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_json_collection_tokenized/_0_Lucene84_0.tmd new file mode 100644 index 0000000000..481c1f236c Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_json_collection_tokenized/_0_Lucene84_0.tmd differ diff --git a/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_json_collection_tokenized/segments_1 b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_json_collection_tokenized/segments_1 new file mode 100644 index 0000000000..fcea113757 Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_json_collection_tokenized/segments_1 differ diff --git a/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_json_collection_tokenized/write.lock b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_json_collection_tokenized/write.lock new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2/_0.fdm b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2/_0.fdm new file mode 100644 index 0000000000..9a74be457e Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2/_0.fdm differ diff --git a/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2/_0.fdt b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2/_0.fdt new file mode 100644 index 0000000000..49abc6a4df Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2/_0.fdt differ diff --git a/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2/_0.fdx b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2/_0.fdx new file mode 100644 index 0000000000..9df5ed721d Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2/_0.fdx differ diff --git a/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2/_0.fnm b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2/_0.fnm new file mode 100644 index 0000000000..944fe3ad27 Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2/_0.fnm differ diff --git a/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2/_0.nvd b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2/_0.nvd new file mode 100644 index 0000000000..dca4cae955 Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2/_0.nvd differ diff --git a/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2/_0.nvm b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2/_0.nvm new file mode 100644 index 0000000000..e054aab049 Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2/_0.nvm differ diff --git a/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2/_0.si b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2/_0.si new file mode 100644 index 0000000000..d373301835 Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2/_0.si differ diff --git a/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2/_0.tvd b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2/_0.tvd new file mode 100644 index 0000000000..d76929790e Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2/_0.tvd differ diff --git a/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2/_0.tvm b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2/_0.tvm new file mode 100644 index 0000000000..b1a8b4863a Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2/_0.tvm differ diff --git a/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2/_0.tvx b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2/_0.tvx new file mode 100644 index 0000000000..03722aada1 Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2/_0.tvx differ diff --git a/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2/_0_Lucene80_0.dvd b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2/_0_Lucene80_0.dvd new file mode 100644 index 0000000000..761df155a5 Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2/_0_Lucene80_0.dvd differ diff --git a/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2/_0_Lucene80_0.dvm b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2/_0_Lucene80_0.dvm new file mode 100644 index 0000000000..5aa3fe8bac Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2/_0_Lucene80_0.dvm differ diff --git a/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2/_0_Lucene84_0.doc b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2/_0_Lucene84_0.doc new file mode 100644 index 0000000000..72c2ef1444 Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2/_0_Lucene84_0.doc differ diff --git a/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2/_0_Lucene84_0.pos b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2/_0_Lucene84_0.pos new file mode 100644 index 0000000000..e1916d5b55 Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2/_0_Lucene84_0.pos differ diff --git a/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2/_0_Lucene84_0.tim b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2/_0_Lucene84_0.tim new file mode 100644 index 0000000000..b37f4acc03 Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2/_0_Lucene84_0.tim differ diff --git a/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2/_0_Lucene84_0.tip b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2/_0_Lucene84_0.tip new file mode 100644 index 0000000000..6c60ae824b Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2/_0_Lucene84_0.tip differ diff --git a/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2/_0_Lucene84_0.tmd b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2/_0_Lucene84_0.tmd new file mode 100644 index 0000000000..a5b8902e2b Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2/_0_Lucene84_0.tmd differ diff --git a/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2/segments_1 b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2/segments_1 new file mode 100644 index 0000000000..5a06c8b70e Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2/segments_1 differ diff --git a/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2/write.lock b/src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2/write.lock new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_json_collection_tokenized/_0.fdm b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_json_collection_tokenized/_0.fdm new file mode 100644 index 0000000000..927ded4942 Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_json_collection_tokenized/_0.fdm differ diff --git a/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_json_collection_tokenized/_0.fdt b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_json_collection_tokenized/_0.fdt new file mode 100644 index 0000000000..8146ceeb02 Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_json_collection_tokenized/_0.fdt differ diff --git a/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_json_collection_tokenized/_0.fdx b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_json_collection_tokenized/_0.fdx new file mode 100644 index 0000000000..6f9a4b5b87 Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_json_collection_tokenized/_0.fdx differ diff --git a/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_json_collection_tokenized/_0.fnm b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_json_collection_tokenized/_0.fnm new file mode 100644 index 0000000000..80c22724c7 Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_json_collection_tokenized/_0.fnm differ diff --git a/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_json_collection_tokenized/_0.nvd b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_json_collection_tokenized/_0.nvd new file mode 100644 index 0000000000..f30d35de10 Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_json_collection_tokenized/_0.nvd differ diff --git a/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_json_collection_tokenized/_0.nvm b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_json_collection_tokenized/_0.nvm new file mode 100644 index 0000000000..6773ceee1a Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_json_collection_tokenized/_0.nvm differ diff --git a/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_json_collection_tokenized/_0.si b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_json_collection_tokenized/_0.si new file mode 100644 index 0000000000..9a1039fd99 Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_json_collection_tokenized/_0.si differ diff --git a/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_json_collection_tokenized/_0_Lucene90_0.doc b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_json_collection_tokenized/_0_Lucene90_0.doc new file mode 100644 index 0000000000..f0307a86ef Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_json_collection_tokenized/_0_Lucene90_0.doc differ diff --git a/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_json_collection_tokenized/_0_Lucene90_0.dvd b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_json_collection_tokenized/_0_Lucene90_0.dvd new file mode 100644 index 0000000000..bdda7586d6 Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_json_collection_tokenized/_0_Lucene90_0.dvd differ diff --git a/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_json_collection_tokenized/_0_Lucene90_0.dvm b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_json_collection_tokenized/_0_Lucene90_0.dvm new file mode 100644 index 0000000000..d02fa015b5 Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_json_collection_tokenized/_0_Lucene90_0.dvm differ diff --git a/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_json_collection_tokenized/_0_Lucene90_0.tim b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_json_collection_tokenized/_0_Lucene90_0.tim new file mode 100644 index 0000000000..7f29027f8e Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_json_collection_tokenized/_0_Lucene90_0.tim differ diff --git a/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_json_collection_tokenized/_0_Lucene90_0.tip b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_json_collection_tokenized/_0_Lucene90_0.tip new file mode 100644 index 0000000000..0de88e3a09 Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_json_collection_tokenized/_0_Lucene90_0.tip differ diff --git a/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_json_collection_tokenized/_0_Lucene90_0.tmd b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_json_collection_tokenized/_0_Lucene90_0.tmd new file mode 100644 index 0000000000..0c14e45746 Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_json_collection_tokenized/_0_Lucene90_0.tmd differ diff --git a/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_json_collection_tokenized/segments_1 b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_json_collection_tokenized/segments_1 new file mode 100644 index 0000000000..49ee62499f Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_json_collection_tokenized/segments_1 differ diff --git a/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_json_collection_tokenized/write.lock b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_json_collection_tokenized/write.lock new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2/_0.fdm b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2/_0.fdm new file mode 100644 index 0000000000..d44d50a854 Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2/_0.fdm differ diff --git a/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2/_0.fdt b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2/_0.fdt new file mode 100644 index 0000000000..e5e4bb5e84 Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2/_0.fdt differ diff --git a/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2/_0.fdx b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2/_0.fdx new file mode 100644 index 0000000000..dec1a90fc3 Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2/_0.fdx differ diff --git a/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2/_0.fnm b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2/_0.fnm new file mode 100644 index 0000000000..f61f9ab41f Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2/_0.fnm differ diff --git a/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2/_0.nvd b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2/_0.nvd new file mode 100644 index 0000000000..1421bbf24b Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2/_0.nvd differ diff --git a/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2/_0.nvm b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2/_0.nvm new file mode 100644 index 0000000000..42b259e3b4 Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2/_0.nvm differ diff --git a/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2/_0.si b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2/_0.si new file mode 100644 index 0000000000..f88435075c Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2/_0.si differ diff --git a/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2/_0.tvd b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2/_0.tvd new file mode 100644 index 0000000000..7b50137da6 Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2/_0.tvd differ diff --git a/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2/_0.tvm b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2/_0.tvm new file mode 100644 index 0000000000..694981f112 Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2/_0.tvm differ diff --git a/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2/_0.tvx b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2/_0.tvx new file mode 100644 index 0000000000..91beb202f8 Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2/_0.tvx differ diff --git a/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2/_0_Lucene90_0.doc b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2/_0_Lucene90_0.doc new file mode 100644 index 0000000000..2576004502 Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2/_0_Lucene90_0.doc differ diff --git a/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2/_0_Lucene90_0.dvd b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2/_0_Lucene90_0.dvd new file mode 100644 index 0000000000..3d1fdb9c22 Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2/_0_Lucene90_0.dvd differ diff --git a/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2/_0_Lucene90_0.dvm b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2/_0_Lucene90_0.dvm new file mode 100644 index 0000000000..2b24ade7b6 Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2/_0_Lucene90_0.dvm differ diff --git a/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2/_0_Lucene90_0.pos b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2/_0_Lucene90_0.pos new file mode 100644 index 0000000000..386c1ec372 Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2/_0_Lucene90_0.pos differ diff --git a/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2/_0_Lucene90_0.tim b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2/_0_Lucene90_0.tim new file mode 100644 index 0000000000..6585cda1de Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2/_0_Lucene90_0.tim differ diff --git a/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2/_0_Lucene90_0.tip b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2/_0_Lucene90_0.tip new file mode 100644 index 0000000000..8067922450 Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2/_0_Lucene90_0.tip differ diff --git a/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2/_0_Lucene90_0.tmd b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2/_0_Lucene90_0.tmd new file mode 100644 index 0000000000..37f4ff0626 Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2/_0_Lucene90_0.tmd differ diff --git a/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2/segments_1 b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2/segments_1 new file mode 100644 index 0000000000..87840eb4e3 Binary files /dev/null and b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2/segments_1 differ diff --git a/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2/write.lock b/src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2/write.lock new file mode 100644 index 0000000000..e69de29bb2