diff --git a/README.md b/README.md index 26e73ab62..49ff46633 100644 --- a/README.md +++ b/README.md @@ -254,6 +254,7 @@ With Pyserini, it's easy to replicate runs on a number of standard IR test colle + The easiest way, start here: [Replicating runs directly from the Python package](docs/pypi-replication.md) + [Guide to replicating the BM25 baseline for MS MARCO Passage Ranking](docs/experiments-msmarco-passage.md) + [Guide to replicating the BM25 baseline for MS MARCO Document Ranking](docs/experiments-msmarco-doc.md) ++ [Guide to replicating the multi-field BM25 baseline for MS MARCO Document Ranking from Elasticsearch](docs/experiments-elastic.md) + [Guide to replicating Robust04 baselines for ad hoc retrieval](docs/experiments-robust04.md) + [Guide to replicating TCT-ColBERT experiments for MS MARCO Passage/Document Ranking](docs/experiments-tct_colbert.md) + [Guide to replicating DPR experiments for Open-Domain QA](docs/experiments-dpr.md) diff --git a/docs/experiments-elastic.md b/docs/experiments-elastic.md index 51af93324..a00668993 100644 --- a/docs/experiments-elastic.md +++ b/docs/experiments-elastic.md @@ -1,17 +1,18 @@ -# Pyserini: Elastic Multi-field Baseline for MS MARCO Document Ranking +# Pyserini: Multi-field Baseline for MS MARCO Document Ranking -This page contains instructions for reproducing the Elasticsearch optimized -multi_match best_fields" entry on the the [MS MARCO Document Ranking Leaderboard](https://microsoft.github.io/MSMARCO-Document-Ranking-Submissions/leaderboard/). +This page contains instructions for reproducing the "Elasticsearch optimized +multi_match best_fields" entry (2020/11/25) on the the [MS MARCO Document Ranking Leaderboard](https://microsoft.github.io/MSMARCO-Document-Ranking-Submissions/leaderboard/) using Pyserini. +Details behind this run are described in this [blog post](https://www.elastic.co/blog/improving-search-relevance-with-data-driven-query-optimization); +the official leaderboard submission corresponds to the run denoted "multi_match best_fields tuned (all-in-one): all +params" in the blog post. This run makes sure to preserve the distinction between document fields when preparing and indexing documents. For ranking, we use a disjunction max query to -combine score contributions across fields. The weights for the disjunction max -query are taken from the "multi_match best_fields tuned (all-in-one): all -params" entry in the [blog post](https://www.elastic.co/blog/improving-search-relevance-with-data-driven-query-optimization) -that describes the leaderboard submission. +combine score contributions across fields; the weights for the disjunction max +query are taken from the blog post reference above. To match the leaderboard results, this run makes use of a custom stopwords file -'elastic-msmarco-stopwords.txt'. The file contains the default English stopwords +[`elastic-msmarco-stopwords.txt`](elastic-msmarco-stopwords.txt). The file contains the default English stopwords from Lucene, plus some additional words targeted at question-style queries. ## Data Prep @@ -69,9 +70,7 @@ python -m pyserini.search --msmarco --hits 100 \ After the run completes, we can evaluate the results: ```bash -python tools/scripts/msmarco/msmarco_doc_eval.py \ - --judgments tools/topics-and-qrels/qrels.msmarco-doc.dev.txt \ - --run runs/run.msmarco-doc.leaderboard-dev.elastic.txt +$ python -m pyserini.eval.msmarco_doc_eval --judgments msmarco-doc-dev --run runs/run.msmarco-doc.leaderboard-dev.elastic.txt ##################### MRR @100: 0.3071421845448626 QueriesRanked: 5193 diff --git a/pyserini/search/_searcher.py b/pyserini/search/_searcher.py index c0f255ea7..70afb7da0 100644 --- a/pyserini/search/_searcher.py +++ b/pyserini/search/_searcher.py @@ -111,9 +111,9 @@ def search(self, q: Union[str, JQuery], k: int = 10, query_generator: JQueryGene hits = None if query_generator: if not fields: - hits = self.object.search(query_generator, JString(q), k) + hits = self.object.search(query_generator, JString(q.encode('utf8')), k) else: - hits = self.object.searchFields(query_generator, JString(q), jfields, k) + hits = self.object.searchFields(query_generator, JString(q.encode('utf8')), jfields, k) elif isinstance(q, JQuery): # Note that RM3 requires the notion of a query (string) to estimate the appropriate models. If we're just # given a Lucene query, it's unclear what the "query" is for this estimation. One possibility is to extract @@ -126,11 +126,10 @@ def search(self, q: Union[str, JQuery], k: int = 10, query_generator: JQueryGene raise NotImplementedError('Cannot specify fields to search when using a Lucene query.') hits = self.object.search(q, k) else: - query_string = JString(q.encode('utf8')) if not fields: - hits = self.object.search(query_string, k) - else : - hits = self.object.searchFields(query_string, jfields, k) + hits = self.object.search(JString(q.encode('utf8')), k) + else: + hits = self.object.searchFields(JString(q.encode('utf8')), jfields, k) docids = set() filtered_hits = []