From f5496b905246084070f959e59626c6323210c3f2 Mon Sep 17 00:00:00 2001 From: HangCui0510 <64120158+HangCui0510@users.noreply.github.com> Date: Fri, 24 Apr 2020 18:56:25 -0400 Subject: [PATCH] Update replication log for MS MARCO passage + doc and doc2query (#1112) --- docs/experiments-doc2query.md | 7 ++++--- docs/experiments-msmarco-doc.md | 7 ++++--- docs/experiments-msmarco-passage.md | 3 ++- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/docs/experiments-doc2query.md b/docs/experiments-doc2query.md index 10d52e7320..39ebb1026b 100644 --- a/docs/experiments-doc2query.md +++ b/docs/experiments-doc2query.md @@ -48,8 +48,8 @@ We can then reindex the collection: ``` sh ./target/appassembler/bin/IndexCollection -collection JsonCollection \ - -generator LuceneDocumentGenerator -threads 9 -input msmarco-passage/collection_jsonl_expanded_topk10 \ - -index msmarco-passage/lucene-index-msmarco-expanded-topk10 -storePositions -storeDocvectors -storeRawDocs + -generator DefaultLuceneDocumentGenerator -threads 9 -input msmarco-passage/collection_jsonl_expanded_topk10 \ + -index msmarco-passage/lucene-index-msmarco-expanded-topk10 -storePositions -storeDocvectors -storeRaw ``` And run retrieval (same as above): @@ -134,7 +134,7 @@ We can then index the expanded documents: ``` sh target/appassembler/bin/IndexCollection -collection JsonCollection \ - -generator LuceneDocumentGenerator -threads 30 -input trec_car/collection_jsonl_expanded_topk10 \ + -generator DefaultLuceneDocumentGenerator -threads 30 -input trec_car/collection_jsonl_expanded_topk10 \ -index trec_car/lucene-index.car17v2.0 ``` @@ -168,3 +168,4 @@ TREC CAR corpus v2.0 in this experiment instead of corpus v1.5 used in the paper + Results replicated by [@justram](https://github.com/justram) on 2019-08-09 (commit [`5f098f`](https://github.com/justram/Anserini/commit/5f098f23527611bca1224149bc2d155adce1e48)) + Results replicated by [@ronakice](https://github.com/ronakice) on 2019-08-13 (commit [`5b29d16`](https://github.com/castorini/anserini/commit/5b29d1654abc5e8a014c2230da990ab2f91fb340)) + Results replicated by [@edwinzhng](https://github.com/edwinzhng) on 2020-01-08 (commit [`5cc923d`](https://github.com/castorini/anserini/commit/5cc923d5c02777d8b25df32ff2e2a59be5badfdd)) ++ Results replicated by [@HangCui0510](https://github.com/HangCui0510) on 2020-04-23 (commit [`0ae567d`](https://github.com/castorini/anserini/commit/0ae567df5c8a70ac211efd958c9ca1ff609ff782)) diff --git a/docs/experiments-msmarco-doc.md b/docs/experiments-msmarco-doc.md index 5471822015..955bfe5a30 100644 --- a/docs/experiments-msmarco-doc.md +++ b/docs/experiments-msmarco-doc.md @@ -22,8 +22,8 @@ Build the index with the following command: ``` nohup sh target/appassembler/bin/IndexCollection -collection TrecCollection \ - -generator LuceneDocumentGenerator -threads 1 -input msmarco-doc/collection \ - -index lucene-index.msmarco-doc.pos+docvectors+rawdocs -storePositions -storeDocvectors -storeRawDocs \ + -generator DefaultLuceneDocumentGenerator -threads 1 -input msmarco-doc/collection \ + -index lucene-index.msmarco-doc.pos+docvectors+rawdocs -storePositions -storeDocvectors -storeRaw \ >& log.msmarco-doc.pos+docvectors+rawdocs & ``` @@ -128,4 +128,5 @@ As expected, BM25 tuning makes a big difference! + Results replicated by [@edwinzhng](https://github.com/edwinzhng) on 2020-01-14 (commit [`3964169`](https://github.com/castorini/anserini/commit/3964169bf82a3783f9298907d9794f0bddf306f0)) + Results replicated by [@nikhilro](https://github.com/nikhilro) on 2020-01-21 (commit [`631589e`](https://github.com/castorini/anserini/commit/631589e9e08326373f46555e007e6c302c19126d)) -+ Results replicated by [@yuki617](https://github.com/yuki617) on 2020-03-29 (commit [`074723c`](https://github.com/castorini/anserini/commit/074723cbb10660fb9be2bfe6325739ab5fe0dd8d)) \ No newline at end of file ++ Results replicated by [@yuki617](https://github.com/yuki617) on 2020-03-29 (commit [`074723c`](https://github.com/castorini/anserini/commit/074723cbb10660fb9be2bfe6325739ab5fe0dd8d)) ++ Results replicated by [@HangCui0510](https://github.com/HangCui0510) on 2020-04-23 (commit [`0ae567d`](https://github.com/castorini/anserini/commit/0ae567df5c8a70ac211efd958c9ca1ff609ff782)) \ No newline at end of file diff --git a/docs/experiments-msmarco-passage.md b/docs/experiments-msmarco-passage.md index 34c41f8222..f026f54ccc 100644 --- a/docs/experiments-msmarco-passage.md +++ b/docs/experiments-msmarco-passage.md @@ -32,7 +32,7 @@ We can now index these docs as a `JsonCollection` using Anserini: ``` sh ./target/appassembler/bin/IndexCollection -collection JsonCollection \ -generator DefaultLuceneDocumentGenerator -threads 9 -input msmarco-passage/collection_jsonl \ - -index msmarco-passage/lucene-index-msmarco -storePositions -storeDocvectors -storeRawDocs + -index msmarco-passage/lucene-index-msmarco -storePositions -storeDocvectors -storeRaw ``` Upon completion, we should have an index with 8,841,823 documents. @@ -157,3 +157,4 @@ Tuned (`k1=0.82`, `b=0.72`) | 0.1875 | 0.1956 | 0.8578 + Results replicated by [@nikhilro](https://github.com/nikhilro) on 2020-01-21 (commit [`631589e`](https://github.com/castorini/anserini/commit/631589e9e08326373f46555e007e6c302c19126d)) + Results replicated by [@yuki617](https://github.com/yuki617) on 2020-03-29 (commit [`074723c`](https://github.com/castorini/anserini/commit/074723cbb10660fb9be2bfe6325739ab5fe0dd8d)) + Results replicated by [@weipang142857](https://github.com/weipang142857) on 2020-04-20 (commit [`074723c`](https://github.com/castorini/anserini/commit/074723cbb10660fb9be2bfe6325739ab5fe0dd8d)) ++ Results replicated by [@HangCui0510](https://github.com/HangCui0510) on 2020-04-23 (commit [`0ae567d`](https://github.com/castorini/anserini/commit/0ae567df5c8a70ac211efd958c9ca1ff609ff782))