Skip to content

Commit

Permalink
Various updates and cleanup (#2359)
Browse files Browse the repository at this point in the history
+ Added Enum topic bindings SPLADE++ ED and BGE for MS MARCO v1 passage corpus
+ Updated regression documentation
+ Rebalanced regression batches across batch03, batch04, and batch05
  • Loading branch information
lintool authored Jan 28, 2024
1 parent ee37e00 commit f02e6f1
Show file tree
Hide file tree
Showing 8 changed files with 316 additions and 241 deletions.
55 changes: 46 additions & 9 deletions docs/regressions.md
Original file line number Diff line number Diff line change
Expand Up @@ -150,8 +150,8 @@ nohup python src/main/python/run_regression.py --index --verify --search --regre
nohup python src/main/python/run_regression.py --index --verify --search --regression dl20-doc-segmented-unicoil >& logs/log.dl20-doc-segmented-unicoil &
nohup python src/main/python/run_regression.py --index --verify --search --regression dl20-doc-segmented-unicoil-noexp >& logs/log.dl20-doc-segmented-unicoil-noexp &
```
</details>

</details>
<details>
<summary>MS MARCO V2 + DL21 regressions</summary>

Expand Down Expand Up @@ -202,8 +202,44 @@ nohup python src/main/python/run_regression.py --index --verify --search --regre
nohup python src/main/python/run_regression.py --index --verify --search --regression dl22-passage-splade-pp-ed >& logs/log.dl22-passage-splade-pp-ed &
nohup python src/main/python/run_regression.py --index --verify --search --regression dl22-passage-splade-pp-sd >& logs/log.dl22-passage-splade-pp-sd &
```

</details>
<details>
<summary>BEIR (v1.0.0): BGE-base-en-v1.5</summary>

```bash
nohup python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-trec-covid-bge-base-en-v1.5-hnsw >& logs/log.beir-v1.0.0-trec-covid-bge-base-en-v1.5-hnsw &
nohup python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-bioasq-bge-base-en-v1.5-hnsw >& logs/log.beir-v1.0.0-bioasq-bge-base-en-v1.5-hnsw &
nohup python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-nfcorpus-bge-base-en-v1.5-hnsw >& logs/log.beir-v1.0.0-nfcorpus-bge-base-en-v1.5-hnsw &
nohup python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-nq-bge-base-en-v1.5-hnsw >& logs/log.beir-v1.0.0-nq-bge-base-en-v1.5-hnsw &
nohup python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-hotpotqa-bge-base-en-v1.5-hnsw >& logs/log.beir-v1.0.0-hotpotqa-bge-base-en-v1.5-hnsw &
nohup python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-fiqa-bge-base-en-v1.5-hnsw >& logs/log.beir-v1.0.0-fiqa-bge-base-en-v1.5-hnsw &
nohup python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-signal1m-bge-base-en-v1.5-hnsw >& logs/log.beir-v1.0.0-signal1m-bge-base-en-v1.5-hnsw &
nohup python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-trec-news-bge-base-en-v1.5-hnsw >& logs/log.beir-v1.0.0-trec-news-bge-base-en-v1.5-hnsw &
nohup python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-robust04-bge-base-en-v1.5-hnsw >& logs/log.beir-v1.0.0-robust04-bge-base-en-v1.5-hnsw &
nohup python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-arguana-bge-base-en-v1.5-hnsw >& logs/log.beir-v1.0.0-arguana-bge-base-en-v1.5-hnsw &
nohup python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-webis-touche2020-bge-base-en-v1.5-hnsw >& logs/log.beir-v1.0.0-webis-touche2020-bge-base-en-v1.5-hnsw &
nohup python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-android-bge-base-en-v1.5-hnsw >& logs/log.beir-v1.0.0-cqadupstack-android-bge-base-en-v1.5-hnsw &
nohup python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-english-bge-base-en-v1.5-hnsw >& logs/log.beir-v1.0.0-cqadupstack-english-bge-base-en-v1.5-hnsw &
nohup python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-gaming-bge-base-en-v1.5-hnsw >& logs/log.beir-v1.0.0-cqadupstack-gaming-bge-base-en-v1.5-hnsw &
nohup python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-gis-bge-base-en-v1.5-hnsw >& logs/log.beir-v1.0.0-cqadupstack-gis-bge-base-en-v1.5-hnsw &
nohup python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-mathematica-bge-base-en-v1.5-hnsw >& logs/log.beir-v1.0.0-cqadupstack-mathematica-bge-base-en-v1.5-hnsw &
nohup python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-physics-bge-base-en-v1.5-hnsw >& logs/log.beir-v1.0.0-cqadupstack-physics-bge-base-en-v1.5-hnsw &
nohup python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-programmers-bge-base-en-v1.5-hnsw >& logs/log.beir-v1.0.0-cqadupstack-programmers-bge-base-en-v1.5-hnsw &
nohup python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-stats-bge-base-en-v1.5-hnsw >& logs/log.beir-v1.0.0-cqadupstack-stats-bge-base-en-v1.5-hnsw &
nohup python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-tex-bge-base-en-v1.5-hnsw >& logs/log.beir-v1.0.0-cqadupstack-tex-bge-base-en-v1.5-hnsw &
nohup python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-unix-bge-base-en-v1.5-hnsw >& logs/log.beir-v1.0.0-cqadupstack-unix-bge-base-en-v1.5-hnsw &
nohup python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-webmasters-bge-base-en-v1.5-hnsw >& logs/log.beir-v1.0.0-cqadupstack-webmasters-bge-base-en-v1.5-hnsw &
nohup python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-wordpress-bge-base-en-v1.5-hnsw >& logs/log.beir-v1.0.0-cqadupstack-wordpress-bge-base-en-v1.5-hnsw &
nohup python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-quora-bge-base-en-v1.5-hnsw >& logs/log.beir-v1.0.0-quora-bge-base-en-v1.5-hnsw &
nohup python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-dbpedia-entity-bge-base-en-v1.5-hnsw >& logs/log.beir-v1.0.0-dbpedia-entity-bge-base-en-v1.5-hnsw &
nohup python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-scidocs-bge-base-en-v1.5-hnsw >& logs/log.beir-v1.0.0-scidocs-bge-base-en-v1.5-hnsw &
nohup python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-fever-bge-base-en-v1.5-hnsw >& logs/log.beir-v1.0.0-fever-bge-base-en-v1.5-hnsw &
nohup python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-climate-fever-bge-base-en-v1.5-hnsw >& logs/log.beir-v1.0.0-climate-fever-bge-base-en-v1.5-hnsw &
nohup python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-scifact-bge-base-en-v1.5-hnsw >& logs/log.beir-v1.0.0-scifact-bge-base-en-v1.5-hnsw &
```

</details>
<details>
<summary>BEIR (v1.0.0): SPLADE++ CoCondenser-EnsembleDistil</summary>

Expand Down Expand Up @@ -238,8 +274,8 @@ nohup python src/main/python/run_regression.py --index --verify --search --regre
nohup python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-climate-fever-splade-pp-ed >& logs/log.beir-v1.0.0-climate-fever-splade-pp-ed &
nohup python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-scifact-splade-pp-ed >& logs/log.beir-v1.0.0-scifact-splade-pp-ed &
```
</details>

</details>
<details>
<summary>BEIR (v1.0.0): uniCOIL (noexp)</summary>

Expand Down Expand Up @@ -274,8 +310,8 @@ nohup python src/main/python/run_regression.py --index --verify --search --regre
nohup python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-climate-fever-unicoil-noexp >& logs/log.beir-v1.0.0-climate-fever-unicoil-noexp &
nohup python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-scifact-unicoil-noexp >& logs/log.beir-v1.0.0-scifact-unicoil-noexp &
```
</details>

</details>
<details>
<summary>BEIR (v1.0.0): "flat" baseline</summary>

Expand Down Expand Up @@ -310,8 +346,8 @@ nohup python src/main/python/run_regression.py --index --verify --search --regre
nohup python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-climate-fever-flat >& logs/log.beir-v1.0.0-climate-fever-flat &
nohup python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-scifact-flat >& logs/log.beir-v1.0.0-scifact-flat &
```
</details>

</details>
<details>
<summary>BEIR (v1.0.0): "flat" baseline with WordPiece tokenization</summary>

Expand Down Expand Up @@ -346,8 +382,8 @@ nohup python src/main/python/run_regression.py --index --verify --search --regre
nohup python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-climate-fever-flat-wp >& logs/log.beir-v1.0.0-climate-fever-flat-wp &
nohup python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-scifact-flat-wp >& logs/log.beir-v1.0.0-scifact-flat-wp &
```
</details>

</details>
<details>
<summary>BEIR (v1.0.0): "multifield" baseline</summary>

Expand Down Expand Up @@ -382,8 +418,8 @@ nohup python src/main/python/run_regression.py --index --verify --search --regre
nohup python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-climate-fever-multifield >& logs/log.beir-v1.0.0-climate-fever-multifield &
nohup python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-scifact-multifield >& logs/log.beir-v1.0.0-scifact-multifield &
```
</details>

</details>
<details>
<summary>Mr.TyDi (v1.1): BM25 regressions</summary>

Expand Down Expand Up @@ -412,8 +448,8 @@ nohup python src/main/python/run_regression.py --index --verify --search --regre
nohup python src/main/python/run_regression.py --index --verify --search --regression mrtydi-v1.1-te-aca >& logs/log.mrtydi-v1.1-te-aca &
nohup python src/main/python/run_regression.py --index --verify --search --regression mrtydi-v1.1-th-aca >& logs/log.mrtydi-v1.1-th-aca &
```
</details>

</details>
<details>
<summary>MIRACL (v1.0): BM25 regressions</summary>

Expand Down Expand Up @@ -452,8 +488,8 @@ nohup python src/main/python/run_regression.py --index --verify --search --regre
nohup python src/main/python/run_regression.py --index --verify --search --regression miracl-v1.0-th-aca >& logs/log.miracl-v1.0-th-aca &
nohup python src/main/python/run_regression.py --index --verify --search --regression miracl-v1.0-zh-aca >& logs/log.miracl-v1.0-zh-aca &
```
</details>

</details>
<details>
<summary>Other cross-lingual and multi-lingual regressions</summary>

Expand Down Expand Up @@ -500,8 +536,8 @@ nohup python src/main/python/run_regression.py --index --verify --search --regre
nohup python src/main/python/run_regression.py --index --verify --search --regression ciral-v1.0-sw >& logs/log.ciral-v1.0-sw &
nohup python src/main/python/run_regression.py --index --verify --search --regression ciral-v1.0-yo >& logs/log.ciral-v1.0-yo &
```
</details>

</details>
<details>
<summary>Other regressions</summary>

Expand Down Expand Up @@ -534,6 +570,7 @@ nohup python src/main/python/run_regression.py --index --verify --search --regre
nohup python src/main/python/run_regression.py --index --verify --search --regression wikipedia-dpr-100w-bm25 >& logs/log.wikipedia-dpr-100w-bm25 &
nohup python src/main/python/run_regression.py --index --verify --search --regression wiki-all-6-3-tamber-bm25 >& logs/log.wiki-all-6-3-tamber-bm25 &
```

</details>

The `--regression` option specifies the regression to run, corresponding to the YAML configuration file in [`src/main/resources/regression/`](../src/main/resources/regression/).
Expand Down
20 changes: 5 additions & 15 deletions src/main/java/io/anserini/search/topicreader/TopicReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -121,27 +121,17 @@ public SortedMap<K, Map<String, String>> read(String str) throws IOException {
*/
@SuppressWarnings("unchecked")
public static <K> SortedMap<K, Map<String, String>> getTopics(Topics topics) throws IOException{
try {
String raw;
InputStream inputStream;
Path topicPath = getTopicPath(Path.of(topics.path));

if (topicPath.toString().endsWith(".gz")) {
inputStream = new GZIPInputStream(Files.newInputStream(topicPath, StandardOpenOption.READ));
} else {
inputStream = Files.newInputStream(topicPath, StandardOpenOption.READ);
}
raw = new String(inputStream.readAllBytes());
inputStream.close();
Path topicPath = getTopicPath(Path.of(topics.path));

try(InputStream inputStream = topicPath.toString().endsWith(".gz") ?
new GZIPInputStream(Files.newInputStream(topicPath, StandardOpenOption.READ)) :
Files.newInputStream(topicPath, StandardOpenOption.READ)) {
// Get the constructor
Constructor[] ctors = topics.readerClass.getDeclaredConstructors();
// The one we want is always the zero-th one; pass in a dummy Path.
TopicReader<K> reader = (TopicReader<K>) ctors[0].newInstance(Paths.get("."));
return reader.read(raw);

return reader.read(new BufferedReader(new InputStreamReader(inputStream)));
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
Expand Down
9 changes: 9 additions & 0 deletions src/main/java/io/anserini/search/topicreader/Topics.java
Original file line number Diff line number Diff line change
Expand Up @@ -57,13 +57,19 @@ public enum Topics {
TREC2019_DL_PASSAGE_UNICOIL(TsvIntTopicReader.class,"topics.dl19-passage.unicoil.0shot.tsv.gz"),
TREC2019_DL_PASSAGE_UNICOIL_NOEXP(TsvIntTopicReader.class,"topics.dl19-passage.unicoil-noexp.0shot.tsv.gz"),
TREC2019_DL_PASSAGE_SPLADE_DISTILL_COCODENSER_MEDIUM(TsvIntTopicReader.class,"topics.dl19-passage.splade_distil_cocodenser_medium.tsv.gz"),
TREC2019_DL_PASSAGE_SPLADE_PP_ED(TsvIntTopicReader.class,"topics.dl19-passage.splade-pp-ed.tsv.gz"),
TREC2019_DL_PASSAGE_SPLADE_PP_SD(TsvIntTopicReader.class,"topics.dl19-passage.splade-pp-sd.tsv.gz"),
TREC2019_DL_PASSAGE_COS_DPR_DISTIL(JsonIntVectorTopicReader.class, "topics.dl19-passage.cos-dpr-distil.jsonl.gz"),
TREC2019_DL_PASSAGE_BGE_BASE_EN_15(JsonIntVectorTopicReader.class, "topics.dl19-passage.bge-base-en-v1.5.jsonl.gz"),
TREC2020_DL(TsvIntTopicReader.class,"topics.dl20.txt"),
TREC2020_DL_WP(TsvIntTopicReader.class,"topics.dl20.wp.tsv.gz"),
TREC2020_DL_UNICOIL(TsvIntTopicReader.class,"topics.dl20.unicoil.0shot.tsv.gz"),
TREC2020_DL_UNICOIL_NOEXP(TsvIntTopicReader.class,"topics.dl20.unicoil-noexp.0shot.tsv.gz"),
TREC2020_DL_SPLADE_DISTILL_COCODENSER_MEDIUM(TsvIntTopicReader.class,"topics.dl20.splade_distil_cocodenser_medium.tsv.gz"),
TREC2020_DL_SPLADE_PP_ED(TsvIntTopicReader.class,"topics.dl20.splade-pp-ed.tsv.gz"),
TREC2020_DL_SPLADE_PP_SD(TsvIntTopicReader.class,"topics.dl20.splade-pp-sd.tsv.gz"),
TREC2020_DL_COS_DPR_DISTIL(JsonIntVectorTopicReader.class, "topics.dl20.cos-dpr-distil.jsonl.gz"),
TREC2020_DL_BGE_BASE_EN_15(JsonIntVectorTopicReader.class, "topics.dl20.bge-base-en-v1.5.jsonl.gz"),
TREC2021_DL(TsvIntTopicReader.class,"topics.dl21.txt"),
TREC2021_DL_UNICOIL(TsvIntTopicReader.class,"topics.dl21.unicoil.0shot.tsv.gz"),
TREC2021_DL_UNICOIL_NOEXP(TsvIntTopicReader.class,"topics.dl21.unicoil-noexp.0shot.tsv.gz"),
Expand All @@ -89,7 +95,10 @@ public enum Topics {
MSMARCO_PASSAGE_DEV_SUBSET_UNICOIL_TILDE(TsvIntTopicReader.class, "topics.msmarco-passage.dev-subset.unicoil-tilde-expansion.tsv.gz"),
MSMARCO_PASSAGE_DEV_SUBSET_DISTILL_SPLADE_MAX(TsvIntTopicReader.class, "topics.msmarco-passage.dev-subset.distill-splade-max.tsv.gz"),
MSMARCO_PASSAGE_DEV_SUBSET_SPLADE_DISTILL_COCODENSER_MEDIUM(TsvIntTopicReader.class, "topics.msmarco-passage.dev-subset.splade_distil_cocodenser_medium.tsv.gz"),
MSMARCO_PASSAGE_DEV_SUBSET_SPLADE_PP_ED(TsvIntTopicReader.class, "topics.msmarco-passage.dev-subset.splade-pp-ed.tsv.gz"),
MSMARCO_PASSAGE_DEV_SUBSET_SPLADE_PP_SD(TsvIntTopicReader.class, "topics.msmarco-passage.dev-subset.splade-pp-sd.tsv.gz"),
MSMARCO_PASSAGE_DEV_SUBSET_COS_DPR_DISTIL(JsonIntVectorTopicReader.class, "topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.gz"),
MSMARCO_PASSAGE_DEV_SUBSET_BGE_BASE_EN_15(JsonIntVectorTopicReader.class, "topics.msmarco-passage.dev-subset.bge-base-en-v1.5.jsonl.gz"),
MSMARCO_PASSAGE_TEST_SUBSET(TsvIntTopicReader.class, "topics.msmarco-passage.test-subset.txt"),

// MS MARCO V2 topics
Expand Down
Loading

0 comments on commit f02e6f1

Please sign in to comment.