OrConvQA passages datasets and LZ4 store (thanks irds)

experimaestro · Dec 12, 2023 · c254efb · c254efb
1 parent 3b21b15
commit c254efb
Show file tree

Hide file tree

Showing 6 changed files with 224 additions and 9 deletions.
diff --git a/docs/source/datasets/conversation.rst b/docs/source/datasets/conversation.rst
@@ -6,3 +6,5 @@ Contextualized Query Rewriting
 ==============================
 
 .. dm:datasets:: com.github.aagohary.canard text
+
+.. dm:datasets:: com.github.prdwb.orconvqa text
diff --git a/src/datamaestro_text/config/com/github/prdwb/orconvqa.py b/src/datamaestro_text/config/com/github/prdwb/orconvqa.py
@@ -1,5 +1,11 @@
 # See documentation on https://datamaestro.readthedocs.io
 
+from collections import namedtuple
+import gzip
+import json
+from pathlib import Path
+from typing import Iterator, NamedTuple
+import attrs
 from datamaestro.definitions import datatasks, datatags, dataset
 from datamaestro.download.single import filedownloader
 from datamaestro.utils import HashCheck
@@ -8,6 +14,12 @@
 from datamaestro_text.data.conversation.orconvqa import OrConvQADataset
 from datamaestro.data.ml import Supervised
 
+from datamaestro_text.data.ir import DocumentStore
+from datamaestro_text.data.ir.formats import OrConvQADocument
+from datamaestro_text.data.ir.stores import OrConvQADocumentStore
+from datamaestro_text.datasets.irds.data import LZ4DocumentStore
+from datamaestro_text.datasets.irds.helpers import lz4docstore_downloader
+
 
 @datatags("conversation", "context", "query")
 @datatasks("query rewriting")
@@ -31,14 +43,13 @@
     url="https://github.com/prdwb/orconvqa-release",
 )
 def preprocessed(train, dev, test):
-    """Question-in-context rewriting
+    """Open-Retrieval Conversational Question Answering datasets
+
+    OrConvQA is an aggregation of three existing datasets:
 
-    CANARD is a dataset for question-in-context rewriting that consists of
-    questions each given in a dialog context together with a context-independent
-    rewriting of the question. The context of each question is the dialog
-    utterances that precede the question. CANARD can be used to evaluate
-    question rewriting models that handle important linguistic phenomena such as
-    co-reference and ellipsis resolution.
+    1. the QuAC dataset that offers information-seeking conversations,
+    1. the CANARD dataset that consists of context-independent rewrites of QuAC questions, and
+    3. the Wikipedia corpus that serves as the knowledge source of answering questions.
 
     Each dataset is an instance of :class:`datamaestro_text.data.conversation.OrConvQADataset`
     """
@@ -47,3 +58,35 @@ def preprocessed(train, dev, test):
         "validation": OrConvQADataset(path=dev),
         "test": OrConvQADataset(path=test),
     }
+
+
+def orConvQADocumentReader(source: Path) -> Iterator[OrConvQADocumentStore.NAMED_TUPLE]:
+    with gzip.open(source, "rt") as fp:
+        for line in fp:
+            yield OrConvQADocumentStore.NAMED_TUPLE(**json.loads(line))
+
+
+@lz4docstore_downloader(
+    "all_blocks",
+    "https://ciir.cs.umass.edu/downloads/ORConvQA/all_blocks.txt.gz",
+    orConvQADocumentReader,
+    OrConvQADocumentStore.NAMED_TUPLE,
+    "id",
+    checker=HashCheck("1095a3408690e7bbe4d8a87a2bae6356"),
+    size=5_086_902_800,
+    count_hint=11_377_951,
+)
+@dataset(
+    OrConvQADocumentStore,
+    url="https://github.com/prdwb/orconvqa-release",
+)
+def passages(all_blocks):
+    """orConvQA wikipedia files
+
+    OrConvQA is an aggregation of three existing datasets:
+
+    1. the QuAC dataset that offers information-seeking conversations,
+    1. the CANARD dataset that consists of context-independent rewrites of QuAC questions, and
+    3. the Wikipedia corpus that serves as the knowledge source of answering questions.
+    """
+    return {"path": all_blocks, "count": 11_377_951}
diff --git a/src/datamaestro_text/data/ir/formats.py b/src/datamaestro_text/data/ir/formats.py
@@ -130,6 +130,20 @@ def get_text(self):
         return f"{self.text}"
 
 
+@define
+class OrConvQADocument(IDHolder, Document):
+    id: str
+    title: str
+    text: str
+    aid: str
+    bid: int
+
+    has_text: ClassVar[bool] = True
+
+    def get_text(self):
+        return f"{self.title} {self.text}"
+
+
 @define
 class TrecTopic(GenericTopic):
     text: str

diff --git a/src/datamaestro_text/data/ir/stores.py b/src/datamaestro_text/data/ir/stores.py
@@ -0,0 +1,22 @@
+from collections import namedtuple
+from typing import List
+from experimaestro import Constant
+import attrs
+
+from datamaestro_text.datasets.irds.data import LZ4DocumentStore
+from datamaestro_text.data.ir.formats import OrConvQADocument
+
+
+class OrConvQADocumentStore(LZ4DocumentStore):
+    NAMED_TUPLE = namedtuple(
+        "OrConvQADocument", [a.name for a in attrs.fields(OrConvQADocument)]
+    )
+
+    lookup_field: Constant[str] = "id"
+    fields: Constant[List[str]] = list(NAMED_TUPLE._fields)
+    index_fields: Constant[List[str]] = ["id"]
+
+    data_cls = NAMED_TUPLE
+
+    def converter(self, data: NAMED_TUPLE) -> OrConvQADocument:
+        return OrConvQADocument(**data._asdict())
diff --git a/src/datamaestro_text/datasets/irds/data.py b/src/datamaestro_text/datasets/irds/data.py
@@ -1,7 +1,9 @@
 import logging
-from typing import Any, Iterator, Tuple, Type, List
+from pathlib import Path
+from typing import Any, Iterator, NamedTuple, Tuple, Type, List
 import attrs
 import ir_datasets
+from ir_datasets.indices import PickleLz4FullStore
 from ir_datasets.formats import (
     GenericDoc,
     GenericQuery,
@@ -10,7 +12,7 @@
     TrecQuery,
 )
 import ir_datasets.datasets as _irds
-from experimaestro import Config
+from experimaestro import Config, Param
 from experimaestro.compat import cached_property
 from experimaestro import Option
 import datamaestro_text.data.ir as ir
@@ -208,6 +210,67 @@ def converter(self):
     )
 
 
+# Fix while PR https://github.com/allenai/ir_datasets/pull/252
+# is not in.
+class DMPickleLz4FullStore(PickleLz4FullStore):
+    def get_many(self, doc_ids, field=None):
+        result = {}
+        field_idx = self._doc_cls._fields.index(field) if field is not None else None
+        for doc in self.get_many_iter(doc_ids):
+            if field is not None:
+                result[getattr(doc, self._id_field)] = doc[field_idx]
+            else:
+                result[getattr(doc, self._id_field)] = doc
+        return result
+
+
+class LZ4DocumentStore(ir.DocumentStore):
+    """A LZ4-based document store"""
+
+    path: Param[Path]
+
+    #: Lookup field
+    lookup_field: Param[str]
+
+    # Extra indexed fields (e.g. URLs)
+    index_fields: List[str]
+
+    @cached_property
+    def store(self):
+        return DMPickleLz4FullStore(
+            self.path, None, self.data_cls, self.lookup_field, self.index_fields
+        )
+
+    @cached_property
+    def _docs(self):
+        return self.store.__iter__()
+
+    def docid_internal2external(self, ix: int):
+        return getattr(self._docs[ix], self.store._id_field)
+
+    def document_ext(self, docid: str) -> Document:
+        return self.converter(self.store.get(docid))
+
+    def documents_ext(self, docids: List[str]) -> Document:
+        """Returns documents given their external IDs (optimized for batch)"""
+        retrieved = self.store.get_many(docids)
+        return [self.converter(retrieved[docid]) for docid in docids]
+
+    def converter(self, data):
+        """Converts a document from LZ4 tuples to any other format"""
+        # By default, use identity
+        return data
+
+    def iter(self) -> Iterator[Document]:
+        """Returns an iterator over documents"""
+        return map(self.converter, self.store.__iter__())
+
+    def documentcount(self):
+        if self.count:
+            return self.count
+        return self.store.count()
+
+
 @attrs.define()
 class IRDSQueryWrapper(ir.Topic):
     query: Any

diff --git a/src/datamaestro_text/datasets/irds/helpers.py b/src/datamaestro_text/datasets/irds/helpers.py
@@ -0,0 +1,71 @@
+import logging
+from typing import Optional, Type, Callable, Iterator
+from ir_datasets.indices import PickleLz4FullStore
+from datamaestro.download import Download
+from datamaestro.utils import FileChecker
+from pathlib import Path
+import urllib3
+
+
+class lz4docstore_downloader(Download):
+    """Uses ir_datasets Lz4FullStore to build a document store for a stream of documents"""
+
+    def __init__(
+        self,
+        varname: str,
+        url: str,
+        iter_factory: Callable[[Path], Iterator],
+        doc_cls: Type,
+        lookup_field: str,
+        *,
+        count_hint: Optional[int] = None,
+        size: Optional[int] = None,
+        checker: FileChecker = None,
+    ):
+        super().__init__(varname)
+        self.iter_factory = iter_factory
+        self.url = url
+        self.doc_cls = doc_cls
+        self.size = size
+        self.lookup_field = lookup_field
+        self.count_hint = count_hint
+        self.checker = checker
+
+        p = urllib3.util.parse_url(self.url)
+        assert p is not None
+        self.name = Path(p.path).with_suffix("").name
+
+    def prepare(self):
+        return self.definition.datapath / self.name
+
+    def download(self, force=False):
+        # Creates directory if needed
+        destination = self.definition.datapath / self.name
+        destination.mkdir(exist_ok=True)
+
+        # Early exit
+        if (destination / "done").is_file() and not force:
+            return True
+
+        # Download (cache)
+        logging.info("Building the document index")
+        with self.context.downloadURL(self.url, size=self.size) as file:
+            # Checks the file
+            if self.checker:
+                self.checker.check(file.path)
+
+            # Builds the LZ4 store
+            store = PickleLz4FullStore(
+                destination,
+                lambda: self.iter_factory(Path(file.path)),
+                self.doc_cls,
+                lookup_field=self.lookup_field,
+                index_fields=[self.lookup_field],
+                key_field_prefix=None,
+                size_hint=None,
+                count_hint=self.count_hint,
+            )
+            store.build()
+
+            # All good!
+            (destination / "done").touch()
Original file line number	Diff line number	Diff line change
Expand Up		@@ -6,3 +6,5 @@ Contextualized Query Rewriting
		==============================

		.. dm:datasets:: com.github.aagohary.canard text

		.. dm:datasets:: com.github.prdwb.orconvqa text