Skip to content

Commit

Permalink
OrConvQA passages datasets and LZ4 store (thanks irds)
Browse files Browse the repository at this point in the history
  • Loading branch information
bpiwowar committed Dec 12, 2023
1 parent 3b21b15 commit c254efb
Show file tree
Hide file tree
Showing 6 changed files with 224 additions and 9 deletions.
2 changes: 2 additions & 0 deletions docs/source/datasets/conversation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,5 @@ Contextualized Query Rewriting
==============================

.. dm:datasets:: com.github.aagohary.canard text
.. dm:datasets:: com.github.prdwb.orconvqa text
57 changes: 50 additions & 7 deletions src/datamaestro_text/config/com/github/prdwb/orconvqa.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# See documentation on https://datamaestro.readthedocs.io

from collections import namedtuple
import gzip
import json
from pathlib import Path
from typing import Iterator, NamedTuple
import attrs
from datamaestro.definitions import datatasks, datatags, dataset
from datamaestro.download.single import filedownloader
from datamaestro.utils import HashCheck
Expand All @@ -8,6 +14,12 @@
from datamaestro_text.data.conversation.orconvqa import OrConvQADataset
from datamaestro.data.ml import Supervised

from datamaestro_text.data.ir import DocumentStore
from datamaestro_text.data.ir.formats import OrConvQADocument
from datamaestro_text.data.ir.stores import OrConvQADocumentStore
from datamaestro_text.datasets.irds.data import LZ4DocumentStore
from datamaestro_text.datasets.irds.helpers import lz4docstore_downloader


@datatags("conversation", "context", "query")
@datatasks("query rewriting")
Expand All @@ -31,14 +43,13 @@
url="https://github.com/prdwb/orconvqa-release",
)
def preprocessed(train, dev, test):
"""Question-in-context rewriting
"""Open-Retrieval Conversational Question Answering datasets
OrConvQA is an aggregation of three existing datasets:
CANARD is a dataset for question-in-context rewriting that consists of
questions each given in a dialog context together with a context-independent
rewriting of the question. The context of each question is the dialog
utterances that precede the question. CANARD can be used to evaluate
question rewriting models that handle important linguistic phenomena such as
co-reference and ellipsis resolution.
1. the QuAC dataset that offers information-seeking conversations,
1. the CANARD dataset that consists of context-independent rewrites of QuAC questions, and
3. the Wikipedia corpus that serves as the knowledge source of answering questions.
Each dataset is an instance of :class:`datamaestro_text.data.conversation.OrConvQADataset`
"""
Expand All @@ -47,3 +58,35 @@ def preprocessed(train, dev, test):
"validation": OrConvQADataset(path=dev),
"test": OrConvQADataset(path=test),
}


def orConvQADocumentReader(source: Path) -> Iterator[OrConvQADocumentStore.NAMED_TUPLE]:
with gzip.open(source, "rt") as fp:
for line in fp:
yield OrConvQADocumentStore.NAMED_TUPLE(**json.loads(line))


@lz4docstore_downloader(
"all_blocks",
"https://ciir.cs.umass.edu/downloads/ORConvQA/all_blocks.txt.gz",
orConvQADocumentReader,
OrConvQADocumentStore.NAMED_TUPLE,
"id",
checker=HashCheck("1095a3408690e7bbe4d8a87a2bae6356"),
size=5_086_902_800,
count_hint=11_377_951,
)
@dataset(
OrConvQADocumentStore,
url="https://github.com/prdwb/orconvqa-release",
)
def passages(all_blocks):
"""orConvQA wikipedia files
OrConvQA is an aggregation of three existing datasets:
1. the QuAC dataset that offers information-seeking conversations,
1. the CANARD dataset that consists of context-independent rewrites of QuAC questions, and
3. the Wikipedia corpus that serves as the knowledge source of answering questions.
"""
return {"path": all_blocks, "count": 11_377_951}
14 changes: 14 additions & 0 deletions src/datamaestro_text/data/ir/formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,20 @@ def get_text(self):
return f"{self.text}"


@define
class OrConvQADocument(IDHolder, Document):
id: str
title: str
text: str
aid: str
bid: int

has_text: ClassVar[bool] = True

def get_text(self):
return f"{self.title} {self.text}"


@define
class TrecTopic(GenericTopic):
text: str
Expand Down
22 changes: 22 additions & 0 deletions src/datamaestro_text/data/ir/stores.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from collections import namedtuple
from typing import List
from experimaestro import Constant
import attrs

from datamaestro_text.datasets.irds.data import LZ4DocumentStore
from datamaestro_text.data.ir.formats import OrConvQADocument


class OrConvQADocumentStore(LZ4DocumentStore):
NAMED_TUPLE = namedtuple(
"OrConvQADocument", [a.name for a in attrs.fields(OrConvQADocument)]
)

lookup_field: Constant[str] = "id"
fields: Constant[List[str]] = list(NAMED_TUPLE._fields)
index_fields: Constant[List[str]] = ["id"]

data_cls = NAMED_TUPLE

def converter(self, data: NAMED_TUPLE) -> OrConvQADocument:
return OrConvQADocument(**data._asdict())
67 changes: 65 additions & 2 deletions src/datamaestro_text/datasets/irds/data.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import logging
from typing import Any, Iterator, Tuple, Type, List
from pathlib import Path
from typing import Any, Iterator, NamedTuple, Tuple, Type, List
import attrs
import ir_datasets
from ir_datasets.indices import PickleLz4FullStore
from ir_datasets.formats import (
GenericDoc,
GenericQuery,
Expand All @@ -10,7 +12,7 @@
TrecQuery,
)
import ir_datasets.datasets as _irds
from experimaestro import Config
from experimaestro import Config, Param
from experimaestro.compat import cached_property
from experimaestro import Option
import datamaestro_text.data.ir as ir
Expand Down Expand Up @@ -208,6 +210,67 @@ def converter(self):
)


# Fix while PR https://github.com/allenai/ir_datasets/pull/252
# is not in.
class DMPickleLz4FullStore(PickleLz4FullStore):
def get_many(self, doc_ids, field=None):
result = {}
field_idx = self._doc_cls._fields.index(field) if field is not None else None
for doc in self.get_many_iter(doc_ids):
if field is not None:
result[getattr(doc, self._id_field)] = doc[field_idx]
else:
result[getattr(doc, self._id_field)] = doc
return result


class LZ4DocumentStore(ir.DocumentStore):
"""A LZ4-based document store"""

path: Param[Path]

#: Lookup field
lookup_field: Param[str]

# Extra indexed fields (e.g. URLs)
index_fields: List[str]

@cached_property
def store(self):
return DMPickleLz4FullStore(
self.path, None, self.data_cls, self.lookup_field, self.index_fields
)

@cached_property
def _docs(self):
return self.store.__iter__()

def docid_internal2external(self, ix: int):
return getattr(self._docs[ix], self.store._id_field)

def document_ext(self, docid: str) -> Document:
return self.converter(self.store.get(docid))

def documents_ext(self, docids: List[str]) -> Document:
"""Returns documents given their external IDs (optimized for batch)"""
retrieved = self.store.get_many(docids)
return [self.converter(retrieved[docid]) for docid in docids]

def converter(self, data):
"""Converts a document from LZ4 tuples to any other format"""
# By default, use identity
return data

def iter(self) -> Iterator[Document]:
"""Returns an iterator over documents"""
return map(self.converter, self.store.__iter__())

def documentcount(self):
if self.count:
return self.count
return self.store.count()


@attrs.define()
class IRDSQueryWrapper(ir.Topic):
query: Any
Expand Down
71 changes: 71 additions & 0 deletions src/datamaestro_text/datasets/irds/helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import logging
from typing import Optional, Type, Callable, Iterator
from ir_datasets.indices import PickleLz4FullStore
from datamaestro.download import Download
from datamaestro.utils import FileChecker
from pathlib import Path
import urllib3


class lz4docstore_downloader(Download):
"""Uses ir_datasets Lz4FullStore to build a document store for a stream of documents"""

def __init__(
self,
varname: str,
url: str,
iter_factory: Callable[[Path], Iterator],
doc_cls: Type,
lookup_field: str,
*,
count_hint: Optional[int] = None,
size: Optional[int] = None,
checker: FileChecker = None,
):
super().__init__(varname)
self.iter_factory = iter_factory
self.url = url
self.doc_cls = doc_cls
self.size = size
self.lookup_field = lookup_field
self.count_hint = count_hint
self.checker = checker

p = urllib3.util.parse_url(self.url)
assert p is not None
self.name = Path(p.path).with_suffix("").name

def prepare(self):
return self.definition.datapath / self.name

def download(self, force=False):
# Creates directory if needed
destination = self.definition.datapath / self.name
destination.mkdir(exist_ok=True)

# Early exit
if (destination / "done").is_file() and not force:
return True

# Download (cache)
logging.info("Building the document index")
with self.context.downloadURL(self.url, size=self.size) as file:
# Checks the file
if self.checker:
self.checker.check(file.path)

# Builds the LZ4 store
store = PickleLz4FullStore(
destination,
lambda: self.iter_factory(Path(file.path)),
self.doc_cls,
lookup_field=self.lookup_field,
index_fields=[self.lookup_field],
key_field_prefix=None,
size_hint=None,
count_hint=self.count_hint,
)
store.build()

# All good!
(destination / "done").touch()

0 comments on commit c254efb

Please sign in to comment.