From ed2f0ef24ec19064c3aa262a133cd95f91440d7a Mon Sep 17 00:00:00 2001 From: Benjamin Piwowarski Date: Fri, 31 May 2024 06:40:35 +0200 Subject: [PATCH] QReCC dataset (conversations only) --- requirements.txt | 2 +- .../config/com/github/apple/ml-qrecc.py | 37 +++++++ .../data/conversation/qrecc.py | 101 ++++++++++++++++++ 3 files changed, 139 insertions(+), 1 deletion(-) create mode 100644 src/datamaestro_text/config/com/github/apple/ml-qrecc.py create mode 100644 src/datamaestro_text/data/conversation/qrecc.py diff --git a/requirements.txt b/requirements.txt index fdb15d3..1a01340 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ -datamaestro>=1.1.0 +datamaestro>=1.1.1 ir_datasets attrs diff --git a/src/datamaestro_text/config/com/github/apple/ml-qrecc.py b/src/datamaestro_text/config/com/github/apple/ml-qrecc.py new file mode 100644 index 0000000..cc8f331 --- /dev/null +++ b/src/datamaestro_text/config/com/github/apple/ml-qrecc.py @@ -0,0 +1,37 @@ +# See documentation on https://datamaestro.readthedocs.io + +from pathlib import Path +from datamaestro.definitions import datatasks, datatags, dataset +from datamaestro.data.ml import Supervised +from datamaestro.download.archive import zipdownloader +from datamaestro.utils import HashCheck +from datamaestro_text.data.conversation.qrecc import QReCCDataset + + +@datatags("conversation", "context", "query") +@datatasks("query rewriting") +@zipdownloader( + "data", + "https://github.com/apple/ml-qrecc/raw/main/dataset/qrecc_data.zip", + checker=HashCheck("f88fcc7ef3678cd6312080389c8abd67"), +) +@dataset( + Supervised[QReCCDataset, None, QReCCDataset], + url="https://github.com/apple/ml-qrecc", + doi="https://doi.org/10.48550/arXiv.2010.04898", + id="", +) +def main(data: Path): + """Open-Domain Question Answering Goes Conversational via Question Rewriting + + We introduce QReCC (Question Rewriting in Conversational Context), an + end-to-end open-domain question answering dataset comprising of 14K + conversations with 81K question-answer pairs. The goal of this dataset is to + provide a challenging benchmark for end-to-end conversational question + answering that includes the individual subtasks of question rewriting, + passage retrieval and reading comprehension + """ + return { + "train": QReCCDataset(path=data / "qrecc_train.json"), + "test": QReCCDataset(path=data / "qrecc_test.json"), + } diff --git a/src/datamaestro_text/data/conversation/qrecc.py b/src/datamaestro_text/data/conversation/qrecc.py new file mode 100644 index 0000000..83a8b23 --- /dev/null +++ b/src/datamaestro_text/data/conversation/qrecc.py @@ -0,0 +1,101 @@ +from functools import cached_property +from typing import Iterator, List, Optional +from attr import define +import json +from datamaestro.data import File +from datamaestro.record import Record + +from datamaestro_text.data.ir.base import ( + IDItem, + SimpleTextItem, +) + + +from .base import ( + AnswerDocumentURL, + AnswerEntry, + ConversationTree, + EntryType, + RetrievedEntry, + SimpleDecontextualizedItem, + SingleConversationTree, +) +from . import ConversationDataset + + +@define(kw_only=True) +class QReCCDatasetEntry: + """A query with past history""" + + conversation_no: int + """Conversation ID""" + + turn_no: int + """The turn in the conversation""" + + conversation_source: str + """Conversation source""" + + question: str + """The last issued query""" + + rewrite: str + """Manually rewritten query""" + + context: List[str] + """The list of queries asked by the user""" + + answer: str + """The answer""" + + answer_url: str + """The URL containing the answer""" + + +class QReCCDataset(ConversationDataset, File): + def entries(self) -> Iterator[QReCCDatasetEntry]: + """Iterates over re-written query with their context""" + with self.path.open("rt") as fp: + data = json.load(fp) + + data = [ + QReCCDatasetEntry(**{key.lower(): value for key, value in entry.items()}) + for entry in data + ] + return iter(data) + + def __iter__(self) -> Iterator[ConversationTree]: + history: List[Record] = [] + current_id: Optional[str] = None + + for entry in self.entries(): + # Creates a new conversation if needed + if entry.conversation_no != current_id: + if current_id is not None: + history.reverse() + yield SingleConversationTree(current_id, history) + + current_id = entry.conversation_no + history = [] + + # Add to current + history.append( + Record( + IDItem(f"{entry.conversation_no}#{entry.turn_no}"), + SimpleTextItem(entry.question), + AnswerDocumentURL(entry.answer_url), + SimpleDecontextualizedItem(entry.rewrite), + EntryType.USER_QUERY, + ) + ) + + history.append( + Record( + AnswerEntry(entry.answer), + EntryType.SYSTEM_ANSWER, + ) + ) + + # Yields the last one + history.reverse() + yield SingleConversationTree(current_id, history)