-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
139 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,3 @@ | ||
datamaestro>=1.1.0 | ||
datamaestro>=1.1.1 | ||
ir_datasets | ||
attrs |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
# See documentation on https://datamaestro.readthedocs.io | ||
|
||
from pathlib import Path | ||
from datamaestro.definitions import datatasks, datatags, dataset | ||
from datamaestro.data.ml import Supervised | ||
from datamaestro.download.archive import zipdownloader | ||
from datamaestro.utils import HashCheck | ||
from datamaestro_text.data.conversation.qrecc import QReCCDataset | ||
|
||
|
||
@datatags("conversation", "context", "query") | ||
@datatasks("query rewriting") | ||
@zipdownloader( | ||
"data", | ||
"https://github.com/apple/ml-qrecc/raw/main/dataset/qrecc_data.zip", | ||
checker=HashCheck("f88fcc7ef3678cd6312080389c8abd67"), | ||
) | ||
@dataset( | ||
Supervised[QReCCDataset, None, QReCCDataset], | ||
url="https://github.com/apple/ml-qrecc", | ||
doi="https://doi.org/10.48550/arXiv.2010.04898", | ||
id="", | ||
) | ||
def main(data: Path): | ||
"""Open-Domain Question Answering Goes Conversational via Question Rewriting | ||
We introduce QReCC (Question Rewriting in Conversational Context), an | ||
end-to-end open-domain question answering dataset comprising of 14K | ||
conversations with 81K question-answer pairs. The goal of this dataset is to | ||
provide a challenging benchmark for end-to-end conversational question | ||
answering that includes the individual subtasks of question rewriting, | ||
passage retrieval and reading comprehension | ||
""" | ||
return { | ||
"train": QReCCDataset(path=data / "qrecc_train.json"), | ||
"test": QReCCDataset(path=data / "qrecc_test.json"), | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
from functools import cached_property | ||
from typing import Iterator, List, Optional | ||
from attr import define | ||
import json | ||
from datamaestro.data import File | ||
from datamaestro.record import Record | ||
|
||
from datamaestro_text.data.ir.base import ( | ||
IDItem, | ||
SimpleTextItem, | ||
) | ||
|
||
|
||
from .base import ( | ||
AnswerDocumentURL, | ||
AnswerEntry, | ||
ConversationTree, | ||
EntryType, | ||
RetrievedEntry, | ||
SimpleDecontextualizedItem, | ||
SingleConversationTree, | ||
) | ||
from . import ConversationDataset | ||
|
||
|
||
@define(kw_only=True) | ||
class QReCCDatasetEntry: | ||
"""A query with past history""" | ||
|
||
conversation_no: int | ||
"""Conversation ID""" | ||
|
||
turn_no: int | ||
"""The turn in the conversation""" | ||
|
||
conversation_source: str | ||
"""Conversation source""" | ||
|
||
question: str | ||
"""The last issued query""" | ||
|
||
rewrite: str | ||
"""Manually rewritten query""" | ||
|
||
context: List[str] | ||
"""The list of queries asked by the user""" | ||
|
||
answer: str | ||
"""The answer""" | ||
|
||
answer_url: str | ||
"""The URL containing the answer""" | ||
|
||
|
||
class QReCCDataset(ConversationDataset, File): | ||
def entries(self) -> Iterator[QReCCDatasetEntry]: | ||
"""Iterates over re-written query with their context""" | ||
with self.path.open("rt") as fp: | ||
data = json.load(fp) | ||
|
||
data = [ | ||
QReCCDatasetEntry(**{key.lower(): value for key, value in entry.items()}) | ||
for entry in data | ||
] | ||
return iter(data) | ||
|
||
def __iter__(self) -> Iterator[ConversationTree]: | ||
history: List[Record] = [] | ||
current_id: Optional[str] = None | ||
|
||
for entry in self.entries(): | ||
# Creates a new conversation if needed | ||
if entry.conversation_no != current_id: | ||
if current_id is not None: | ||
history.reverse() | ||
yield SingleConversationTree(current_id, history) | ||
|
||
current_id = entry.conversation_no | ||
history = [] | ||
|
||
# Add to current | ||
history.append( | ||
Record( | ||
IDItem(f"{entry.conversation_no}#{entry.turn_no}"), | ||
SimpleTextItem(entry.question), | ||
AnswerDocumentURL(entry.answer_url), | ||
SimpleDecontextualizedItem(entry.rewrite), | ||
EntryType.USER_QUERY, | ||
) | ||
) | ||
|
||
history.append( | ||
Record( | ||
AnswerEntry(entry.answer), | ||
EntryType.SYSTEM_ANSWER, | ||
) | ||
) | ||
|
||
# Yields the last one | ||
history.reverse() | ||
yield SingleConversationTree(current_id, history) |