Skip to content

Commit

Permalink
QReCC dataset (conversations only)
Browse files Browse the repository at this point in the history
  • Loading branch information
bpiwowar committed May 31, 2024
1 parent d123aaa commit ed2f0ef
Show file tree
Hide file tree
Showing 3 changed files with 139 additions and 1 deletion.
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
datamaestro>=1.1.0
datamaestro>=1.1.1
ir_datasets
attrs
37 changes: 37 additions & 0 deletions src/datamaestro_text/config/com/github/apple/ml-qrecc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# See documentation on https://datamaestro.readthedocs.io

from pathlib import Path
from datamaestro.definitions import datatasks, datatags, dataset
from datamaestro.data.ml import Supervised
from datamaestro.download.archive import zipdownloader
from datamaestro.utils import HashCheck
from datamaestro_text.data.conversation.qrecc import QReCCDataset


@datatags("conversation", "context", "query")
@datatasks("query rewriting")
@zipdownloader(
"data",
"https://github.com/apple/ml-qrecc/raw/main/dataset/qrecc_data.zip",
checker=HashCheck("f88fcc7ef3678cd6312080389c8abd67"),
)
@dataset(
Supervised[QReCCDataset, None, QReCCDataset],
url="https://github.com/apple/ml-qrecc",
doi="https://doi.org/10.48550/arXiv.2010.04898",
id="",
)
def main(data: Path):
"""Open-Domain Question Answering Goes Conversational via Question Rewriting
We introduce QReCC (Question Rewriting in Conversational Context), an
end-to-end open-domain question answering dataset comprising of 14K
conversations with 81K question-answer pairs. The goal of this dataset is to
provide a challenging benchmark for end-to-end conversational question
answering that includes the individual subtasks of question rewriting,
passage retrieval and reading comprehension
"""
return {
"train": QReCCDataset(path=data / "qrecc_train.json"),
"test": QReCCDataset(path=data / "qrecc_test.json"),
}
101 changes: 101 additions & 0 deletions src/datamaestro_text/data/conversation/qrecc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
from functools import cached_property
from typing import Iterator, List, Optional
from attr import define
import json
from datamaestro.data import File
from datamaestro.record import Record

from datamaestro_text.data.ir.base import (
IDItem,
SimpleTextItem,
)


from .base import (
AnswerDocumentURL,
AnswerEntry,
ConversationTree,
EntryType,
RetrievedEntry,
SimpleDecontextualizedItem,
SingleConversationTree,
)
from . import ConversationDataset


@define(kw_only=True)
class QReCCDatasetEntry:
"""A query with past history"""

conversation_no: int
"""Conversation ID"""

turn_no: int
"""The turn in the conversation"""

conversation_source: str
"""Conversation source"""

question: str
"""The last issued query"""

rewrite: str
"""Manually rewritten query"""

context: List[str]
"""The list of queries asked by the user"""

answer: str
"""The answer"""

answer_url: str
"""The URL containing the answer"""


class QReCCDataset(ConversationDataset, File):
def entries(self) -> Iterator[QReCCDatasetEntry]:
"""Iterates over re-written query with their context"""
with self.path.open("rt") as fp:
data = json.load(fp)

data = [
QReCCDatasetEntry(**{key.lower(): value for key, value in entry.items()})
for entry in data
]
return iter(data)

def __iter__(self) -> Iterator[ConversationTree]:
history: List[Record] = []
current_id: Optional[str] = None

for entry in self.entries():
# Creates a new conversation if needed
if entry.conversation_no != current_id:
if current_id is not None:
history.reverse()
yield SingleConversationTree(current_id, history)

current_id = entry.conversation_no
history = []

# Add to current
history.append(
Record(
IDItem(f"{entry.conversation_no}#{entry.turn_no}"),
SimpleTextItem(entry.question),
AnswerDocumentURL(entry.answer_url),
SimpleDecontextualizedItem(entry.rewrite),
EntryType.USER_QUERY,
)
)

history.append(
Record(
AnswerEntry(entry.answer),
EntryType.SYSTEM_ANSWER,
)
)

# Yields the last one
history.reverse()
yield SingleConversationTree(current_id, history)

0 comments on commit ed2f0ef

Please sign in to comment.