Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Refactor QuALITY data loading to use LoadCSV and update preprocessing…
Browse files Browse the repository at this point in the history
… steps

Signed-off-by: elronbandel <[email protected]>
elronbandel committed Jan 29, 2025
1 parent 3c0c987 commit 4049401
Showing 2 changed files with 112 additions and 67 deletions.
88 changes: 21 additions & 67 deletions prepare/cards/quality.py
Original file line number Diff line number Diff line change
@@ -1,82 +1,39 @@
import json

import requests

import unitxt
from unitxt.card import TaskCard
from unitxt.catalog import add_to_catalog
from unitxt.loaders import LoadFromDictionary
from unitxt.collections_operators import Explode
from unitxt.loaders import LoadCSV
from unitxt.operators import (
Copy,
MapInstanceValues,
Set,
)
from unitxt.splitters import SplitRandomMix
from unitxt.test_utils.card import test_card

file_path = "https://raw.githubusercontent.com/nyu-mll/quality/05e85750d4c5444d2a0a4ad299f6df5f4df06068/data/v1.0.1/QuALITY.v1.0.1.htmlstripped."

def load_quality_split(split: str,
base_url: str = "https://raw.githubusercontent.com/nyu-mll/quality/main/data/v1.0.1"):
"""Load and parse a specific split of the QuALITY dataset."""
url = f"{base_url}/QuALITY.v1.0.1.htmlstripped.{split}"
response = requests.get(url)
response.raise_for_status()

# Parse JSON Lines format and process the data
processed_data = []
for line in response.text.strip().split('\n'):
if line:
article_data = json.loads(line)

# Extract common article information
article_info = {
'article_id': article_data['article_id'],
'title': article_data['title'],
'author': article_data.get('author', ''),
'article': article_data['article'],
'year': article_data.get('year', ''),
'topic': article_data.get('topic', '')
}

# Process each question for this article
for question in article_data.get('questions', []):
# Use 'gold_label' as the answer and handle missing values
gold_label = question.get('gold_label', None)

processed_question = {
'context': article_info['article'],
'question': question['question'],
'choices': question['options'],
'answer': gold_label, # Use the 'gold_label' as the answer
'topic': article_info['topic']
}
processed_data.append(processed_question)

return processed_data


def load_quality_data():
"""Load all splits of the QuALITY dataset."""
data = {}
for split_name, file_split in [('train', 'train'), ('validation', 'dev')]:
try:
data[split_name] = load_quality_split(file_split)
except Exception as e:
print(f"Warning: Could not load {file_split} split: {e}")

return data


# Create the card using LoadFromDictionary
with unitxt.settings.context(allow_unverified_code=True):
card = TaskCard(
loader=LoadFromDictionary(
data=load_quality_data(),
data_classification_policy=["public"]
loader=LoadCSV(
files={"train": file_path + "train", "validation": file_path + "dev"},
file_type="json",
lines=True,
data_classification_policy=["public"],
),
preprocess_steps=[
SplitRandomMix(
{"train": "train[80%]", "validation": "validation[100%]", "test": "train[20%]"}
{
"train": "train[80%]",
"validation": "train[20%]",
"test": "validation",
}
),
Copy(field="article", to_field="context"),
Explode(field="questions", to_field="data"),
Copy(field="data/question", to_field="question"),
Copy(field="data/options", to_field="choices"),
Copy(field="data/gold_label", to_field="answer"),
MapInstanceValues(
mappers={
"answer": {
@@ -103,13 +60,10 @@ def load_quality_data():
"task_categories": [
"question-answering",
"multiple-choice",
"reading-comprehension"
"reading-comprehension",
],
"multilinguality": "monolingual",
"task_ids": [
"extractive-qa",
"reading-comprehension"
],
"task_ids": ["extractive-qa", "reading-comprehension"],
},
)

91 changes: 91 additions & 0 deletions src/unitxt/catalog/cards/quality.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
{
"__type__": "task_card",
"loader": {
"__type__": "load_csv",
"files": {
"train": "https://raw.githubusercontent.com/nyu-mll/quality/05e85750d4c5444d2a0a4ad299f6df5f4df06068/data/v1.0.1/QuALITY.v1.0.1.htmlstripped.train",
"validation": "https://raw.githubusercontent.com/nyu-mll/quality/05e85750d4c5444d2a0a4ad299f6df5f4df06068/data/v1.0.1/QuALITY.v1.0.1.htmlstripped.dev"
},
"file_type": "json",
"lines": true,
"data_classification_policy": [
"public"
]
},
"preprocess_steps": [
{
"__type__": "split_random_mix",
"mix": {
"train": "train[80%]",
"validation": "train[20%]",
"test": "validation"
}
},
{
"__type__": "copy",
"field": "article",
"to_field": "context"
},
{
"__type__": "explode",
"field": "questions",
"to_field": "data"
},
{
"__type__": "copy",
"field": "data/question",
"to_field": "question"
},
{
"__type__": "copy",
"field": "data/options",
"to_field": "choices"
},
{
"__type__": "copy",
"field": "data/gold_label",
"to_field": "answer"
},
{
"__type__": "map_instance_values",
"mappers": {
"answer": {
"1": 0,
"2": 1,
"3": 2,
"4": 3,
"5": 4
}
}
},
{
"__type__": "set",
"fields": {
"context_type": "document"
}
}
],
"task": "tasks.qa.multiple_choice.with_context",
"templates": "templates.qa.multiple_choice.with_context.all",
"__description__": "QuALITY (Question Answering with Long Input Texts, Yes!) is a multiple-choice reading comprehension dataset with long documents. The dataset comprises of documents from Project Gutenberg and questions written by human annotators. Each question has 4-5 answer choices, and requires understanding of the entire document to answer correctly. Questions are designed to test comprehensive understanding of the entire document, with various difficulty levels.",
"__tags__": {
"annotations_creators": "expert-generated",
"language": [
"en"
],
"license": "cc-by-4.0",
"size_categories": [
"10K<n<100K"
],
"task_categories": [
"question-answering",
"multiple-choice",
"reading-comprehension"
],
"multilinguality": "monolingual",
"task_ids": [
"extractive-qa",
"reading-comprehension"
]
}
}

0 comments on commit 4049401

Please sign in to comment.