Refactor QuALITY data loading to use LoadCSV and update preprocessing…

… steps Signed-off-by: elronbandel <[email protected]>
IBM · Jan 29, 2025 · 4049401 · 4049401
1 parent 3c0c987
commit 4049401
Showing 2 changed files with 112 additions and 67 deletions.
diff --git a/prepare/cards/quality.py b/prepare/cards/quality.py
@@ -1,82 +1,39 @@
-import json
-
-import requests
-
 import unitxt
 from unitxt.card import TaskCard
 from unitxt.catalog import add_to_catalog
-from unitxt.loaders import LoadFromDictionary
+from unitxt.collections_operators import Explode
+from unitxt.loaders import LoadCSV
 from unitxt.operators import (
+    Copy,
     MapInstanceValues,
     Set,
 )
 from unitxt.splitters import SplitRandomMix
 from unitxt.test_utils.card import test_card
 
+file_path = "https://raw.githubusercontent.com/nyu-mll/quality/05e85750d4c5444d2a0a4ad299f6df5f4df06068/data/v1.0.1/QuALITY.v1.0.1.htmlstripped."
 
-def load_quality_split(split: str,
-                       base_url: str = "https://raw.githubusercontent.com/nyu-mll/quality/main/data/v1.0.1"):
-    """Load and parse a specific split of the QuALITY dataset."""
-    url = f"{base_url}/QuALITY.v1.0.1.htmlstripped.{split}"
-    response = requests.get(url)
-    response.raise_for_status()
-
-    # Parse JSON Lines format and process the data
-    processed_data = []
-    for line in response.text.strip().split('\n'):
-        if line:
-            article_data = json.loads(line)
-
-            # Extract common article information
-            article_info = {
-                'article_id': article_data['article_id'],
-                'title': article_data['title'],
-                'author': article_data.get('author', ''),
-                'article': article_data['article'],
-                'year': article_data.get('year', ''),
-                'topic': article_data.get('topic', '')
-            }
-
-            # Process each question for this article
-            for question in article_data.get('questions', []):
-                # Use 'gold_label' as the answer and handle missing values
-                gold_label = question.get('gold_label', None)
-
-                processed_question = {
-                    'context': article_info['article'],
-                    'question': question['question'],
-                    'choices': question['options'],
-                    'answer': gold_label,  # Use the 'gold_label' as the answer
-                    'topic': article_info['topic']
-                }
-                processed_data.append(processed_question)
-
-    return processed_data
-
-
-def load_quality_data():
-    """Load all splits of the QuALITY dataset."""
-    data = {}
-    for split_name, file_split in [('train', 'train'), ('validation', 'dev')]:
-        try:
-            data[split_name] = load_quality_split(file_split)
-        except Exception as e:
-            print(f"Warning: Could not load {file_split} split: {e}")
-
-    return data
-
-
-# Create the card using LoadFromDictionary
 with unitxt.settings.context(allow_unverified_code=True):
     card = TaskCard(
-        loader=LoadFromDictionary(
-            data=load_quality_data(),
-            data_classification_policy=["public"]
+        loader=LoadCSV(
+            files={"train": file_path + "train", "validation": file_path + "dev"},
+            file_type="json",
+            lines=True,
+            data_classification_policy=["public"],
         ),
         preprocess_steps=[
             SplitRandomMix(
-                {"train": "train[80%]", "validation": "validation[100%]", "test": "train[20%]"}
+                {
+                    "train": "train[80%]",
+                    "validation": "train[20%]",
+                    "test": "validation",
+                }
             ),
+            Copy(field="article", to_field="context"),
+            Explode(field="questions", to_field="data"),
+            Copy(field="data/question", to_field="question"),
+            Copy(field="data/options", to_field="choices"),
+            Copy(field="data/gold_label", to_field="answer"),
             MapInstanceValues(
                 mappers={
                     "answer": {
@@ -103,13 +60,10 @@ def load_quality_data():
             "task_categories": [
                 "question-answering",
                 "multiple-choice",
-                "reading-comprehension"
+                "reading-comprehension",
             ],
             "multilinguality": "monolingual",
-            "task_ids": [
-                "extractive-qa",
-                "reading-comprehension"
-            ],
+            "task_ids": ["extractive-qa", "reading-comprehension"],
         },
     )
 

diff --git a/src/unitxt/catalog/cards/quality.json b/src/unitxt/catalog/cards/quality.json
@@ -0,0 +1,91 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "load_csv",
+        "files": {
+            "train": "https://raw.githubusercontent.com/nyu-mll/quality/05e85750d4c5444d2a0a4ad299f6df5f4df06068/data/v1.0.1/QuALITY.v1.0.1.htmlstripped.train",
+            "validation": "https://raw.githubusercontent.com/nyu-mll/quality/05e85750d4c5444d2a0a4ad299f6df5f4df06068/data/v1.0.1/QuALITY.v1.0.1.htmlstripped.dev"
+        },
+        "file_type": "json",
+        "lines": true,
+        "data_classification_policy": [
+            "public"
+        ]
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "split_random_mix",
+            "mix": {
+                "train": "train[80%]",
+                "validation": "train[20%]",
+                "test": "validation"
+            }
+        },
+        {
+            "__type__": "copy",
+            "field": "article",
+            "to_field": "context"
+        },
+        {
+            "__type__": "explode",
+            "field": "questions",
+            "to_field": "data"
+        },
+        {
+            "__type__": "copy",
+            "field": "data/question",
+            "to_field": "question"
+        },
+        {
+            "__type__": "copy",
+            "field": "data/options",
+            "to_field": "choices"
+        },
+        {
+            "__type__": "copy",
+            "field": "data/gold_label",
+            "to_field": "answer"
+        },
+        {
+            "__type__": "map_instance_values",
+            "mappers": {
+                "answer": {
+                    "1": 0,
+                    "2": 1,
+                    "3": 2,
+                    "4": 3,
+                    "5": 4
+                }
+            }
+        },
+        {
+            "__type__": "set",
+            "fields": {
+                "context_type": "document"
+            }
+        }
+    ],
+    "task": "tasks.qa.multiple_choice.with_context",
+    "templates": "templates.qa.multiple_choice.with_context.all",
+    "__description__": "QuALITY (Question Answering with Long Input Texts, Yes!) is a multiple-choice reading comprehension dataset with long documents. The dataset comprises of documents from Project Gutenberg and questions written by human annotators. Each question has 4-5 answer choices, and requires understanding of the entire document to answer correctly. Questions are designed to test comprehensive understanding of the entire document, with various difficulty levels.",
+    "__tags__": {
+        "annotations_creators": "expert-generated",
+        "language": [
+            "en"
+        ],
+        "license": "cc-by-4.0",
+        "size_categories": [
+            "10K<n<100K"
+        ],
+        "task_categories": [
+            "question-answering",
+            "multiple-choice",
+            "reading-comprehension"
+        ],
+        "multilinguality": "monolingual",
+        "task_ids": [
+            "extractive-qa",
+            "reading-comprehension"
+        ]
+    }
+}