Add classification cfg checks for labels

h2oai · Nov 14, 2023 · 92b402f · 92b402f
1 parent 75acecf
commit 92b402f
Show file tree

Hide file tree

Showing 2 changed files with 44 additions and 1 deletion.
diff --git a/llm_studio/python_configs/text_causal_classification_modeling_config.py b/llm_studio/python_configs/text_causal_classification_modeling_config.py
@@ -1,6 +1,6 @@
 import os
 from dataclasses import dataclass, field
-from typing import Any, Tuple
+from typing import Any, Dict, List, Tuple
 
 import llm_studio.src.datasets.text_causal_classification_ds
 import llm_studio.src.plots.text_causal_classification_modeling_plots
@@ -188,3 +188,28 @@ def __post_init__(self):
             ),
             allow_custom=True,
         )
+
+    def check(self) -> Dict[str, List]:
+        errors: Dict[str, List] = {"title": [], "message": []}
+
+        if self.training.loss_function == "CrossEntropyLoss":
+            if self.dataset.num_classes == 1:
+                errors["title"] += ["CrossEntropyLoss requires num_classes > 1"]
+                errors["message"] += [
+                    "CrossEntropyLoss requires num_classes > 1, "
+                    "but num_classes is set to 1."
+                ]
+        elif self.training.loss_function == "BinaryCrossEntropyLoss":
+            if self.dataset.num_classes != 1:
+                errors["title"] += ["BinaryCrossEntropyLoss requires num_classes == 1"]
+                errors["message"] += [
+                    "BinaryCrossEntropyLoss requires num_classes == 1, "
+                    "but num_classes is set to {}.".format(self.dataset.num_classes)
+                ]
+        if self.dataset.parent_id_column not in ["None", None]:
+            errors["title"] += ["Parent ID column is not supported for classification"]
+            errors["message"] += [
+                "Parent ID column is not supported for classification datasets."
+            ]
+
+        return errors
diff --git a/llm_studio/src/datasets/text_causal_classification_ds.py b/llm_studio/src/datasets/text_causal_classification_ds.py
@@ -1,3 +1,4 @@
+import logging
 from typing import Any, Dict
 
 import numpy as np
@@ -9,12 +10,15 @@
 )
 from llm_studio.src.utils.exceptions import LLMDataException
 
+logger = logging.getLogger(__name__)
+
 
 class CustomDataset(TextCausalLanguageModelingCustomDataset):
     def __init__(self, df: pd.DataFrame, cfg: Any, mode: str = "train"):
         super().__init__(df=df, cfg=cfg, mode=mode)
         check_for_non_int_answers(cfg, df)
         self.answers_int = df[cfg.dataset.answer_column].astype(int).values.tolist()
+
         if 1 < cfg.dataset.num_classes <= max(self.answers_int):
             raise LLMDataException(
                 "Number of classes is smaller than max label "
@@ -25,6 +29,20 @@ def __init__(self, df: pd.DataFrame, cfg: Any, mode: str = "train"):
                 "For binary classification, max label should be 1 but is "
                 f"{max(self.answers_int)}."
             )
+        if min(self.answers_int) < 0:
+            raise LLMDataException(
+                "Labels should be non-negative but min label is "
+                f"{min(self.answers_int)}."
+            )
+        if (
+            min(self.answers_int) != 0
+            or max(self.answers_int) != len(set(self.answers_int)) - 1
+        ):
+            logger.warning(
+                "Labels should start at 0 and be continuous but are "
+                f"{sorted(set(self.answers_int))}."
+            )
+
         if cfg.dataset.parent_id_column != "None":
             raise LLMDataException(
                 "Parent ID column is not supported for classification datasets."