Fix/nested sequence structures in model features (#47)

* fixed nested casting + predefined split for hf dataset * readme updates pfly and action yaml fix
wilhelm-lab · Nov 22, 2024 · afae26b · afae26b
1 parent 5fa3b4a
commit afae26b
Show file tree

Hide file tree

Showing 4 changed files with 74 additions and 16 deletions.
diff --git a/.github/workflows/pypi.yaml b/.github/workflows/pypi.yaml
@@ -4,18 +4,15 @@ on:
   workflow_dispatch:
   workflow_run:
     workflows: ["Build"]
-    types:
-      - completed
-    branches:
-      - main
+    types: [completed]
+    branches: [main]
 
 jobs:
   release:
     runs-on: ubuntu-latest
     if: |
       github.event.workflow_run.conclusion == 'success' &&
-      github.event.workflow_run.head_branch == 'main' &&
-      github.ref == 'refs/heads/main'
+      github.event.workflow_run.head_branch == 'main'
     steps:
       - uses: actions/checkout@v4
       - name: Set up Python

diff --git a/README.md b/README.md
@@ -47,7 +47,7 @@ $ pip install dlomix[wandb]
 - Fragment Ion Intensity Prediction:
     - a multi-output regression problem where the intensity values for fragment ions are predicted given a peptide sequence along with some additional features.
 
-- Peptide Detectability:
+- Peptide Detectability (Pfly) [4]:
     - a multi-class classification problem where the detectability of a peptide is predicted given the peptide sequence.
 
 
@@ -97,3 +97,7 @@ Robbin Bouwmeester, Ralf Gabriels, Niels Hulstaert, Lennart Martens, Sven Degroe
 bioRxiv 2020.03.28.013003; doi: 10.1101/2020.03.28.013003
 
 [3] Bouwmeester, R., Gabriels, R., Hulstaert, N. et al. DeepLC can predict retention times for peptides that carry as-yet unseen modifications. Nat Methods 18, 1363–1369 (2021). https://doi.org/10.1038/s41592-021-01301-5
+
+[**Detectability - Pfly**]
+
+[4] Abdul-Khalek, N., Picciani, M., Wimmer, R., Overgaard, M. T., Wilhelm, M., & Gregersen Echers, S. (2024). To fly, or not to fly, that is the question: A deep learning model for peptide detectability prediction in mass spectrometry. bioRxiv, 2024-10.
diff --git a/src/dlomix/data/dataset.py b/src/dlomix/data/dataset.py
@@ -259,6 +259,7 @@ def _load_dataset(self):
     def _load_from_hub(self):
         self.hf_dataset = load_dataset(self.data_source)
         self._empty_dataset_mode = False
+        self._is_predefined_split = True
         warnings.warn(
             'The provided data is assumed to be hosted on the Hugging Face Hub since data_format is set to "hub". Validation and test data sources will be ignored.'
         )
@@ -282,6 +283,7 @@ def _load_from_hub(self):
 
     def _load_from_inmemory_hf_dataset(self):
         self._empty_dataset_mode = False
+        self._is_predefined_split = True
         warnings.warn(
             f'The provided data is assumed to be an in-memory Hugging Face Dataset or DatasetDict object since data_format is set to "hf". Validation and test data sources will be ignored and the split names of the DatasetDict has to follow the default namings {PeptideDataset.DEFAULT_SPLIT_NAMES}.'
         )
@@ -308,7 +310,7 @@ def _load_from_inmemory_hf_dataset(self):
     def _decide_on_splitting(self):
         count_loaded_data_sources = len(self._data_files_available_splits)
 
-        # one non-train data source provided -> if test, then test only, if val, then do not split
+        # one data source provided -> if test, then test only, if val, then do not split
         if count_loaded_data_sources == 1:
             if (
                 self.test_data_source is not None
@@ -551,19 +553,23 @@ def _apply_processor_to_split(
         )
 
     def _cast_model_feature_types_to_float(self):
+        def cast_to_float(feature):
+            """Recursively casts Sequence and Value features to float32."""
+            if isinstance(feature, Sequence):
+                # Recursively apply the transformation to the nested feature
+                return Sequence(cast_to_float(feature.feature))
+            if isinstance(feature, Value):
+                return Value("float32")
+            return feature  # Return as is for unsupported feature types
+
         for split in self.hf_dataset.keys():
             new_features = self.hf_dataset[split].features.copy()
 
             for feature_name, feature_type in self.hf_dataset[split].features.items():
-                # ensure model features are casted to float for concatenation later
+                # Ensure model features are casted to float for concatenation later
                 if feature_name not in self.model_features:
                     continue
-                if feature_type.dtype.startswith("float"):
-                    continue
-                if isinstance(feature_type, Sequence):
-                    new_features[feature_name] = Sequence(Value("float32"))
-                if isinstance(feature_type, Value):
-                    new_features[feature_name] = Value("float32")
+                new_features[feature_name] = cast_to_float(feature_type)
 
             self.hf_dataset[split] = self.hf_dataset[split].cast(
                 new_features,

diff --git a/tests/test_datasets.py b/tests/test_datasets.py
@@ -5,7 +5,7 @@
 from os.path import exists, join
 
 import pytest
-from datasets import load_dataset
+from datasets import Dataset, DatasetDict, load_dataset
 
 from dlomix.data import FragmentIonIntensityDataset, RetentionTimeDataset
 
@@ -17,6 +17,12 @@
 INTENSITY_CSV_EXAMPLE_URL = "https://raw.githubusercontent.com/wilhelm-lab/dlomix/develop/example_dataset/intensity/intensity_data.csv"
 RT_HUB_DATASET_NAME = "Wilhelmlab/prospect-ptms-irt"
 
+RAW_GENERIC_NESTED_DATA = {
+    "seq": ["[UNIMOD:737]-DASAQTTSHELTIPN-[]", "[UNIMOD:737]-DLHTGRLC[UNIMOD:4]-[]"],
+    "nested_feature": [[[30, 64]], [[25, 35]]],
+    "label": [0.1, 0.2],
+}
+
 TEST_ASSETS_TO_DOWNLOAD = [
     RT_PARQUET_EXAMPLE_URL,
     RT_CSV_EXAMPLE_URL,
@@ -218,6 +224,51 @@ def test_csv_intensitydataset():
         > 0
     )
 
+
+def test_nested_model_features():
+    hfdata = Dataset.from_dict(RAW_GENERIC_NESTED_DATA)
+
+    intensity_dataset = FragmentIonIntensityDataset(
+        data_format="hf",
+        data_source=hfdata,
+        sequence_column="seq",
+        label_column="label",
+        model_features=["nested_feature"],
+    )
+
+    assert intensity_dataset.hf_dataset is not None
+    assert intensity_dataset._empty_dataset_mode is False
+
+    example = iter(intensity_dataset.tensor_train_data).next()
+    assert example[0]["nested_feature"].shape == [2, 1, 2]
+
+
+def test_no_split_datasetDict_hf_inmemory():
+    hfdata = Dataset.from_dict(RAW_GENERIC_NESTED_DATA)
+    hf_dataset = DatasetDict({"train": hfdata})
+
+    intensity_dataset = FragmentIonIntensityDataset(
+        data_format="hf",
+        data_source=hf_dataset,
+        sequence_column="seq",
+        label_column="label",
+    )
+
+    assert intensity_dataset.hf_dataset is not None
+    assert intensity_dataset._empty_dataset_mode is False
+    assert FragmentIonIntensityDataset.DEFAULT_SPLIT_NAMES[0] in list(
+        intensity_dataset.hf_dataset.keys()
+    )
+
+    assert (
+        len(
+            intensity_dataset.hf_dataset[
+                FragmentIonIntensityDataset.DEFAULT_SPLIT_NAMES[0]
+            ]
+        )
+        == 2
+    )
+
     # test saving and loading datasets with config
 
     # test learning alphabet for train/val and then using it for test with fallback