Skip to content

Commit

Permalink
Fix/nested sequence structures in model features (#47)
Browse files Browse the repository at this point in the history
* fixed nested casting + predefined split for hf dataset

* readme updates pfly and action yaml fix
  • Loading branch information
omsh committed Nov 22, 2024
1 parent 5fa3b4a commit afae26b
Show file tree
Hide file tree
Showing 4 changed files with 74 additions and 16 deletions.
9 changes: 3 additions & 6 deletions .github/workflows/pypi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,15 @@ on:
workflow_dispatch:
workflow_run:
workflows: ["Build"]
types:
- completed
branches:
- main
types: [completed]
branches: [main]

jobs:
release:
runs-on: ubuntu-latest
if: |
github.event.workflow_run.conclusion == 'success' &&
github.event.workflow_run.head_branch == 'main' &&
github.ref == 'refs/heads/main'
github.event.workflow_run.head_branch == 'main'
steps:
- uses: actions/checkout@v4
- name: Set up Python
Expand Down
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ $ pip install dlomix[wandb]
- Fragment Ion Intensity Prediction:
- a multi-output regression problem where the intensity values for fragment ions are predicted given a peptide sequence along with some additional features.

- Peptide Detectability:
- Peptide Detectability (Pfly) [4]:
- a multi-class classification problem where the detectability of a peptide is predicted given the peptide sequence.


Expand Down Expand Up @@ -97,3 +97,7 @@ Robbin Bouwmeester, Ralf Gabriels, Niels Hulstaert, Lennart Martens, Sven Degroe
bioRxiv 2020.03.28.013003; doi: 10.1101/2020.03.28.013003

[3] Bouwmeester, R., Gabriels, R., Hulstaert, N. et al. DeepLC can predict retention times for peptides that carry as-yet unseen modifications. Nat Methods 18, 1363–1369 (2021). https://doi.org/10.1038/s41592-021-01301-5

[**Detectability - Pfly**]

[4] Abdul-Khalek, N., Picciani, M., Wimmer, R., Overgaard, M. T., Wilhelm, M., & Gregersen Echers, S. (2024). To fly, or not to fly, that is the question: A deep learning model for peptide detectability prediction in mass spectrometry. bioRxiv, 2024-10.
22 changes: 14 additions & 8 deletions src/dlomix/data/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,7 @@ def _load_dataset(self):
def _load_from_hub(self):
self.hf_dataset = load_dataset(self.data_source)
self._empty_dataset_mode = False
self._is_predefined_split = True
warnings.warn(
'The provided data is assumed to be hosted on the Hugging Face Hub since data_format is set to "hub". Validation and test data sources will be ignored.'
)
Expand All @@ -282,6 +283,7 @@ def _load_from_hub(self):

def _load_from_inmemory_hf_dataset(self):
self._empty_dataset_mode = False
self._is_predefined_split = True
warnings.warn(
f'The provided data is assumed to be an in-memory Hugging Face Dataset or DatasetDict object since data_format is set to "hf". Validation and test data sources will be ignored and the split names of the DatasetDict has to follow the default namings {PeptideDataset.DEFAULT_SPLIT_NAMES}.'
)
Expand All @@ -308,7 +310,7 @@ def _load_from_inmemory_hf_dataset(self):
def _decide_on_splitting(self):
count_loaded_data_sources = len(self._data_files_available_splits)

# one non-train data source provided -> if test, then test only, if val, then do not split
# one data source provided -> if test, then test only, if val, then do not split
if count_loaded_data_sources == 1:
if (
self.test_data_source is not None
Expand Down Expand Up @@ -551,19 +553,23 @@ def _apply_processor_to_split(
)

def _cast_model_feature_types_to_float(self):
def cast_to_float(feature):
"""Recursively casts Sequence and Value features to float32."""
if isinstance(feature, Sequence):
# Recursively apply the transformation to the nested feature
return Sequence(cast_to_float(feature.feature))
if isinstance(feature, Value):
return Value("float32")
return feature # Return as is for unsupported feature types

for split in self.hf_dataset.keys():
new_features = self.hf_dataset[split].features.copy()

for feature_name, feature_type in self.hf_dataset[split].features.items():
# ensure model features are casted to float for concatenation later
# Ensure model features are casted to float for concatenation later
if feature_name not in self.model_features:
continue
if feature_type.dtype.startswith("float"):
continue
if isinstance(feature_type, Sequence):
new_features[feature_name] = Sequence(Value("float32"))
if isinstance(feature_type, Value):
new_features[feature_name] = Value("float32")
new_features[feature_name] = cast_to_float(feature_type)

self.hf_dataset[split] = self.hf_dataset[split].cast(
new_features,
Expand Down
53 changes: 52 additions & 1 deletion tests/test_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from os.path import exists, join

import pytest
from datasets import load_dataset
from datasets import Dataset, DatasetDict, load_dataset

from dlomix.data import FragmentIonIntensityDataset, RetentionTimeDataset

Expand All @@ -17,6 +17,12 @@
INTENSITY_CSV_EXAMPLE_URL = "https://raw.githubusercontent.com/wilhelm-lab/dlomix/develop/example_dataset/intensity/intensity_data.csv"
RT_HUB_DATASET_NAME = "Wilhelmlab/prospect-ptms-irt"

RAW_GENERIC_NESTED_DATA = {
"seq": ["[UNIMOD:737]-DASAQTTSHELTIPN-[]", "[UNIMOD:737]-DLHTGRLC[UNIMOD:4]-[]"],
"nested_feature": [[[30, 64]], [[25, 35]]],
"label": [0.1, 0.2],
}

TEST_ASSETS_TO_DOWNLOAD = [
RT_PARQUET_EXAMPLE_URL,
RT_CSV_EXAMPLE_URL,
Expand Down Expand Up @@ -218,6 +224,51 @@ def test_csv_intensitydataset():
> 0
)


def test_nested_model_features():
hfdata = Dataset.from_dict(RAW_GENERIC_NESTED_DATA)

intensity_dataset = FragmentIonIntensityDataset(
data_format="hf",
data_source=hfdata,
sequence_column="seq",
label_column="label",
model_features=["nested_feature"],
)

assert intensity_dataset.hf_dataset is not None
assert intensity_dataset._empty_dataset_mode is False

example = iter(intensity_dataset.tensor_train_data).next()
assert example[0]["nested_feature"].shape == [2, 1, 2]


def test_no_split_datasetDict_hf_inmemory():
hfdata = Dataset.from_dict(RAW_GENERIC_NESTED_DATA)
hf_dataset = DatasetDict({"train": hfdata})

intensity_dataset = FragmentIonIntensityDataset(
data_format="hf",
data_source=hf_dataset,
sequence_column="seq",
label_column="label",
)

assert intensity_dataset.hf_dataset is not None
assert intensity_dataset._empty_dataset_mode is False
assert FragmentIonIntensityDataset.DEFAULT_SPLIT_NAMES[0] in list(
intensity_dataset.hf_dataset.keys()
)

assert (
len(
intensity_dataset.hf_dataset[
FragmentIonIntensityDataset.DEFAULT_SPLIT_NAMES[0]
]
)
== 2
)

# test saving and loading datasets with config

# test learning alphabet for train/val and then using it for test with fallback

0 comments on commit afae26b

Please sign in to comment.