Merge branch 'refs/heads/develop' into feature/embedding-dimension-re…

…duction # Conflicts: # pyproject.toml
sacdallago · Dec 9, 2024 · 3aa9815 · 3aa9815
2 parents 66871b2 + c3da0c4
commit 3aa9815
Show file tree

Hide file tree

Showing 11 changed files with 1,137 additions and 69 deletions.
diff --git a/biotrainer/config/configurator.py b/biotrainer/config/configurator.py
diff --git a/biotrainer/config/hf_dataset_options.py b/biotrainer/config/hf_dataset_options.py
@@ -0,0 +1,142 @@
+from typing import List, Any, Union, Type
+from abc import ABC
+
+from .config_option import ConfigOption, ConfigurationException, classproperty
+from ..protocols import Protocol
+
+
+class HFDatasetOption(ConfigOption, ABC):
+    """
+    Abstract base class for HuggingFace dataset configuration options.
+
+    Extends `ConfigOption` to provide a framework for defining
+    specific HuggingFace dataset-related options.
+    """
+
+    @classproperty
+    def category(self) -> str:
+        return "hf_dataset"
+
+
+class HFPath(HFDatasetOption):
+    """
+    Configuration option for specifying the HuggingFace dataset path.
+    """
+
+    @classproperty
+    def name(self) -> str:
+        return "path"
+
+    @classproperty
+    def allow_multiple_values(self) -> bool:
+        return False
+
+    @classproperty
+    def required(self) -> bool:
+        return True
+
+    def check_value(self) -> bool:
+        if not isinstance(self.value, str) or "/" not in self.value:
+            raise ConfigurationException(
+                f"Invalid HuggingFace dataset path: {self.value}. It should be in the format 'username/dataset_name'."
+            )
+        return True
+
+
+class HFSubsetName(HFDatasetOption):
+    """
+    Configuration option for specifying the dataset subset name.
+    """
+
+    @classproperty
+    def name(self) -> str:
+        return "subset"
+
+    @classproperty
+    def allow_multiple_values(self) -> bool:
+        return False
+
+    @classproperty
+    def required(self) -> bool:
+        return False
+
+
+class HFSequenceColumn(HFDatasetOption):
+    """
+    Configuration option for specifying the sequence column in the dataset.
+    """
+
+    @classproperty
+    def name(self) -> str:
+        return "sequence_column"
+
+    @classproperty
+    def allow_multiple_values(self) -> bool:
+        return False
+
+    @classproperty
+    def required(self) -> bool:
+        return True
+
+    def check_value(self) -> bool:
+        if not isinstance(self.value, str) or not self.value.strip():
+            raise ConfigurationException("sequence_column must be a non-empty string.")
+        return True
+
+
+class HFTargetColumn(HFDatasetOption):
+    """
+    Configuration option for specifying the target column in the dataset.
+    """
+
+    @classproperty
+    def name(self) -> str:
+        return "target_column"
+
+    @classproperty
+    def allow_multiple_values(self) -> bool:
+        return False
+
+    @classproperty
+    def required(self) -> bool:
+        return True
+
+    def check_value(self) -> bool:
+        if not isinstance(self.value, str) or not self.value.strip():
+            raise ConfigurationException("target_column must be a non-empty string.")
+        return True
+
+class HFMaskColumn(HFDatasetOption):
+    """
+    Configuration option for specifying the mask column in the dataset.
+    """
+
+    @classproperty
+    def name(self) -> str:
+        return "mask_column"
+
+    @classproperty
+    def allow_multiple_values(self) -> bool:
+        return False
+
+    @classproperty
+    def required(self) -> bool:
+        return False
+
+    def check_value(self) -> bool:
+        if not isinstance(self.value, str) or not self.value.strip():
+            raise ConfigurationException("mask_column must be a non-empty string.")
+        return True
+
+# Constant key for hf_dataset configuration
+HF_DATASET_CONFIG_KEY: str = "hf_dataset"
+
+# Add hf_dataset options to a separate dictionary
+hf_dataset_options: List[Type[HFDatasetOption]] = [
+    HFDatasetOption,
+    HFPath,
+    HFSubsetName,
+    HFSequenceColumn,
+    HFTargetColumn,
+    HFMaskColumn
+]
diff --git a/biotrainer/utilities/__init__.py b/biotrainer/utilities/__init__.py
@@ -2,16 +2,27 @@
 from .version import __version__
 from .cuda_device import get_device, is_device_cpu
 from .data_classes import Split, SplitResult, DatasetSample
-from .constants import SEQUENCE_PAD_VALUE, MASK_AND_LABELS_PAD_VALUE, INTERACTION_INDICATOR, \
+from .constants import (
+    SEQUENCE_PAD_VALUE,
+    MASK_AND_LABELS_PAD_VALUE,
+    INTERACTION_INDICATOR,
     METRICS_WITHOUT_REVERSED_SORTING
-from .fasta import read_FASTA, get_attributes_from_seqrecords, \
-    get_attributes_from_seqrecords_for_protein_interactions, get_split_lists
+)
+from .fasta import (
+    read_FASTA,
+    get_attributes_from_seqrecords,
+    get_attributes_from_seqrecords_for_protein_interactions,
+    get_split_lists
+)
+
+from .hf_dataset_to_fasta import process_hf_dataset_to_fasta
 
 __all__ = [
     'seed_all',
     'get_device',
     'is_device_cpu',
     'read_FASTA',
+    'process_hf_dataset_to_fasta',
     'get_attributes_from_seqrecords',
     'get_attributes_from_seqrecords_for_protein_interactions',
     'get_split_lists',

diff --git a/biotrainer/utilities/fasta.py b/biotrainer/utilities/fasta.py
@@ -2,7 +2,7 @@
 import logging
 
 from Bio import SeqIO
-from typing import Dict, List, Tuple
+from typing import Dict, List, Tuple, Union, Any, Optional
 from Bio.SeqRecord import SeqRecord
 
 from ..utilities import INTERACTION_INDICATOR