Skip to content

Commit

Permalink
Merge branch 'refs/heads/develop' into feature/embedding-dimension-re…
Browse files Browse the repository at this point in the history
…duction

# Conflicts:
#	pyproject.toml
  • Loading branch information
SebieF committed Dec 9, 2024
2 parents 66871b2 + c3da0c4 commit 3aa9815
Show file tree
Hide file tree
Showing 11 changed files with 1,137 additions and 69 deletions.
381 changes: 318 additions & 63 deletions biotrainer/config/configurator.py

Large diffs are not rendered by default.

142 changes: 142 additions & 0 deletions biotrainer/config/hf_dataset_options.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
from typing import List, Any, Union, Type
from abc import ABC

from .config_option import ConfigOption, ConfigurationException, classproperty
from ..protocols import Protocol


class HFDatasetOption(ConfigOption, ABC):
"""
Abstract base class for HuggingFace dataset configuration options.
Extends `ConfigOption` to provide a framework for defining
specific HuggingFace dataset-related options.
"""

@classproperty
def category(self) -> str:
return "hf_dataset"


class HFPath(HFDatasetOption):
"""
Configuration option for specifying the HuggingFace dataset path.
"""

@classproperty
def name(self) -> str:
return "path"

@classproperty
def allow_multiple_values(self) -> bool:
return False

@classproperty
def required(self) -> bool:
return True

def check_value(self) -> bool:
if not isinstance(self.value, str) or "/" not in self.value:
raise ConfigurationException(
f"Invalid HuggingFace dataset path: {self.value}. It should be in the format 'username/dataset_name'."
)
return True


class HFSubsetName(HFDatasetOption):
"""
Configuration option for specifying the dataset subset name.
"""

@classproperty
def name(self) -> str:
return "subset"

@classproperty
def allow_multiple_values(self) -> bool:
return False

@classproperty
def required(self) -> bool:
return False


class HFSequenceColumn(HFDatasetOption):
"""
Configuration option for specifying the sequence column in the dataset.
"""

@classproperty
def name(self) -> str:
return "sequence_column"

@classproperty
def allow_multiple_values(self) -> bool:
return False

@classproperty
def required(self) -> bool:
return True

def check_value(self) -> bool:
if not isinstance(self.value, str) or not self.value.strip():
raise ConfigurationException("sequence_column must be a non-empty string.")
return True


class HFTargetColumn(HFDatasetOption):
"""
Configuration option for specifying the target column in the dataset.
"""

@classproperty
def name(self) -> str:
return "target_column"

@classproperty
def allow_multiple_values(self) -> bool:
return False

@classproperty
def required(self) -> bool:
return True

def check_value(self) -> bool:
if not isinstance(self.value, str) or not self.value.strip():
raise ConfigurationException("target_column must be a non-empty string.")
return True

class HFMaskColumn(HFDatasetOption):
"""
Configuration option for specifying the mask column in the dataset.
"""

@classproperty
def name(self) -> str:
return "mask_column"

@classproperty
def allow_multiple_values(self) -> bool:
return False

@classproperty
def required(self) -> bool:
return False

def check_value(self) -> bool:
if not isinstance(self.value, str) or not self.value.strip():
raise ConfigurationException("mask_column must be a non-empty string.")
return True

# Constant key for hf_dataset configuration
HF_DATASET_CONFIG_KEY: str = "hf_dataset"

# Add hf_dataset options to a separate dictionary
hf_dataset_options: List[Type[HFDatasetOption]] = [
HFDatasetOption,
HFPath,
HFSubsetName,
HFSequenceColumn,
HFTargetColumn,
HFMaskColumn
]
17 changes: 14 additions & 3 deletions biotrainer/utilities/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,27 @@
from .version import __version__
from .cuda_device import get_device, is_device_cpu
from .data_classes import Split, SplitResult, DatasetSample
from .constants import SEQUENCE_PAD_VALUE, MASK_AND_LABELS_PAD_VALUE, INTERACTION_INDICATOR, \
from .constants import (
SEQUENCE_PAD_VALUE,
MASK_AND_LABELS_PAD_VALUE,
INTERACTION_INDICATOR,
METRICS_WITHOUT_REVERSED_SORTING
from .fasta import read_FASTA, get_attributes_from_seqrecords, \
get_attributes_from_seqrecords_for_protein_interactions, get_split_lists
)
from .fasta import (
read_FASTA,
get_attributes_from_seqrecords,
get_attributes_from_seqrecords_for_protein_interactions,
get_split_lists
)

from .hf_dataset_to_fasta import process_hf_dataset_to_fasta

__all__ = [
'seed_all',
'get_device',
'is_device_cpu',
'read_FASTA',
'process_hf_dataset_to_fasta',
'get_attributes_from_seqrecords',
'get_attributes_from_seqrecords_for_protein_interactions',
'get_split_lists',
Expand Down
2 changes: 1 addition & 1 deletion biotrainer/utilities/fasta.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import logging

from Bio import SeqIO
from typing import Dict, List, Tuple
from typing import Dict, List, Tuple, Union, Any, Optional
from Bio.SeqRecord import SeqRecord

from ..utilities import INTERACTION_INDICATOR
Expand Down
Loading

0 comments on commit 3aa9815

Please sign in to comment.