Skip to content

Commit

Permalink
Speeding up one hot encoding embedder
Browse files Browse the repository at this point in the history
Using numpy functions to improve performance up to 5x
  • Loading branch information
SebieF committed Nov 18, 2024
1 parent 98ee386 commit b02872b
Showing 1 changed file with 15 additions and 14 deletions.
29 changes: 15 additions & 14 deletions biotrainer/embedders/one_hot_encoding_embedder.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,31 @@
# Changed version from the original one from Konstantin Schütze (konstin, https://github.com/konstin) from
# bio_embeddings repository (https://github.com/sacdallago/bio_embeddings)
# Original file: https://github.com/sacdallago/bio_embeddings/blob/efb9801f0de9b9d51d19b741088763a7d2d0c3a2/bio_embeddings/embed/one_hot_encoding_embedder.py
import numpy as np

import numpy
from numpy import ndarray

from .embedder_interfaces import EmbedderInterface

AMINO_ACIDS = numpy.asarray(list("ACDEFGHIKLMNPQRSTVWXY"))
# Create a mapping of amino acids to their index
AA_TO_INDEX = {aa: i for i, aa in enumerate("ACDEFGHIKLMNPQRSTVWXY")}


class OneHotEncodingEmbedder(EmbedderInterface):
"""Baseline embedder: One hot encoding as per-residue embedding, amino acid composition for per-protein
This embedder is meant to be used as naive baseline for comparing different types of inputs or training method.
"""
Baseline embedder: One hot encoding as per-residue embedding, amino acid composition for per-protein
While option such as device aren't used, you may still pass them for consistency.
This embedder is meant to be used as naive baseline for comparing different types of inputs or training methods.
"""

number_of_layers = 1
embedding_dimension = len(AMINO_ACIDS)
embedding_dimension = len(AA_TO_INDEX.keys())
name = "one_hot_encoding"

def __init__(self):
self.eye_matrix = np.eye(self.embedding_dimension, dtype=np.float32)

def _embed_single(self, sequence: str) -> ndarray:
one_hot = [AMINO_ACIDS == i for i in sequence]
return numpy.stack(one_hot).astype(numpy.float32)
# Convert sequence to indices
indices = np.fromiter((AA_TO_INDEX.get(aa, -1) for aa in sequence), dtype=np.int8)

# Use advanced indexing of identity matrix to create one-hot encoding
return self.eye_matrix[indices]

@staticmethod
def reduce_per_protein(embedding: ndarray) -> ndarray:
Expand Down

0 comments on commit b02872b

Please sign in to comment.