Speeding up one hot encoding embedder

Using numpy functions to improve performance up to 5x
sacdallago · Nov 18, 2024 · b02872b · b02872b
1 parent 98ee386
commit b02872b
Showing 1 changed file with 15 additions and 14 deletions.
diff --git a/biotrainer/embedders/one_hot_encoding_embedder.py b/biotrainer/embedders/one_hot_encoding_embedder.py
@@ -1,30 +1,31 @@
-# Changed version from the original one from Konstantin Schütze (konstin, https://github.com/konstin) from
-# bio_embeddings repository (https://github.com/sacdallago/bio_embeddings)
-# Original file: https://github.com/sacdallago/bio_embeddings/blob/efb9801f0de9b9d51d19b741088763a7d2d0c3a2/bio_embeddings/embed/one_hot_encoding_embedder.py
+import numpy as np
 
-import numpy
 from numpy import ndarray
-
 from .embedder_interfaces import EmbedderInterface
 
-AMINO_ACIDS = numpy.asarray(list("ACDEFGHIKLMNPQRSTVWXY"))
+# Create a mapping of amino acids to their index
+AA_TO_INDEX = {aa: i for i, aa in enumerate("ACDEFGHIKLMNPQRSTVWXY")}
 
 
 class OneHotEncodingEmbedder(EmbedderInterface):
-    """Baseline embedder: One hot encoding as per-residue embedding, amino acid composition for per-protein
-
-    This embedder is meant to be used as naive baseline for comparing different types of inputs or training method.
+    """
+    Baseline embedder: One hot encoding as per-residue embedding, amino acid composition for per-protein
 
-    While option such as device aren't used, you may still pass them for consistency.
+    This embedder is meant to be used as naive baseline for comparing different types of inputs or training methods.
     """
 
-    number_of_layers = 1
-    embedding_dimension = len(AMINO_ACIDS)
+    embedding_dimension = len(AA_TO_INDEX.keys())
     name = "one_hot_encoding"
 
+    def __init__(self):
+        self.eye_matrix = np.eye(self.embedding_dimension, dtype=np.float32)
+
     def _embed_single(self, sequence: str) -> ndarray:
-        one_hot = [AMINO_ACIDS == i for i in sequence]
-        return numpy.stack(one_hot).astype(numpy.float32)
+        # Convert sequence to indices
+        indices = np.fromiter((AA_TO_INDEX.get(aa, -1) for aa in sequence), dtype=np.int8)
+
+        # Use advanced indexing of identity matrix to create one-hot encoding
+        return self.eye_matrix[indices]
 
     @staticmethod
     def reduce_per_protein(embedding: ndarray) -> ndarray: