sacdallago · SebieF · Nov 18, 2024 · Nov 18, 2024 · Nov 18, 2024
diff --git a/biotrainer/embedders/one_hot_encoding_embedder.py b/biotrainer/embedders/one_hot_encoding_embedder.py
@@ -1,30 +1,31 @@
-# Changed version from the original one from Konstantin Schütze (konstin, https://github.com/konstin) from
-# bio_embeddings repository (https://github.com/sacdallago/bio_embeddings)
-# Original file: https://github.com/sacdallago/bio_embeddings/blob/efb9801f0de9b9d51d19b741088763a7d2d0c3a2/bio_embeddings/embed/one_hot_encoding_embedder.py
+import numpy as np
 
-import numpy
 from numpy import ndarray
-
 from .embedder_interfaces import EmbedderInterface
 
-AMINO_ACIDS = numpy.asarray(list("ACDEFGHIKLMNPQRSTVWXY"))
+# Create a mapping of amino acids to their index
+AA_TO_INDEX = {aa: i for i, aa in enumerate("ACDEFGHIKLMNPQRSTVWXY")}
 
 
 class OneHotEncodingEmbedder(EmbedderInterface):
-    """Baseline embedder: One hot encoding as per-residue embedding, amino acid composition for per-protein
-
-    This embedder is meant to be used as naive baseline for comparing different types of inputs or training method.
+    """
+    Baseline embedder: One hot encoding as per-residue embedding, amino acid composition for per-protein
 
-    While option such as device aren't used, you may still pass them for consistency.
+    This embedder is meant to be used as naive baseline for comparing different types of inputs or training methods.
     """
 
-    number_of_layers = 1
-    embedding_dimension = len(AMINO_ACIDS)
+    embedding_dimension = len(AA_TO_INDEX.keys())
     name = "one_hot_encoding"
 
+    def __init__(self):
+        self.eye_matrix = np.eye(self.embedding_dimension, dtype=np.float32)
+
     def _embed_single(self, sequence: str) -> ndarray:
-        one_hot = [AMINO_ACIDS == i for i in sequence]
-        return numpy.stack(one_hot).astype(numpy.float32)
+        # Convert sequence to indices
+        indices = np.fromiter((AA_TO_INDEX.get(aa, -1) for aa in sequence), dtype=np.int8)
+
+        # Use advanced indexing of identity matrix to create one-hot encoding
+        return self.eye_matrix[indices]
 
     @staticmethod
     def reduce_per_protein(embedding: ndarray) -> ndarray:

diff --git a/biotrainer/solvers/solver.py b/biotrainer/solvers/solver.py
@@ -331,7 +331,7 @@ def save_as_onnx(self, embedding_dimension: int, output_dir: Optional[str] = Non
         self.network.eval()
 
         # Export
-        export_options = torch.onnx.ExportOptions(dynamic_shapes=True)
+        export_options = torch.onnx.ExportOptions(dynamic_shapes=True, op_level_debug=False)
         onnx_program = torch.onnx.dynamo_export(self.network, dummy_input,
                                                 export_options=export_options)
         onnx_file_name = f"{output_dir}/{self.checkpoint_name.split('.')[0]}.onnx"

diff --git a/biotrainer/utilities/executer.py b/biotrainer/utilities/executer.py
@@ -1,3 +1,4 @@
+import torch
 import logging
 
 from ruamel import yaml
@@ -29,6 +30,10 @@ def _setup_logging(output_dir: str):
                             logging.StreamHandler()]
                         )
     logging.captureWarnings(True)
+    # Only log errors for onnx and dynamo
+    torch._logging.set_logs(dynamo=logging.ERROR, onnx=logging.ERROR, onnx_diagnostics=False)
+    for logger_name in ["torch.onnx", "torch._dynamo", "onnxscript"]:
+        logging.getLogger(logger_name).setLevel(logging.ERROR)
 
 
 def _write_output_file(out_filename: str, config: dict) -> None: