From 3586e423522990e6213162372bb6bc4271c7470c Mon Sep 17 00:00:00 2001 From: Sebastian Date: Mon, 18 Nov 2024 16:52:17 +0100 Subject: [PATCH 1/2] Setting log level of onnx and dynamo to ERROR Reducing amount of logs that is not relevant to the user --- biotrainer/solvers/solver.py | 2 +- biotrainer/utilities/executer.py | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/biotrainer/solvers/solver.py b/biotrainer/solvers/solver.py index 1bb30173..24e674fd 100644 --- a/biotrainer/solvers/solver.py +++ b/biotrainer/solvers/solver.py @@ -331,7 +331,7 @@ def save_as_onnx(self, embedding_dimension: int, output_dir: Optional[str] = Non self.network.eval() # Export - export_options = torch.onnx.ExportOptions(dynamic_shapes=True) + export_options = torch.onnx.ExportOptions(dynamic_shapes=True, op_level_debug=False) onnx_program = torch.onnx.dynamo_export(self.network, dummy_input, export_options=export_options) onnx_file_name = f"{output_dir}/{self.checkpoint_name.split('.')[0]}.onnx" diff --git a/biotrainer/utilities/executer.py b/biotrainer/utilities/executer.py index bbabb8a4..ac221ba9 100644 --- a/biotrainer/utilities/executer.py +++ b/biotrainer/utilities/executer.py @@ -1,3 +1,4 @@ +import torch import logging from ruamel import yaml @@ -29,6 +30,10 @@ def _setup_logging(output_dir: str): logging.StreamHandler()] ) logging.captureWarnings(True) + # Only log errors for onnx and dynamo + torch._logging.set_logs(dynamo=logging.ERROR, onnx=logging.ERROR, onnx_diagnostics=False) + for logger_name in ["torch.onnx", "torch._dynamo", "onnxscript"]: + logging.getLogger(logger_name).setLevel(logging.ERROR) def _write_output_file(out_filename: str, config: dict) -> None: From efc18f7fd8dae85de8906dfcfaf27fccaaaf70f2 Mon Sep 17 00:00:00 2001 From: Sebastian Date: Mon, 18 Nov 2024 17:09:39 +0100 Subject: [PATCH 2/2] Speeding up one hot encoding embedder Using numpy functions to improve performance up to 5x --- .../embedders/one_hot_encoding_embedder.py | 29 ++++++++++--------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/biotrainer/embedders/one_hot_encoding_embedder.py b/biotrainer/embedders/one_hot_encoding_embedder.py index 473a9366..4a248dff 100644 --- a/biotrainer/embedders/one_hot_encoding_embedder.py +++ b/biotrainer/embedders/one_hot_encoding_embedder.py @@ -1,30 +1,31 @@ -# Changed version from the original one from Konstantin Schütze (konstin, https://github.com/konstin) from -# bio_embeddings repository (https://github.com/sacdallago/bio_embeddings) -# Original file: https://github.com/sacdallago/bio_embeddings/blob/efb9801f0de9b9d51d19b741088763a7d2d0c3a2/bio_embeddings/embed/one_hot_encoding_embedder.py +import numpy as np -import numpy from numpy import ndarray - from .embedder_interfaces import EmbedderInterface -AMINO_ACIDS = numpy.asarray(list("ACDEFGHIKLMNPQRSTVWXY")) +# Create a mapping of amino acids to their index +AA_TO_INDEX = {aa: i for i, aa in enumerate("ACDEFGHIKLMNPQRSTVWXY")} class OneHotEncodingEmbedder(EmbedderInterface): - """Baseline embedder: One hot encoding as per-residue embedding, amino acid composition for per-protein - - This embedder is meant to be used as naive baseline for comparing different types of inputs or training method. + """ + Baseline embedder: One hot encoding as per-residue embedding, amino acid composition for per-protein - While option such as device aren't used, you may still pass them for consistency. + This embedder is meant to be used as naive baseline for comparing different types of inputs or training methods. """ - number_of_layers = 1 - embedding_dimension = len(AMINO_ACIDS) + embedding_dimension = len(AA_TO_INDEX.keys()) name = "one_hot_encoding" + def __init__(self): + self.eye_matrix = np.eye(self.embedding_dimension, dtype=np.float32) + def _embed_single(self, sequence: str) -> ndarray: - one_hot = [AMINO_ACIDS == i for i in sequence] - return numpy.stack(one_hot).astype(numpy.float32) + # Convert sequence to indices + indices = np.fromiter((AA_TO_INDEX.get(aa, -1) for aa in sequence), dtype=np.int8) + + # Use advanced indexing of identity matrix to create one-hot encoding + return self.eye_matrix[indices] @staticmethod def reduce_per_protein(embedding: ndarray) -> ndarray: