From 8b70e69d824e408ecd3f1d8b47a1d98fdc6a3ec2 Mon Sep 17 00:00:00 2001
From: Omar Shouman <o.shouman@tum.de>
Date: Mon, 20 Jan 2025 20:18:38 +0200
Subject: [PATCH 1/2] Feature/update reports (#51)

* Intensity report updates

* example intensity notebook
---
 ...ple_IntensityModel_Walkthrough_colab.ipynb | 444 ++++++++++++++++++
 src/dlomix/reports/IntensityReport.py         |  58 ++-
 src/dlomix/reports/postprocessing.py          |  27 +-
 3 files changed, 503 insertions(+), 26 deletions(-)
 create mode 100644 notebooks/Example_IntensityModel_Walkthrough_colab.ipynb

diff --git a/notebooks/Example_IntensityModel_Walkthrough_colab.ipynb b/notebooks/Example_IntensityModel_Walkthrough_colab.ipynb
new file mode 100644
index 00000000..c07fed61
--- /dev/null
+++ b/notebooks/Example_IntensityModel_Walkthrough_colab.ipynb
@@ -0,0 +1,444 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "7YWkUVjVr7qJ"
+   },
+   "source": [
+    "# Fragment Ion Intensity Prediction \n",
+    "\n",
+    "This notebook is prepared to be run in Google [Colaboratory](https://colab.research.google.com/). In order to train the model faster, please change the runtime of Colab to use Hardware Accelerator, either GPU or TPU."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "S3DlTOq3r7qM"
+   },
+   "source": [
+    "This notebook presents a short walkthrough the process of reading a dataset and training a model for intensity prediction. The dataset is an example dataset extracted from a ProteomeTools dataset generated in the **Chair of Bioanalytics** at the **Technical University of Munich**.\n",
+    "\n",
+    "DLOmix is the framework being used and is a custom wrapper on top of Keras/TensorFlow."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "aO-69zbKsGey",
+    "outputId": "2d7f5f84-aa06-4ec7-9a2b-c1f546031baf"
+   },
+   "outputs": [],
+   "source": [
+    "# install the DLOmix package in the current environment using pip\n",
+    "\n",
+    "!python -m pip install -q dlomix"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Mo7H9qzWr7qN"
+   },
+   "source": [
+    "The available modules in the framework are as follows:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "l0CS0tFur7qN",
+    "outputId": "16c0f94a-97ab-4445-ef6b-4ac28029e64d",
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import dlomix\n",
+    "from dlomix import constants, data, eval, layers, models, pipelines, reports\n",
+    "print([x for x in dir(dlomix) if not x.startswith(\"_\")])"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "JsgPZb_Mr7qP"
+   },
+   "source": [
+    "\n",
+    "\n",
+    "**Note**: reports and pipelines are work-in-progress, some funtionalities are not complete.\n",
+    "\n",
+    "- `constants`: constants to be used in the framework (e.g. Aminoacid alphabet mapping)\n",
+    "- `data`:  classes for representing dataset, wrappers around HuggingFace datasets to process input data and generate tensor datasets\n",
+    "- `eval`: custom evaluation metrics implemented in Keras/TF to work as `metrics` for model training\n",
+    "- `layers`: custom layer implementation required for the different models\n",
+    "- `models`: different model implementations for Retention Time and Fragment Ion Intensity Prediction\n",
+    "- `pipelines`: complete pipelines to run a task (e.g. Retention Time prediction)\n",
+    "\n",
+    "**Note**: reports and pipelines are work-in-progress, some funtionalities are not complete."
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "41qXroyKr7qP"
+   },
+   "source": [
+    "## 1. Load Data\n",
+    "\n",
+    "We can import the dataset class and create an object of type `FragmentIonIntensityDataset`. This object wraps around a Hugging Face dataset that can generate TensorFlow Dataset objects or Torch Dataset for training, validation, or testing. This can be controlled by the arguments `val_ratio`, `val_data_source`, and `test_data_source`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "RiXz_epEr7qQ",
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "from dlomix.data import FragmentIonIntensityDataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "lzNXJ-s6r7qQ",
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "TRAIN_DATAPATH = \"https://github.com/wilhelm-lab/dlomix/raw/refs/heads/main/example_dataset/intensity/intensity_data.parquet\"\n",
+    "\n",
+    "int_data = FragmentIonIntensityDataset(\n",
+    "    data_format=\"parquet\",\n",
+    "    data_source=TRAIN_DATAPATH,\n",
+    "    sequence_column=\"sequence\",\n",
+    "    label_column=\"intensities\",\n",
+    "    model_features=[\"precursor_charge_onehot\", \"collision_energy_aligned_normed\"],\n",
+    "    max_seq_len=30,\n",
+    "    batch_size=128,\n",
+    "    val_ratio=0.2,\n",
+    "    with_termini=False,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "_UUzvcHGr7qR"
+   },
+   "source": [
+    "Now we have an Intensity dataset that can be used directly with standard or custom `Keras` models. This wrapper contains the splits we chose when creating it. In our case, they are training and validation splits. To get the TF Dataset, we call the attributes `.tensor_rain_data` and `.tensor_val_data`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"Hugging Face Dataset\", int_data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "y1l6YedCr7qS",
+    "outputId": "16a1d57b-7720-4cb1-cdea-a32d9d2c3804",
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    " \"Training examples\", len(int_data[\"train\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "uEnVWjtRr7qT",
+    "outputId": "95550088-76c5-4f1b-8181-dddb942d94b0",
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    " \"Training examples\", len(int_data[\"val\"])"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "oWeVi0iar7qT"
+   },
+   "source": [
+    "## 2. Model\n",
+    "\n",
+    "We can now create the model. We will use a Prosit model [1]. It has the default working arguments, but most of the parameters can be customized.\n",
+    "\n",
+    "**Note**: Important is to ensure that the padding length used for the dataset object is equal to the sequence length passed to the model.\n",
+    "\n",
+    "*[1] Gessulat, S., Schmidt, T., Zolg, D. P., Samaras, P., Schnatbaum, K., Zerweck, J., ... & Wilhelm, M. (2019). Prosit: proteome-wide prediction of peptide tandem mass spectra by deep learning. Nature methods, 16(6), 509-518.*"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "Q8SGTvfRr7qT",
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "from dlomix.models import PrositIntensityPredictor"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "ZqrsF6APr7qU",
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "model = PrositIntensityPredictor(\n",
+    "    seq_length=30,\n",
+    "    input_keys={\n",
+    "        \"SEQUENCE_KEY\": \"sequence\",\n",
+    "    },\n",
+    "    meta_data_keys={\n",
+    "        \"COLLISION_ENERGY_KEY\": \"collision_energy_aligned_normed\",\n",
+    "        \"PRECURSOR_CHARGE_KEY\": \"precursor_charge_onehot\",\n",
+    "    },\n",
+    "    with_termini=False,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "adD60VwQr7qU"
+   },
+   "source": [
+    "## 3. Training\n",
+    "\n",
+    "We can then train the model like a standard Keras model. The training parameters here are from Prosit, but other optimizer parameters can be used.  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "IkPIHuWEr7qU",
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "#imports\n",
+    "\n",
+    "import tensorflow as tf\n",
+    "from dlomix.losses import masked_spectral_distance, masked_pearson_correlation_distance"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "xLy32wk7r7qU",
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# create the optimizer object\n",
+    "optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)\n",
+    "\n",
+    "# compile the model  with the optimizer and the metrics we want to use, we can add our custom timedelta metric\n",
+    "model.compile(optimizer=optimizer,\n",
+    "              loss=masked_spectral_distance,\n",
+    "              metrics=['mse', masked_pearson_correlation_distance])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "history = model.fit(\n",
+    "    int_data.tensor_train_data,\n",
+    "    validation_data=int_data.tensor_val_data,\n",
+    "    epochs=10,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "wtEUn_vdr7qV"
+   },
+   "source": [
+    "We store the result of training so that we can explore the metrics and the losses later. We specify the number of epochs for training and pass the training and validation data as previously described."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "oukZ4AyMr7qV"
+   },
+   "source": [
+    "## 3. Testing and Reporting\n",
+    "\n",
+    "We can create a test dataset to test our model. Additionally, we can use the reporting module to produce plots and evaluate the model.\n",
+    "\n",
+    "Note: the reporting module is still in progress and some functionalities might easily break."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "ngz4zlnwr7qV",
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# create the dataset object for test data\n",
+    "# Note: this is just using the same data above for demonstration purposes. In practice, you should use a separate test dataset\n",
+    "\n",
+    "TEST_DATAPATH = \"https://github.com/wilhelm-lab/dlomix/raw/refs/heads/main/example_dataset/intensity/intensity_data.parquet\"\n",
+    "\n",
+    "test_int_data = FragmentIonIntensityDataset(\n",
+    "    data_format=\"parquet\",\n",
+    "    test_data_source=TEST_DATAPATH,\n",
+    "    sequence_column=\"sequence\",\n",
+    "    label_column=\"intensities\",\n",
+    "    model_features=[\"precursor_charge_onehot\", \"collision_energy_aligned_normed\"],\n",
+    "    max_seq_len=30,\n",
+    "    batch_size=128,\n",
+    "    with_termini=False,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_int_data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Ensure columns in the test data are the same as in training data\n",
+    "int_data.column_names, test_int_data.column_names"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "RrvR8Cl3r7qV",
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# use model.predict from keras directly on the testdata\n",
+    "\n",
+    "predictions = model.predict(test_int_data.tensor_test_data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from dlomix.reports import IntensityReport\n",
+    "\n",
+    "# create a report object by passing the history object and plot different metrics\n",
+    "report = IntensityReport(output_path=\"./output\", history=history)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# you can generate a complete report for intensity by calling generate_report\n",
+    "# the function takes the test dataset object and the predictions as arguments\n",
+    "\n",
+    "report.generate_report(test_int_data, predictions, split=\"test\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# the results used for plotting are availabe under the attribute prediction_results\n",
+    "report.prediction_results"
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "name": "Copy of Example_RTModel_Walkthrough.ipynb",
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/src/dlomix/reports/IntensityReport.py b/src/dlomix/reports/IntensityReport.py
index c4d5609d..2e5ca69a 100644
--- a/src/dlomix/reports/IntensityReport.py
+++ b/src/dlomix/reports/IntensityReport.py
@@ -7,6 +7,7 @@
 from matplotlib.colors import LogNorm
 from matplotlib.ticker import LogLocator
 
+from ..data.processing.processors import SequenceParsingProcessor
 from .postprocessing import normalize_intensity_predictions
 from .Report import PDFFile, Report
 
@@ -17,6 +18,7 @@ class IntensityReport(Report):
     TARGETS_LABEL = "x"
     PREDICTIONS_LABEL = "y"
     DEFAULT_BATCH_SIZE = 600
+    PREDICTIONS_COL_NAME = "intensities_pred"
 
     def __init__(self, output_path, history, figures_ext="png", batch_size=0):
         super(IntensityReport, self).__init__(output_path, history, figures_ext)
@@ -28,39 +30,63 @@ def __init__(self, output_path, history, figures_ext="png", batch_size=0):
         else:
             self.batch_size = IntensityReport.DEFAULT_BATCH_SIZE
 
-    def generate_report(self, dataset, predictions):
+    def generate_report(
+        self,
+        dataset,
+        predictions,
+        split="test",
+        precursor_charge_column_name="precursor_charge_onehot",
+    ):
         self._init_report_resources()
 
-        predictions_df = self.generate_intensity_results_df(dataset, predictions)
         self.plot_all_metrics()
 
         # make custom plots
-        self.plot_spectral_angle(predictions_df)
+        self.plot_spectral_angle(
+            dataset, predictions, split, precursor_charge_column_name
+        )
 
         self._compile_report_resources_add_pdf_pages()
         self.pdf_file.output(join(self._output_path, "intensity_Report.pdf"), "F")
 
-    def generate_intensity_results_df(self, dataset, predictions):
-        predictions_df = pd.DataFrame()
-
-        predictions_df["sequences"] = dataset.sequences
-        predictions_df["intensities_pred"] = predictions.tolist()
-        predictions_df["precursor_charge_onehot"] = dataset.precursor_charge.tolist()
-        predictions_df["intensities_raw"] = dataset.intensities.tolist()
-
-        return predictions_df
-
-    def plot_spectral_angle(self, predictions_df):
+    def plot_spectral_angle(
+        self, dataset, predictions, split, precursor_charge_column_name
+    ):
         """Create spectral  plot
 
         Arguments
         ---------
-            predictions_df:  dataframe with raw intensities, predictions, sequences, precursor_charges
+        dataset:  FragmentIonIntensityDataset
+        predictions:  array of predictions
+        split:  str for the split name in the FragmentIonIntensityDataset
+        precursor_charge_column_name:  str
         """
 
+        predictions_df = (
+            dataset[split]
+            .select_columns(
+                [
+                    SequenceParsingProcessor.PARSED_COL_NAMES["seq"],
+                    dataset.label_column,
+                    *dataset.model_features,
+                ]
+            )
+            .to_pandas()
+        )
+
+        predictions_df[IntensityReport.PREDICTIONS_COL_NAME] = predictions.tolist()
+
         predictions_acc = normalize_intensity_predictions(
-            predictions_df, self.batch_size
+            predictions_df,
+            sequence_column_name=SequenceParsingProcessor.PARSED_COL_NAMES["seq"],
+            labels_column_name=dataset.label_column,
+            predictions_column_name=IntensityReport.PREDICTIONS_COL_NAME,
+            precursor_charge_column_name=precursor_charge_column_name,
+            batch_size=self.batch_size,
         )
+
+        self.prediction_results = predictions_acc
+
         violin_plot = sns.violinplot(predictions_acc["spectral_angle"])
 
         save_path = join(
diff --git a/src/dlomix/reports/postprocessing.py b/src/dlomix/reports/postprocessing.py
index ef223b69..b2b094d4 100644
--- a/src/dlomix/reports/postprocessing.py
+++ b/src/dlomix/reports/postprocessing.py
@@ -66,20 +66,27 @@ def iterate():
     return sa
 
 
-def normalize_intensity_predictions(data, batch_size=600):
+def normalize_intensity_predictions(
+    data,
+    sequence_column_name="sequences",
+    labels_column_name="intensities_raw",
+    predictions_column_name="intensities_pred",
+    precursor_charge_column_name="precursor_charge_onehot",
+    batch_size=600,
+):
     assert (
-        "sequences" in data
+        sequence_column_name in data
     ), "Key sequences is missing in the data provided for post-processing"
     assert (
-        "intensities_pred" in data
+        predictions_column_name in data
     ), "Key intensities_pred is missing in the data provided for post-processing"
     assert (
-        "precursor_charge_onehot" in data
+        precursor_charge_column_name in data
     ), "Key precursor_charge_onehot is missing in the data provided for post-processing"
 
-    sequence_lengths = data["sequences"].apply(lambda x: len(x))
-    intensities = np.stack(data["intensities_pred"].to_numpy()).astype(np.float32)
-    precursor_charge_onehot = np.stack(data["precursor_charge_onehot"].to_numpy())
+    sequence_lengths = data[sequence_column_name].apply(lambda x: len(x))
+    intensities = np.stack(data[predictions_column_name].to_numpy()).astype(np.float32)
+    precursor_charge_onehot = np.stack(data[predictions_column_name].to_numpy())
     charges = list(precursor_charge_onehot.argmax(axis=1) + 1)
 
     intensities[intensities < 0] = 0
@@ -90,11 +97,11 @@ def normalize_intensity_predictions(data, batch_size=600):
     m_idx = intensities == -1
     intensities = normalize_base_peak(intensities)
     intensities[m_idx] = -1
-    data["intensities_pred"] = intensities.tolist()
+    data[predictions_column_name] = intensities.tolist()
 
-    if "intensities_raw" in data:
+    if labels_column_name in data:
         data["spectral_angle"] = get_spectral_angle(
-            np.stack(data["intensities_raw"].to_numpy()).astype(np.float32),
+            np.stack(data[labels_column_name].to_numpy()).astype(np.float32),
             intensities,
             batch_size=batch_size,
         )

From 1b907d3b266ec4742c2409e16ec82afb048b1dcb Mon Sep 17 00:00:00 2001
From: omsh <omar.shouman@gmail.com>
Date: Mon, 20 Jan 2025 20:25:59 +0200
Subject: [PATCH 2/2] version 0.1.7

---
 src/dlomix/__init__.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/dlomix/__init__.py b/src/dlomix/__init__.py
index 3a3cdd2c..9689224c 100644
--- a/src/dlomix/__init__.py
+++ b/src/dlomix/__init__.py
@@ -1,10 +1,10 @@
-__version__ = "0.1.6"
+__version__ = "0.1.7"
 
 META_DATA = {
-    "author": "Omar Shouman",
+    "author": "Wilhelm Lab",
     "author_email": "o.shouman@tum.de",
     "description": "Deep Learning for Proteomics",
     "package_name": "DLOmix",
-    "copyright_text": "2024, Wilhelm Lab, TU Munich.",
+    "copyright_text": "2025, Wilhelm Lab, TU Munich, School of Life Sciences",
     "github_url": "https://github.com/wilhelm-lab/dlomix",
 }