Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FIX transfer learning bug #218

Merged
merged 8 commits into from
Jun 13, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion alphadia/outputaccumulator.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ def parse_output_folder(
"mods",
"mod_sites",
"proba",
"decoy",
],
) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""
Expand Down Expand Up @@ -132,10 +133,16 @@ def parse_output_folder(
psm_df = psm_df[selected_precursor_columns]
# validate.precursors_flat_from_output(psm_df)

# remove decoy precursors
psm_df = psm_df[psm_df["decoy"] == 0]

self._precursor_df = pd.DataFrame()
for col in psm_df.columns:
self._precursor_df[col] = psm_df[col]

self._precursor_df["decoy"] = self._precursor_df["decoy"].astype(int)
self._precursor_df = psm_df[psm_df["decoy"] == 0].reset_index(drop=True)

# self._precursor_df.set_index('precursor_idx', inplace=True)
# Change the data type of the mods column to string
self._precursor_df["mods"] = self._precursor_df["mods"].astype(str)
Expand Down Expand Up @@ -232,7 +239,7 @@ def process_folder(folder):


def error_callback(e):
logger.error(e)
logger.error(e, exc_info=True)


class AccumulationBroadcaster:
Expand Down
12 changes: 8 additions & 4 deletions alphadia/workflow/peptidecentric.py
Original file line number Diff line number Diff line change
Expand Up @@ -1117,6 +1117,9 @@ def _build_candidate_speclib_flat(
"mod_sites",
"sequence",
"charge",
"rt_observed",
"mobility_observed",
"mz_observed",
],
) -> typing.Tuple[SpecLibFlat, pd.DataFrame]:
"""Build a candidate spectral library for transfer learning.
Expand Down Expand Up @@ -1149,6 +1152,7 @@ def _build_candidate_speclib_flat(
"mod_sites",
"sequence",
"charge",
"rt_observed", "mobility_observed", "mz_observed"
]

Returns
Expand All @@ -1160,13 +1164,13 @@ def _build_candidate_speclib_flat(
Dataframe with scored candidates
"""
# remove decoys
psm_df = psm_df[psm_df["decoy"] == 0]
# psm_df = psm_df[psm_df["decoy"] == 0]
GeorgWa marked this conversation as resolved.
Show resolved Hide resolved

for col in ["rt_observed", "mobility_observed", "mz_observed"]:
optional_columns += [col] if col in psm_df.columns else []
# make copy to avoid modifying the original dataframe
GeorgWa marked this conversation as resolved.
Show resolved Hide resolved
_optional_columns = [col for col in optional_columns if col in psm_df.columns]

scored_candidates = plexscoring.candidate_features_to_candidates(
psm_df, optional_columns=optional_columns
psm_df, optional_columns=_optional_columns
)

# create speclib with fragment_types of interest
Expand Down
7 changes: 5 additions & 2 deletions tests/unit_tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

def mock_precursor_df(
n_precursor: int = 100,
with_decoy=True,
) -> pd.DataFrame:
"""Create a mock precursor dataframe as it's found as the individual search outputs

Expand All @@ -30,7 +31,6 @@ def mock_precursor_df(
"""

precursor_idx = np.arange(n_precursor)
decoy = np.zeros(n_precursor)
precursor_mz = np.random.rand(n_precursor) * 2000 + 500
precursor_charge = np.random.choice([2, 3], size=n_precursor)

Expand All @@ -40,7 +40,10 @@ def mock_precursor_df(
proteins = np.random.choice(protein_names, size=n_precursor)
genes = proteins

decoy = np.concatenate([np.zeros(n_precursor // 2), np.ones(n_precursor // 2)])
if with_decoy:
decoy = np.concatenate([np.zeros(n_precursor // 2), np.ones(n_precursor // 2)])
else:
decoy = np.zeros(n_precursor)
proba = np.zeros(n_precursor) + decoy * np.random.rand(n_precursor)
qval = np.random.rand(n_precursor) * 10e-3

Expand Down
14 changes: 7 additions & 7 deletions tests/unit_tests/test_outputaccumulator.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import tempfile
import numpy as np
import pandas as pd
from conftest import mock_precursor_df, mock_fragment_df
from alphadia import outputtransform
from alphabase.spectral_library.base import SpecLibBase
Expand Down Expand Up @@ -67,7 +68,7 @@ def prepare_input_data():
# setup raw folders
raw_folders = [os.path.join(progress_folder, run) for run in run_columns]

psm_base_df = mock_precursor_df(n_precursor=100)
psm_base_df = mock_precursor_df(n_precursor=100, with_decoy=True)
fragment_base_df = mock_fragment_df(n_precursor=200, n_fragments=10)

psm_dfs = []
Expand Down Expand Up @@ -122,12 +123,11 @@ def test_complete_output_accumulation():
os.path.join(temp_folder, f"{output.TRANSFER_OUTPUT}.hdf"), load_mod_seq=True
)

# Then: all unique precursors should be in the built library
number_of_unique_precursors = len(
np.unique(
np.concatenate([psm_df["precursor_idx"].values for psm_df in psm_dfs])
)
)
# Then: all unique none decoy precursors should be in the built library
union_psm_df = pd.concat(psm_dfs)
union_psm_df = union_psm_df[union_psm_df["decoy"] == 0]
number_of_unique_precursors = len(np.unique(union_psm_df["precursor_idx"]))

assert (
len(np.unique(built_lib.precursor_df["precursor_idx"]))
== number_of_unique_precursors
Expand Down
Loading