Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add functionality to save mokapot models #200

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions ms2rescore/feature_generators/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from ms2rescore.feature_generators.maxquant import MaxQuantFeatureGenerator
from ms2rescore.feature_generators.ms2pip import MS2PIPFeatureGenerator
from ms2rescore.feature_generators.im2deep import IM2DeepFeatureGenerator
from peak_pack.feature_generators import PeakFeatureGenerator

FEATURE_GENERATORS = {
"basic": BasicFeatureGenerator,
Expand All @@ -16,4 +17,5 @@
"maxquant": MaxQuantFeatureGenerator,
"ionmob": IonMobFeatureGenerator,
"im2deep": IM2DeepFeatureGenerator,
"peak_fgen": PeakFeatureGenerator
}
51 changes: 39 additions & 12 deletions ms2rescore/feature_generators/deeplc.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,30 +109,24 @@ def feature_names(self) -> List[str]:
"rt_diff_best",
]

def add_features(self, psm_list: PSMList) -> None:
"""Add DeepLC-derived features to PSMs."""

logger.info("Adding DeepLC-derived features to PSMs.")
def retrain_deeplc(self, psm_list: PSMList) -> None:
logger.info("Transfer learning of DeepLC model")

# Get easy-access nested version of PSMList
psm_dict = psm_list.get_psm_dict()

# Run DeepLC for each spectrum file
current_run = 1
total_runs = sum(len(runs) for runs in psm_dict.values())
assert total_runs == 1

# Only one iteration
for runs in psm_dict.values():
# Reset DeepLC predictor for each collection of runs
self.deeplc_predictor = None
self.selected_model = None
for run, psms in runs.items():
peptide_rt_diff_dict = defaultdict(
lambda: {
"observed_retention_time_best": np.Inf,
"predicted_retention_time_best": np.Inf,
"rt_diff_best": np.Inf,
}
)

logger.info(
f"Running DeepLC for PSMs from run ({current_run}/{total_runs}): `{run}`..."
)
Expand Down Expand Up @@ -163,8 +157,41 @@ def add_features(self, psm_list: PSMList) -> None:
"calibration of first run. Using this model (after new "
"calibrations) for the remaining runs."
)

def add_features(self, psm_list: PSMList) -> None:
"""Add DeepLC-derived features to PSMs."""

logger.info("Adding DeepLC-derived features to PSMs.")

# Get easy-access nested version of PSMList
psm_dict = psm_list.get_psm_dict()

# Run DeepLC for each spectrum file
current_run = 1
total_runs = sum(len(runs) for runs in psm_dict.values())
assert total_runs == 1
assert self.deeplc_predictor is not None

for runs in psm_dict.values():
for run, psms in runs.items():
peptide_rt_diff_dict = defaultdict(
lambda: {
"observed_retention_time_best": np.Inf,
"predicted_retention_time_best": np.Inf,
"rt_diff_best": np.Inf,
}
)
logger.info(
f"Running DeepLC for PSMs from run ({current_run}/{total_runs}): `{run}`..."
)

# Disable wild logging to stdout by Tensorflow, unless in debug mode
with contextlib.redirect_stdout(
open(os.devnull, "w", encoding="utf-8")
) if not self._verbose else contextlib.nullcontext():
# Make new PSM list for this run (chain PSMs per spectrum to flat list)
psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))

logger.debug("Predicting retention times...")
predictions = np.array(self.deeplc_predictor.make_preds(psm_list_run))
observations = psm_list_run["retention_time"]
rt_diffs_run = np.abs(predictions - observations)
Expand Down
42 changes: 38 additions & 4 deletions ms2rescore/package_data/config_default.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,44 @@
"ms2_tolerance": 0.02
},
"deeplc": {
"deeplc_retrain": false
"deeplc_retrain": true
},
"maxquant": {}
"maxquant": {},
"peak_fgen": {
"ion_types": [
"a1",
"a2",
"b1",
"b2",
"c1",
"c2",
"y1",
"y2",
"z1",
"z2",
"x1",
"x2",
"p1",
"p2",
"p3"
],
"neutral_losses": [
"",
"-H2O1"
],
"max_workers": -1,
"ion_types_evidence": ["b1", "y1", "b2", "y2"],
"neutral_losses_evidence": ["", "-H2O1"],
"fragment_tol_mass": 20,
"fragment_tol_mode": "ppm"
}
},
"rescoring_engine": {
"mokapot": {
"train_fdr": 0.01,
"write_weights": true,
"write_txt": true
"write_txt": true,
"save_models": true
}
},
"config_file": null,
Expand All @@ -40,6 +69,11 @@
"rename_to_usi": false,
"fasta_file": null,
"write_flashlfq": false,
"write_report": false
"write_report": false,
"rescoring_features": [
"hyperscore",
"",
""
]
}
}
3 changes: 2 additions & 1 deletion ms2rescore/package_data/config_default_tims.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@
"rescoring_engine": {
"mokapot": {
"write_weights": true,
"write_txt": true
"write_txt": true,
"save_models": true
}
},
"psm_file": null
Expand Down
72 changes: 72 additions & 0 deletions ms2rescore/package_data/config_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,11 @@
"description": "Write a txt report using cProfile for profiling",
"type": "boolean",
"default": false
},
"rescoring_features": {
"description": "Features to include for rescoring",
"type": "array",
"default": ["all"]
}
}
}
Expand Down Expand Up @@ -265,6 +270,68 @@
}
}
},
"peak_fgen": {
"$ref": "#/definitions/feature_generator",
"description": "Peak feature generator",
"type": "object",
"additionalProperties": true,
"properties": {
"ion_types": {
"description": "Type of ions to try and annotate",
"type": "array",
"default": [
"a1",
"a2",
"b1",
"b2",
"c1",
"c2",
"y1",
"y2",
"z1",
"z2",
"x1",
"x2",
"p1",
"p2",
"p3"
]
},
"neutral_losses": {
"description": "Neutral losses to include for annotation",
"type": "array",
"default": [
"",
"-H2O1"
]
},
"max_workers": {
"description": "Parallel processes to use for annotation",
"type": "integer",
"default":-1
},
"ion_types_evidence": {
"description": "Ion types to consider for fragmentation site evidence",
"type": "array",
"default": ["b1", "y1", "b2", "y2"]
},
"neutral_losses_evidence": {
"description": "Neutral losses to consider for fragmentation site evidence",
"type": "array",
"default": ["", "-H2O1"]
},
"fragment_tol_mass": {
"description": "Fragmentation tolerance mass for hyperscore calculation",
"type": "integer",
"default": 20
},
"fragment_tol_mode": {
"description": "Fragmentation tolerance mode",
"type": "string",
"default": "ppm"
}
}
},
"im2deep": {
"$ref": "#/definitions/feature_generator",
"description": "Ion mobility feature generator configuration using IM2Deep",
Expand Down Expand Up @@ -300,6 +367,11 @@
"description": "Write Mokapot results to a text file",
"type": "boolean",
"default": false
},
"save_models": {
"description": "Save Mokapot models to a pickle file",
"type": "boolean",
"default": false
}
}
},
Expand Down
31 changes: 20 additions & 11 deletions ms2rescore/parse_psms.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
logger = logging.getLogger(__name__)


def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList:
def parse_psms(config: Dict, psm_list: Union[PSMList, None], recalculate_qvalues=True) -> PSMList:
"""
Parse PSMs and prepare for rescoring.

Expand Down Expand Up @@ -43,8 +43,10 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList:

# Remove invalid AAs, find decoys, calculate q-values
psm_list = _remove_invalid_aa(psm_list)
_find_decoys(psm_list, config["id_decoy_pattern"])
_calculate_qvalues(psm_list, config["lower_score_is_better"])

if recalculate_qvalues:
_find_decoys(psm_list, config["id_decoy_pattern"])
_calculate_qvalues(psm_list, config["lower_score_is_better"])
if config["psm_id_rt_pattern"] or config["psm_id_im_pattern"]:
logger.debug("Parsing retention time and/or ion mobility from PSM identifier...")
_parse_values_from_spectrum_id(
Expand All @@ -53,14 +55,21 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList:

# Store scoring values for comparison later
for psm in psm_list:
psm.provenance_data.update(
{
"before_rescoring_score": psm.score,
"before_rescoring_qvalue": psm.qvalue,
"before_rescoring_pep": psm.pep,
"before_rescoring_rank": psm.rank,
}
)
if recalculate_qvalues:
psm.provenance_data.update(
{
"before_rescoring_score": psm.score,
"before_rescoring_qvalue": psm.qvalue,
"before_rescoring_pep": psm.pep,
"before_rescoring_rank": psm.rank,
}
)
else:
psm.provenance_data.update(
{
"before_rescoring_score": psm.score,
}
)

logger.debug("Parsing modifications...")
modifications_found = set(
Expand Down
5 changes: 5 additions & 0 deletions ms2rescore/rescoring_engines/mokapot.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def rescore(
train_fdr: float = 0.01,
write_weights: bool = False,
write_txt: bool = False,
save_models: bool = False,
protein_kwargs: Optional[Dict[str, Any]] = None,
**kwargs: Any,
) -> None:
Expand Down Expand Up @@ -123,6 +124,10 @@ def rescore(
if write_txt:
confidence_results.to_txt(file_root=output_file_root, decoys=True)

if save_models:
for i, model in enumerate(models):
model.save(output_file_root + f".mokapot.model_{i}.pkl")


def convert_psm_list(
psm_list: psm_utils.PSMList,
Expand Down
2 changes: 1 addition & 1 deletion ms2rescore/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def infer_spectrum_path(
)

# Match with file extension if not in resolved_path yet
if not _is_minitdf(resolved_path) and not re.match(
if not _is_minitdf(resolved_path) and not re.search(
r"\.mgf$|\.mzml$|\.d$", resolved_path, flags=re.IGNORECASE
):
for filename in glob(resolved_path + "*"):
Expand Down
Loading