From 3a3e5ea4ac08f86f5deae89dbea4a7412192eae8 Mon Sep 17 00:00:00 2001 From: Siegfried Gessulat Date: Wed, 28 Aug 2024 16:53:32 +0200 Subject: [PATCH 1/5] =?UTF-8?q?=E2=9C=A8=20draft=20of=20pin=20to=20tsv=20c?= =?UTF-8?q?onverter?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mokapot/writers/pin_to_tsv.py | 197 ++++++++++++++++++++++++++++++++++ 1 file changed, 197 insertions(+) create mode 100644 mokapot/writers/pin_to_tsv.py diff --git a/mokapot/writers/pin_to_tsv.py b/mokapot/writers/pin_to_tsv.py new file mode 100644 index 00000000..50257db1 --- /dev/null +++ b/mokapot/writers/pin_to_tsv.py @@ -0,0 +1,197 @@ +from pathlib import Path +from io import StringIO +from typing import TextIO +from unittest.mock import Mock + +import argparse + +# PIN file specification from +# https://github.com/percolator/percolator/wiki/Interface#tab-delimited-file-format +""" +PSMId Label ScanNr feature1name ... featureNname Peptide Proteins +DefaultDirection - - feature1weight ... featureNweight [optional] +""" + +EXAMPLE_PIN = """SpecId\tLabel\tScanNr\tExpMass\tPeptide\tProteins +target_0_16619_2_-1\t1\t16619\t750.4149\tK.SEFLVR.E\tsp|Q96QR8|PURB_HUMAN\tsp|Q00577|PURA_HUMAN +target_0_2025_2_-1\t1\t2025\t751.4212\tR.HTALGPR.S\tsp|Q9Y4H4|GPSM3_HUMAN""" +EXAMPLE_HEADER, EXAMPLE_LINE_1, EXAMPLE_LINE_2 = EXAMPLE_PIN.split('\n') + +PIN_SEP = '\t' + + +def parse_pin_header_columns( + header: str, + sep_column: str = PIN_SEP, + +) -> (int, int): + """ + Parse the header of a PIN file to get the number of columns and the index of the + Proteins column. + + Parameters + ---------- + header : str + The header line from the PIN file. + sep_column : str, optional + Column separator (default is PIN_SEP). + + Returns + ------- + n_col : int + The total number of columns in the PIN file. + idx_protein_col : int + The index of the 'Proteins' column. + + Examples + -------- + >>> n_col, idx_protein_col = parse_pin_header_columns(EXAMPLE_HEADER) + >>> n_col, idx_protein_col + (6, 5) + """ + columns = header.split(sep_column) + assert "Proteins" in columns + n_col = len(columns) + idx_protein_col = columns.index("Proteins") + return n_col, idx_protein_col + + +def convert_line_pin_to_tsv( + line: str, + idx_protein_col: int, + n_col: int, + sep_column: str = "\t", + sep_protein: str = ":" +): + """ + Convert a single line from a PIN file format to a TSV format. + + Parameters + ---------- + line : str + A single line from the PIN file. + idx_protein_col : int + The index of the first protein column. + n_col : int + The total number of columns in the PIN file (excluding additional protein columns). + sep_column : str, optional + The separator used between columns (default is "\t"). + sep_protein : str, optional + The separator to use between multiple proteins (default is ":"). + + Returns + ------- + str + The converted line in TSV format. + + Examples + -------- + >>> header = EXAMPLE_HEADER + >>> n_col, idx_protein_col = parse_pin_header_columns(header) + >>> tsv_line = convert_line_pin_to_tsv(EXAMPLE_LINE_1, n_col=n_col, idx_protein_col=idx_protein_col) + >>> tsv_line.expandtabs(4) # needed for docstring to work + 'target_0_16619_2_-1 1 16619 750.4149 K.SEFLVR.E sp|Q96QR8|PURB_HUMAN:sp|Q00577|PURA_HUMAN' + >>> tsv_line = convert_line_pin_to_tsv(EXAMPLE_LINE_2, n_col=n_col, idx_protein_col=idx_protein_col) + >>> tsv_line.expandtabs(4) # needed for docstring to work + 'target_0_2025_2_-1 1 2025 751.4212 R.HTALGPR.S sp|Q9Y4H4|GPSM3_HUMAN' + """ + elements = line.split(sep=sep_column) # this contains columns and proteins + n_proteins = len(elements) - n_col + idx_prot_start = idx_protein_col + idx_prot_end = idx_protein_col + n_proteins + 1 + proteins: str = sep_protein.join(elements[idx_prot_start:idx_prot_end]) + columns: list = elements[:idx_prot_start] + [proteins] + elements[idx_prot_end:] + tsv_line: str = sep_column.join(columns) + return tsv_line + + +def pin_to_valid_tsv( + f_in: TextIO, + f_out: TextIO, + sep_column: str = PIN_SEP, + sep_protein: str = ":" +) -> None: + """ + Convert a PIN file to a valid TSV file. + + This assumes that the input file is in PIN format and that the first line + is a header. It preserves the header in the output file and ignores the second line + if it starts with "DefaultDirection". + + Parameters + ---------- + f_in : TextIO + Input file object to read from. + f_out : TextIO + Output file object to write to. + sep_column : str, optional + Column separator (default is PIN_SEP). + sep_protein : str, optional + Protein separator (default is ":"). + + Returns + ------- + None + + Examples + -------- + >>> mock_input = StringIO(EXAMPLE_PIN) + >>> mock_output = Mock() + >>> mock_output.write = Mock() + >>> pin_to_valid_tsv(mock_input, mock_output) + >>> mock_output.write.call_count + 3 + >>> mock_output.write.assert_any_call(EXAMPLE_HEADER + "\\n") + """ + header: str = next(f_in).strip() + f_out.write(header + "\n") + n_col, idx_protein_col = parse_pin_header_columns(header, sep_column) + + # Optionally, the second line of a PIN file might declare DefaultDirection + # This is ignored with this conversion + # https://github.com/percolator/percolator/wiki/Interface#pintsv-tab-delimited-file-format + second_line = next(f_in).strip() + + if not second_line.startswith("DefaultDirection"): + tsv_line: str = convert_line_pin_to_tsv( + second_line, + n_col=n_col, + idx_protein_col=idx_protein_col, + sep_column=sep_column, + sep_protein=sep_protein + ) + f_out.write(tsv_line + "\n") + + for line in f_in: + line = line.strip() + tsv_line: str = convert_line_pin_to_tsv( + line, + n_col=n_col, + idx_protein_col=idx_protein_col, + sep_column=sep_column, + sep_protein=sep_protein + ) + f_out.write(tsv_line + "\n") + + +def main(): + parser = argparse.ArgumentParser(description="Convert PIN file to valid TSV") + parser.add_argument("path_in", type=Path, help="Input PIN file path") + parser.add_argument("path_out", type=Path, help="Output TSV file path") + parser.add_argument("--sep_column", type=str, default="\t", + help="Column separator (default: '\\t')") + parser.add_argument("--sep_protein", type=str, default=":", + help="Protein separator (default: ':')") + args = parser.parse_args() + with open(args.path_in, 'r') as f_in: + with open(args.path_out, 'a') as f_out: + pin_to_valid_tsv( + f_in=f_in, + f_out=f_out, + sep_column=args.sep_column, + sep_protein=args.sep_protein + ) + + +if __name__ == "__main__": + main() From 00a3c649a67fef1f4e5c8518c69d10845ae00389 Mon Sep 17 00:00:00 2001 From: Siegfried Gessulat Date: Wed, 28 Aug 2024 17:07:40 +0200 Subject: [PATCH 2/5] =?UTF-8?q?=E2=9C=A8=20adds=20is=5Fvalid=5Ftsv?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mokapot/writers/pin_to_tsv.py | 47 +++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/mokapot/writers/pin_to_tsv.py b/mokapot/writers/pin_to_tsv.py index 50257db1..cb8b826c 100644 --- a/mokapot/writers/pin_to_tsv.py +++ b/mokapot/writers/pin_to_tsv.py @@ -105,6 +105,53 @@ def convert_line_pin_to_tsv( return tsv_line +def is_valid_tsv( + f_in: TextIO, + sep_column: str = PIN_SEP +) -> bool: + """ + This function verifies that: + 1. All rows have the same number of columns as the header row. + 2. The file does not contain a "DefaultDirection" line as the second line. + + Parameters + ---------- + f_in : TextIO + Input file object to read from. This should be an opened file or file-like + object that supports iteration. + sep_column : str, optional + Column separator (default is PIN_SEP, which is assumed to be a tab character). + + Returns + ------- + bool + True if the file is a valid TSV according to the specified criteria, + False otherwise. + + Examples + -------- + >>> input = StringIO(EXAMPLE_PIN) + >>> is_valid_tsv(input) + False + """ + n_col_header = len(next(f_in).split(sep_column)) + line_2 = next(f_in) + + # check for optional DefaultDirection line + if line_2.startswith("DefaultDirection"): + return False + n_col = len(line_2.split(sep_column)) + if n_col != n_col_header: + return False + + # check if sep_column is really only used for columns + for line in f_in: + n_col = len(line.split(sep_column)) + if n_col != n_col_header: + return False + return True + + def pin_to_valid_tsv( f_in: TextIO, f_out: TextIO, From 3eb62a3414bddb3316e8be08b1e67372ef0c78b7 Mon Sep 17 00:00:00 2001 From: Siegfried Gessulat Date: Wed, 28 Aug 2024 17:48:30 +0200 Subject: [PATCH 3/5] =?UTF-8?q?=E2=9C=A8=20adds=20tsv=20verification=20for?= =?UTF-8?q?=20pin=20files=20and=20conversion?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mokapot/config.py | 7 +++++++ mokapot/mokapot.py | 17 +++++++++++++++++ mokapot/{writers => parsers}/pin_to_tsv.py | 4 ++-- 3 files changed, 26 insertions(+), 2 deletions(-) rename mokapot/{writers => parsers}/pin_to_tsv.py (99%) diff --git a/mokapot/config.py b/mokapot/config.py index ca44052a..fa01545f 100644 --- a/mokapot/config.py +++ b/mokapot/config.py @@ -72,6 +72,13 @@ def _parser(): ), ) + parser.add_argument( + "--verify_pin", + type=bool, + default=True, + help="Verify that PIN input files are valid TSVs. If not convert them.", + ) + parser.add_argument( "-d", "--dest_dir", diff --git a/mokapot/mokapot.py b/mokapot/mokapot.py index abe7e999..c7647a64 100644 --- a/mokapot/mokapot.py +++ b/mokapot/mokapot.py @@ -7,6 +7,7 @@ import sys import time import warnings +import shutil from pathlib import Path import numpy as np @@ -14,6 +15,7 @@ from . import __version__ from .config import Config from .parsers.pin import read_pin +from .parsers.pin_to_tsv import is_valid_tsv, pin_to_valid_tsv from .parsers.fasta import read_fasta from .brew import brew from .model import PercolatorModel, load_model @@ -55,6 +57,21 @@ def main(main_args=None): logging.info("Command issued:") logging.info("%s", " ".join(sys.argv)) logging.info("") + + logging.info("Verify PIN format") + logging.info("=================") + if config.verify_pin: + for path_pin in config.psm_files: + with open(path_pin, 'r') as f_pin: + valid_tsv = is_valid_tsv(f_pin) + if not valid_tsv: + logging.info(f"{path_pin} invalid tsv, converting") + path_tsv = f"{path_pin}.tsv" + with open(path_pin, 'r') as f_pin: + with open(path_tsv, 'a') as f_tsv: + pin_to_valid_tsv(f_in=f_pin, f_out=f_tsv) + shutil.move(path_tsv, path_pin) + logging.info("Starting Analysis") logging.info("=================") diff --git a/mokapot/writers/pin_to_tsv.py b/mokapot/parsers/pin_to_tsv.py similarity index 99% rename from mokapot/writers/pin_to_tsv.py rename to mokapot/parsers/pin_to_tsv.py index cb8b826c..c8c7db8f 100644 --- a/mokapot/writers/pin_to_tsv.py +++ b/mokapot/parsers/pin_to_tsv.py @@ -2,7 +2,6 @@ from io import StringIO from typing import TextIO from unittest.mock import Mock - import argparse # PIN file specification from @@ -49,7 +48,8 @@ def parse_pin_header_columns( >>> n_col, idx_protein_col (6, 5) """ - columns = header.split(sep_column) + columns = header.strip().split(sep_column) + print(columns) assert "Proteins" in columns n_col = len(columns) idx_protein_col = columns.index("Proteins") From 47c6c0bbfd88f5ebd4d1e993b24b82cbbaf89c6b Mon Sep 17 00:00:00 2001 From: Siegfried Gessulat Date: Wed, 28 Aug 2024 18:05:13 +0200 Subject: [PATCH 4/5] :pencil: remove print --- mokapot/parsers/pin_to_tsv.py | 1 - 1 file changed, 1 deletion(-) diff --git a/mokapot/parsers/pin_to_tsv.py b/mokapot/parsers/pin_to_tsv.py index c8c7db8f..cdf14a29 100644 --- a/mokapot/parsers/pin_to_tsv.py +++ b/mokapot/parsers/pin_to_tsv.py @@ -49,7 +49,6 @@ def parse_pin_header_columns( (6, 5) """ columns = header.strip().split(sep_column) - print(columns) assert "Proteins" in columns n_col = len(columns) idx_protein_col = columns.index("Proteins") From 9197db96ef6c9ea4b2f0e59bcce615aa977862ec Mon Sep 17 00:00:00 2001 From: Siegfried Gessulat Date: Wed, 28 Aug 2024 18:16:23 +0200 Subject: [PATCH 5/5] =?UTF-8?q?=F0=9F=94=A5=20add=20required=20default=20f?= =?UTF-8?q?or=20--dest=5Fdir?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - using Mokapot without --dest_dir was broken without this new default before --- mokapot/config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mokapot/config.py b/mokapot/config.py index fa01545f..cecfe5b2 100644 --- a/mokapot/config.py +++ b/mokapot/config.py @@ -83,6 +83,7 @@ def _parser(): "-d", "--dest_dir", type=Path, + default=Path("."), help=( "The directory in which to write the result files. Defaults to " "the current working directory"