diff --git a/pints/calling_engine.py b/pints/calling_engine.py index f3a8343..c309761 100644 --- a/pints/calling_engine.py +++ b/pints/calling_engine.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # coding=utf-8 -# PINTS: Peak Identifier for Nascent Transcripts Sequencing +# PINTS: Peak Identifier for Nascent Transcripts Starts # Copyright (c) 2019-2022. Li Yao at the Yu Lab. # # This program is free software: you can redistribute it and/or modify @@ -322,6 +322,7 @@ def check_window(coord_start, coord_end, mu_peak, var_peak, pi_peak, chromosome_ or window_value == 0: return 1., window_value, 0, 0, (0, 0, 0) flanking = np.asarray(flanking, dtype=int) // 2 + chr_len = len(chromosome_coverage) mus = [] variances = [] pis = [] @@ -333,9 +334,10 @@ def check_window(coord_start, coord_end, mu_peak, var_peak, pi_peak, chromosome_ qsr = coord_end + 1 qer = coord_end + f bg, x = iqr_obj.remove_peaks_in_local_env(stat_tester=stat_tester, bed_handler=sp_bed_handler, - chromosome=chromosome_name, query_start_left=qsl, - query_end_left=qel, query_start_right=qsr, - query_end_right=qer, small_window_threshold=small_window_threshold, + chromosome=chromosome_name, query_start_left=max(qsl, 0), + query_end_left=max(qel, 0), query_start_right=min(qsr, chr_len), + query_end_right=min(qer, chr_len), + small_window_threshold=small_window_threshold, peak_in_bg_threshold=peak_in_bg_threshold, coverage_info=chromosome_coverage, fdr_target=fdr_target, cache=cache, diff --git a/pints/extension_engine.py b/pints/extension_engine.py index a51f5d6..8cd9e62 100644 --- a/pints/extension_engine.py +++ b/pints/extension_engine.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # coding=utf-8 -# PINTS: Peak Identifier for Nascent Transcripts Sequencing +# PINTS: Peak Identifier for Nascent Transcripts Starts # Copyright (c) 2019-2022. Li Yao at the Yu Lab. # # This program is free software: you can redistribute it and/or modify diff --git a/pints/qc_engine.py b/pints/qc_engine.py new file mode 100644 index 0000000..8010f03 --- /dev/null +++ b/pints/qc_engine.py @@ -0,0 +1,101 @@ +# coding=utf-8 +import os.path + +# PINTS: Peak Identifier for Nascent Transcripts Starts +# Copyright (c) 2019-2022. Li Yao at the Yu Lab. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +import pyBigWig +from pints.io_engine import parse_gtf + + +def _row_atom(X, pl_handler, mn_handler): + """ + + Parameters + ---------- + X + pl_handler + mn_handler + + Returns + ------- + + """ + if X.strand == "+": + tss_start = X.start - 500 + tss_end = X.start + 500 + gbody_start = tss_end + 1 + gbody_end = X.end - 500 + handler = pl_handler + else: + tss_start = X.end - 500 + tss_end = X.end + 500 + gbody_start = X.start + 500 + gbody_end = tss_start - 1 + handler = mn_handler + + try: + tss_counts = handler.stats(X.seqname, tss_start, tss_end, "sum", exact=True)[0] + except: + tss_counts = 0 + tss_counts = tss_counts if tss_counts is not None else 0 + tss_counts = tss_counts if tss_counts >= 0 else -1*tss_counts + try: + gbody_counts = handler.stats(X.seqname, gbody_start, gbody_end, "sum", exact=True)[0] + except: + gbody_counts = 0 + gbody_counts = gbody_counts if gbody_counts is not None else 0 + gbody_counts = gbody_counts if gbody_counts >= 0 else -1*gbody_counts + return X.gene_name, X.transcript_id, tss_counts, gbody_counts + + +def calculate_gbody_tss_ratio(pl_bw_file, mn_bw_file, reference_gtf): + """ + Calculate read count ratio of gene body to tss regions + + Parameters + ---------- + pl_bw_file : str + Path to the pl bw file + mn_bw_file : str + Path to the mn bw file + reference_gtf : str + Path to the gene annotation gtf file + + Returns + ------- + gb_tss_ratio : float + + """ + if not all([os.path.exists(x) for x in (pl_bw_file, mn_bw_file, reference_gtf)]): + raise IOError("Please make sure pl_bw_file, mn_bw_file and reference_gtf are accessible!") + + ref = parse_gtf(reference_gtf) + expected_cols = {"feature", "transcript_type", "start", "end", "seqname"} + if not all([x in ref.columns for x in expected_cols]): + raise ValueError("The gtf file doesn't contain all required columns.") + ref = ref.loc[(ref.feature == "transcript") & (ref.transcript_type == "protein_coding"), :] + ref = ref.loc[ref.end-ref.start > 2000, :] + + with pyBigWig.open(pl_bw_file) as pl_bw, pyBigWig.open(mn_bw_file) as mn_bw: + results = ref.apply(_row_atom, axis=1, args=(pl_bw, mn_bw), result_type="expand") + results = results.sort_values(by=[0, 2], ascending=False).drop_duplicates(subset=0, keep="first") + + total_counts = (results[2]+results[3]) + gb_tss_ratio = (results[3][ + results[2] > results[2].quantile(0.9) + ]).sum()/total_counts[ + results[2] > results[2].quantile(0.9) + ].sum() + + return gb_tss_ratio + diff --git a/scripts/pints_boundary_extender b/scripts/pints_boundary_extender index 9e24bdf..5f0ef33 100644 --- a/scripts/pints_boundary_extender +++ b/scripts/pints_boundary_extender @@ -42,7 +42,8 @@ logger = logging.getLogger("PINTS - BoundaryExtender") if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Element boundary refinement") + parser = argparse.ArgumentParser(description="Element boundary refinement", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) group = parser.add_mutually_exclusive_group(required=True) group.add_argument("--bam-files", action="store", dest="bam_files", nargs="*", type=str, required=False, diff --git a/scripts/pints_caller b/scripts/pints_caller index 24474fc..42721d2 100644 --- a/scripts/pints_caller +++ b/scripts/pints_caller @@ -33,7 +33,8 @@ logger = logging.getLogger("PINTS - Caller") if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Peak Identifier for Nascent Transcripts Starts") + parser = argparse.ArgumentParser(description="Peak Identifier for Nascent Transcripts Starts", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) group = parser.add_argument_group("Input/Output") group.add_argument("--bam-file", action="store", dest="bam_file", nargs="*", type=str, required=False, @@ -125,7 +126,7 @@ if __name__ == "__main__": help="If --annotation-gtf is specified, you use this parameter to change which chromosome the " "tool should learn the values from.") group.add_argument("--alpha", "--donor-tolerance", action="store", dest="donor_tolerance", - type=float, required=False, default=0.3, + type=float, required=False, default=0.3, help="The stringency for PINTS to cluster nearby TSSs into a peak. 0 is the least stringent; " "1 is the most stringent.") group.add_argument("--ce-trigger", action="store", dest="ce_trigger", @@ -175,7 +176,7 @@ if __name__ == "__main__": help="Refine peak calls with compiled epigenomic annotation from the PINTS web server." " Values should be the name of the biosample, for example, K562.") group.add_argument("--relaxed-fdr-target", action="store", dest="relaxed_fdr_target", default=0.2, - type=float, required=False, help="Relaxed FDR cutoff for TREs overlap with ") + type=float, required=False, help="Relaxed FDR cutoff for TREs overlap with epigenomic annotations") group.add_argument("--chromosome-start-with", action="store", dest="chromosome_startswith", type=str, required=False, default="chr", help="Only keep reads mapped to chromosomes with this prefix") diff --git a/scripts/pints_normalizer b/scripts/pints_normalizer index 0bb6b23..7e7736b 100644 --- a/scripts/pints_normalizer +++ b/scripts/pints_normalizer @@ -148,7 +148,7 @@ def RPM_normalization(bam_files, exp_type, output_dir, rc=False, **kwargs): if __name__ == "__main__": - parser = argparse.ArgumentParser() + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("-m", "--method", help="Normalization method, acceptable values: RPM or Spikein") parser.add_argument("-b", "--bam", nargs="+", required=True) parser.add_argument("-f", "--fasta-spike-in") diff --git a/scripts/pints_sample_qc b/scripts/pints_sample_qc new file mode 100644 index 0000000..91f8362 --- /dev/null +++ b/scripts/pints_sample_qc @@ -0,0 +1,54 @@ +#!/usr/bin/env python +# coding=utf-8 + +# PINTS: Peak Identifier for Nascent Transcripts Starts +# Copyright (c) 2019-2022. Li Yao at the Yu Lab. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +import argparse +import logging +from pints.qc_engine import calculate_gbody_tss_ratio + +logging.basicConfig(format="%(name)s - %(asctime)s - %(levelname)s: %(message)s", + datefmt="%d-%b-%y %H:%M:%S", + level=logging.INFO, + handlers=[ + logging.StreamHandler() + ]) +logger = logging.getLogger("PINTS - Sample QC") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument( + "--bw-pl", action="store", dest="bw_pl", + type=str, required=True, + help="Bigwig for the plus strand.") + parser.add_argument( + "--bw-mn", action="store", dest="bw_mn", + type=str, required=True, + help="Bigwig for the minus strand.") + parser.add_argument( + "--annotation-gtf", action="store", dest="annotation_gtf", type=str, required=True, + help="Gene annotation file (gtf) format for evaluating TSS enrichment.") + + args = parser.parse_args() + + logger.info("Evaluating the effect of cap selection/TSS enrichment...") + ratio = calculate_gbody_tss_ratio(args.bw_pl, args.bw_mn, args.annotation_gtf) + if ratio > 0.15: + logger.critical(f"- PINTS detected high proportion of reads from gene body regions. ({ratio:.2%})") + logger.critical("- This usually indicates the cap selection is not working as expected.") + elif ratio > 0.1: + logger.warning(f"- PINTS observed higher than expected proportion of reads in gene body regions. ({ratio:.2%})") + logger.warning("- Please proceed with caution.") + else: + logger.info("- PINTS doesn't find any significant deviation...") diff --git a/scripts/pints_visualizer b/scripts/pints_visualizer index 8c150c4..e3984b9 100644 --- a/scripts/pints_visualizer +++ b/scripts/pints_visualizer @@ -147,11 +147,11 @@ def coverage_dict_to_bigwig(pl_dict, mn_dict, output_pref, normalization_factor= if rpm_normalization: logger.info("%d reads passed filter" % (pl_sum + mn_sum)) rpm_scale = 1000 * 1000 / (pl_sum + mn_sum) - logger.info("RPM normalizing (forward strand)") + logger.info("RPM normalizing (forward strand) with scale factor {:.3f}".format(rpm_scale)) pl_ranges[3] = (np.asarray(pl_ranges[3]) * rpm_scale).tolist() logger.info("Done: RPM normalizing (forward strand)") - logger.info("RPM normalizing (reverse strand)") + logger.info("RPM normalizing (reverse strand) with scale factor {:.3f}".format(rpm_scale)) # mn_ranges.Score *= rpm_scale mn_ranges[3] = (np.asarray(mn_ranges[3]) * rpm_scale).tolist() logger.info("Done: RPM normalizing (reverse strand)") @@ -241,7 +241,7 @@ def bam_to_bigwig(bam_file, exp_type, output_pref, normalization_factor, rpm=Fal if __name__ == "__main__": - parser = argparse.ArgumentParser() + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("-b", "--bam", action="store", required=True) parser.add_argument("-e", "--exp-type", action="store", default=None, dest="bam_parser", help="Type of experiment, acceptable values are: CoPRO/GROcap/GROseq/PROcap/PROseq, or if you "