[Added]: QC script for evaluating TSS vs gene body enrichment

[Changed]: Report scale factors when using pints_visualizer [Changed]: Switch to `argparse.ArgumentDefaultsHelpFormatter` so that default values can be easily seen [Fixed]: Issue hyulab#6
liyao001 · May 11, 2023 · 41281dd · 41281dd
1 parent a714662
commit 41281dd
Show file tree

Hide file tree

Showing 8 changed files with 172 additions and 13 deletions.
diff --git a/pints/calling_engine.py b/pints/calling_engine.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 # coding=utf-8
 
-#  PINTS: Peak Identifier for Nascent Transcripts Sequencing
+#  PINTS: Peak Identifier for Nascent Transcripts Starts
 #  Copyright (c) 2019-2022. Li Yao at the Yu Lab.
 #
 #  This program is free software: you can redistribute it and/or modify
@@ -322,6 +322,7 @@ def check_window(coord_start, coord_end, mu_peak, var_peak, pi_peak, chromosome_
             or window_value == 0:
         return 1., window_value, 0, 0, (0, 0, 0)
     flanking = np.asarray(flanking, dtype=int) // 2
+    chr_len = len(chromosome_coverage)
     mus = []
     variances = []
     pis = []
@@ -333,9 +334,10 @@ def check_window(coord_start, coord_end, mu_peak, var_peak, pi_peak, chromosome_
         qsr = coord_end + 1
         qer = coord_end + f
         bg, x = iqr_obj.remove_peaks_in_local_env(stat_tester=stat_tester, bed_handler=sp_bed_handler,
-                                                  chromosome=chromosome_name, query_start_left=qsl,
-                                                  query_end_left=qel, query_start_right=qsr,
-                                                  query_end_right=qer, small_window_threshold=small_window_threshold,
+                                                  chromosome=chromosome_name, query_start_left=max(qsl, 0),
+                                                  query_end_left=max(qel, 0), query_start_right=min(qsr, chr_len),
+                                                  query_end_right=min(qer, chr_len),
+                                                  small_window_threshold=small_window_threshold,
                                                   peak_in_bg_threshold=peak_in_bg_threshold,
                                                   coverage_info=chromosome_coverage,
                                                   fdr_target=fdr_target, cache=cache,

diff --git a/pints/extension_engine.py b/pints/extension_engine.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 # coding=utf-8
 
-#  PINTS: Peak Identifier for Nascent Transcripts Sequencing
+#  PINTS: Peak Identifier for Nascent Transcripts Starts
 #  Copyright (c) 2019-2022. Li Yao at the Yu Lab.
 #
 #  This program is free software: you can redistribute it and/or modify

diff --git a/pints/qc_engine.py b/pints/qc_engine.py
@@ -0,0 +1,101 @@
+# coding=utf-8
+import os.path
+
+#  PINTS: Peak Identifier for Nascent Transcripts Starts
+#  Copyright (c) 2019-2022. Li Yao at the Yu Lab.
+#
+#  This program is free software: you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation, either version 3 of the License, or
+#  (at your option) any later version.
+#
+#  This program is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+import pyBigWig
+from pints.io_engine import parse_gtf
+
+
+def _row_atom(X, pl_handler, mn_handler):
+    """
+
+    Parameters
+    ----------
+    X
+    pl_handler
+    mn_handler
+
+    Returns
+    -------
+
+    """
+    if X.strand == "+":
+        tss_start = X.start - 500
+        tss_end = X.start + 500
+        gbody_start = tss_end + 1
+        gbody_end = X.end - 500
+        handler = pl_handler
+    else:
+        tss_start = X.end - 500
+        tss_end = X.end + 500
+        gbody_start = X.start + 500
+        gbody_end = tss_start - 1
+        handler = mn_handler
+
+    try:
+        tss_counts = handler.stats(X.seqname, tss_start, tss_end, "sum", exact=True)[0]
+    except:
+        tss_counts = 0
+    tss_counts = tss_counts if tss_counts is not None else 0
+    tss_counts = tss_counts if tss_counts >= 0 else -1*tss_counts
+    try:
+        gbody_counts = handler.stats(X.seqname, gbody_start, gbody_end, "sum", exact=True)[0]
+    except:
+        gbody_counts = 0
+    gbody_counts = gbody_counts if gbody_counts is not None else 0
+    gbody_counts = gbody_counts if gbody_counts >= 0 else -1*gbody_counts
+    return X.gene_name, X.transcript_id, tss_counts, gbody_counts
+
+
+def calculate_gbody_tss_ratio(pl_bw_file, mn_bw_file, reference_gtf):
+    """
+    Calculate read count ratio of gene body to tss regions
+
+    Parameters
+    ----------
+    pl_bw_file : str
+        Path to the pl bw file
+    mn_bw_file : str
+        Path to the mn bw file
+    reference_gtf : str
+        Path to the gene annotation gtf file
+
+    Returns
+    -------
+    gb_tss_ratio : float
+
+    """
+    if not all([os.path.exists(x) for x in (pl_bw_file, mn_bw_file, reference_gtf)]):
+        raise IOError("Please make sure pl_bw_file, mn_bw_file and reference_gtf are accessible!")
+
+    ref = parse_gtf(reference_gtf)
+    expected_cols = {"feature", "transcript_type", "start", "end", "seqname"}
+    if not all([x in ref.columns for x in expected_cols]):
+        raise ValueError("The gtf file doesn't contain all required columns.")
+    ref = ref.loc[(ref.feature == "transcript") & (ref.transcript_type == "protein_coding"), :]
+    ref = ref.loc[ref.end-ref.start > 2000, :]
+
+    with pyBigWig.open(pl_bw_file) as pl_bw, pyBigWig.open(mn_bw_file) as mn_bw:
+        results = ref.apply(_row_atom, axis=1, args=(pl_bw, mn_bw), result_type="expand")
+    results = results.sort_values(by=[0, 2], ascending=False).drop_duplicates(subset=0, keep="first")
+
+    total_counts = (results[2]+results[3])
+    gb_tss_ratio = (results[3][
+        results[2] > results[2].quantile(0.9)
+        ]).sum()/total_counts[
+            results[2] > results[2].quantile(0.9)
+            ].sum()
+
+    return gb_tss_ratio
+
diff --git a/scripts/pints_boundary_extender b/scripts/pints_boundary_extender
@@ -42,7 +42,8 @@ logger = logging.getLogger("PINTS - BoundaryExtender")
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Element boundary refinement")
+    parser = argparse.ArgumentParser(description="Element boundary refinement",
+                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     group = parser.add_mutually_exclusive_group(required=True)
     group.add_argument("--bam-files", action="store", dest="bam_files", nargs="*",
                        type=str, required=False,

diff --git a/scripts/pints_caller b/scripts/pints_caller
@@ -33,7 +33,8 @@ logger = logging.getLogger("PINTS - Caller")
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Peak Identifier for Nascent Transcripts Starts")
+    parser = argparse.ArgumentParser(description="Peak Identifier for Nascent Transcripts Starts",
+                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     group = parser.add_argument_group("Input/Output")
     group.add_argument("--bam-file", action="store", dest="bam_file", nargs="*",
                        type=str, required=False,
@@ -125,7 +126,7 @@ if __name__ == "__main__":
                        help="If --annotation-gtf is specified, you use this parameter to change which chromosome the "
                             "tool should learn the values from.")
     group.add_argument("--alpha", "--donor-tolerance", action="store", dest="donor_tolerance",
-                       type=float, required=False, default=0.3, 
+                       type=float, required=False, default=0.3,
                        help="The stringency for PINTS to cluster nearby TSSs into a peak. 0 is the least stringent; "
                             "1 is the most stringent.")
     group.add_argument("--ce-trigger", action="store", dest="ce_trigger",
@@ -175,7 +176,7 @@ if __name__ == "__main__":
                        help="Refine peak calls with compiled epigenomic annotation from the PINTS web server."
                             " Values should be the name of the biosample, for example, K562.")
     group.add_argument("--relaxed-fdr-target", action="store", dest="relaxed_fdr_target", default=0.2,
-                       type=float, required=False, help="Relaxed FDR cutoff for TREs overlap with ")
+                       type=float, required=False, help="Relaxed FDR cutoff for TREs overlap with epigenomic annotations")
     group.add_argument("--chromosome-start-with", action="store", dest="chromosome_startswith",
                        type=str, required=False, default="chr",
                        help="Only keep reads mapped to chromosomes with this prefix")

diff --git a/scripts/pints_normalizer b/scripts/pints_normalizer
@@ -148,7 +148,7 @@ def RPM_normalization(bam_files, exp_type, output_dir, rc=False, **kwargs):
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument("-m", "--method", help="Normalization method, acceptable values: RPM or Spikein")
     parser.add_argument("-b", "--bam", nargs="+", required=True)
     parser.add_argument("-f", "--fasta-spike-in")

diff --git a/scripts/pints_sample_qc b/scripts/pints_sample_qc
@@ -0,0 +1,54 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+#  PINTS: Peak Identifier for Nascent Transcripts Starts
+#  Copyright (c) 2019-2022. Li Yao at the Yu Lab.
+#
+#  This program is free software: you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation, either version 3 of the License, or
+#  (at your option) any later version.
+#
+#  This program is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+import argparse
+import logging
+from pints.qc_engine import calculate_gbody_tss_ratio
+
+logging.basicConfig(format="%(name)s - %(asctime)s - %(levelname)s: %(message)s",
+                    datefmt="%d-%b-%y %H:%M:%S",
+                    level=logging.INFO,
+                    handlers=[
+                        logging.StreamHandler()
+                    ])
+logger = logging.getLogger("PINTS - Sample QC")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument(
+        "--bw-pl", action="store", dest="bw_pl",
+        type=str, required=True,
+        help="Bigwig for the plus strand.")
+    parser.add_argument(
+        "--bw-mn", action="store", dest="bw_mn",
+        type=str, required=True,
+        help="Bigwig for the minus strand.")
+    parser.add_argument(
+        "--annotation-gtf", action="store", dest="annotation_gtf", type=str, required=True,
+        help="Gene annotation file (gtf) format for evaluating TSS enrichment.")
+
+    args = parser.parse_args()
+
+    logger.info("Evaluating the effect of cap selection/TSS enrichment...")
+    ratio = calculate_gbody_tss_ratio(args.bw_pl, args.bw_mn, args.annotation_gtf)
+    if ratio > 0.15:
+        logger.critical(f"- PINTS detected high proportion of reads from gene body regions. ({ratio:.2%})")
+        logger.critical("- This usually indicates the cap selection is not working as expected.")
+    elif ratio > 0.1:
+        logger.warning(f"- PINTS observed higher than expected proportion of reads in gene body regions. ({ratio:.2%})")
+        logger.warning("- Please proceed with caution.")
+    else:
+        logger.info("- PINTS doesn't find any significant deviation...")
diff --git a/scripts/pints_visualizer b/scripts/pints_visualizer
@@ -147,11 +147,11 @@ def coverage_dict_to_bigwig(pl_dict, mn_dict, output_pref, normalization_factor=
     if rpm_normalization:
         logger.info("%d reads passed filter" % (pl_sum + mn_sum))
         rpm_scale = 1000 * 1000 / (pl_sum + mn_sum)
-        logger.info("RPM normalizing (forward strand)")
+        logger.info("RPM normalizing (forward strand) with scale factor {:.3f}".format(rpm_scale))
         pl_ranges[3] = (np.asarray(pl_ranges[3]) * rpm_scale).tolist()
         logger.info("Done: RPM normalizing (forward strand)")
 
-        logger.info("RPM normalizing (reverse strand)")
+        logger.info("RPM normalizing (reverse strand) with scale factor {:.3f}".format(rpm_scale))
         # mn_ranges.Score *= rpm_scale
         mn_ranges[3] = (np.asarray(mn_ranges[3]) * rpm_scale).tolist()
         logger.info("Done: RPM normalizing (reverse strand)")
@@ -241,7 +241,7 @@ def bam_to_bigwig(bam_file, exp_type, output_pref, normalization_factor, rpm=Fal
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument("-b", "--bam", action="store", required=True)
     parser.add_argument("-e", "--exp-type", action="store", default=None, dest="bam_parser",
                         help="Type of experiment, acceptable values are: CoPRO/GROcap/GROseq/PROcap/PROseq, or if you "