Skip to content

Commit

Permalink
[Added]: QC script for evaluating TSS vs gene body enrichment
Browse files Browse the repository at this point in the history
[Changed]: Report scale factors when using pints_visualizer
[Changed]: Switch to `argparse.ArgumentDefaultsHelpFormatter` so that default values can be easily seen
[Fixed]: Issue hyulab#6
  • Loading branch information
liyao001 committed May 11, 2023
1 parent a714662 commit 41281dd
Show file tree
Hide file tree
Showing 8 changed files with 172 additions and 13 deletions.
10 changes: 6 additions & 4 deletions pints/calling_engine.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env python
# coding=utf-8

# PINTS: Peak Identifier for Nascent Transcripts Sequencing
# PINTS: Peak Identifier for Nascent Transcripts Starts
# Copyright (c) 2019-2022. Li Yao at the Yu Lab.
#
# This program is free software: you can redistribute it and/or modify
Expand Down Expand Up @@ -322,6 +322,7 @@ def check_window(coord_start, coord_end, mu_peak, var_peak, pi_peak, chromosome_
or window_value == 0:
return 1., window_value, 0, 0, (0, 0, 0)
flanking = np.asarray(flanking, dtype=int) // 2
chr_len = len(chromosome_coverage)
mus = []
variances = []
pis = []
Expand All @@ -333,9 +334,10 @@ def check_window(coord_start, coord_end, mu_peak, var_peak, pi_peak, chromosome_
qsr = coord_end + 1
qer = coord_end + f
bg, x = iqr_obj.remove_peaks_in_local_env(stat_tester=stat_tester, bed_handler=sp_bed_handler,
chromosome=chromosome_name, query_start_left=qsl,
query_end_left=qel, query_start_right=qsr,
query_end_right=qer, small_window_threshold=small_window_threshold,
chromosome=chromosome_name, query_start_left=max(qsl, 0),
query_end_left=max(qel, 0), query_start_right=min(qsr, chr_len),
query_end_right=min(qer, chr_len),
small_window_threshold=small_window_threshold,
peak_in_bg_threshold=peak_in_bg_threshold,
coverage_info=chromosome_coverage,
fdr_target=fdr_target, cache=cache,
Expand Down
2 changes: 1 addition & 1 deletion pints/extension_engine.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env python
# coding=utf-8

# PINTS: Peak Identifier for Nascent Transcripts Sequencing
# PINTS: Peak Identifier for Nascent Transcripts Starts
# Copyright (c) 2019-2022. Li Yao at the Yu Lab.
#
# This program is free software: you can redistribute it and/or modify
Expand Down
101 changes: 101 additions & 0 deletions pints/qc_engine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
# coding=utf-8
import os.path

# PINTS: Peak Identifier for Nascent Transcripts Starts
# Copyright (c) 2019-2022. Li Yao at the Yu Lab.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
import pyBigWig
from pints.io_engine import parse_gtf


def _row_atom(X, pl_handler, mn_handler):
"""
Parameters
----------
X
pl_handler
mn_handler
Returns
-------
"""
if X.strand == "+":
tss_start = X.start - 500
tss_end = X.start + 500
gbody_start = tss_end + 1
gbody_end = X.end - 500
handler = pl_handler
else:
tss_start = X.end - 500
tss_end = X.end + 500
gbody_start = X.start + 500
gbody_end = tss_start - 1
handler = mn_handler

try:
tss_counts = handler.stats(X.seqname, tss_start, tss_end, "sum", exact=True)[0]
except:
tss_counts = 0
tss_counts = tss_counts if tss_counts is not None else 0
tss_counts = tss_counts if tss_counts >= 0 else -1*tss_counts
try:
gbody_counts = handler.stats(X.seqname, gbody_start, gbody_end, "sum", exact=True)[0]
except:
gbody_counts = 0
gbody_counts = gbody_counts if gbody_counts is not None else 0
gbody_counts = gbody_counts if gbody_counts >= 0 else -1*gbody_counts
return X.gene_name, X.transcript_id, tss_counts, gbody_counts


def calculate_gbody_tss_ratio(pl_bw_file, mn_bw_file, reference_gtf):
"""
Calculate read count ratio of gene body to tss regions
Parameters
----------
pl_bw_file : str
Path to the pl bw file
mn_bw_file : str
Path to the mn bw file
reference_gtf : str
Path to the gene annotation gtf file
Returns
-------
gb_tss_ratio : float
"""
if not all([os.path.exists(x) for x in (pl_bw_file, mn_bw_file, reference_gtf)]):
raise IOError("Please make sure pl_bw_file, mn_bw_file and reference_gtf are accessible!")

ref = parse_gtf(reference_gtf)
expected_cols = {"feature", "transcript_type", "start", "end", "seqname"}
if not all([x in ref.columns for x in expected_cols]):
raise ValueError("The gtf file doesn't contain all required columns.")
ref = ref.loc[(ref.feature == "transcript") & (ref.transcript_type == "protein_coding"), :]
ref = ref.loc[ref.end-ref.start > 2000, :]

with pyBigWig.open(pl_bw_file) as pl_bw, pyBigWig.open(mn_bw_file) as mn_bw:
results = ref.apply(_row_atom, axis=1, args=(pl_bw, mn_bw), result_type="expand")
results = results.sort_values(by=[0, 2], ascending=False).drop_duplicates(subset=0, keep="first")

total_counts = (results[2]+results[3])
gb_tss_ratio = (results[3][
results[2] > results[2].quantile(0.9)
]).sum()/total_counts[
results[2] > results[2].quantile(0.9)
].sum()

return gb_tss_ratio

3 changes: 2 additions & 1 deletion scripts/pints_boundary_extender
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ logger = logging.getLogger("PINTS - BoundaryExtender")


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Element boundary refinement")
parser = argparse.ArgumentParser(description="Element boundary refinement",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument("--bam-files", action="store", dest="bam_files", nargs="*",
type=str, required=False,
Expand Down
7 changes: 4 additions & 3 deletions scripts/pints_caller
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ logger = logging.getLogger("PINTS - Caller")


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Peak Identifier for Nascent Transcripts Starts")
parser = argparse.ArgumentParser(description="Peak Identifier for Nascent Transcripts Starts",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
group = parser.add_argument_group("Input/Output")
group.add_argument("--bam-file", action="store", dest="bam_file", nargs="*",
type=str, required=False,
Expand Down Expand Up @@ -125,7 +126,7 @@ if __name__ == "__main__":
help="If --annotation-gtf is specified, you use this parameter to change which chromosome the "
"tool should learn the values from.")
group.add_argument("--alpha", "--donor-tolerance", action="store", dest="donor_tolerance",
type=float, required=False, default=0.3,
type=float, required=False, default=0.3,
help="The stringency for PINTS to cluster nearby TSSs into a peak. 0 is the least stringent; "
"1 is the most stringent.")
group.add_argument("--ce-trigger", action="store", dest="ce_trigger",
Expand Down Expand Up @@ -175,7 +176,7 @@ if __name__ == "__main__":
help="Refine peak calls with compiled epigenomic annotation from the PINTS web server."
" Values should be the name of the biosample, for example, K562.")
group.add_argument("--relaxed-fdr-target", action="store", dest="relaxed_fdr_target", default=0.2,
type=float, required=False, help="Relaxed FDR cutoff for TREs overlap with ")
type=float, required=False, help="Relaxed FDR cutoff for TREs overlap with epigenomic annotations")
group.add_argument("--chromosome-start-with", action="store", dest="chromosome_startswith",
type=str, required=False, default="chr",
help="Only keep reads mapped to chromosomes with this prefix")
Expand Down
2 changes: 1 addition & 1 deletion scripts/pints_normalizer
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ def RPM_normalization(bam_files, exp_type, output_dir, rc=False, **kwargs):


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("-m", "--method", help="Normalization method, acceptable values: RPM or Spikein")
parser.add_argument("-b", "--bam", nargs="+", required=True)
parser.add_argument("-f", "--fasta-spike-in")
Expand Down
54 changes: 54 additions & 0 deletions scripts/pints_sample_qc
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#!/usr/bin/env python
# coding=utf-8

# PINTS: Peak Identifier for Nascent Transcripts Starts
# Copyright (c) 2019-2022. Li Yao at the Yu Lab.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
import argparse
import logging
from pints.qc_engine import calculate_gbody_tss_ratio

logging.basicConfig(format="%(name)s - %(asctime)s - %(levelname)s: %(message)s",
datefmt="%d-%b-%y %H:%M:%S",
level=logging.INFO,
handlers=[
logging.StreamHandler()
])
logger = logging.getLogger("PINTS - Sample QC")


if __name__ == "__main__":
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument(
"--bw-pl", action="store", dest="bw_pl",
type=str, required=True,
help="Bigwig for the plus strand.")
parser.add_argument(
"--bw-mn", action="store", dest="bw_mn",
type=str, required=True,
help="Bigwig for the minus strand.")
parser.add_argument(
"--annotation-gtf", action="store", dest="annotation_gtf", type=str, required=True,
help="Gene annotation file (gtf) format for evaluating TSS enrichment.")

args = parser.parse_args()

logger.info("Evaluating the effect of cap selection/TSS enrichment...")
ratio = calculate_gbody_tss_ratio(args.bw_pl, args.bw_mn, args.annotation_gtf)
if ratio > 0.15:
logger.critical(f"- PINTS detected high proportion of reads from gene body regions. ({ratio:.2%})")
logger.critical("- This usually indicates the cap selection is not working as expected.")
elif ratio > 0.1:
logger.warning(f"- PINTS observed higher than expected proportion of reads in gene body regions. ({ratio:.2%})")
logger.warning("- Please proceed with caution.")
else:
logger.info("- PINTS doesn't find any significant deviation...")
6 changes: 3 additions & 3 deletions scripts/pints_visualizer
Original file line number Diff line number Diff line change
Expand Up @@ -147,11 +147,11 @@ def coverage_dict_to_bigwig(pl_dict, mn_dict, output_pref, normalization_factor=
if rpm_normalization:
logger.info("%d reads passed filter" % (pl_sum + mn_sum))
rpm_scale = 1000 * 1000 / (pl_sum + mn_sum)
logger.info("RPM normalizing (forward strand)")
logger.info("RPM normalizing (forward strand) with scale factor {:.3f}".format(rpm_scale))
pl_ranges[3] = (np.asarray(pl_ranges[3]) * rpm_scale).tolist()
logger.info("Done: RPM normalizing (forward strand)")

logger.info("RPM normalizing (reverse strand)")
logger.info("RPM normalizing (reverse strand) with scale factor {:.3f}".format(rpm_scale))
# mn_ranges.Score *= rpm_scale
mn_ranges[3] = (np.asarray(mn_ranges[3]) * rpm_scale).tolist()
logger.info("Done: RPM normalizing (reverse strand)")
Expand Down Expand Up @@ -241,7 +241,7 @@ def bam_to_bigwig(bam_file, exp_type, output_pref, normalization_factor, rpm=Fal


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("-b", "--bam", action="store", required=True)
parser.add_argument("-e", "--exp-type", action="store", default=None, dest="bam_parser",
help="Type of experiment, acceptable values are: CoPRO/GROcap/GROseq/PROcap/PROseq, or if you "
Expand Down

0 comments on commit 41281dd

Please sign in to comment.