forked from hyulab/PINTS
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Added]: QC script for evaluating TSS vs gene body enrichment
[Changed]: Report scale factors when using pints_visualizer [Changed]: Switch to `argparse.ArgumentDefaultsHelpFormatter` so that default values can be easily seen [Fixed]: Issue hyulab#6
- Loading branch information
Showing
8 changed files
with
172 additions
and
13 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
# coding=utf-8 | ||
import os.path | ||
|
||
# PINTS: Peak Identifier for Nascent Transcripts Starts | ||
# Copyright (c) 2019-2022. Li Yao at the Yu Lab. | ||
# | ||
# This program is free software: you can redistribute it and/or modify | ||
# it under the terms of the GNU General Public License as published by | ||
# the Free Software Foundation, either version 3 of the License, or | ||
# (at your option) any later version. | ||
# | ||
# This program is distributed in the hope that it will be useful, | ||
# but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
# GNU General Public License for more details. | ||
import pyBigWig | ||
from pints.io_engine import parse_gtf | ||
|
||
|
||
def _row_atom(X, pl_handler, mn_handler): | ||
""" | ||
Parameters | ||
---------- | ||
X | ||
pl_handler | ||
mn_handler | ||
Returns | ||
------- | ||
""" | ||
if X.strand == "+": | ||
tss_start = X.start - 500 | ||
tss_end = X.start + 500 | ||
gbody_start = tss_end + 1 | ||
gbody_end = X.end - 500 | ||
handler = pl_handler | ||
else: | ||
tss_start = X.end - 500 | ||
tss_end = X.end + 500 | ||
gbody_start = X.start + 500 | ||
gbody_end = tss_start - 1 | ||
handler = mn_handler | ||
|
||
try: | ||
tss_counts = handler.stats(X.seqname, tss_start, tss_end, "sum", exact=True)[0] | ||
except: | ||
tss_counts = 0 | ||
tss_counts = tss_counts if tss_counts is not None else 0 | ||
tss_counts = tss_counts if tss_counts >= 0 else -1*tss_counts | ||
try: | ||
gbody_counts = handler.stats(X.seqname, gbody_start, gbody_end, "sum", exact=True)[0] | ||
except: | ||
gbody_counts = 0 | ||
gbody_counts = gbody_counts if gbody_counts is not None else 0 | ||
gbody_counts = gbody_counts if gbody_counts >= 0 else -1*gbody_counts | ||
return X.gene_name, X.transcript_id, tss_counts, gbody_counts | ||
|
||
|
||
def calculate_gbody_tss_ratio(pl_bw_file, mn_bw_file, reference_gtf): | ||
""" | ||
Calculate read count ratio of gene body to tss regions | ||
Parameters | ||
---------- | ||
pl_bw_file : str | ||
Path to the pl bw file | ||
mn_bw_file : str | ||
Path to the mn bw file | ||
reference_gtf : str | ||
Path to the gene annotation gtf file | ||
Returns | ||
------- | ||
gb_tss_ratio : float | ||
""" | ||
if not all([os.path.exists(x) for x in (pl_bw_file, mn_bw_file, reference_gtf)]): | ||
raise IOError("Please make sure pl_bw_file, mn_bw_file and reference_gtf are accessible!") | ||
|
||
ref = parse_gtf(reference_gtf) | ||
expected_cols = {"feature", "transcript_type", "start", "end", "seqname"} | ||
if not all([x in ref.columns for x in expected_cols]): | ||
raise ValueError("The gtf file doesn't contain all required columns.") | ||
ref = ref.loc[(ref.feature == "transcript") & (ref.transcript_type == "protein_coding"), :] | ||
ref = ref.loc[ref.end-ref.start > 2000, :] | ||
|
||
with pyBigWig.open(pl_bw_file) as pl_bw, pyBigWig.open(mn_bw_file) as mn_bw: | ||
results = ref.apply(_row_atom, axis=1, args=(pl_bw, mn_bw), result_type="expand") | ||
results = results.sort_values(by=[0, 2], ascending=False).drop_duplicates(subset=0, keep="first") | ||
|
||
total_counts = (results[2]+results[3]) | ||
gb_tss_ratio = (results[3][ | ||
results[2] > results[2].quantile(0.9) | ||
]).sum()/total_counts[ | ||
results[2] > results[2].quantile(0.9) | ||
].sum() | ||
|
||
return gb_tss_ratio | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
#!/usr/bin/env python | ||
# coding=utf-8 | ||
|
||
# PINTS: Peak Identifier for Nascent Transcripts Starts | ||
# Copyright (c) 2019-2022. Li Yao at the Yu Lab. | ||
# | ||
# This program is free software: you can redistribute it and/or modify | ||
# it under the terms of the GNU General Public License as published by | ||
# the Free Software Foundation, either version 3 of the License, or | ||
# (at your option) any later version. | ||
# | ||
# This program is distributed in the hope that it will be useful, | ||
# but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
# GNU General Public License for more details. | ||
import argparse | ||
import logging | ||
from pints.qc_engine import calculate_gbody_tss_ratio | ||
|
||
logging.basicConfig(format="%(name)s - %(asctime)s - %(levelname)s: %(message)s", | ||
datefmt="%d-%b-%y %H:%M:%S", | ||
level=logging.INFO, | ||
handlers=[ | ||
logging.StreamHandler() | ||
]) | ||
logger = logging.getLogger("PINTS - Sample QC") | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) | ||
parser.add_argument( | ||
"--bw-pl", action="store", dest="bw_pl", | ||
type=str, required=True, | ||
help="Bigwig for the plus strand.") | ||
parser.add_argument( | ||
"--bw-mn", action="store", dest="bw_mn", | ||
type=str, required=True, | ||
help="Bigwig for the minus strand.") | ||
parser.add_argument( | ||
"--annotation-gtf", action="store", dest="annotation_gtf", type=str, required=True, | ||
help="Gene annotation file (gtf) format for evaluating TSS enrichment.") | ||
|
||
args = parser.parse_args() | ||
|
||
logger.info("Evaluating the effect of cap selection/TSS enrichment...") | ||
ratio = calculate_gbody_tss_ratio(args.bw_pl, args.bw_mn, args.annotation_gtf) | ||
if ratio > 0.15: | ||
logger.critical(f"- PINTS detected high proportion of reads from gene body regions. ({ratio:.2%})") | ||
logger.critical("- This usually indicates the cap selection is not working as expected.") | ||
elif ratio > 0.1: | ||
logger.warning(f"- PINTS observed higher than expected proportion of reads in gene body regions. ({ratio:.2%})") | ||
logger.warning("- Please proceed with caution.") | ||
else: | ||
logger.info("- PINTS doesn't find any significant deviation...") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters