Skip to content

Commit

Permalink
Merge pull request #303 from ConesaLab/genic_nic
Browse files Browse the repository at this point in the history
Correction of the classfication of mono-exons
  • Loading branch information
alexpan00 authored May 30, 2024
2 parents 706a208 + d45de19 commit 8fbc951
Showing 1 changed file with 15 additions and 16 deletions.
31 changes: 15 additions & 16 deletions sqanti3_qc.py
Original file line number Diff line number Diff line change
Expand Up @@ -1214,7 +1214,7 @@ def categorize_incomplete_matches(trec, ref):
if isoform_hit.str_class == "" and trec.chrom in refs_exons_by_chr:
# no hits to single exon genes, let's see if it hits multi-exon genes
# (1) if it overlaps with a ref exon and is contained in an exon, we call it ISM
# (2) else, if it is completely within a ref gene start-end region, we call it NIC by intron retention
# (2) else, if it spans one or more introns, we call it NIC by intron retention
for ref in refs_exons_by_chr[trec.chrom].find(trec.txStart, trec.txEnd):
if calc_exon_overlap(trec.exons, ref.exons) == 0: # no exonic overlap, skip!
continue
Expand All @@ -1238,22 +1238,21 @@ def categorize_incomplete_matches(trec, ref):
# if we haven't exited here, then ISM hit is not found yet
# instead check if it's NIC by intron retention
# but we don't exit here since the next gene could be a ISM hit
if ref.txStart <= trec.txStart < trec.txEnd <= ref.txEnd:
isoform_hit.str_class = "novel_in_catalog"
isoform_hit.subtype = "mono-exon"
# check for intron retention
if len(ref.junctions) > 0:
for (d,a) in ref.junctions:
if trec.txStart < d < a < trec.txEnd:
isoform_hit.subtype = "mono-exon_by_intron_retention"
break
isoform_hit.modify("novel", ref.gene, 'NA', 'NA', ref.length, ref.exonCount)
get_gene_diff_tss_tts(isoform_hit)
return isoform_hit

# if we get to here, means neither ISM nor NIC, so just add a ref gene and categorize further later
if len(ref.junctions) > 0: # This should always be true since we are checking on multi-exon references
for (d,a) in ref.junctions:
if trec.txStart < d < a < trec.txEnd:
isoform_hit.str_class = "novel_in_catalog"
isoform_hit.subtype = "mono-exon_by_intron_retention"

isoform_hit.modify("novel", ref.gene, 'NA', 'NA', ref.length, ref.exonCount)
get_gene_diff_tss_tts(isoform_hit)

# return isoform_hit

# if we get to here, means not ISM, so just add a ref gene and categorize further later
isoform_hit.genes.append(ref.gene)

if isoform_hit.str_class == "novel_in_catalog":
return isoform_hit
get_gene_diff_tss_tts(isoform_hit)
isoform_hit.genes.sort(key=lambda x: start_ends_by_gene[x]['begin'])
return isoform_hit
Expand Down

0 comments on commit 8fbc951

Please sign in to comment.