From 0f109d83d67c68c6b5f8accbfdc2f36eae0ef515 Mon Sep 17 00:00:00 2001 From: alexpan00 Date: Thu, 30 May 2024 11:07:12 +0200 Subject: [PATCH 1/3] Monoexons that don't overlap a entire junction will be considered genic, not NIC. Only when the monoexon spans a whole intron will be considered NIC by intron retention --- sqanti3_qc.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sqanti3_qc.py b/sqanti3_qc.py index 593bb87..540a600 100755 --- a/sqanti3_qc.py +++ b/sqanti3_qc.py @@ -1239,12 +1239,13 @@ def categorize_incomplete_matches(trec, ref): # instead check if it's NIC by intron retention # but we don't exit here since the next gene could be a ISM hit if ref.txStart <= trec.txStart < trec.txEnd <= ref.txEnd: - isoform_hit.str_class = "novel_in_catalog" + isoform_hit.str_class = "genic" isoform_hit.subtype = "mono-exon" # check for intron retention if len(ref.junctions) > 0: for (d,a) in ref.junctions: if trec.txStart < d < a < trec.txEnd: + isoform_hit.str_class = "novel_in_catalog" isoform_hit.subtype = "mono-exon_by_intron_retention" break isoform_hit.modify("novel", ref.gene, 'NA', 'NA', ref.length, ref.exonCount) From 5616b2ef5537e14ca77d3bc466f61e852f439011 Mon Sep 17 00:00:00 2001 From: alexpan00 Date: Thu, 30 May 2024 12:28:29 +0200 Subject: [PATCH 2/3] Classify as genic in the proper function --- sqanti3_qc.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/sqanti3_qc.py b/sqanti3_qc.py index 540a600..c04b482 100755 --- a/sqanti3_qc.py +++ b/sqanti3_qc.py @@ -1238,19 +1238,20 @@ def categorize_incomplete_matches(trec, ref): # if we haven't exited here, then ISM hit is not found yet # instead check if it's NIC by intron retention # but we don't exit here since the next gene could be a ISM hit - if ref.txStart <= trec.txStart < trec.txEnd <= ref.txEnd: - isoform_hit.str_class = "genic" - isoform_hit.subtype = "mono-exon" - # check for intron retention - if len(ref.junctions) > 0: - for (d,a) in ref.junctions: - if trec.txStart < d < a < trec.txEnd: - isoform_hit.str_class = "novel_in_catalog" - isoform_hit.subtype = "mono-exon_by_intron_retention" - break - isoform_hit.modify("novel", ref.gene, 'NA', 'NA', ref.length, ref.exonCount) - get_gene_diff_tss_tts(isoform_hit) - return isoform_hit + #if ref.txStart <= trec.txStart < trec.txEnd <= ref.txEnd: + # isoform_hit.str_class = "genic" + # isoform_hit.subtype = "mono-exon" + # check for intron retention + if len(ref.junctions) > 0: + for (d,a) in ref.junctions: + if trec.txStart < d < a < trec.txEnd: + isoform_hit.str_class = "novel_in_catalog" + isoform_hit.subtype = "mono-exon_by_intron_retention" + + isoform_hit.modify("novel", ref.gene, 'NA', 'NA', ref.length, ref.exonCount) + get_gene_diff_tss_tts(isoform_hit) + + return isoform_hit # if we get to here, means neither ISM nor NIC, so just add a ref gene and categorize further later isoform_hit.genes.append(ref.gene) From d45de1943b79e3b7f54d44d7603a515dd396d746 Mon Sep 17 00:00:00 2001 From: alexpan00 Date: Thu, 30 May 2024 13:42:51 +0200 Subject: [PATCH 3/3] Fixed a bug in the classification of mono-exon ISM + update documentation --- sqanti3_qc.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/sqanti3_qc.py b/sqanti3_qc.py index c04b482..9960645 100755 --- a/sqanti3_qc.py +++ b/sqanti3_qc.py @@ -1214,7 +1214,7 @@ def categorize_incomplete_matches(trec, ref): if isoform_hit.str_class == "" and trec.chrom in refs_exons_by_chr: # no hits to single exon genes, let's see if it hits multi-exon genes # (1) if it overlaps with a ref exon and is contained in an exon, we call it ISM - # (2) else, if it is completely within a ref gene start-end region, we call it NIC by intron retention + # (2) else, if it spans one or more introns, we call it NIC by intron retention for ref in refs_exons_by_chr[trec.chrom].find(trec.txStart, trec.txEnd): if calc_exon_overlap(trec.exons, ref.exons) == 0: # no exonic overlap, skip! continue @@ -1238,11 +1238,7 @@ def categorize_incomplete_matches(trec, ref): # if we haven't exited here, then ISM hit is not found yet # instead check if it's NIC by intron retention # but we don't exit here since the next gene could be a ISM hit - #if ref.txStart <= trec.txStart < trec.txEnd <= ref.txEnd: - # isoform_hit.str_class = "genic" - # isoform_hit.subtype = "mono-exon" - # check for intron retention - if len(ref.junctions) > 0: + if len(ref.junctions) > 0: # This should always be true since we are checking on multi-exon references for (d,a) in ref.junctions: if trec.txStart < d < a < trec.txEnd: isoform_hit.str_class = "novel_in_catalog" @@ -1251,11 +1247,12 @@ def categorize_incomplete_matches(trec, ref): isoform_hit.modify("novel", ref.gene, 'NA', 'NA', ref.length, ref.exonCount) get_gene_diff_tss_tts(isoform_hit) - return isoform_hit + # return isoform_hit - # if we get to here, means neither ISM nor NIC, so just add a ref gene and categorize further later + # if we get to here, means not ISM, so just add a ref gene and categorize further later isoform_hit.genes.append(ref.gene) - + if isoform_hit.str_class == "novel_in_catalog": + return isoform_hit get_gene_diff_tss_tts(isoform_hit) isoform_hit.genes.sort(key=lambda x: start_ends_by_gene[x]['begin']) return isoform_hit