From 0f109d83d67c68c6b5f8accbfdc2f36eae0ef515 Mon Sep 17 00:00:00 2001
From: alexpan00 <alexpan0098@gmail.com>
Date: Thu, 30 May 2024 11:07:12 +0200
Subject: [PATCH 1/3] Monoexons that don't overlap a entire junction will be
 considered genic, not NIC. Only when the monoexon spans a whole intron will
 be considered NIC by intron retention

---
 sqanti3_qc.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sqanti3_qc.py b/sqanti3_qc.py
index 593bb87..540a600 100755
--- a/sqanti3_qc.py
+++ b/sqanti3_qc.py
@@ -1239,12 +1239,13 @@ def categorize_incomplete_matches(trec, ref):
                 # instead check if it's NIC by intron retention
                 # but we don't exit here since the next gene could be a ISM hit
                 if ref.txStart <= trec.txStart < trec.txEnd <= ref.txEnd:
-                    isoform_hit.str_class = "novel_in_catalog"
+                    isoform_hit.str_class = "genic"
                     isoform_hit.subtype = "mono-exon"
                     # check for intron retention
                     if len(ref.junctions) > 0:
                         for (d,a) in ref.junctions:
                             if trec.txStart < d < a < trec.txEnd:
+                                isoform_hit.str_class = "novel_in_catalog"
                                 isoform_hit.subtype = "mono-exon_by_intron_retention"
                                 break
                     isoform_hit.modify("novel", ref.gene, 'NA', 'NA', ref.length, ref.exonCount)

From 5616b2ef5537e14ca77d3bc466f61e852f439011 Mon Sep 17 00:00:00 2001
From: alexpan00 <alexpan0098@gmail.com>
Date: Thu, 30 May 2024 12:28:29 +0200
Subject: [PATCH 2/3] Classify as genic in the proper function

---
 sqanti3_qc.py | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/sqanti3_qc.py b/sqanti3_qc.py
index 540a600..c04b482 100755
--- a/sqanti3_qc.py
+++ b/sqanti3_qc.py
@@ -1238,19 +1238,20 @@ def categorize_incomplete_matches(trec, ref):
                 # if we haven't exited here, then ISM hit is not found yet
                 # instead check if it's NIC by intron retention
                 # but we don't exit here since the next gene could be a ISM hit
-                if ref.txStart <= trec.txStart < trec.txEnd <= ref.txEnd:
-                    isoform_hit.str_class = "genic"
-                    isoform_hit.subtype = "mono-exon"
-                    # check for intron retention
-                    if len(ref.junctions) > 0:
-                        for (d,a) in ref.junctions:
-                            if trec.txStart < d < a < trec.txEnd:
-                                isoform_hit.str_class = "novel_in_catalog"
-                                isoform_hit.subtype = "mono-exon_by_intron_retention"
-                                break
-                    isoform_hit.modify("novel", ref.gene, 'NA', 'NA', ref.length, ref.exonCount)
-                    get_gene_diff_tss_tts(isoform_hit)
-                    return isoform_hit
+                #if ref.txStart <= trec.txStart < trec.txEnd <= ref.txEnd:
+                # isoform_hit.str_class = "genic"
+                # isoform_hit.subtype = "mono-exon"
+                # check for intron retention
+                if len(ref.junctions) > 0:
+                    for (d,a) in ref.junctions:
+                        if trec.txStart < d < a < trec.txEnd:
+                            isoform_hit.str_class = "novel_in_catalog"
+                            isoform_hit.subtype = "mono-exon_by_intron_retention"
+                            
+                            isoform_hit.modify("novel", ref.gene, 'NA', 'NA', ref.length, ref.exonCount)
+                            get_gene_diff_tss_tts(isoform_hit)
+
+                            return isoform_hit
 
                 # if we get to here, means neither ISM nor NIC, so just add a ref gene and categorize further later
                 isoform_hit.genes.append(ref.gene)

From d45de1943b79e3b7f54d44d7603a515dd396d746 Mon Sep 17 00:00:00 2001
From: alexpan00 <alexpan0098@gmail.com>
Date: Thu, 30 May 2024 13:42:51 +0200
Subject: [PATCH 3/3] Fixed a bug in the classification of mono-exon ISM +
 update documentation

---
 sqanti3_qc.py | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/sqanti3_qc.py b/sqanti3_qc.py
index c04b482..9960645 100755
--- a/sqanti3_qc.py
+++ b/sqanti3_qc.py
@@ -1214,7 +1214,7 @@ def categorize_incomplete_matches(trec, ref):
         if isoform_hit.str_class == "" and trec.chrom in refs_exons_by_chr:
             # no hits to single exon genes, let's see if it hits multi-exon genes
             # (1) if it overlaps with a ref exon and is contained in an exon, we call it ISM
-            # (2) else, if it is completely within a ref gene start-end region, we call it NIC by intron retention
+            # (2) else, if it spans one or more introns, we call it NIC by intron retention
             for ref in refs_exons_by_chr[trec.chrom].find(trec.txStart, trec.txEnd):
                 if calc_exon_overlap(trec.exons, ref.exons) == 0:   # no exonic overlap, skip!
                     continue
@@ -1238,11 +1238,7 @@ def categorize_incomplete_matches(trec, ref):
                 # if we haven't exited here, then ISM hit is not found yet
                 # instead check if it's NIC by intron retention
                 # but we don't exit here since the next gene could be a ISM hit
-                #if ref.txStart <= trec.txStart < trec.txEnd <= ref.txEnd:
-                # isoform_hit.str_class = "genic"
-                # isoform_hit.subtype = "mono-exon"
-                # check for intron retention
-                if len(ref.junctions) > 0:
+                if len(ref.junctions) > 0: # This should always be true since we are checking on multi-exon references
                     for (d,a) in ref.junctions:
                         if trec.txStart < d < a < trec.txEnd:
                             isoform_hit.str_class = "novel_in_catalog"
@@ -1251,11 +1247,12 @@ def categorize_incomplete_matches(trec, ref):
                             isoform_hit.modify("novel", ref.gene, 'NA', 'NA', ref.length, ref.exonCount)
                             get_gene_diff_tss_tts(isoform_hit)
 
-                            return isoform_hit
+                            # return isoform_hit
 
-                # if we get to here, means neither ISM nor NIC, so just add a ref gene and categorize further later
+                # if we get to here, means not ISM, so just add a ref gene and categorize further later
                 isoform_hit.genes.append(ref.gene)
-
+            if isoform_hit.str_class == "novel_in_catalog":
+                return isoform_hit
     get_gene_diff_tss_tts(isoform_hit)
     isoform_hit.genes.sort(key=lambda x: start_ends_by_gene[x]['begin'])
     return isoform_hit