Skip to content

Commit

Permalink
Merge pull request #280 from ConesaLab/min_ref_len
Browse files Browse the repository at this point in the history
Recover isoforms shorter than min_ref_len.
  • Loading branch information
lkondratova authored Mar 29, 2024
2 parents 79893c7 + 2dbc584 commit c9cb160
Showing 1 changed file with 25 additions and 3 deletions.
28 changes: 25 additions & 3 deletions sqanti3_qc.py
Original file line number Diff line number Diff line change
Expand Up @@ -402,7 +402,6 @@ def write_collapsed_GFF_with_CDS(isoforms_info, input_gff, output_gff):
reader = collapseGFFReader(input_gff)
for r in reader:
r.geneid = isoforms_info[r.seqid].geneName() # set the gene name

s = isoforms_info[r.seqid].CDS_genomic_start # could be 'NA'
e = isoforms_info[r.seqid].CDS_genomic_end # could be 'NA'
r.cds_exons = []
Expand Down Expand Up @@ -444,6 +443,12 @@ def get_class_junc_filenames(args, dir=None):
outputJuncPath = outputPathPrefix + "_junctions.txt"
return outputClassPath, outputJuncPath

def get_omitted_name(args, dir=None):
d = dir if dir is not None else args.dir
corrPathPrefix = os.path.join(d, args.output)
omitted_name = corrPathPrefix +"_omitted_due_to_min_ref_len.txt"
return omitted_name

def correctionPlusORFpred(args, genome_dict):
"""
Use the reference genome to correct the sequences (unless a pre-corrected GTF is given)
Expand Down Expand Up @@ -655,7 +660,6 @@ def reference_parser(args, genome_chroms):
known_5_3_by_gene = defaultdict(lambda: {'begin':set(), 'end': set()})

for r in genePredReader(referenceFiles):
if r.length < args.min_ref_len and not args.is_fusion: continue # ignore miRNAs
if r.exonCount == 1:
refs_1exon_by_chr[r.chrom].insert(r.txStart, r.txEnd, r)
known_5_3_by_gene[r.gene]['begin'].add(r.txStart)
Expand Down Expand Up @@ -2045,6 +2049,25 @@ def run(args):
#### Printing output file:
print("**** Writing output files....", file=sys.stderr)

#write omitted isoforms if requested minimum reference length is more than 0
if args.min_ref_len > 0 and not args.is_fusion:
omitted_name = get_omitted_name(args)
omitted_iso = {}
for key in isoforms_info:
if not isoforms_info[key].refLen == 'NA':
if int(isoforms_info[key].refLen) <= int(args.min_ref_len):
omitted_iso[key] = isoforms_info[key]
for key in omitted_iso:
del isoforms_info[key]
omitted_keys = list(omitted_iso.keys())
print(type(omitted_keys))
omitted_keys.sort(key=lambda x: (omitted_iso[x].chrom,omitted_iso[x].id))
print('Type omitted keys ', type(omitted_keys))
with open(omitted_name, 'w') as h:
fout_omitted = DictWriter(h, fieldnames=fields_class_cur, delimiter='\t')
fout_omitted.writeheader()
for key in omitted_keys:
fout_omitted.writerow(omitted_iso[key].as_dict())
# sort isoform keys
iso_keys = list(isoforms_info.keys())
iso_keys.sort(key=lambda x: (isoforms_info[x].chrom,isoforms_info[x].id))
Expand All @@ -2065,7 +2088,6 @@ def run(args):
else:
r['RTS_junction'] = 'FALSE'
fout_junc.writerow(r)

#isoform hits to file if requested
if args.isoform_hits:
fields_hits =['Isoform', 'Isoform_length', 'Isoform_exon_number', 'Hit', 'Hit_length',
Expand Down

0 comments on commit c9cb160

Please sign in to comment.