From b2cca9041decada5f62060be44e5f0385fd26b5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabi=C3=A1n=20Robledo?= Date: Thu, 29 Aug 2024 13:19:33 +0200 Subject: [PATCH 1/3] Added compatibility for .gz compressed fasta/fastq isoforms to SQANTI2_qc --- sqanti3_qc.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/sqanti3_qc.py b/sqanti3_qc.py index 9a6dac2..069c97b 100755 --- a/sqanti3_qc.py +++ b/sqanti3_qc.py @@ -17,6 +17,8 @@ import math import csv import numpy as np +import gzip + from statistics import mean from collections import defaultdict, Counter, namedtuple from collections.abc import Iterable @@ -517,7 +519,13 @@ def correctionPlusORFpred(args, genome_dict): print("Skipping aligning of sequences because GTF file was provided.", file=sys.stdout) ind = 0 - with open(args.isoforms, 'r') as isoforms_gtf: + # gzip.open and open have different default open modes: + # gzip.open uses "rb" (read in binary format) + # open uses "rt" (read in text format) + # This can be solved by making explicit read text mode + # as they share this parameter + open_function = gzip.open if args.isoforms.endswith('.gz') else open + with open_function(args.isoforms, mode='rt') as isoforms_gtf: for line in isoforms_gtf: if line[0] != "#" and len(line.split("\t"))!=9: sys.stderr.write("\nERROR: input isoforms file with not GTF format.\n") @@ -2149,10 +2157,16 @@ def rename_isoform_seqids(input_fasta, force_id_ignore=False): :return: output fasta with the cleaned up sequence ID, is_fusion flag """ type = 'fasta' - with open(input_fasta) as h: + # gzip.open and open have different default open modes: + # gzip.open uses "rb" (read in binary format) + # open uses "rt" (read in text format) + # This can be solved by making explicit the read text mode (which is required + # by SeqIO.parse) + open_function = gzip.open if input_fasta.endswith('.gz') else open + with open_function(input_fasta, mode="rt") as h: if h.readline().startswith('@'): type = 'fastq' - f = open(input_fasta[:input_fasta.rfind('.')]+'.renamed.fasta', 'w') - for r in SeqIO.parse(open(input_fasta), type): + f = open_function(input_fasta[:input_fasta.rfind('.')]+'.renamed.fasta', mode='wt') + for r in SeqIO.parse(open_function(input_fasta, "rt"), type): m1 = seqid_rex1.match(r.id) m2 = seqid_rex2.match(r.id) m3 = seqid_fusion.match(r.id) From 5f67353732af44421dabe6da663a7835dbf6cbd7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabi=C3=A1n=20Robledo?= Date: Wed, 11 Sep 2024 10:03:18 +0200 Subject: [PATCH 2/3] Fix: loading gz files only when --fasta option is indicated --- sqanti3_qc.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/sqanti3_qc.py b/sqanti3_qc.py index 069c97b..abfd1c4 100755 --- a/sqanti3_qc.py +++ b/sqanti3_qc.py @@ -519,13 +519,7 @@ def correctionPlusORFpred(args, genome_dict): print("Skipping aligning of sequences because GTF file was provided.", file=sys.stdout) ind = 0 - # gzip.open and open have different default open modes: - # gzip.open uses "rb" (read in binary format) - # open uses "rt" (read in text format) - # This can be solved by making explicit read text mode - # as they share this parameter - open_function = gzip.open if args.isoforms.endswith('.gz') else open - with open_function(args.isoforms, mode='rt') as isoforms_gtf: + with open(args.isoforms) as isoforms_gtf: for line in isoforms_gtf: if line[0] != "#" and len(line.split("\t"))!=9: sys.stderr.write("\nERROR: input isoforms file with not GTF format.\n") From be34b2c5027eb212df0459596cd1e20e148c004d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabi=C3=A1n=20Robledo=20Yag=C3=BCe?= <57529812+Fabian-RY@users.noreply.github.com> Date: Wed, 11 Sep 2024 10:26:46 +0200 Subject: [PATCH 3/3] Fix write compressed renamed fasta file that should not --- sqanti3_qc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sqanti3_qc.py b/sqanti3_qc.py index abfd1c4..3e79f11 100755 --- a/sqanti3_qc.py +++ b/sqanti3_qc.py @@ -2159,7 +2159,7 @@ def rename_isoform_seqids(input_fasta, force_id_ignore=False): open_function = gzip.open if input_fasta.endswith('.gz') else open with open_function(input_fasta, mode="rt") as h: if h.readline().startswith('@'): type = 'fastq' - f = open_function(input_fasta[:input_fasta.rfind('.')]+'.renamed.fasta', mode='wt') + f = open(input_fasta[:input_fasta.rfind('.')]+'.renamed.fasta', mode='wt') for r in SeqIO.parse(open_function(input_fasta, "rt"), type): m1 = seqid_rex1.match(r.id) m2 = seqid_rex2.match(r.id)