".format("\t".join(self._fields))
- def __bytes__(self) -> bytes:
- return b'\t'.join(self._fields)
+ def __bytes__(self) -> str:
+ return "\t".join(self._fields)
def _parse_attribute(self) -> None:
- for field in self._fields[8].rstrip(b';\n').split(b';'):
+ for field in self._fields[8].rstrip(";\n").split(";"):
key, *value = field.strip().split()
- self._attribute[key] = b' '.join(value).strip(b'"')
+ self._attribute[key] = " ".join(value).strip('"')
def __hash__(self) -> int:
"""concatenate strand, start, end, and chromosome and hash the resulting bytes"""
- return hash(self._fields[6] + self._fields[3] + self._fields[4] + self._fields[0])
+ return hash(
+ self._fields[6] + self._fields[3] + self._fields[4] + self._fields[0]
+ )
@property
- def seqname(self) -> bytes:
+ def seqname(self) -> str:
return self._fields[0]
@property
- def chromosome(self) -> bytes:
+ def chromosome(self) -> str:
return self._fields[0] # synonym for seqname
@property
- def source(self) -> bytes:
+ def source(self) -> str:
return self._fields[1]
@property
- def feature(self) -> bytes:
+ def feature(self) -> str:
return self._fields[2]
@property
@@ -63,15 +65,15 @@ def end(self) -> int:
return int(self._fields[4])
@property
- def score(self) -> bytes:
+ def score(self) -> str:
return self._fields[5]
@property
- def strand(self) -> bytes:
+ def strand(self) -> str:
return self._fields[6]
@property
- def frame(self) -> bytes:
+ def frame(self) -> str:
return self._fields[7]
@property
@@ -95,36 +97,38 @@ def attribute(self, item):
self._parse_attribute()
return self._attribute[item]
else:
- raise KeyError('%s is not a stored attribute of this gtf record' %
- repr(item))
+ raise KeyError(
+ "%s is not a stored attribute of this gtf record" % repr(item)
+ )
@property
def integer_gene_id(self) -> int:
"""ENSEMBL gene id without the organism specific prefix, encoded as an integer"""
- return int(self.attribute(b'gene_id').split(b'.')[0]
- .translate(None, self._del_letters))
+ return int(
+ self.attribute("gene_id").split(".")[0].translate(None, self._del_letters)
+ )
@property
- def organism_prefix(self) -> bytes:
+ def organism_prefix(self) -> str:
"""Organism prefix of ENSEMBL gene id (e.g. ENSG for human, ENSMUSG)"""
- return self.attribute(b'gene_id').translate(None, self._del_non_letters)
+ return self.attribute("gene_id").translate(None, self._del_non_letters)
@property
- def string_gene_id(self) -> bytes:
+ def string_gene_id(self) -> str:
"""ENSEMBL gene id, including organism prefix."""
- return self.attribute(b'gene_id')
+ return self.attribute("gene_id")
@staticmethod
- def int2str_gene_id(integer_id: int, organism_prefix: bytes) -> bytes:
+ def int2str_gene_id(integer_id: int, organism_prefix: str) -> str:
"""
converts an integer gene id (suffix) to a string gene id (including organism-
specific suffix)
- :param organism_prefix: bytes
+ :param organism_prefix: str
:param integer_id: int
"""
- bytestring = str(integer_id).encode()
+ bytestring = str(integer_id)
diff = 11 - len(bytestring)
- return organism_prefix + (b'0' * diff) + bytestring
+ return organism_prefix + ("0" * diff) + bytestring
def __eq__(self, other):
"""equivalent to testing if start, end, chrom and strand are the same."""
@@ -163,7 +167,9 @@ def __init__(self, gtf: str, max_transcript_length=1000):
transcript sizes indicates that the majority of non-erroneous fragments of
mRNA molecules should align within this region.
"""
- self._chromosomes_to_genes = self.construct_translator(gtf, max_transcript_length)
+ self._chromosomes_to_genes = self.construct_translator(
+ gtf, max_transcript_length
+ )
@staticmethod
def iterate_adjusted_exons(exons, strand, max_transcript_length):
@@ -180,7 +186,7 @@ def iterate_adjusted_exons(exons, strand, max_transcript_length):
start, end = int(exon[3]), int(exon[4])
size = end - start
if size >= max_transcript_length:
- if strand == '+':
+ if strand == "+":
yield end - max_transcript_length, end
else:
yield start, start + max_transcript_length
@@ -255,18 +261,23 @@ def construct_translator(self, gtf, max_transcript_length):
chromosome -> strand -> position which returns a gene.
"""
results_dictionary = defaultdict(dict)
- for (tx_chromosome, tx_strand, gene_id), exons in Reader(gtf).iter_transcripts():
+ for (tx_chromosome, tx_strand, gene_id), exons in Reader(
+ gtf
+ ).iter_transcripts():
for start, end in self.iterate_adjusted_exons(
- exons, tx_strand, max_transcript_length):
+ exons, tx_strand, max_transcript_length
+ ):
if start == end:
continue # zero-length exons apparently occur in the gtf
try:
results_dictionary[tx_chromosome][tx_strand].addi(
- start, end, gene_id)
+ start, end, gene_id
+ )
except KeyError:
results_dictionary[tx_chromosome][tx_strand] = IntervalTree()
results_dictionary[tx_chromosome][tx_strand].addi(
- start, end, gene_id)
+ start, end, gene_id
+ )
return dict(results_dictionary)
def translate(self, chromosome, strand, pos):
@@ -275,16 +286,17 @@ def translate(self, chromosome, strand, pos):
Uses the IntervalTree data structure to rapidly search for the corresponding
identifier.
- :param bytes chromosome: chromosome for this alignment
- :param bytes strand: strand for this alignment (one of ['+', '-'])
+ :param str chromosome: chromosome for this alignment
+ :param str strand: strand for this alignment (one of ['+', '-'])
:param int pos: position of the alignment within the chromosome
:return int|None: Returns either an integer gene_id if a unique gene was found
at the specified position, or None otherwise
"""
# todo remove duplicate exons during construction to save time
try:
- result = set(x.data for x in
- self._chromosomes_to_genes[chromosome][strand][pos])
+ result = set(
+ x.data for x in self._chromosomes_to_genes[chromosome][strand][pos]
+ )
if len(result) == 1:
return first(result) # just right
else:
@@ -299,49 +311,71 @@ class Reader(reader.Reader):
methods.
:method __iter__: Iterator over all non-header records in gtf; yields Record objects.
- :method iter_genes: Iterator over all genes in gtf; yields Gene objects.
+ :method iter_transcripts: Iterate over transcripts in a gtf file, returning a transcripts's
+ chromosome strand, gene_id, and a list of tab-split exon records
"""
def __iter__(self):
"""return an iterator over all non-header records in gtf"""
- hook = fileinput.hook_compressed
- with fileinput.input(self._files, openhook=hook, mode='r') as f:
+
+ # fixme: workaround for https://bugs.python.org/issue36865 (as of 2019-10-24)
+ # force to "rt" instead of using `mode` being passed
+ # this will let us avoid using `.decode()` all over the place
+ def hook_compressed(filename, mode):
+ ext = os.path.splitext(filename)[1]
+ if ext == ".gz":
+ import gzip
+
+ return gzip.open(filename, "rt")
+ elif ext == ".bz2":
+ import bz2
+
+ return bz2.BZ2File(filename, "rt")
+ else:
+ return open(filename, mode)
+
+ with fileinput.input(self._files, openhook=hook_compressed, mode="r") as f:
# get rid of header lines
file_iterator = iter(f)
first_record = next(file_iterator)
- while first_record.startswith('#'):
+ while first_record.startswith("#"):
first_record = next(file_iterator)
- yield first_record.split('\t') # avoid loss of first non-comment line
+ # avoid loss of first non-comment line
+ yield first_record.split("\t")
for record in file_iterator: # now, run to exhaustion
- yield record.split('\t')
+ yield record.split("\t")
@staticmethod
def strip_gene_num(attribute_str):
try:
- gene_start = attribute_str.index('gene_id')
+ gene_start = attribute_str.index("gene_id")
except ValueError:
raise ValueError(
- 'Gene_id field is missing in annotations file: {}'.format(attribute_str))
+ "Gene_id field is missing in annotations file: {}".format(attribute_str)
+ )
try:
gene_end = attribute_str.index('";', gene_start)
except ValueError:
raise ValueError(
'no "; in gene_id attribute, gtf file might be corrupted: {}'.format(
- attribute_str))
+ attribute_str
+ )
+ )
try:
- id_start = attribute_str.index('0', gene_start)
+ id_start = attribute_str.index("0", gene_start)
except ValueError:
raise ValueError(
- 'Corrupt gene_id field in annotations file - {}'.format(attribute_str))
+ "Corrupt gene_id field in annotations file - {}".format(attribute_str)
+ )
# ignore the gene version, which is located after a decimal in some gtf files
try:
- gene_end = attribute_str.index('.', id_start, gene_end)
+ gene_end = attribute_str.index(".", id_start, gene_end)
except ValueError:
pass
@@ -357,7 +391,7 @@ def iter_transcripts(self):
record = next(iterator)
# skip to first transcript record and store chromosome and strand
- while record[2] != 'transcript':
+ while record[2] != "transcript":
record = next(iterator)
transcript_chromosome = record[0]
transcript_strand = record[6]
@@ -384,13 +418,15 @@ def iter_transcripts(self):
exons = []
for record in iterator:
- if record[2] == 'exon':
+ if record[2] == "exon":
exons.append(record)
- elif record[2] == 'transcript':
+ elif record[2] == "transcript":
# we want exons in inverse order
exons = exons[::-1]
yield (
- (transcript_chromosome, transcript_strand, transcript_gene_id), exons)
+ (transcript_chromosome, transcript_strand, transcript_gene_id),
+ exons,
+ )
exons = []
transcript_chromosome = record[0]
transcript_strand = record[6]
@@ -411,23 +447,24 @@ def create_phix_annotation(phix_fasta):
"""
import numpy as np
- with open(phix_fasta, 'r') as f:
+ with open(phix_fasta, "r") as f:
header = f.readline() # phiX has only one chromosome
data = f.readlines()
# concatenate data
- contig = ''
+ contig = ""
for line in data:
contig += line.strip()
# get chromosome
- chromosome = header.split()[0].strip('>')
- source = 'seqc'
- score = '.'
- frame = '.'
+ chromosome = header.split()[0].strip(">")
+ source = "seqc"
+ score = "."
+ frame = "."
gene_meta = 'gene_id "PHIXG00{NUM}"; gene_name "PHIX{NAME!s}";'
- exon_meta = ('gene_id "PHIXG00{NUM}"; gene_name "PHIX{NAME!s}"; '
- 'exon_id "PHIX{NAME!s}";')
+ exon_meta = (
+ 'gene_id "PHIXG00{NUM}"; gene_name "PHIX{NAME!s}"; ' 'exon_id "PHIX{NAME!s}";'
+ )
# SEQC truncates genes at 1000b from the end of each transcript. However, phiX DNA
# that is spiked into an experiment is not subject to library construction. Thus,
@@ -438,24 +475,60 @@ def create_phix_annotation(phix_fasta):
transcript_starts = np.arange(length // 1000 + 1) * 1000
transcript_ends = np.array([min(s + 1000, length) for s in transcript_starts])
- phix_gtf = phix_fasta.replace('.fa', '.gtf')
+ phix_gtf = phix_fasta.replace(".fa", ".gtf")
- with open(phix_gtf, 'w') as f:
+ with open(phix_gtf, "w") as f:
for i, (s, e) in enumerate(zip(transcript_starts, transcript_ends)):
# add forward strand gene
- gene = [chromosome, source, 'gene', str(s), str(e), score, '+', frame,
- gene_meta.format(NUM=str(i + 1) * 9, NAME=i + 1)]
- f.write('\t'.join(gene) + '\n')
- exon = [chromosome, source, 'exon', str(s), str(e), score, '+', frame,
- exon_meta.format(NUM=str(i + 1) * 9, NAME=i + 1)]
- f.write('\t'.join(exon) + '\n')
+ gene = [
+ chromosome,
+ source,
+ "gene",
+ str(s),
+ str(e),
+ score,
+ "+",
+ frame,
+ gene_meta.format(NUM=str(i + 1) * 9, NAME=i + 1),
+ ]
+ f.write("\t".join(gene) + "\n")
+ exon = [
+ chromosome,
+ source,
+ "exon",
+ str(s),
+ str(e),
+ score,
+ "+",
+ frame,
+ exon_meta.format(NUM=str(i + 1) * 9, NAME=i + 1),
+ ]
+ f.write("\t".join(exon) + "\n")
# add reverse strand gene
- gene = [chromosome, source, 'gene', str(s), str(e), score, '-', frame,
- gene_meta.format(NUM=str(i + 1) * 9, NAME=i + 1)]
- f.write('\t'.join(gene) + '\n')
- exon = [chromosome, source, 'exon', str(s), str(e), score, '-', frame,
- exon_meta.format(NUM=str(i + 1) * 9, NAME=i + 1)]
- f.write('\t'.join(exon) + '\n')
+ gene = [
+ chromosome,
+ source,
+ "gene",
+ str(s),
+ str(e),
+ score,
+ "-",
+ frame,
+ gene_meta.format(NUM=str(i + 1) * 9, NAME=i + 1),
+ ]
+ f.write("\t".join(gene) + "\n")
+ exon = [
+ chromosome,
+ source,
+ "exon",
+ str(s),
+ str(e),
+ score,
+ "-",
+ frame,
+ exon_meta.format(NUM=str(i + 1) * 9, NAME=i + 1),
+ ]
+ f.write("\t".join(exon) + "\n")
def create_gene_id_to_official_gene_symbol_map(gtf: str):
@@ -466,16 +539,17 @@ def create_gene_id_to_official_gene_symbol_map(gtf: str):
:param gtf: str, filename of gtf file from which to create the map.
"""
pattern = re.compile(
- r'(^.*?gene_id "[^0-9]*)([0-9]*)(\.?.*?gene_name ")(.*?)(".*?$)')
+ r'(^.*?gene_id "[^0-9]*)([0-9]*)(\.?.*?gene_name ")(.*?)(".*?$)'
+ )
gene_id_map = defaultdict(set)
- with open(gtf, 'r') as f:
+ with open(gtf, "r") as f:
for line in f:
# Skip comment lines
- if line.startswith('#'):
+ if line.startswith("#"):
continue
- fields = line.split('\t') # speed-up, only run regex on gene lines
- if fields[2] != 'gene':
+ fields = line.split("\t") # speed-up, only run regex on gene lines
+ if fields[2] != "gene":
continue
match = re.match(pattern, line) # run regex
@@ -493,4 +567,5 @@ def ensembl_gene_id_to_official_gene_symbol(ids, gene_id_map):
objects, it is much faster to only construct the map a single time.
:return list: converted ids
"""
- return ['-'.join(gene_id_map[i]) for i in ids]
+ return ["-".join(gene_id_map[i]) for i in ids]
+
diff --git a/src/seqc/sequence/index.py b/src/seqc/sequence/index.py
index 23b2c72..b3fa7b7 100644
--- a/src/seqc/sequence/index.py
+++ b/src/seqc/sequence/index.py
@@ -6,11 +6,11 @@
from seqc.sequence import gtf
from seqc.alignment import star
from seqc.io import S3
+from seqc import log
class Index:
-
- def __init__(self, organism, additional_id_types=None, index_folder_name='.'):
+ def __init__(self, organism, additional_id_types=None, index_folder_name="."):
"""Create an Index object for organism, requiring that a valid annotation have
both an ENSEMBL id and at least one additional id provided by an
additional_id_field (if provided)
@@ -42,20 +42,25 @@ def __init__(self, organism, additional_id_types=None, index_folder_name='.'):
# check organism input
if not organism:
raise ValueError(
- 'organism must be formatted as genus_species in all lower case')
+ "organism must be formatted as genus_species in all lower case"
+ )
elif not isinstance(organism, str):
- raise TypeError('organism must be a string')
- elif any([('_' not in organism) or (organism.lower() != organism)]):
+ raise TypeError("organism must be a string")
+ elif any([("_" not in organism) or (organism.lower() != organism)]):
raise ValueError(
- 'organism must be formatted as genus_species in all lower case')
+ "organism must be formatted as genus_species in all lower case"
+ )
self._organism = organism
# check additional_id_fields argument
- if not (isinstance(additional_id_types, (list, tuple, np.ndarray)) or
- additional_id_types is None):
+ if not (
+ isinstance(additional_id_types, (list, tuple, np.ndarray))
+ or additional_id_types is None
+ ):
raise TypeError(
- 'if provided, additional id fields must be a list, tuple, or numpy '
- 'array')
+ "if provided, additional id fields must be a list, tuple, or numpy "
+ "array"
+ )
if additional_id_types:
self._additional_id_types = additional_id_types
else:
@@ -64,6 +69,11 @@ def __init__(self, organism, additional_id_types=None, index_folder_name='.'):
# todo type checks
self.index_folder_name = index_folder_name
+ if self.index_folder_name != ".":
+ os.makedirs(
+ os.path.join(self.index_folder_name, self.organism), exist_ok=True
+ )
+
@property
def organism(self) -> str:
return self._organism
@@ -77,20 +87,22 @@ def _converter_xml(self) -> str:
"""Generate The xml query to download an ENSEMBL BioMART file mapping
ENSEMBL gene ids to any identifiers implemented in self.additional_id_fields
"""
- attributes = ''.join(
- '' % f for f in self.additional_id_types)
- genus, species = self.organism.split('_')
+ attributes = "".join(
+ '' % f for f in self.additional_id_types
+ )
+ genus, species = self.organism.split("_")
genome_name = genus[0] + species
xml = (
''
- ''
+ ""
''
''
''
- '{attr}'
- ''
- '\''.format(genome=genome_name, attr=attributes))
+ "{attr}"
+ ""
+ "'".format(genome=genome_name, attr=attributes)
+ )
return xml
@staticmethod
@@ -102,20 +114,23 @@ def _identify_genome_file(files: [str]) -> str:
:param files: list of fasta files obtained from the ENSEMBL ftp server
:return str: name of the correct genome file"""
for f in files:
- if '.dna_sm.primary_assembly' in f:
+ if ".dna_sm.primary_assembly" in f:
return f
for f in files:
- if f.endswith('.dna_sm.toplevel.fa.gz'):
+ if f.endswith(".dna_sm.toplevel.fa.gz"):
return f
- raise FileNotFoundError('could not find the correct fasta file in %r' % files)
+ raise FileNotFoundError("could not find the correct fasta file in %r" % files)
@staticmethod
- def _identify_gtf_file(files: [str], newest: int) -> str:
+ def _identify_gtf_file(files: [str], release_num: int) -> str:
"""Identify and return the basic gtf file from a list of annotation files"""
+ search_pattern = ".%d.chr.gtf.gz" % release_num
for f in files:
- if f.endswith('.%d.gtf.gz' % newest):
+ if f.endswith(search_pattern):
return f
+ raise FileNotFoundError("Unable to find *.{}".format(search_pattern))
+
@staticmethod
def _identify_newest_release(open_ftp: FTP) -> int:
"""Identify the most recent genome release given an open link to ftp.ensembl.org
@@ -125,34 +140,56 @@ def _identify_newest_release(open_ftp: FTP) -> int:
:param FTP open_ftp: open FTP link to ftp.ensembl.org
"""
- open_ftp.cwd('/pub')
- releases = [f for f in open_ftp.nlst() if 'release' in f]
- newest = max(int(r[r.find('-') + 1:]) for r in releases)
+ open_ftp.cwd("/pub")
+ releases = [f for f in open_ftp.nlst() if f.startswith("release-")]
+ newest = max(int(r[r.find("-") + 1 :]) for r in releases)
+
return newest - 1
- def _download_fasta_file(self, ftp: FTP, download_name: str) -> None:
+ def _download_fasta_file(
+ self, ftp: FTP, download_name: str, ensemble_release: int
+ ) -> None:
"""download the fasta file for cls.organism from ftp, an open Ensembl FTP server
:param FTP ftp: open FTP link to ENSEMBL
:param str download_name: filename for downloaded fasta file
"""
- newest = self._identify_newest_release(ftp)
- ftp.cwd('/pub/release-%d/fasta/%s/dna' % (newest, self.organism))
+
+ release_num = (
+ ensemble_release if ensemble_release else self._identify_newest_release(ftp)
+ )
+ work_dir = "/pub/release-%d/fasta/%s/dna" % (release_num, self.organism)
+ ftp.cwd(work_dir)
ensembl_fasta_filename = self._identify_genome_file(ftp.nlst())
- with open(download_name, 'wb') as f:
- ftp.retrbinary('RETR %s' % ensembl_fasta_filename, f.write)
- def _download_gtf_file(self, ftp, download_name) -> None:
+ log.info("FASTA Ensemble Release {}".format(release_num))
+ log.info("ftp://{}{}/{}".format(ftp.host, work_dir, ensembl_fasta_filename))
+
+ with open(download_name, "wb") as f:
+ ftp.retrbinary("RETR %s" % ensembl_fasta_filename, f.write)
+
+ def _download_gtf_file(
+ self, ftp, download_name: str, ensemble_release: int
+ ) -> None:
"""download the gtf file for cls.organism from ftp, an open Ensembl FTP server
:param FTP ftp: open FTP link to ENSEMBL
:param str download_name: filename for downloaded gtf file
"""
- newest = self._identify_newest_release(ftp)
- ftp.cwd('/pub/release-%d/gtf/%s/' % (newest, self.organism))
- ensembl_gtf_filename = self._identify_gtf_file(ftp.nlst(), newest)
- with open(download_name, 'wb') as f:
- ftp.retrbinary('RETR %s' % ensembl_gtf_filename, f.write)
+ release_num = (
+ ensemble_release if ensemble_release else self._identify_newest_release(ftp)
+ )
+ work_dir = "/pub/release-%d/gtf/%s/" % (release_num, self.organism)
+ ftp.cwd(work_dir)
+ ensembl_gtf_filename = self._identify_gtf_file(ftp.nlst(), release_num)
+
+ log.info("GTF Ensemble Release {}".format(release_num))
+ log.info(
+ "ftp://{}{}".format(ftp.host, os.path.join(work_dir, ensembl_gtf_filename))
+ )
+
+ with open(download_name, "wb") as f:
+ ftp.retrbinary("RETR %s" % ensembl_gtf_filename, f.write)
# todo remove wget dependency
def _download_conversion_file(self, download_name: str) -> None:
@@ -161,44 +198,54 @@ def _download_conversion_file(self, download_name: str) -> None:
:param download_name: name for the downloaded file
"""
- cmd = ('wget -O %s \'http://www.ensembl.org/biomart/martservice?query=%s > '
- '/dev/null 2>&1' % (download_name, self._converter_xml))
+ cmd = (
+ "wget -O %s 'http://www.ensembl.org/biomart/martservice?query=%s > "
+ "/dev/null 2>&1" % (download_name, self._converter_xml)
+ )
err = check_call(cmd, shell=True)
if err:
- raise ChildProcessError('conversion file download failed: %s' % err)
+ raise ChildProcessError("conversion file download failed: %s" % err)
def _download_ensembl_files(
- self, fasta_name: str=None, gtf_name: str=None,
- conversion_name: str=None) -> None:
+ self,
+ ensemble_release: int,
+ fasta_name: str = None,
+ gtf_name: str = None,
+ conversion_name: str = None,
+ ) -> None:
"""download the fasta, gtf, and id_mapping file for the organism defined in
cls.organism
+ :param ensemble_release: Ensemble release number
:param fasta_name: name for the downloaded fasta file
:param gtf_name: name for the downloaded gtf file
:param conversion_name: name for the downloaded conversion file
"""
if fasta_name is None:
- fasta_name = '%s/%s.fa.gz' % (self.index_folder_name, self.organism)
+ fasta_name = "%s/%s.fa.gz" % (self.index_folder_name, self.organism)
if gtf_name is None:
- gtf_name = '%s/%s.gtf.gz' % (self.index_folder_name, self.organism)
+ gtf_name = "%s/%s.gtf.gz" % (self.index_folder_name, self.organism)
if conversion_name is None:
- conversion_name = '%s/%s_ids.csv' % (self.index_folder_name, self.organism)
+ conversion_name = "%s/%s_ids.csv" % (self.index_folder_name, self.organism)
- with FTP(host='ftp.ensembl.org') as ftp:
+ ensemble_ftp_address = "ftp.ensembl.org"
+
+ with FTP(host=ensemble_ftp_address) as ftp:
ftp.login()
- self._download_fasta_file(ftp, fasta_name)
- self._download_gtf_file(ftp, gtf_name)
+ self._download_fasta_file(ftp, fasta_name, ensemble_release)
+ self._download_gtf_file(ftp, gtf_name, ensemble_release)
self._download_conversion_file(conversion_name)
def _subset_genes(
- self,
- conversion_file: str=None,
- gtf_file: str=None,
- truncated_annotation: str=None,
- valid_biotypes=(b'protein_coding', b'lincRNA')):
+ self,
+ conversion_file: str = None,
+ gtf_file: str = None,
+ truncated_annotation: str = None,
+ valid_biotypes=("protein_coding", "lincRNA"),
+ ):
"""
Remove any annotation from the annotation_file that is not also defined by at
least one additional identifer present in conversion file.
@@ -219,47 +266,66 @@ def _subset_genes(
:param conversion_file: file location of the conversion file
:param gtf_file: file location of the annotation file
:param truncated_annotation: name for the generated output file
- :param list(bytes) valid_biotypes: only accept genes of this biotype.
+ :param list(str) valid_biotypes: only accept genes of this biotype.
"""
- if not (self.additional_id_types or valid_biotypes): # nothing to be done
- return
-
- # change to set for efficiency
- if all(isinstance(t, str) for t in valid_biotypes):
- valid_biotypes = set((t.encode() for t in valid_biotypes))
- elif all(isinstance(t, bytes) for t in valid_biotypes):
- valid_biotypes = set(valid_biotypes)
- else:
- raise TypeError('mixed-type biotypes detected. Please pass valid_biotypes '
- 'as strings or bytes objects (but not both).')
if gtf_file is None:
- gtf_file = '%s/%s.gtf.gz' % (self.index_folder_name, self.organism)
+ gtf_file = os.path.join(
+ self.index_folder_name, "{}.gtf.gz".format(self.organism)
+ )
if conversion_file is None:
- conversion_file = '%s/%s_ids.csv' % (self.index_folder_name, self.organism)
+ conversion_file = os.path.join(
+ self.index_folder_name, "{}_ids.csv".format(self.organism)
+ )
if truncated_annotation is None:
- truncated_annotation = '%s/%s_multiconsortia.gtf' % (
- self.index_folder_name, self.organism)
+ truncated_annotation = os.path.join(
+ self.index_folder_name, self.organism, "annotations.gtf"
+ )
+
+ if not (self.additional_id_types or valid_biotypes): # nothing to be done
+ # no need to truncate the annotation file
+ # let's just make a copy of the original file so that it can be added to the final output directory
+ cmd = "gunzip -c {} > {}".format(gtf_file, truncated_annotation)
+ err = check_call(cmd, shell=True)
+ if err:
+ raise ChildProcessError("conversion file download failed: %s" % err)
+ return
+
+ # change to set for efficiency
+ valid_biotypes = set(valid_biotypes)
# extract valid ensembl ids from the conversion file
c = pd.read_csv(conversion_file, index_col=[0])
- valid_ensembl_ids = set(c[np.any(~c.isnull().values, axis=1)].index)
+
+ if c.shape[1] == 1:
+ # index == ensembl_gene_id & col 1 == hgnc_symbol
+ valid_ensembl_ids = set(c[np.any(~c.isnull().values, axis=1)].index)
+ elif c.shape[1] == 0:
+ # index == ensembl_gene_id & no columns
+ # set to none to take all IDs
+ valid_ensembl_ids = None
+ else:
+ raise Exception("Not implemented/supported shape={}".format(c.shape))
# remove any invalid ids from the annotation file
gr = gtf.Reader(gtf_file)
- with open(truncated_annotation, 'wb') as f:
+ with open(truncated_annotation, "wt") as f:
for line_fields in gr:
record = gtf.Record(line_fields)
- if (record.attribute(b'gene_id').decode() in valid_ensembl_ids and
- record.attribute(b'gene_biotype') in valid_biotypes):
- f.write(bytes(record))
+ # include only biotypes of interest
+ if record.attribute("gene_biotype") in valid_biotypes:
+ if (valid_ensembl_ids is None) or (
+ record.attribute("gene_id") in valid_ensembl_ids
+ ):
+ f.write("\t".join(line_fields))
def _create_star_index(
- self,
- fasta_file: str=None,
- gtf_file: str=None,
- genome_dir: str=None,
- read_length: int=75) -> None:
+ self,
+ fasta_file: str = None,
+ gtf_file: str = None,
+ genome_dir: str = None,
+ read_length: int = 75,
+ ) -> None:
"""Create a new STAR index for the associated genome
:param fasta_file:
@@ -269,16 +335,23 @@ def _create_star_index(
:return:
"""
if fasta_file is None:
- fasta_file = '%s/%s.fa.gz' % (self.index_folder_name, self.organism)
+ fasta_file = os.path.join(
+ self.index_folder_name, "{}.fa.gz".format(self.organism)
+ )
if gtf_file is None:
- if os.path.isfile('%s/%s_multiconsortia.gtf' % (
- self.index_folder_name, self.organism)):
- gtf_file = '%s/%s_multiconsortia.gtf' % (
- self.index_folder_name, self.organism)
+ if os.path.isfile(
+ os.path.join(self.index_folder_name, self.organism, "annotations.gtf")
+ ):
+ gtf_file = os.path.join(
+ self.index_folder_name, self.organism, "annotations.gtf"
+ )
else:
- gtf_file = '%s/%s.gtf.gz' % (self.index_folder_name, self.organism)
+ gtf_file = os.path.join(
+ self.index_folder_name, "{}.gtf.gz".format(self.organism)
+ )
if genome_dir is None:
- genome_dir = '%s/%s' % (self.index_folder_name, self.organism)
+ genome_dir = os.path.join(self.index_folder_name, self.organism)
+
star.create_index(fasta_file, gtf_file, genome_dir, read_length)
@staticmethod
@@ -288,16 +361,23 @@ def _upload_index(index_directory: str, s3_upload_location: str) -> None:
:param index_directory: folder containing index
:param s3_upload_location: location to upload index on s3
"""
- if not index_directory.endswith('/'):
- index_directory += '/'
- if not s3_upload_location.endswith('/'):
- s3_upload_location += '/'
- bucket, *dirs = s3_upload_location.replace('s3://', '').split('/')
- key_prefix = '/'.join(dirs)
- S3.upload_files(file_prefix=index_directory, bucket=bucket, key_prefix=key_prefix)
+ if not index_directory.endswith("/"):
+ index_directory += "/"
+ if not s3_upload_location.endswith("/"):
+ s3_upload_location += "/"
+ bucket, *dirs = s3_upload_location.replace("s3://", "").split("/")
+ key_prefix = "/".join(dirs)
+ S3.upload_files(
+ file_prefix=index_directory, bucket=bucket, key_prefix=key_prefix
+ )
def create_index(
- self, valid_biotypes=('protein_coding', 'lincRNA'), s3_location: str=None):
+ self,
+ ensemble_release: int,
+ read_length: int,
+ valid_biotypes=("protein_coding", "lincRNA"),
+ s3_location: str = None,
+ ):
"""create an optionally upload an index
:param valid_biotypes: gene biotypes that do not match values in this list will
@@ -305,11 +385,19 @@ def create_index(
:param s3_location: optional, s3 location to upload the index to.
:return:
"""
- if self.index_folder_name is not '.':
- os.makedirs(self.index_folder_name, exist_ok=True)
- self._download_ensembl_files()
+
+ log.info("Downloading Ensemble files...")
+ self._download_ensembl_files(ensemble_release)
+
+ log.info("Subsetting genes...")
self._subset_genes(valid_biotypes=valid_biotypes)
- self._create_star_index()
+
+ log.info("Creating STAR index...")
+ self._create_star_index(read_length=read_length)
+
if s3_location:
- self._upload_index('%s/%s' % (self.index_folder_name, self.organism),
- s3_location)
+ log.info("Uploading...")
+ self._upload_index(
+ "%s/%s" % (self.index_folder_name, self.organism), s3_location
+ )
+
diff --git a/src/seqc/sparse_frame.py b/src/seqc/sparse_frame.py
index 4aecb79..d0b82e1 100644
--- a/src/seqc/sparse_frame.py
+++ b/src/seqc/sparse_frame.py
@@ -7,7 +7,6 @@
class SparseFrame:
-
def __init__(self, data, index, columns):
"""
lightweight wrapper of scipy.stats.coo_matrix to provide pd.DataFrame-like access
@@ -25,11 +24,11 @@ def __init__(self, data, index, columns):
"""
if not isinstance(data, coo_matrix):
- raise TypeError('data must be type coo_matrix')
+ raise TypeError("data must be type coo_matrix")
if not isinstance(index, np.ndarray):
- raise TypeError('index must be type np.ndarray')
+ raise TypeError("index must be type np.ndarray")
if not isinstance(columns, np.ndarray):
- raise TypeError('columns must be type np.ndarray')
+ raise TypeError("columns must be type np.ndarray")
self._data = data
self._index = index
@@ -42,7 +41,7 @@ def data(self):
@data.setter
def data(self, item):
if not isinstance(item, coo_matrix):
- raise TypeError('data must be type coo_matrix')
+ raise TypeError("data must be type coo_matrix")
self._data = item
@property
@@ -54,7 +53,7 @@ def index(self, item):
try:
self._index = np.array(item)
except:
- raise TypeError('self.index must be convertible into a np.array object')
+ raise TypeError("self.index must be convertible into a np.array object")
@property
def columns(self):
@@ -65,7 +64,7 @@ def columns(self, item):
try:
self._columns = np.array(item)
except:
- raise TypeError('self.columns must be convertible into a np.array object')
+ raise TypeError("self.columns must be convertible into a np.array object")
@property
def shape(self):
@@ -106,18 +105,22 @@ def from_dict(cls, dictionary, genes_to_symbols=False):
i_inds = np.fromiter((imap[v] for v in i), dtype=int)
j_inds = np.fromiter((jmap[v] for v in j), dtype=int)
- coo = coo_matrix((data, (i_inds, j_inds)), shape=(len(imap), len(jmap)),
- dtype=np.int32)
+ coo = coo_matrix(
+ (data, (i_inds, j_inds)), shape=(len(imap), len(jmap)), dtype=np.int32
+ )
index = np.fromiter(imap.keys(), dtype=int)
columns = np.fromiter(jmap.keys(), dtype=int)
if genes_to_symbols:
if not os.path.isfile(genes_to_symbols):
- raise ValueError('genes_to_symbols argument %s is not a valid annotation '
- 'file' % repr(genes_to_symbols))
+ raise ValueError(
+ "genes_to_symbols argument %s is not a valid annotation "
+ "file" % repr(genes_to_symbols)
+ )
gmap = create_gene_id_to_official_gene_symbol_map(genes_to_symbols)
- columns = np.array(ensembl_gene_id_to_official_gene_symbol(
- columns, gene_id_map=gmap))
+ columns = np.array(
+ ensembl_gene_id_to_official_gene_symbol(columns, gene_id_map=gmap)
+ )
return cls(coo, index, columns)
diff --git a/src/seqc/summary/summary.py b/src/seqc/summary/summary.py
index a3ac004..9bc59e0 100644
--- a/src/seqc/summary/summary.py
+++ b/src/seqc/summary/summary.py
@@ -98,25 +98,25 @@ def from_status_filters(cls, ra, filename):
:param str filename: html file name for this section
:return cls: Section containing initial filtering results
"""
- # todo replace whitespace characters with html equiv, add space b/w lines
+
description = (
- 'Initial filters are run over the sam file while our ReadArray database is '
+ 'Initial filters are run over the sam file while our ReadArray database is '
'being constructed. These filters indicate heuristic reasons why reads '
- 'should be omitted from downstream operations:
'
- 'no gene: Regardless of the read\'s genomic alignment status, there was no '
- 'transcriptomic alignment for this read.
'
- 'gene not unique: this indicates that more than one alignment was recovered '
- 'for this read. We attempt to resolve these multi-alignments downstream.
'
- 'primer missing: This is an in-drop specific filter, it indices that the '
+ 'should be omitted from downstream operations:
'
+ '- no gene: Regardless of the read\'s genomic alignment status, there was no '
+ 'transcriptomic alignment for this read.
'
+ '- gene not unique: This indicates that more than one alignment was recovered '
+ 'for this read. We attempt to resolve these multi-alignments downstream.
'
+ '- primer missing: This is an in-drop specific filter, it indices that the '
'spacer sequence could not be identified, and thus neither a cell barcode '
- 'nor an rmt were recorded for this read.
'
- 'low poly t: the primer did not display enough t-sequence in the primer '
+ 'nor an rmt were recorded for this read. '
+ '- low poly t: The primer did not display enough t-sequence in the primer '
'tail, where these nucleotides are expected. This indicates an increased '
'probability that this primer randomly primed, instead of hybridizing with '
- 'the poly-a tail of an mRNA molecule.')
+ 'the poly-a tail of an mRNA molecule.
')
description_section = TextContent(description)
- # Get counts
+ # Get counts
no_gene = np.sum(ra.data['status'] & ra.filter_codes['no_gene'] > 0)
gene_not_unique = np.sum(ra.data['status'] & ra.filter_codes['gene_not_unique'] > 0)
primer_missing = np.sum(ra.data['status'] & ra.filter_codes['primer_missing'] > 0)
@@ -209,10 +209,23 @@ def from_cell_filtering(cls, figure_path, filename):
:param str filename: html file name for this section
:return:
"""
- description = 'description for cell filtering' # todo implement
+ description = [
+ "Top Left: Cells whose molecule counts are below the inflection point of an ecdf constructed from cell molecule counts",
+ "Top Right: Fits a two-component gaussian mixture model to the data. If a component is found to fit a low-coverage fraction of the data, this fraction is set as invalid.",
+ "Bottom Left: Sets any cell with a fraction of mitochondrial mRNA greater than 20% to invalid.",
+ "Bottom Right: Fits a linear model to the relationship between number of genes detected and number of molecules detected. Cells with a lower than expected number of detected genes are set as invalid."
+ ]
+
+ description = list(map(lambda text: f"{text}", description))
+ description = "" + "".join(description) + "
"
description_section = TextContent(description)
- image_legend = 'image legend' # todo implement
- image_section = ImageContent(figure_path, 'cell filtering figure', image_legend)
+ image_legend = "Cell Filtering"
+ # use basename in the HTML file
+ image_section = ImageContent(
+ os.path.basename(figure_path),
+ 'cell filtering figure',
+ image_legend
+ )
return cls(
'Cell Filtering',
{'Description': description_section, 'Results': image_section},
@@ -229,8 +242,9 @@ def from_final_matrix(cls, counts_matrix, figure_path, filename):
:param pd.DataFrame counts_matrix:
:return:
"""
+ # use full path to generate an image
plot.Diagnostics.cell_size_histogram(counts_matrix, save=figure_path)
-
+
# Number of cells and molecule count distributions
image_legend = "Number of cells: {}
".format(counts_matrix.shape[0])
ms = counts_matrix.sum(axis=1)
@@ -239,7 +253,12 @@ def from_final_matrix(cls, counts_matrix, figure_path, filename):
image_legend += '{}th percentile: {}
'.format(prctile, np.percentile(ms, prctile))
image_legend += "Max number of molecules: {}
".format(ms.max())
- image_section = ImageContent(figure_path, 'cell size figure', image_legend)
+ # use basename in the HTML file
+ image_section = ImageContent(
+ os.path.basename(figure_path),
+ 'cell size figure',
+ image_legend
+ )
return cls('Cell Summary',
{'Library Size Distribution': image_section},
filename)
@@ -320,26 +339,33 @@ def render(self):
def compress_archive(self):
root_dir, _, base_dir = self.archive_name.rpartition('/')
+ if root_dir == "":
+ # make_archive doesn't like an empty string, should be None
+ root_dir = None
shutil.make_archive(
- self.archive_name, 'gztar', root_dir, base_dir)
+ self.archive_name, 'gztar', root_dir, base_dir
+ )
return self.archive_name + '.tar.gz'
class MiniSummary:
- def __init__(self, output_prefix, mini_summary_d, alignment_summary_file, filter_fig, cellsize_fig):
+ def __init__(self, output_dir, output_prefix, mini_summary_d, alignment_summary_file, filter_fig, cellsize_fig):
"""
:param mini_summary_d: dictionary containing output parameters
:param count_mat: count matrix after filtered
:param filter_fig: filtering figure
:param cellsize_fig: cell size figure
"""
+ self.output_dir = output_dir
self.output_prefix = output_prefix
self.mini_summary_d = mini_summary_d
self.alignment_summary_file = alignment_summary_file
+
+ # use the full path for figures
self.filter_fig = filter_fig
self.cellsize_fig = cellsize_fig
- self.pca_fig = output_prefix+"_pca.png"
- self.tsne_and_phenograph_fig = output_prefix+"_phenograph.png"
+ self.pca_fig = os.path.join(output_dir, output_prefix + "_pca.png")
+ self.tsne_and_phenograph_fig = os.path.join(output_dir, output_prefix + "_phenograph.png")
def compute_summary_fields(self, read_array, count_mat):
self.count_mat = pd.DataFrame(count_mat)
@@ -384,7 +410,7 @@ def compute_summary_fields(self, read_array, count_mat):
# Doing PCA transformation
pcaModel = PCA(n_components=min(20, counts_normalized.shape[1]))
- counts_pca_reduced = pcaModel.fit_transform(counts_normalized.as_matrix())
+ counts_pca_reduced = pcaModel.fit_transform(counts_normalized.values)
# taking at most 20 components or total variance is greater than 80%
num_comps = 0
@@ -396,7 +422,7 @@ def compute_summary_fields(self, read_array, count_mat):
self.counts_after_pca = counts_pca_reduced[:, :num_comps]
self.explained_variance_ratio = pcaModel.explained_variance_ratio_
- # regressed library size out of principal components
+ # regressed library size out of principal components
for c in range(num_comps):
lm = LinearRegression(normalize=False)
X = self.counts_filtered.sum(1).values.reshape(len(self.counts_filtered), 1)
@@ -424,7 +450,7 @@ def get_counts_filtered(self):
def render(self):
plot.Diagnostics.pca_components(self.pca_fig, self.explained_variance_ratio, self.counts_after_pca)
- plot.Diagnostics.phenograph_clustering(self.tsne_and_phenograph_fig, self.counts_filtered.sum(1),
+ plot.Diagnostics.phenograph_clustering(self.tsne_and_phenograph_fig, self.counts_filtered.sum(1),
self.clustering_communities, self.counts_after_tsne)
self.mini_summary_d['seq_sat_rate'] = ((self.mini_summary_d['avg_reads_per_molc'] - 1.0) * 100.0
@@ -438,22 +464,39 @@ def render(self):
warning_d["High percentage of cell death"] = "No"
warning_d["Noisy first few principle components"] = "Yes" if (self.explained_variance_ratio[0]<=0.05) else "No"
if self.mini_summary_d['seq_sat_rate'] <= 5.00:
- warning_d["Low sequencing saturation rate"] = ("Yes (%.2f%%)" % (self.mini_summary_d['seq_sat_rate']))
+ warning_d["Low sequencing saturation rate"] = ("Yes (%.2f%%)" % (self.mini_summary_d['seq_sat_rate']))
else:
warning_d["Low sequencing saturation rate"] = "No"
env = Environment(loader=PackageLoader('seqc.summary', 'templates'))
section_template = env.get_template('mini_summary_base.html')
- rendered_section = section_template.render(output_prefix = self.output_prefix, warning_d = warning_d,
- mini_summary_d = self.mini_summary_d, cellsize_fig = self.cellsize_fig,
- pca_fig = self.pca_fig, filter_fig = self.filter_fig,
- tsne_and_phenograph_fig = self.tsne_and_phenograph_fig)
- with open(self.output_prefix + "_mini_summary.html", 'w') as f:
+
+ # use the basename (i.e. not full path) when rendering figures
+ rendered_section = section_template.render(
+ output_prefix = self.output_prefix,
+ warning_d = warning_d,
+ mini_summary_d = self.mini_summary_d,
+ cellsize_fig = os.path.basename(self.cellsize_fig),
+ pca_fig = os.path.basename(self.pca_fig),
+ filter_fig = os.path.basename(self.filter_fig),
+ tsne_and_phenograph_fig = os.path.basename(self.tsne_and_phenograph_fig)
+ )
+
+ # construct path for mini summary in HTML, JSON, and PDF
+ path_mini_html = os.path.join(self.output_dir, self.output_prefix + "_mini_summary.html")
+ path_mini_json = os.path.join(self.output_dir, self.output_prefix + "_mini_summary.json")
+ path_mini_pdf = os.path.join(self.output_dir, self.output_prefix + "_mini_summary.pdf")
+
+ # save html
+ with open(path_mini_html, 'w') as f:
f.write(rendered_section)
- HTML(self.output_prefix + "_mini_summary.html").write_pdf(self.output_prefix + "_mini_summary.pdf")
+ # save pdf
+ HTML(path_mini_html).write_pdf(path_mini_pdf)
- with open(self.output_prefix + "_mini_summary.json","w") as f:
+ # save json
+ with open(path_mini_json, "w") as f:
json.dump(self.mini_summary_d, f)
- return self.output_prefix + "_mini_summary.json", self.output_prefix + "_mini_summary.pdf"
\ No newline at end of file
+ # return path to mini summary in JSON & PDF
+ return path_mini_json, path_mini_pdf
diff --git a/src/seqc/test.py b/src/seqc/test.py
deleted file mode 100644
index b9e68af..0000000
--- a/src/seqc/test.py
+++ /dev/null
@@ -1,388 +0,0 @@
-import os
-import unittest
-import gzip
-import pandas as pd
-import numpy as np
-import ftplib
-import nose2
-from nose2.tools import params
-from seqc.sequence import index, gtf
-from seqc.core import main
-from seqc.read_array import ReadArray
-import seqc
-import logging
-
-logging.basicConfig()
-
-seqc_dir = '/'.join(seqc.__file__.split('/')[:-3]) + '/'
-
-# fill and uncomment these variables to avoid having to provide input to tests
-TEST_BUCKET = "seqc-public" # None
-EMAIL = None
-# RSA_KEY = None
-
-# define some constants for testing
-BARCODE_FASTQ = 's3://seqc-public/test/%s/barcode/' # platform
-GENOMIC_FASTQ = 's3://seqc-public/test/%s/genomic/' # platform
-MERGED = 's3://seqc-public/test/%s/%s_merged.fastq.gz' # platform, platform
-SAMFILE = 's3://seqc-public/test/%s/Aligned.out.bam' # platform
-INDEX = 's3://seqc-public/genomes/hg38_chr19/'
-LOCAL_OUTPUT = os.environ['TMPDIR'] + 'seqc/%s/test' # test_name
-REMOTE_OUTPUT = './test'
-UPLOAD = 's3://%s/seqc_test/%s/' # bucket_name, test_folder
-PLATFORM_BARCODES = 's3://seqc-public/barcodes/%s/flat/' # platform
-
-
-def makedirs(output):
- """make directories based on OUTPUT"""
- stem, prefix = os.path.split(output)
- os.makedirs(stem, exist_ok=True)
-
-
-class TestSEQC(unittest.TestCase):
-
- email = globals()['EMAIL'] if 'EMAIL' in globals() else None
- bucket = globals()['TEST_BUCKET'] if 'TEST_BUCKET' in globals() else None
- rsa_key = globals()['RSA_KEY'] if 'RSA_KEY' in globals() else None
- if rsa_key is None:
- try:
- rsa_key = os.environ['AWS_RSA_KEY']
- except KeyError:
- pass
-
- def check_parameters(self):
- if self.email is None:
- self.email = input('please provide an email address for SEQC to mail '
- 'results: ')
- if self.bucket is None:
- self.bucket = input('please provide an amazon s3 bucket to upload test '
- 'results: ')
- self.bucket.rstrip('/')
- if self.rsa_key is None:
- self.rsa_key = input('please provide an RSA key with permission to create '
- 'aws instances: ')
-
- # @params('in_drop', 'in_drop_v2', 'drop_seq', 'ten_x', 'mars_seq')
- def test_local(self, platform='in_drop_v2'):
- """test seqc after pre-downloading all files"""
- with open('seqc_log.txt', 'w') as f:
- f.write('Dummy log. nose2 captures input, so no log is produced. This causes '
- 'pipeline errors.\n')
- test_name = 'test_no_aws_%s' % platform
- makedirs(LOCAL_OUTPUT % test_name)
- if self.email is None:
- self.email = input('please provide an email address for SEQC to mail results: ')
- argv = [
- 'run',
- platform,
- '-o', test_name,
- '-i', INDEX,
- '-b', BARCODE_FASTQ % platform,
- '-g', GENOMIC_FASTQ % platform,
- '--barcode-files', PLATFORM_BARCODES % platform,
- '-e', self.email,
- '--local']
- main.main(argv)
- os.remove('./seqc_log.txt') # clean up the dummy log we made.
-
- # @params('in_drop', 'in_drop_v2', 'drop_seq', 'ten_x', 'mars_seq')
- def test_remote_from_raw_fastq(self, platform='ten_x_v2'):
- test_name = 'test_remote_%s' % platform
- self.check_parameters()
- argv = [
- 'run',
- platform,
- '-o', REMOTE_OUTPUT,
- '-u', UPLOAD % (self.bucket, test_name),
- '-i', INDEX,
- '-e', self.email,
- '-b', BARCODE_FASTQ % platform,
- '-g', GENOMIC_FASTQ % platform,
- '--instance-type', 'c4.large',
- '--spot-bid', '1.0',
- '-k', self.rsa_key, '--debug']
- if platform != 'drop_seq':
- argv += ['--barcode-files', PLATFORM_BARCODES % platform]
- main.main(argv)
-
- # @params('in_drop', 'in_drop_v2', 'drop_seq', 'ten_x', 'mars_seq')
- def test_remote_from_merged(self, platform='in_drop_v2'):
- test_name = 'test_remote_%s' % platform
- self.check_parameters()
- argv = [
- 'run',
- platform,
- '-o', REMOTE_OUTPUT,
- '-u', UPLOAD % (self.bucket, test_name),
- '-i', INDEX,
- '-e', self.email,
- '-m', MERGED % (platform, platform),
- '-k', self.rsa_key,
- '--instance-type', 'c4.large',
- # '--spot-bid', '1.0'
- ]
- if platform != 'drop_seq':
- argv += ['--barcode-files', PLATFORM_BARCODES % platform]
- main.main(argv)
-
- # @params('in_drop', 'in_drop_v2', 'drop_seq', 'ten_x', 'mars_seq')
- def test_remote_from_samfile(self, platform='in_drop_v2'):
- test_name = 'test_remote_%s' % platform
- self.check_parameters()
- argv = [
- 'run',
- platform,
- '-o', REMOTE_OUTPUT,
- '-u', UPLOAD % (self.bucket, test_name),
- '-i', INDEX,
- '-e', self.email,
- '-a', SAMFILE % platform,
- '-k', self.rsa_key,
- '--instance-type', 'r5.2xlarge',
- '--debug',
- # '--spot-bid', '1.0'
- ]
- if platform != 'drop_seq':
- argv += ['--barcode-files', PLATFORM_BARCODES % platform]
- main.main(argv)
-
-
-class TestIndex(unittest.TestCase):
-
- @classmethod
- def setUpClass(cls):
- cls.outdir = os.environ['TMPDIR']
-
- def test_Index_raises_ValueError_when_organism_is_not_provided(self):
- self.assertRaises(ValueError, index.Index, organism='', additional_id_fields=[])
-
- def test_Index_raises_ValueError_when_organism_isnt_lower_case(self):
- self.assertRaises(ValueError, index.Index, organism='Homo_sapiens',
- additional_id_fields=[])
- self.assertRaises(ValueError, index.Index, organism='Homo_Sapiens',
- additional_id_fields=[])
- self.assertRaises(ValueError, index.Index, organism='hoMO_Sapiens',
- additional_id_fields=[])
-
- def test_Index_raises_ValueError_when_organism_has_no_underscore(self):
- self.assertRaises(ValueError, index.Index, organism='homosapiens',
- additional_id_fields=[])
-
- def test_Index_raises_TypeError_when_additional_id_fields_is_not_correct_type(self):
- self.assertRaises(TypeError, index.Index, organism='homo_sapiens',
- additional_id_fields='not_an_array_tuple_or_list')
- self.assertRaises(TypeError, index.Index, organism='homo_sapiens',
- additional_id_fields='')
-
- def test_False_evaluating_additional_id_fields_are_accepted_but_set_empty_list(self):
- idx = index.Index('homo_sapiens', [])
- self.assertEqual(idx.additional_id_types, [])
- idx = index.Index('homo_sapiens', tuple())
- self.assertEqual(idx.additional_id_types, [])
- idx = index.Index('homo_sapiens', np.array([]))
- self.assertEqual(idx.additional_id_types, [])
-
- def test_converter_xml_contains_one_attribute_line_per_gene_list(self):
- idx = index.Index('homo_sapiens', ['hgnc_symbol', 'mgi_symbol'])
- self.assertEqual(idx._converter_xml.count('Attribute name'), 3)
- idx = index.Index('homo_sapiens', [])
- self.assertEqual(idx._converter_xml.count('Attribute name'), 1)
-
- def test_converter_xml_formats_genome_as_first_initial_plus_species(self):
- idx = index.Index('homo_sapiens', ['hgnc_symbol', 'mgi_symbol'])
- self.assertIn('hsapiens', idx._converter_xml)
- idx = index.Index('mus_musculus')
- self.assertIn('mmusculus', idx._converter_xml)
-
- def test_can_login_to_ftp_ensembl(self):
- with ftplib.FTP(host='ftp.ensembl.org') as ftp:
- ftp.login()
-
- def test_download_converter_gets_output_and_is_pandas_loadable(self):
- idx = index.Index('ciona_intestinalis', ['entrezgene'])
- filename = self.outdir + 'ci.csv'
- idx._download_conversion_file(filename)
- converter = pd.read_csv(filename, index_col=0)
- self.assertGreaterEqual(len(converter), 10)
- self.assertEqual(converter.shape[1], 1)
- os.remove(filename) # cleanup
-
- def test_identify_newest_release_finds_a_release_which_is_gt_eq_85(self):
- idx = index.Index('ciona_intestinalis', ['entrezgene'])
- with ftplib.FTP(host='ftp.ensembl.org') as ftp:
- ftp.login()
- newest = idx._identify_newest_release(ftp)
- self.assertGreaterEqual(int(newest), 85) # current=85, they only get bigger
-
- def test_identify_genome_file_finds_primary_assembly_when_present(self):
- idx = index.Index('homo_sapiens', ['entrezgene'])
- with ftplib.FTP(host='ftp.ensembl.org') as ftp:
- ftp.login()
- newest = idx._identify_newest_release(ftp)
- ftp.cwd('/pub/release-%d/fasta/%s/dna' % (newest, idx.organism))
- filename = idx._identify_genome_file(ftp.nlst())
- self.assertIn('primary_assembly', filename)
-
- def test_identify_genome_file_defaults_to_toplevel_when_no_primary_assembly(self):
- idx = index.Index('ciona_intestinalis', ['entrezgene'])
- with ftplib.FTP(host='ftp.ensembl.org') as ftp:
- ftp.login()
- newest = idx._identify_newest_release(ftp)
- ftp.cwd('/pub/release-%d/fasta/%s/dna' % (newest, idx.organism))
- filename = idx._identify_genome_file(ftp.nlst())
- self.assertIn('toplevel', filename)
-
- def test_download_fasta_file_gets_a_properly_formatted_file(self):
- idx = index.Index('ciona_intestinalis', ['entrezgene'])
- with ftplib.FTP(host='ftp.ensembl.org') as ftp:
- ftp.login()
- filename = self.outdir + 'ci.fa.gz'
- idx._download_fasta_file(ftp, filename)
- with gzip.open(filename, 'rt') as f:
- self.assertIs(f.readline()[0], '>') # starting character for genome fa record
- os.remove(filename)
-
- def test_identify_annotation_file_finds_a_gtf_file(self):
- idx = index.Index('ciona_intestinalis', ['entrezgene'])
- with ftplib.FTP(host='ftp.ensembl.org') as ftp:
- ftp.login()
- newest = idx._identify_newest_release(ftp)
- ftp.cwd('/pub/release-%d/gtf/%s/' % (newest, idx.organism))
- filename = idx._identify_gtf_file(ftp.nlst(), newest)
- self.assertIsNotNone(filename)
-
- def test_download_gtf_file_gets_a_file_readable_by_seqc_gtf_reader(self):
- idx = index.Index('ciona_intestinalis', ['entrezgene'])
- with ftplib.FTP(host='ftp.ensembl.org') as ftp:
- ftp.login()
- filename = self.outdir + 'ci.gtf.gz'
- idx._download_gtf_file(ftp, filename)
- rd = gtf.Reader(filename)
- rc = next(rd.iter_genes())
- self.assertIsInstance(rc, gtf.Gene)
- os.remove(filename)
-
- def test_subset_genes_does_nothing_if_no_additional_fields_or_valid_biotypes(self):
- idx = index.Index('ciona_intestinalis')
- fasta_name = self.outdir + 'ci.fa.gz'
- gtf_name = self.outdir + 'ci.gtf.gz'
- conversion_name = self.outdir + 'ci_ids.csv'
- idx._download_ensembl_files(fasta_name, gtf_name, conversion_name)
- idx._subset_genes(conversion_name, gtf_name, self.outdir + 'test.csv',
- valid_biotypes=None)
- self.assertFalse(os.path.isfile(self.outdir))
-
- def test_subset_genes_produces_a_reduced_annotation_file_when_passed_fields(self):
- organism = 'ciona_intestinalis'
- idx = index.Index(organism, ['entrezgene'])
- os.chdir(self.outdir)
- idx._download_ensembl_files()
- self.assertTrue(os.path.isfile('%s.fa.gz' % organism), 'fasta file not found')
- self.assertTrue(os.path.isfile('%s.gtf.gz' % organism), 'gtf file not found')
- self.assertTrue(os.path.isfile('%s_ids.csv' % organism), 'id file not found')
-
- idx._subset_genes()
- self.assertTrue(os.path.isfile('%s_multiconsortia.gtf' % organism))
- gr_subset = gtf.Reader('%s_multiconsortia.gtf' % organism)
- gr_complete = gtf.Reader('%s.gtf.gz' % organism)
- self.assertLess(
- len(gr_subset), len(gr_complete),
- 'Subset annotation was not smaller than the complete annotation')
-
- # make sure only valid biotypes are returned
- complete_invalid = False
- valid_biotypes = {b'protein_coding', b'lincRNA'}
- for r in gr_complete.iter_genes():
- if r.attribute(b'gene_biotype') not in valid_biotypes:
- complete_invalid = True
- break
- self.assertTrue(complete_invalid)
- subset_invalid = False
- for r in gr_subset.iter_genes():
- if r.attribute(b'gene_biotype') not in valid_biotypes:
- subset_invalid = True
- break
- self.assertFalse(subset_invalid)
- self.assertGreater(len(gr_subset), 0)
-
- def test_create_star_index_produces_an_index(self):
- organism = 'ciona_intestinalis'
- idx = index.Index(organism, ['entrezgene'])
- os.chdir(self.outdir)
- idx._download_ensembl_files()
- idx._subset_genes()
- print(os.getcwd())
- idx._create_star_index()
- self.assertTrue(os.path.isfile('{outdir}/{organism}/SAindex'.format(
- outdir=self.outdir, organism=organism)))
-
- def test_upload_star_index_correctly_places_index_on_s3(self):
- os.chdir(self.outdir)
- if 'TEST_BUCKET' in globals():
- bucket = globals()['TEST_BUCKET']
- else:
- bucket = input('please provide an amazon s3 bucket to upload test results: ')
- organism = 'ciona_intestinalis'
- idx = index.Index(organism, ['entrezgene'])
- index_directory = organism + '/'
- idx._download_ensembl_files()
- idx._subset_genes()
- idx._create_star_index()
- idx._upload_index(index_directory, 's3://%s/genomes/ciona_intestinalis/' % bucket)
-
- def test_create_index_produces_and_uploads_an_index(self):
- if 'TEST_BUCKET' in globals():
- bucket = globals()['TEST_BUCKET']
- else:
- bucket = input('please provide an amazon s3 bucket to upload test results: ')
- organism = 'ciona_intestinalis'
- idx = index.Index(organism, ['entrezgene'], self.outdir)
- idx.create_index(s3_location='s3://%s/genomes/%s/' % (bucket, idx.organism))
-
-
-class TestReadArrayCreation(unittest.TestCase):
-
- @classmethod
- def setUpClass(cls):
- platform = 'in_drop_v2'
- # cls.bamfile = LOCAL_OUTPUT % platform + '_bamfile.bam'
- # cls.annotation = LOCAL_OUTPUT % platform + '_annotations.gtf'
- # S3.download(SAMFILE % platform, cls.bamfile, recursive=False)
- # S3.download(INDEX + 'annotations.gtf', cls.annotation, recursive=False)
-
- cls.bamfile = os.path.expanduser('~/Downloads/mm_test_short.bam')
- cls.annotation = os.path.expanduser('~/Downloads/annotations.gtf')
- cls.summary = os.path.expanduser('~/Downloads/mm_test_summary.txt')
- cls.total_input_reads = 12242659
- cls.translator = gtf.GeneIntervals(cls.annotation, 10000)
-
- def test_read_array_creation(self):
- ra = ReadArray.from_alignment_file(self.bamfile, self.translator,
- required_poly_t=0)
- print(repr(ra.data.shape[0]))
- print(repr(ra.genes))
- ra.save(os.path.expanduser('~/Downloads/test.ra'))
-
-
-class TestTranslator(unittest.TestCase):
-
- @classmethod
- def setUpClass(cls):
- cls.annotation = os.path.expanduser('~/Downloads/annotations.gtf')
-
- def test_construct_translator(self):
- translator = gtf.GeneIntervals(self.annotation)
- print(len(translator._chromosomes_to_genes))
-
- def get_length_of_gtf(self):
- rd = gtf.Reader(self.annotation)
- # print(len(rd))
- print(sum(1 for _ in rd.iter_transcripts()))
-
-
-#########################################################################################
-
-if __name__ == "__main__":
- nose2.main()
-
-#########################################################################################
diff --git a/src/seqc/tests/__init__.py b/src/seqc/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/seqc/tests/test_args.py b/src/seqc/tests/test_args.py
new file mode 100644
index 0000000..de7f96f
--- /dev/null
+++ b/src/seqc/tests/test_args.py
@@ -0,0 +1,78 @@
+import nose2
+import unittest
+
+import seqc
+from seqc.core import main
+
+
+# class TestSEQC(unittest.TestCase):
+# def setUp(self):
+# pass
+
+# def tearDown(self):
+# pass
+
+# def test_args(self):
+
+# argv = ["start", "-k", "/Users/dchun/dpeerlab-chunj.pem", "-t", "t2.micro"]
+
+# self.assertRaises(ValueError, lambda: main.main(argv))
+
+# class MyUnitTest(unittest.TestCase):
+# def setUp(self):
+# pass
+
+# def tearDown(self):
+# pass
+
+# def test_args(self):
+
+# # argv = [
+# # "run", "ten_x_v2", "--local",
+# # "--index", "s3://seqc-public/genomes/hg38_chr19/",
+# # "--barcode-files", "s3://seqc-public/barcodes/ten_x_v2/flat/",
+# # "--genomic-fastq", "./test-data/genomic/",
+# # "--barcode-fastq", "./test-data/barcode/",
+# # "--output-prefix", "./test-data/seqc-results/",
+# # "--email", "jaeyoung.chun@gmail.com",
+# # "--star-args", "\"runRNGseed=0\""
+# # ]
+
+# argv = [
+# "run"
+# ]
+
+# try:
+# main.main(argv)
+# # self.assertRaises(BaseException, lambda: main.main(argv))
+# except:
+# pass
+# # self.assertRaises(ValueError, lambda: main.main(argv))
+
+
+# class TestSEQC(unittest.TestCase):
+# def setUp(self):
+# pass
+
+# def tearDown(self):
+# pass
+
+# def test_args(self):
+
+# from seqc.sequence import gtf
+
+# # remove any invalid ids from the annotation file
+# gr = gtf.Reader("./test-data/homo_sapiens.gtf.gz")
+
+# for line_fields in gr:
+# record = gtf.Record(line_fields)
+# print(record)
+# biotype = record.attribute("gene_biotype")
+# print(biotype)
+
+# # self.assertRaises(ValueError, lambda: main.main(argv))
+
+
+if __name__ == "__main__":
+
+ unittest.main()
diff --git a/src/seqc/tests/test_dataset.py b/src/seqc/tests/test_dataset.py
new file mode 100644
index 0000000..d269b2b
--- /dev/null
+++ b/src/seqc/tests/test_dataset.py
@@ -0,0 +1,24 @@
+from collections import namedtuple
+
+TestDataset = namedtuple(
+ "datasets",
+ ["barcode_fastq", "genomic_fastq", "merged_fastq", "bam", "index", "barcodes",],
+)
+
+dataset_s3 = TestDataset(
+ barcode_fastq="s3://seqc-public/test/%s/barcode/", # platform
+ genomic_fastq="s3://seqc-public/test/%s/genomic/", # platform
+ merged_fastq="s3://seqc-public/test/%s/%s_merged.fastq.gz", # platform, platform
+ bam="s3://seqc-public/test/%s/Aligned.out.bam", # platform
+ index="s3://seqc-public/genomes/hg38_chr19/",
+ barcodes="s3://seqc-public/barcodes/%s/flat/", # platform
+)
+
+dataset_local = TestDataset(
+ barcode_fastq="test-data/datasets/%s/barcode/", # platform
+ genomic_fastq="test-data/datasets/%s/genomic/", # platform
+ merged_fastq=None,
+ bam="test-data/datasets/%s/Aligned.out.bam", # platform
+ index="test-data/datasets/genomes/hg38_chr19/",
+ barcodes="test-data/datasets/barcodes/%s/flat/", # platform
+)
diff --git a/src/seqc/tests/test_index.py b/src/seqc/tests/test_index.py
new file mode 100644
index 0000000..b45f58f
--- /dev/null
+++ b/src/seqc/tests/test_index.py
@@ -0,0 +1,393 @@
+import os
+import shutil
+import uuid
+import unittest
+import gzip
+import pandas as pd
+import numpy as np
+import ftplib
+import nose2
+from nose2.tools import params
+import seqc
+from seqc.sequence import index, gtf
+from seqc import io
+
+
+def expected_output_files():
+
+ files = set(
+ [
+ "Genome",
+ "SA",
+ "SAindex",
+ "annotations.gtf",
+ "chrLength.txt",
+ "chrName.txt",
+ "chrNameLength.txt",
+ "chrStart.txt",
+ "exonGeTrInfo.tab",
+ "exonInfo.tab",
+ "geneInfo.tab",
+ "genomeParameters.txt",
+ "sjdbInfo.txt",
+ "sjdbList.fromGTF.out.tab",
+ "sjdbList.out.tab",
+ "transcriptInfo.tab",
+ ]
+ )
+
+ return files
+
+
+class TestIndexRemote(unittest.TestCase):
+
+ s3_bucket = "dp-lab-cicd"
+
+ @classmethod
+ def setUp(cls):
+ cls.test_id = str(uuid.uuid4())
+ cls.outdir = os.path.join(os.environ["TMPDIR"], "seqc-test", cls.test_id)
+ os.makedirs(cls.outdir, exist_ok=True)
+
+ @classmethod
+ def tearDown(self):
+ if os.path.isdir(self.outdir):
+ shutil.rmtree(self.outdir, ignore_errors=True)
+
+ def test_upload_star_index_correctly_places_index_on_s3(self):
+ organism = "ciona_intestinalis"
+ # must end with a slash
+ test_folder = f"seqc/index-{organism}-{self.test_id}/"
+
+ idx = index.Index(
+ organism, ["external_gene_name"], index_folder_name=self.outdir
+ )
+ index_directory = os.path.join(self.outdir, organism) + "/"
+ idx._download_ensembl_files(ensemble_release=None)
+ idx._subset_genes()
+ idx._create_star_index()
+ idx._upload_index(index_directory, f"s3://{self.s3_bucket}/{test_folder}")
+
+ # check files generated in S3
+ files = io.S3.listdir(self.s3_bucket, test_folder)
+
+ # extract only filenames (i.e. remove directory hierarchy)
+ # convert to a set for easy comparison
+ files = set(map(lambda filename: filename.replace(test_folder, ""), files))
+
+ # check for the exact same filenames
+ self.assertSetEqual(files, expected_output_files())
+
+ def test_create_index_produces_and_uploads_an_index(self):
+ organism = "ciona_intestinalis"
+ # must end with a slash
+ test_folder = f"seqc/index-{organism}-{self.test_id}/"
+
+ idx = index.Index(
+ organism, ["external_gene_name"], index_folder_name=self.outdir
+ )
+ idx.create_index(
+ s3_location=f"s3://{self.s3_bucket}/{test_folder}",
+ ensemble_release=None,
+ read_length=101,
+ )
+
+ # check files generated in S3
+ files = io.S3.listdir(self.s3_bucket, test_folder)
+
+ # extract only filenames (i.e. remove directory hierarchy)
+ # convert to a set for easy comparison
+ files = set(map(lambda filename: filename.replace(test_folder, ""), files))
+
+ # check for the exact same filenames
+ self.assertSetEqual(files, expected_output_files())
+
+
+class MyUnitTest(unittest.TestCase):
+
+ s3_bucket = "dp-lab-cicd"
+
+ @classmethod
+ def setUp(cls):
+ cls.test_id = str(uuid.uuid4())
+ cls.outdir = os.path.join(os.environ["TMPDIR"], "seqc-test", cls.test_id)
+ os.makedirs(cls.outdir, exist_ok=True)
+
+ @classmethod
+ def tearDown(self):
+ if os.path.isdir(self.outdir):
+ shutil.rmtree(self.outdir, ignore_errors=True)
+
+ def test_Index_raises_ValueError_when_organism_is_not_provided(self):
+ self.assertRaises(ValueError, index.Index, organism="", additional_id_types=[])
+
+ def test_Index_raises_ValueError_when_organism_isnt_lower_case(self):
+ self.assertRaises(
+ ValueError, index.Index, organism="Homo_sapiens", additional_id_types=[]
+ )
+ self.assertRaises(
+ ValueError, index.Index, organism="Homo_Sapiens", additional_id_types=[]
+ )
+ self.assertRaises(
+ ValueError, index.Index, organism="hoMO_Sapiens", additional_id_types=[]
+ )
+
+ def test_Index_raises_ValueError_when_organism_has_no_underscore(self):
+ self.assertRaises(
+ ValueError, index.Index, organism="homosapiens", additional_id_types=[]
+ )
+
+ def test_Index_raises_TypeError_when_additional_id_fields_is_not_correct_type(self):
+ self.assertRaises(
+ TypeError,
+ index.Index,
+ organism="homo_sapiens",
+ additional_id_types="not_an_array_tuple_or_list",
+ )
+ self.assertRaises(
+ TypeError, index.Index, organism="homo_sapiens", additional_id_types=""
+ )
+
+ def test_False_evaluating_additional_id_fields_are_accepted_but_set_empty_list(
+ self,
+ ):
+ idx = index.Index("homo_sapiens", [])
+ self.assertEqual(idx.additional_id_types, [])
+ idx = index.Index("homo_sapiens", tuple())
+ self.assertEqual(idx.additional_id_types, [])
+ idx = index.Index("homo_sapiens", np.array([]))
+ self.assertEqual(idx.additional_id_types, [])
+
+ def test_converter_xml_contains_one_attribute_line_per_gene_list(self):
+ idx = index.Index("homo_sapiens", ["hgnc_symbol", "mgi_symbol"])
+ self.assertEqual(idx._converter_xml.count("Attribute name"), 3)
+ idx = index.Index("homo_sapiens", [])
+ self.assertEqual(idx._converter_xml.count("Attribute name"), 1)
+
+ def test_converter_xml_formats_genome_as_first_initial_plus_species(self):
+ idx = index.Index("homo_sapiens", ["hgnc_symbol", "mgi_symbol"])
+ self.assertIn("hsapiens", idx._converter_xml)
+ idx = index.Index("mus_musculus")
+ self.assertIn("mmusculus", idx._converter_xml)
+
+ def test_identify_gtf_file_should_return_correct_file(self):
+
+ files = [
+ "CHECKSUMS",
+ "Homo_sapiens.GRCh38.86.abinitio.gtf.gz",
+ "Homo_sapiens.GRCh38.86.chr.gtf.gz",
+ "Homo_sapiens.GRCh38.85.chr.gtf.gz",
+ "Homo_sapiens.GRCh38.86.chr_patch_hapl_scaff.gtf.gz",
+ "Homo_sapiens.GRCh38.86.gtf.gz",
+ "README",
+ ]
+ release_num = 86
+
+ filename = index.Index._identify_gtf_file(files, release_num)
+
+ self.assertEqual(filename, "Homo_sapiens.GRCh38.86.chr.gtf.gz")
+
+ def test_identify_gtf_file_should_throw_exception(self):
+
+ files = [
+ "CHECKSUMS",
+ "Homo_sapiens.GRCh38.86.abinitio.gtf.gz",
+ "Homo_sapiens.GRCh38.86.chr_patch_hapl_scaff.gtf.gz",
+ "Homo_sapiens.GRCh38.86.gtf.gz",
+ "README",
+ ]
+ release_num = 86
+
+ self.assertRaises(
+ FileNotFoundError,
+ index.Index._identify_gtf_file,
+ files=files,
+ release_num=release_num,
+ )
+
+ def test_can_login_to_ftp_ensembl(self):
+ with ftplib.FTP(host="ftp.ensembl.org") as ftp:
+ ftp.login()
+
+ def test_download_converter_gets_output_and_is_pandas_loadable(self):
+ idx = index.Index("ciona_intestinalis", ["external_gene_name"])
+ filename = os.path.join(self.outdir, "ci.csv")
+ idx._download_conversion_file(filename)
+ converter = pd.read_csv(filename, index_col=0)
+ self.assertGreaterEqual(len(converter), 10)
+ self.assertEqual(converter.shape[1], 1)
+
+ def test_identify_newest_release_finds_a_release_which_is_gt_eq_85(self):
+ idx = index.Index("ciona_intestinalis", ["external_gene_name"])
+ with ftplib.FTP(host="ftp.ensembl.org") as ftp:
+ ftp.login()
+ newest = idx._identify_newest_release(ftp)
+ self.assertGreaterEqual(int(newest), 85) # current=85, they only get bigger
+
+ def test_identify_genome_file_finds_primary_assembly_when_present(self):
+ idx = index.Index("homo_sapiens", ["entrezgene"])
+ with ftplib.FTP(host="ftp.ensembl.org") as ftp:
+ ftp.login()
+ newest = idx._identify_newest_release(ftp)
+ ftp.cwd("/pub/release-%d/fasta/%s/dna" % (newest, idx.organism))
+ filename = idx._identify_genome_file(ftp.nlst())
+ self.assertIn("primary_assembly", filename)
+
+ def test_identify_genome_file_defaults_to_toplevel_when_no_primary_assembly(self):
+ idx = index.Index("ciona_intestinalis", ["entrezgene"])
+ with ftplib.FTP(host="ftp.ensembl.org") as ftp:
+ ftp.login()
+ newest = idx._identify_newest_release(ftp)
+ ftp.cwd("/pub/release-%d/fasta/%s/dna" % (newest, idx.organism))
+ filename = idx._identify_genome_file(ftp.nlst())
+ self.assertIn("toplevel", filename)
+
+ def test_download_fasta_file_gets_a_properly_formatted_file(self):
+ idx = index.Index(
+ "ciona_intestinalis", ["external_gene_name"], index_folder_name=self.outdir
+ )
+ with ftplib.FTP(host="ftp.ensembl.org") as ftp:
+ ftp.login()
+ filename = os.path.join(self.outdir, "ci.fa.gz")
+ idx._download_fasta_file(ftp, filename, ensemble_release=None)
+ with gzip.open(filename, "rt") as f:
+ self.assertIs(
+ f.readline()[0], ">"
+ ) # starting character for genome fa record
+
+ def test_identify_annotation_file_finds_a_gtf_file(self):
+ idx = index.Index("ciona_intestinalis", ["external_gene_name"])
+ with ftplib.FTP(host="ftp.ensembl.org") as ftp:
+ ftp.login()
+ newest = idx._identify_newest_release(ftp)
+ ftp.cwd("/pub/release-%d/gtf/%s/" % (newest, idx.organism))
+ filename = idx._identify_gtf_file(ftp.nlst(), newest)
+ self.assertIsNotNone(filename)
+
+ def test_download_gtf_file_gets_a_file_readable_by_seqc_gtf_reader(self):
+
+ idx = index.Index("ciona_intestinalis", ["entrezgene"])
+
+ with ftplib.FTP(host="ftp.ensembl.org") as ftp:
+ ftp.login()
+ filename = self.outdir + "ci.gtf.gz"
+ idx._download_gtf_file(ftp, filename, ensemble_release=99)
+
+ rd = gtf.Reader(filename)
+ (transcript_chromosome, transcript_strand, transcript_gene_id), exons = next(
+ rd.iter_transcripts()
+ )
+
+ # (('1', '+', 17842), [['1', 'ensembl', 'exon', '1636', '1902', '.', '+', '.', 'gene_id "ENSCING00000017842"; gene_version "1"; transcript_id "ENSCINT00000030147"; transcript_version "1"; exon_number "1"; gene_name "RNaseP_nuc"; gene_source "ensembl"; gene_biotype "misc_RNA"; transcript_name "RNaseP_nuc-201"; transcript_source "ensembl"; transcript_biotype "misc_RNA"; exon_id "ENSCINE00000207263"; exon_version "1";\n']])
+ self.assertEqual(transcript_chromosome, "1")
+ self.assertEqual(transcript_strand, "+")
+ self.assertEqual(transcript_gene_id, 17842)
+ self.assertEqual(len(exons), 1)
+
+ def test_subset_genes_should_returns_original_if_no_additional_fields_or_valid_biotypes(
+ self,
+ ):
+
+ fasta_name = os.path.join(self.outdir, "ci.fa.gz")
+ gtf_name = os.path.join(self.outdir, "ci.gtf.gz")
+ conversion_name = os.path.join(self.outdir, "ci_ids.csv")
+
+ idx = index.Index("ciona_intestinalis", index_folder_name=self.outdir)
+
+ idx._download_ensembl_files(
+ ensemble_release=None,
+ fasta_name=fasta_name,
+ gtf_name=gtf_name,
+ conversion_name=conversion_name,
+ )
+ truncated_gtf = os.path.join(self.outdir, "test.gtf")
+ idx._subset_genes(conversion_name, gtf_name, truncated_gtf, valid_biotypes=None)
+
+ # expect the same file as the original file
+ self.assertTrue(os.path.isfile(truncated_gtf))
+
+ # the current implementation of GTF Reader doesn't allow this:
+ # for gr1, gr2 in zip(gtf.Reader(gtf_name), gtf.Reader(truncated_gtf)):
+
+ records = []
+ for gr in gtf.Reader(gtf_name):
+ records.append(gtf.Record(gr))
+
+ for i, gr in enumerate(gtf.Reader(truncated_gtf)):
+ rec1 = records[i]
+ rec2 = gtf.Record(gr)
+ self.assertEqual(rec1, rec2)
+
+ def test_subset_genes_produces_a_reduced_annotation_file_when_passed_fields(self):
+ organism = "ciona_intestinalis"
+ idx = index.Index(
+ organism, ["external_gene_name"], index_folder_name=self.outdir
+ )
+ idx._download_ensembl_files(ensemble_release=None)
+ self.assertTrue(
+ os.path.isfile(os.path.join(self.outdir, "%s.fa.gz" % organism)),
+ "fasta file not found",
+ )
+ self.assertTrue(
+ os.path.isfile(os.path.join(self.outdir, "%s.gtf.gz" % organism)),
+ "gtf file not found",
+ )
+ self.assertTrue(
+ os.path.isfile(os.path.join(self.outdir, "%s_ids.csv" % organism)),
+ "id file not found",
+ )
+
+ valid_biotypes = {"protein_coding", "lincRNA"}
+ idx._subset_genes(valid_biotypes=valid_biotypes)
+
+ self.assertTrue(
+ os.path.isfile(os.path.join(self.outdir, organism, "annotations.gtf"))
+ )
+ gr_subset = gtf.Reader(os.path.join(self.outdir, organism, "annotations.gtf"))
+ gr_complete = gtf.Reader(os.path.join(self.outdir, "%s.gtf.gz" % organism))
+ self.assertLess(
+ len(gr_subset),
+ len(gr_complete),
+ "Subset annotation was not smaller than the complete annotation",
+ )
+
+ # make sure only valid biotypes are returned
+ complete_invalid = False
+
+ for r in gr_complete:
+ record = gtf.Record(r)
+ if record.attribute("gene_biotype") not in valid_biotypes:
+ complete_invalid = True
+ break
+ self.assertTrue(complete_invalid)
+
+ subset_invalid = False
+ for r in gr_subset:
+ record = gtf.Record(r)
+ if record.attribute("gene_biotype") not in valid_biotypes:
+ subset_invalid = True
+ break
+ self.assertFalse(subset_invalid)
+ self.assertGreater(len(gr_subset), 0)
+
+ def test_create_star_index_produces_an_index(self):
+ organism = "ciona_intestinalis"
+ idx = index.Index(
+ organism, ["external_gene_name"], index_folder_name=self.outdir
+ )
+ idx._download_ensembl_files(ensemble_release=None)
+ idx._subset_genes()
+ idx._create_star_index()
+ expected_file = os.path.join(
+ self.outdir,
+ "{outdir}/{organism}/SAindex".format(outdir=self.outdir, organism=organism),
+ )
+ self.assertTrue(os.path.isfile(expected_file))
+
+
+#########################################################################################
+
+if __name__ == "__main__":
+ nose2.main()
+
+#########################################################################################
diff --git a/src/seqc/tests/test_run_e2e_local.py b/src/seqc/tests/test_run_e2e_local.py
new file mode 100644
index 0000000..0c7770a
--- /dev/null
+++ b/src/seqc/tests/test_run_e2e_local.py
@@ -0,0 +1,130 @@
+import unittest
+import os
+import uuid
+import shutil
+import subprocess
+import re
+from nose2.tools import params
+from seqc.core import main
+from test_dataset import dataset_local, dataset_s3
+
+
+def get_output_file_list(test_id, test_folder):
+
+ proc = subprocess.Popen(
+ ["find", test_folder, "-type", "f"],
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ )
+ stdout, _ = proc.communicate()
+ files = stdout.decode().splitlines()
+
+ # extract only filenames (i.e. remove directory hierarchy)
+ # convert to a set for easy comparison
+ files = set(map(lambda filename: filename.replace(test_folder + "/", ""), files))
+
+ return files
+
+
+def expected_output_files(file_prefix):
+
+ files = set(
+ [
+ f"{file_prefix}.h5",
+ f"{file_prefix}_alignment_summary.txt",
+ f"{file_prefix}_cell_filters.png",
+ f"{file_prefix}_de_gene_list.txt",
+ f"{file_prefix}_dense.csv",
+ f"{file_prefix}_merged.fastq.gz",
+ f"{file_prefix}_mini_summary.json",
+ f"{file_prefix}_mini_summary.pdf",
+ f"{file_prefix}_seqc_log.txt",
+ f"{file_prefix}_sparse_counts_barcodes.csv",
+ f"{file_prefix}_sparse_counts_genes.csv",
+ f"{file_prefix}_sparse_molecule_counts.mtx",
+ f"{file_prefix}_sparse_read_counts.mtx",
+ f"{file_prefix}_summary.tar.gz",
+ f"{file_prefix}_Aligned.out.bam",
+ ]
+ )
+
+ return files
+
+
+class TestRunLocal(unittest.TestCase):
+ @classmethod
+ def setUp(cls):
+ cls.test_id = str(uuid.uuid4())
+ cls.path_temp = os.path.join(
+ os.environ["TMPDIR"], "seqc-test", str(uuid.uuid4())
+ )
+ os.makedirs(cls.path_temp, exist_ok=True)
+ with open("seqc_log.txt", "wt") as f:
+ f.write("Dummy log.\n")
+ f.write("nose2 captures input, so no log is produced.\n")
+ f.write("This causes pipeline errors.\n")
+
+ @classmethod
+ def tearDown(self):
+ if os.path.isdir(self.path_temp):
+ shutil.rmtree(self.path_temp, ignore_errors=True)
+
+ def test_using_dataset_in_s3(self, platform="ten_x_v2"):
+ # must NOT end with a slash
+ file_prefix = "test"
+ output_prefix = os.path.join(self.path_temp, file_prefix)
+
+ params = [
+ ("run", platform),
+ ("--local",),
+ ("--output-prefix", output_prefix),
+ ("--index", dataset_s3.index),
+ ("--barcode-files", dataset_s3.barcodes % platform),
+ ("--barcode-fastq", dataset_s3.barcode_fastq % platform),
+ ("--genomic-fastq", dataset_s3.genomic_fastq % platform),
+ ("--star-args", "runRNGseed=0"),
+ ]
+
+ argv = [element for tupl in params for element in tupl]
+
+ if platform != "drop_seq":
+ argv += ["--barcode-files", dataset_s3.barcodes % platform]
+
+ main.main(argv)
+
+ # get output file list
+ files = get_output_file_list(self.test_id, self.path_temp)
+
+ # check if each expected file is found in the list of files generated
+ for file in expected_output_files(file_prefix):
+ self.assertIn(file, files)
+
+ def test_using_local_dataset(self, platform="ten_x_v2"):
+ # must NOT end with a slash
+ file_prefix = "test"
+ output_prefix = os.path.join(self.path_temp, file_prefix)
+
+ params = [
+ ("run", platform),
+ ("--local",),
+ ("--output-prefix", output_prefix),
+ ("--index", dataset_local.index),
+ ("--barcode-files", dataset_local.barcodes % platform),
+ ("--barcode-fastq", dataset_local.barcode_fastq % platform),
+ ("--genomic-fastq", dataset_local.genomic_fastq % platform),
+ ("--star-args", "runRNGseed=0"),
+ ]
+
+ argv = [element for tupl in params for element in tupl]
+
+ if platform != "drop_seq":
+ argv += ["--barcode-files", dataset_local.barcodes % platform]
+
+ main.main(argv)
+
+ # get output file list
+ files = get_output_file_list(self.test_id, self.path_temp)
+
+ # check if each expected file is found in the list of files generated
+ for file in expected_output_files(file_prefix):
+ self.assertIn(file, files)
diff --git a/src/seqc/tests/test_run_e2e_remote.py b/src/seqc/tests/test_run_e2e_remote.py
new file mode 100644
index 0000000..f2d9bfd
--- /dev/null
+++ b/src/seqc/tests/test_run_e2e_remote.py
@@ -0,0 +1,269 @@
+import unittest
+import os
+import uuid
+import shutil
+import re
+from seqc.core import main
+from seqc import io
+import boto3
+from nose2.tools import params
+from test_dataset import dataset_s3
+
+
+def get_instance_by_test_id(test_id):
+
+ ec2 = boto3.resource("ec2")
+ instances = ec2.instances.filter(
+ Filters=[{"Name": "tag:TestID", "Values": [test_id]}]
+ )
+ instances = list(instances)
+
+ if len(instances) != 1:
+ raise Exception("Test ID is not found or not unique!")
+
+ return instances[0]
+
+
+def expected_output_files(output_prefix):
+
+ files = set(
+ [
+ f"{output_prefix}.h5",
+ f"{output_prefix}_Aligned.out.bam",
+ f"{output_prefix}_alignment_summary.txt",
+ f"{output_prefix}_cell_filters.png",
+ f"{output_prefix}_de_gene_list.txt",
+ f"{output_prefix}_dense.csv",
+ f"{output_prefix}_merged.fastq.gz",
+ f"{output_prefix}_mini_summary.json",
+ f"{output_prefix}_mini_summary.pdf",
+ f"{output_prefix}_seqc_log.txt",
+ f"{output_prefix}_sparse_counts_barcodes.csv",
+ f"{output_prefix}_sparse_counts_genes.csv",
+ f"{output_prefix}_sparse_molecule_counts.mtx",
+ f"{output_prefix}_sparse_read_counts.mtx",
+ f"{output_prefix}_summary.tar.gz",
+ f"seqc_log.txt",
+ ]
+ )
+
+ return files
+
+
+def expected_output_files_run_from_merged(output_prefix):
+
+ files = expected_output_files(output_prefix)
+
+ excludes = set([f"{output_prefix}_merged.fastq.gz"])
+
+ return files - excludes
+
+
+def expected_output_files_run_from_bam(output_prefix):
+
+ files = expected_output_files(output_prefix)
+
+ excludes = set(
+ [
+ f"{output_prefix}_Aligned.out.bam",
+ f"{output_prefix}_alignment_summary.txt",
+ f"{output_prefix}_merged.fastq.gz",
+ ]
+ )
+
+ return files - excludes
+
+
+def get_output_file_list(test_id, s3_bucket, test_folder):
+
+ # get instance and wait until terminated
+ instance = get_instance_by_test_id(test_id)
+ instance.wait_until_terminated()
+
+ # check files generated in S3
+ files = io.S3.listdir(s3_bucket, test_folder)
+
+ # extract only filenames (i.e. remove directory hierarchy)
+ # convert to a set for easy comparison
+ files = set(map(lambda filename: filename.replace(test_folder, ""), files))
+
+ return files
+
+
+def check_for_success_msg(s3_seqc_log_uri, path_temp):
+
+ # download seqc_log.txt
+ io.S3.download(
+ link=s3_seqc_log_uri, prefix=path_temp, overwrite=True, recursive=False
+ )
+
+ # check if seqc_log.txt has a successful message
+ with open(os.path.join(path_temp, "seqc_log.txt"), "rt") as fin:
+ logs = fin.read()
+ match = re.search(r"Execution completed successfully", logs, re.MULTILINE)
+
+ return True if match else False
+
+
+class TestRunRemote(unittest.TestCase):
+
+ email = os.environ["SEQC_TEST_EMAIL"]
+ rsa_key = os.environ["SEQC_TEST_RSA_KEY"]
+ ami_id = os.environ["SEQC_TEST_AMI_ID"]
+
+ s3_bucket = "dp-lab-cicd"
+
+ @classmethod
+ def setUp(cls):
+ cls.test_id = str(uuid.uuid4())
+ cls.path_temp = os.path.join(
+ os.environ["TMPDIR"], "seqc-test", str(uuid.uuid4())
+ )
+ os.makedirs(cls.path_temp, exist_ok=True)
+
+ @classmethod
+ def tearDown(self):
+ if os.path.isdir(self.path_temp):
+ shutil.rmtree(self.path_temp, ignore_errors=True)
+
+ @params("in_drop_v2", "ten_x_v2")
+ def test_remote_from_raw_fastq(self, platform="ten_x_v2"):
+ output_prefix = "from-raw-fastq"
+ # must end with a slash
+ test_folder = f"seqc/run-{platform}-{self.test_id}/"
+
+ params = [
+ ("run", platform),
+ ("--output-prefix", "from-raw-fastq"),
+ ("--upload-prefix", f"s3://{self.s3_bucket}/{test_folder}"),
+ ("--index", dataset_s3.index),
+ ("--email", self.email),
+ ("--barcode-fastq", dataset_s3.barcode_fastq % platform),
+ ("--genomic-fastq", dataset_s3.genomic_fastq % platform),
+ ("--instance-type", "r5.2xlarge"),
+ ("--spot-bid", "1.0"),
+ ("--rsa-key", self.rsa_key),
+ ("--debug",),
+ ("--remote-update",),
+ ("--ami-id", self.ami_id),
+ ("--user-tags", f"TestID:{self.test_id}"),
+ ]
+
+ argv = [element for tupl in params for element in tupl]
+
+ if platform != "drop_seq":
+ argv += ["--barcode-files", dataset_s3.barcodes % platform]
+
+ main.main(argv)
+
+ # wait until terminated
+ # get output file list
+ files = get_output_file_list(self.test_id, self.s3_bucket, test_folder)
+
+ # check for the exact same filenames
+ self.assertSetEqual(files, expected_output_files(output_prefix))
+
+ # check for success message in seqc_log.txt
+ has_success_msg = check_for_success_msg(
+ s3_seqc_log_uri="s3://{}/{}".format(
+ self.s3_bucket, os.path.join(test_folder, "seqc_log.txt")
+ ),
+ path_temp=self.path_temp,
+ )
+
+ self.assertTrue(
+ has_success_msg, msg="Unable to find the success message in the log"
+ )
+
+ def test_remote_from_merged(self, platform="in_drop_v2"):
+ output_prefix = "from-merged"
+ # must end with a slash
+ test_folder = f"seqc/run-{platform}-{self.test_id}/"
+
+ params = [
+ ("run", platform),
+ ("--output-prefix", output_prefix),
+ ("--upload-prefix", f"s3://{self.s3_bucket}/{test_folder}"),
+ ("--index", dataset_s3.index),
+ ("--email", self.email),
+ ("--merged-fastq", dataset_s3.merged_fastq % (platform, platform)),
+ ("--rsa-key", self.rsa_key),
+ ("--instance-type", "r5.2xlarge"),
+ ("--ami-id", self.ami_id),
+ ("--remote-update",),
+ ("--user-tags", f"TestID:{self.test_id}")
+ # ('--spot-bid', '1.0')
+ ]
+
+ argv = [element for tupl in params for element in tupl]
+
+ if platform != "drop_seq":
+ argv += ["--barcode-files", dataset_s3.barcodes % platform]
+
+ main.main(argv)
+
+ # wait until terminated
+ # get output file list
+ files = get_output_file_list(self.test_id, self.s3_bucket, test_folder)
+
+ # check for the exact same filenames
+ self.assertSetEqual(files, expected_output_files_run_from_merged(output_prefix))
+
+ # check for success message in seqc_log.txt
+ has_success_msg = check_for_success_msg(
+ s3_seqc_log_uri="s3://{}/{}".format(
+ self.s3_bucket, os.path.join(test_folder, "seqc_log.txt")
+ ),
+ path_temp=self.path_temp,
+ )
+
+ self.assertTrue(
+ has_success_msg, msg="Unable to find the success message in the log"
+ )
+
+ def test_remote_from_bamfile(self, platform="in_drop_v2"):
+ output_prefix = "from-bamfile"
+ # must end with a slash
+ test_folder = f"seqc/run-{platform}-{self.test_id}/"
+
+ params = [
+ ("run", platform),
+ ("--output-prefix", output_prefix),
+ ("--upload-prefix", f"s3://{self.s3_bucket}/{test_folder}"),
+ ("--index", dataset_s3.index),
+ ("--email", self.email),
+ ("--alignment-file", dataset_s3.bam % platform),
+ ("--rsa-key", self.rsa_key),
+ ("--instance-type", "r5.2xlarge"),
+ ("--debug",),
+ ("--ami-id", self.ami_id),
+ ("--remote-update",),
+ ("--user-tags", f"TestID:{self.test_id}")
+ # ('--spot-bid', '1.0')
+ ]
+
+ argv = [element for tupl in params for element in tupl]
+
+ if platform != "drop_seq":
+ argv += ["--barcode-files", dataset_s3.barcodes % platform]
+
+ main.main(argv)
+
+ # wait until terminated
+ # get output file list
+ files = get_output_file_list(self.test_id, self.s3_bucket, test_folder)
+
+ # check for the exact same filenames
+ self.assertSetEqual(files, expected_output_files_run_from_bam(output_prefix))
+
+ # check for success message in seqc_log.txt
+ has_success_msg = check_for_success_msg(
+ s3_seqc_log_uri="s3://{}/{}".format(
+ self.s3_bucket, os.path.join(test_folder, "seqc_log.txt")
+ ),
+ path_temp=self.path_temp,
+ )
+
+ self.assertTrue(
+ has_success_msg, msg="Unable to find the success message in the log"
+ )
diff --git a/src/seqc/tests/test_run_gtf.py b/src/seqc/tests/test_run_gtf.py
new file mode 100644
index 0000000..78c5c7f
--- /dev/null
+++ b/src/seqc/tests/test_run_gtf.py
@@ -0,0 +1,66 @@
+from unittest import TestCase, mock
+import os
+import uuid
+import shutil
+import nose2
+from seqc.sequence import gtf
+from test_dataset import dataset_local
+
+
+class TestGtf(TestCase):
+ @classmethod
+ def setUp(cls):
+ cls.test_id = str(uuid.uuid4())
+ cls.path_temp = os.path.join(
+ os.environ["TMPDIR"], "seqc-test", str(uuid.uuid4())
+ )
+ cls.annotation = os.path.join(dataset_local.index, "annotations.gtf")
+
+ @classmethod
+ def tearDown(self):
+ if os.path.isdir(self.path_temp):
+ shutil.rmtree(self.path_temp, ignore_errors=True)
+
+ def test_construct_translator(self):
+ translator = gtf.GeneIntervals(self.annotation)
+ self.assertIsNotNone(translator)
+
+ def test_num_of_transcripts(self):
+ rd = gtf.Reader(self.annotation)
+ num_transcripts = sum(1 for _ in rd.iter_transcripts())
+ # awk -F'\t' '$3=="transcript" { print $0 }' annotations.gtf | wc -l
+ self.assertEqual(num_transcripts, 12747)
+
+ def test_iter_transcripts(self):
+ rd = gtf.Reader(self.annotation)
+ (transcript_chromosome, transcript_strand, transcript_gene_id), exons = next(
+ rd.iter_transcripts()
+ )
+
+ # this should give us 3 exons of the first transcript of the first gene found in inverse order:
+ #
+ # chr19 HAVANA gene 60951 71626 . - . gene_id "ENSG00000282458.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; level 2; havana_gene "OTTHUMG00000180466.8";
+ # chr19 HAVANA transcript 60951 70976 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000632506.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-008"; level 2; tag "basic"; transcript_support_level "1"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000471217.2";
+ # chr19 HAVANA exon 70928 70976 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000632506.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-008"; exon_number 1; exon_id "ENSE00003781173.1"; level 2; tag "basic"; transcript_support_level "1"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000471217.2";
+ # chr19 HAVANA exon 66346 66499 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000632506.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-008"; exon_number 2; exon_id "ENSE00003783498.1"; level 2; tag "basic"; transcript_support_level "1"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000471217.2";
+ # chr19 HAVANA exon 60951 61894 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000632506.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-008"; exon_number 3; exon_id "ENSE00003783010.1"; level 2; tag "basic"; transcript_support_level "1"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000471217.2";
+
+ self.assertEqual(transcript_chromosome, "chr19")
+ self.assertEqual(transcript_strand, "-")
+ self.assertEqual(transcript_gene_id, 282458)
+ self.assertEqual(len(exons), 3)
+
+ # 8th column has exon ID
+ self.assertIn("ENSE00003783010.1", exons[0][8]) # exon number 3
+ self.assertIn("ENSE00003783498.1", exons[1][8]) # exon number 2
+ self.assertIn("ENSE00003781173.1", exons[2][8]) # exon number 1
+
+ def test_translate(self):
+ translator = gtf.GeneIntervals(self.annotation)
+ # chr19 HAVANA gene 60951 71626 . - . gene_id "ENSG00000282458.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; level 2; havana_gene "OTTHUMG00000180466.8";
+ gene_id = translator.translate("chr19", "-", 60951)
+ self.assertEqual(gene_id, 282458)
+
+
+if __name__ == "__main__":
+ nose2.main()
diff --git a/src/seqc/tests/test_run_readarray.py b/src/seqc/tests/test_run_readarray.py
new file mode 100644
index 0000000..417278d
--- /dev/null
+++ b/src/seqc/tests/test_run_readarray.py
@@ -0,0 +1,65 @@
+from unittest import TestCase, mock
+import os
+import uuid
+import shutil
+import nose2
+from test_dataset import dataset_local
+from seqc.sequence.encodings import DNA3Bit
+from seqc.read_array import ReadArray
+from seqc.sequence import gtf
+
+
+class TestReadArray(TestCase):
+ @classmethod
+ def setUp(cls):
+ cls.test_id = str(uuid.uuid4())
+ cls.path_temp = os.path.join(
+ os.environ["TMPDIR"], "seqc-test", str(uuid.uuid4())
+ )
+ cls.annotation = os.path.join(dataset_local.index, "annotations.gtf")
+ cls.translator = gtf.GeneIntervals(cls.annotation, 10000)
+
+ @classmethod
+ def tearDown(self):
+ if os.path.isdir(self.path_temp):
+ shutil.rmtree(self.path_temp, ignore_errors=True)
+
+ def test_read_array_creation(self, platform="ten_x_v2"):
+ ra, _ = ReadArray.from_alignment_file(
+ dataset_local.bam % platform, self.translator, required_poly_t=0
+ )
+ self.assertIsNotNone(ra)
+
+ def test_read_array_rmt_decode_10x_v2(self):
+ platform = "ten_x_v2"
+
+ # create a readarray
+ ra, _ = ReadArray.from_alignment_file(
+ dataset_local.bam % platform, self.translator, required_poly_t=0
+ )
+
+ # see if we can decode numeric UMI back to nucleotide sequence
+ dna3bit = DNA3Bit()
+ for rmt in ra.data["rmt"]:
+ decoded = dna3bit.decode(rmt).decode()
+ # ten_x_v2 UMI length = 10 nt
+ self.assertEqual(len(decoded), 10)
+
+ def test_read_array_rmt_decode_10x_v3(self):
+ platform = "ten_x_v3"
+
+ # create a readarray
+ ra, _ = ReadArray.from_alignment_file(
+ dataset_local.bam % platform, self.translator, required_poly_t=0
+ )
+
+ # see if we can decode numeric UMI back to nucleotide sequence
+ dna3bit = DNA3Bit()
+ for rmt in ra.data["rmt"]:
+ decoded = dna3bit.decode(rmt).decode()
+ # ten_x_v3 UMI length = 12 nt
+ self.assertEqual(len(decoded), 12)
+
+
+if __name__ == "__main__":
+ nose2.main()
diff --git a/src/seqc/tests/test_run_rmt_correction.py b/src/seqc/tests/test_run_rmt_correction.py
new file mode 100644
index 0000000..294fcb1
--- /dev/null
+++ b/src/seqc/tests/test_run_rmt_correction.py
@@ -0,0 +1,97 @@
+from unittest import TestCase, mock
+import nose2
+import os
+import numpy as np
+from seqc.read_array import ReadArray
+from seqc import rmt_correction
+
+
+class TestRmtCorrection(TestCase):
+ @classmethod
+ def setUp(self):
+ # pre-allocate arrays
+ n_barcodes = 183416337
+ data = np.recarray((n_barcodes,), ReadArray._dtype)
+ genes = np.zeros(n_barcodes, dtype=np.int32)
+ positions = np.zeros(n_barcodes, dtype=np.int32)
+ self.ra = ReadArray(data, genes, positions)
+
+ @classmethod
+ def tearDown(self):
+ pass
+
+ def test_should_return_correct_ra_size(self):
+
+ ra_size = self.ra.data.nbytes + self.ra.genes.nbytes + self.ra.positions.nbytes
+
+ self.assertEqual(4768824762, ra_size)
+
+ # 64GB
+ @mock.patch(
+ "seqc.rmt_correction._get_available_memory", return_value=50 * 1024 ** 3
+ )
+ def test_should_return_correct_max_workers(self, mock_mem):
+
+ n_workers = rmt_correction._calc_max_workers(self.ra)
+
+ self.assertEqual(n_workers, 7)
+
+ # 1TB
+ @mock.patch("seqc.rmt_correction._get_available_memory", return_value=1079354630144)
+ def test_should_return_correct_max_workers2(self, mock_mem):
+
+ n_workers = rmt_correction._calc_max_workers(self.ra)
+
+ self.assertEqual(n_workers, 156)
+
+ # having less memory than ra size
+ @mock.patch("seqc.rmt_correction._get_available_memory")
+ def test_should_return_one_if_ra_larger_than_mem(self, mock_mem):
+
+ ra_size = self.ra.data.nbytes + self.ra.genes.nbytes + self.ra.positions.nbytes
+
+ # assume the available memory is a half of ra
+ mock_mem.return_value = int(ra_size) / 2
+
+ n_workers = rmt_correction._calc_max_workers(self.ra)
+
+ self.assertEqual(n_workers, 1)
+
+
+class TestRmtCorrection2(TestCase):
+ @classmethod
+ def setUp(self):
+ # pre-allocate arrays
+ n_barcodes = 183416337
+ data = np.recarray((n_barcodes,), ReadArray._dtype)
+ genes = np.zeros(n_barcodes, dtype=np.int32)
+ positions = np.zeros(n_barcodes, dtype=np.int32)
+ self.ra = ReadArray(data, genes, positions)
+
+ import pickle
+
+ with open("pre-correction-ra.pickle", "wb") as fout:
+ pickle.dump(self.ra, fout)
+
+ @classmethod
+ def tearDown(self):
+ import os
+
+ try:
+ os.remove("pre-correction-ra.pickle")
+ except:
+ pass
+
+ @mock.patch("seqc.rmt_correction._correct_errors_by_cell_group", return_value=0)
+ def test_correct_errors_by_chunks(self, mock_correct):
+ cell_group = [1, 2, 3]
+ x = rmt_correction._correct_errors_by_cell_group_chunks(
+ self.ra, cell_group, 0.02, 0.05
+ )
+ mock_correct.assert_called()
+ self.assertEquals(len(cell_group), mock_correct.call_count)
+ self.assertEquals([0, 0, 0], x)
+
+
+if __name__ == "__main__":
+ nose2.main()
diff --git a/src/seqc/version.py b/src/seqc/version.py
index fe404ae..01ef120 100644
--- a/src/seqc/version.py
+++ b/src/seqc/version.py
@@ -1 +1 @@
-__version__ = "0.2.5"
+__version__ = "0.2.6"