vxtractor_fastq.py

#!/usr/bin/env python3

import os
import sys
import glob
import shutil
import argparse
import subprocess
import multiprocessing
import unittest
import logging
import pytest
import re

from collections import namedtuple
from tqdm import tqdm
from Bio.Data import IUPACData
from Bio.SeqUtils import nt_search
from Bio.SearchIO.HmmerIO import Hmmer3TabParser
from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment
import pyfastx
from pyfastxcli import fastx_format_check

__description__ = "A tool for extracting 16S rRNA gene variable regions from FASTQ files guided by alignment positions."
__author__ = "Connor Morgan-Lang, Koonkie Cloud Services"


_DEFAULT_PRIMERS = {"515f": "GTGYCAGCMGCCGCGGTAA",
                    "806r": "GGACTACNVGGGTWTCTAAT"}
_COORD_RANGE = namedtuple("coord_range", ["start", "stop"])
nuc_alphabet = re.compile(r"^[ACGT]+$")


class FastqExtractorTester(unittest.TestCase):
    def setUp(self) -> None:
        as_one = AmpliconSample(sample_name="SRR4011269", fwd="CCTACGGGAGGCAGCAG", rev="CCGTCAATTCMTTTRAGT")
        as_two = AmpliconSample(sample_name="mock_sample_two", fwd="", rev="")
        as_three = AmpliconSample(sample_name="mock_sample_three", fwd="", rev="")
        self.mock_pe_amplicon_samples = [as_one, as_two, as_three]
        self.se_sample = AmpliconSample(sample_name="ERR3339670", fwd="GTGCCAGCMGCCGCGGTAA", rev="GGACTACHVGGGTWTCTAAT")
        self.fail_as = AmpliconSample(sample_name="ERR2105436", fwd="GTGCCAGCMGCCGCGGTAA", rev="GGACTACHVGGGTWTCTAAT")
        self.pyro_as = AmpliconSample(sample_name="8A", fwd="ACTCCTACGGGAGGCAGCAG", rev="CCGTCAATTCMTTTGAGTTT")
        self.test_data_dir = "test_data" + os.sep
        self.output_dir = "./vxtractor_fastq_out"
        self.tmp_dir = "./vxtractor_fastq_tmp"
        self.trim_out = self.test_data_dir + "primer_test_trimmed.fq"
        self.hmms_dir = os.path.join(self.test_data_dir, "HMMs", "bacteria")

        for dir_path in [self.output_dir, self.tmp_dir]:
            if not os.path.isdir(dir_path):
                os.mkdir(dir_path)

        # Make the directories
        for _sample in self.mock_pe_amplicon_samples:  # type: AmpliconSample
            if not os.path.isdir(self.test_data_dir + _sample.name):
                os.mkdir(self.test_data_dir + _sample.name)
            shutil.copyfile(self.test_data_dir + "Chen2016_1_1.fastq",
                            os.path.join(self.test_data_dir, _sample.name, "mock_R1.fq"))
            shutil.copyfile(self.test_data_dir + "Chen2016_1_2.fastq",
                            os.path.join(self.test_data_dir, _sample.name, "mock_R2.fq"))
        for _sample in [self.se_sample, self.fail_as, self.pyro_as]:
            if not os.path.isdir(os.path.join(self.test_data_dir, _sample.name)):
                os.mkdir(os.path.join(self.test_data_dir, _sample.name))
            shutil.copyfile(os.path.join(self.test_data_dir, _sample.name + ".fastq"),
                            os.path.join(self.test_data_dir, _sample.name, _sample.name + ".fastq"))

        return

    def tearDown(self) -> None:
        for dir_path in [self.output_dir, self.tmp_dir]:
            if os.path.isdir(dir_path):
                shutil.rmtree(dir_path)
        if os.path.isfile(self.trim_out):
            os.remove(self.trim_out)

        for _sample in self.mock_pe_amplicon_samples:  # type: AmpliconSample
            if os.path.isdir(self.test_data_dir + _sample.name):
                shutil.rmtree(self.test_data_dir + _sample.name)
        if os.path.isdir(os.path.join(self.test_data_dir, self.se_sample.name)):
            shutil.rmtree(os.path.join(self.test_data_dir, self.se_sample.name))
        return

    def test_validate_arguments(self):
        cli_args = ["--primer_map", self.test_data_dir + "example_primer_map.txt",
                    "--fastq_path", self.test_data_dir,
                    "--extraction_guide", "infernal",
                    "--cmfile", self.test_data_dir + "SSU_rRNA_bacteria.cm",
                    "--output_dir", self.tmp_dir,
                    "--threads", str(2)]
        args = get_options(cli_args)
        validate_arguments(args)
        self.assertTrue(os.path.isfile(os.path.join(self.tmp_dir, "515_806_primers.fasta")))

        # Test without providing cmfile for Infernal extraction
        args.cmfile = None
        with pytest.raises(AssertionError):
            validate_arguments(args)

        # Test bad tool name
        args.extraction_guide = "vxtract"
        with pytest.raises(ValueError):
            validate_arguments(args)
        return

    def test_rc_primers(self):
        mock_sample = self.mock_pe_amplicon_samples[0]
        rc_fwd, rc_rev = mock_sample.rc_primers()
        self.assertEqual(len(mock_sample.fwd_primer), len(rc_fwd))
        self.assertEqual(len(mock_sample.rev_primer), len(rc_rev))
        self.assertEqual(("CTGCTGCCTCCCGTAGG", "ACTYAAAKGAATTGACGG"), (rc_fwd, rc_rev))
        return

    def test_validate_executables(self):
        retcode = validate_executables()
        self.assertEqual(0, retcode)
        retcode = validate_executables(extractor="vxtractor")
        self.assertEqual(0, retcode)
        return

    def test_validate_hmm_dir(self):
        with pytest.raises(AssertionError):
            validate_hmm_dir(self.test_data_dir + "HMMs")
        validate_hmm_dir(self.hmms_dir)
        return

    def test_read_name_match(self):
        self.assertTrue(read_name_match("R0235093:66:000000000-A4C50:1:1101:17880:1893/1",
                                        "R0235093:66:000000000-A4C50:1:1101:17880:1893/2"))
        self.assertTrue(read_name_match("SRR12345_1 read_1 N033:1234/1", "SRR12345_1 read_1 N033:1234/2"))
        self.assertFalse(read_name_match("SRR12345_1", "SRR12345_11"))
        return

    def test_align_primers(self):
        chen_as = AmpliconSample(sample_name="SRR4011269",
                                 fwd="TGGGGAATATTGCACAA", rev="ACGGCAAGCCAGA")
        primer_counts = chen_as.align_primers(self.test_data_dir + "primer_test.fq")
        self.assertEqual(2, primer_counts["reverse_rc"])
        self.assertEqual(2, primer_counts["forward"])
        self.assertTrue(primer_counts["reverse"] == primer_counts["forward_rc"] == 1)
        return

    def test_fastx_summarize(self):
        # Test a FASTQ
        fx = Fastx(fastx_path=self.test_data_dir + "primer_test.fq")
        fx.summarize()
        self.assertEqual(4, fx.num_seqs())
        return

    def test_check_pairing(self):
        mock_sample = self.mock_pe_amplicon_samples[0]  # type: AmpliconSample
        if not mock_sample.raw_fwd and not mock_sample.raw_rev:
            mock_sample.fetch_raw_fastq(self.test_data_dir)
        mock_sample.check_pairing(self.tmp_dir)
        self.assertFalse(mock_sample.repaired)
        return

    def test_filter_reads(self):
        mock_sample = self.mock_pe_amplicon_samples[0]
        if not mock_sample.raw_fwd and not mock_sample.raw_rev:
            mock_sample.fetch_raw_fastq(self.test_data_dir)
        mock_sample.filtered_fwd = os.path.join(self.tmp_dir, mock_sample.name + "_filtered_R1.fastq")
        mock_sample.filtered_rev = os.path.join(self.tmp_dir, mock_sample.name + "_filtered_R1.fastq")
        with pytest.raises(AssertionError):
            fq1, fq2 = mock_sample.collect_fastq_by_stage("repaired")
            filter_reads(fwd_in=fq1, filtered_fwd=mock_sample.filtered_fwd, min_seq_length=100,
                         rev_in=fq2, filtered_rev=mock_sample.filtered_rev)
        fq1, fq2 = mock_sample.collect_fastq_by_stage("raw")
        filtered_dict = filter_reads(fwd_in=fq1, filtered_fwd=mock_sample.filtered_fwd, min_seq_length=200,
                                     rev_in=fq2, filtered_rev=mock_sample.filtered_rev)
        read_pairs, fastqs = filtered_dict.popitem()
        pairs_pass, pairs_fail = read_pairs
        self.assertEqual(351, pairs_fail)
        self.assertTrue(os.path.isfile(mock_sample.filtered_fwd))
        self.assertTrue(os.path.isfile(mock_sample.filtered_rev))

        # Ensure the number of lines in both Fastq files are identical
        with open(mock_sample.filtered_fwd) as fh:
            n_fwd = len(fh.readlines())
        with open(mock_sample.filtered_rev) as fh:
            n_rev = len(fh.readlines())
        self.assertEqual(n_fwd, n_rev)
        return

    def test_fq2fa(self):
        tmp_fa = self.tmp_dir + os.sep + "test_TarA.1.fa"
        fq2fa(fastq=self.test_data_dir + "test_TarA.1.fq", fa=tmp_fa)
        fx = Fastx(tmp_fa)
        fx.summarize()
        self.assertEqual(12, fx.num_seqs())
        return

    def test_cmsearch_wrapper(self):
        output_file = self.tmp_dir + os.sep + "test_SSU_cmsearch.tbl"
        cmsearch_cmd = cmsearch_command_generator(exec_path=which("cmsearch"),
                                                  fasta_file=self.test_data_dir + "test_SSU_rRNA.fa",
                                                  output_tbl=output_file,
                                                  cov_model=self.test_data_dir + "SSU_rRNA_bacteria.cm",
                                                  threads=2,
                                                  hmm_only=True)
        launch_system_command(cmsearch_cmd)
        self.assertTrue(os.path.isfile(output_file))
        with open(output_file) as tbl:
            self.assertEqual(2511, len(tbl.readlines()))
        return

    def test_get_model_primer_coords(self):
        primer_fa = self.test_data_dir + "515_806_primers.fasta"
        primers_range = get_model_primer_coords(cmsearch=which("cmsearch"),
                                                primers_fa=primer_fa,
                                                cov_model=self.test_data_dir + "SSU_rRNA_bacteria.cm")
        self.assertEqual(523, primers_range.start)
        self.assertEqual(776, primers_range.stop)
        return

    def test_fetch_raw_fastq(self):
        mock_sample = self.mock_pe_amplicon_samples[0]
        mock_sample.fetch_raw_fastq(self.test_data_dir)
        self.assertEqual(self.test_data_dir + "SRR4011269/mock_R1.fq", mock_sample.raw_fwd)
        self.assertEqual(self.test_data_dir + "SRR4011269/mock_R2.fq", mock_sample.raw_rev)
        with pytest.raises(AssertionError):
            AmpliconSample("SRR3581236", fwd="T", rev="A").fetch_raw_fastq(self.test_data_dir)
        return

    def test_read_vxtracted(self):
        mock_sample = self.mock_pe_amplicon_samples[0]
        mock_sample.var_pos_tbl = self.test_data_dir + "SRR3581236_vxtractor.csv"
        mock_sample.read_vxtractor_coords(start_name="V3rightlong", end_name="V5leftlong")

        mock_sample.var_pos_tbl = self.test_data_dir + "test_V4.csv"
        variable_positions = mock_sample.read_vxtractor_coords(start_name="V3rightlong", end_name="V5leftlong")
        self.assertEqual(100, len(variable_positions))
        self.assertEqual((440, 693), variable_positions["AF418950.1.1453"])
        self.assertEqual((501, 753), variable_positions["AY005047.1.1486"])
        return

    def test_trim_fq(self):

        trim_fq(fq_in=self.test_data_dir + "primer_test.fq",
                fq_out=self.trim_out,
                trim_positions={"Seq1_fwd_rev_pos": (1, 10),
                                "Seq2_fwd-rc_rev-rc_pos": (11, 20),
                                "Seq3_fwd_pos": (1, 10),
                                "Seq4_rev-rc_pos": (1, 10)})
        for name, seq, qual in Fastx(self.trim_out).fx_handler:
            if name == "Seq1_fwd_rev_pos":
                self.assertEqual("GGGGAATAT", seq)
        return

    def test_read_fastq_seq_names(self):
        seq_names = read_fastq_seq_names(os.path.join("test_data", "Chen2016_1_1.fastq"), 50)
        self.assertEqual(50, len(seq_names))

        seq_names = read_fastq_seq_names(os.path.join("test_data", "Chen2016_1_1.fastq"))
        self.assertEqual(2500, len(seq_names))
        return

    def test_repair_wrapper(self):
        sam = self.mock_pe_amplicon_samples[0]
        sam.fetch_raw_fastq(self.test_data_dir)
        sam.repaired_fwd = os.path.join(self.tmp_dir, sam.name + "_repaired_R1.fastq")
        sam.repaired_rev = os.path.join(self.tmp_dir, sam.name + "_repaired_R2.fastq")
        repair_wrapper(fwd_reads=sam.raw_fwd,
                       rev_reads=sam.raw_rev,
                       repaired_fwd=sam.repaired_fwd,
                       repaired_rev=sam.repaired_rev)
        self.assertTrue(os.path.isfile(sam.repaired_fwd))
        self.assertTrue(os.path.isfile(sam.repaired_rev))
        self.assertEqual(2500, len(read_fastq_seq_names(sam.repaired_fwd)))
        return

    def test_vxtractor_command_generator(self):
        try:
            vxtractor_exe_path = which("vxtractor.pl")
        except SystemError:
            return
        coords_table = os.path.join(self.tmp_dir, "aln_coords.csv")
        vx_cmd = vxtractor_command_generator(exec_path=vxtractor_exe_path,
                                             fa_in=self.test_data_dir + "test_SSU_rRNA.fa",
                                             fa_extracted=os.path.join(self.tmp_dir, "test_SSU_rRNA_extracted.fa"),
                                             var_pos_tbl=coords_table,
                                             hmm_dir=self.hmms_dir)
        launch_system_command(vx_cmd)
        self.assertTrue(os.path.isfile(coords_table))
        return

    def test_parse_hmmer_dom_table(self):
        qseq_coords = parse_hmmer_dom_table(self.test_data_dir + "empty_aln_coords.csv")
        self.assertEqual(0, len(qseq_coords))
        qseq_coords = parse_hmmer_dom_table(self.test_data_dir + "infernal_aln_coords.csv")
        self.assertEqual(6, len(qseq_coords))
        return

    def test_find_trim_coords_from_infernal_alignments(self):
        primers_range = _COORD_RANGE(start=355, stop=540)
        mock_sample = self.mock_pe_amplicon_samples[0]
        mock_sample.var_pos_tbl = self.test_data_dir + "infernal_aln_coords.csv"
        trim_coords = mock_sample.find_trim_coords_from_infernal_alignments(target_coords=primers_range)
        self.assertEqual(4, len(trim_coords))
        self.assertEqual((1, 188), trim_coords['SRR4011269.1015 1015 length=247'])
        self.assertEqual((54, 231), trim_coords['SRR4011269.3 3 length=247'])
        self.assertEqual((0, 253), trim_coords['HWI-M00178:39:000000000-A409G:1:1113:4128:17933 1:N:0:AAC>'])
        self.assertEqual((1, 253), trim_coords['HWI-M00178:39:000000000-A409G:1:2107:28336:12297 1:N:0:AAC>'])

        mock_sample.var_pos_tbl = self.test_data_dir + "v4_infernal_aln_coords.csv"
        primers_range = _COORD_RANGE(start=505, stop=796)
        trim_coords = mock_sample.find_trim_coords_from_infernal_alignments(target_coords=primers_range)
        self.assertEqual(10678, len(trim_coords))

        # Test an empty alignment table
        mock_sample.var_pos_tbl = self.test_data_dir + "empty_aln_coords.csv"
        trim_coords = mock_sample.find_trim_coords_from_infernal_alignments(target_coords=primers_range)
        self.assertEqual(0, len(trim_coords))
        return

    def test_main(self):
        # Test with Infernal
        cli_args = ["--primer_map", self.test_data_dir + "example_primer_map.txt",
                    "--fastq_path", self.test_data_dir,
                    "--extraction_guide", "infernal",
                    "--cmfile", self.test_data_dir + "SSU_rRNA_bacteria.cm",
                    "--hmmonly",
                    "--output_dir", self.output_dir,
                    "--threads", str(4),
                    "--min_read_length", str(150),
                    "--verbose", "--delete"]
        main(cli_args)
        pyro_sample_extracted = os.path.join(self.output_dir, "8A_extracted.fq")
        self.assertTrue(os.path.isfile(pyro_sample_extracted))
        pyro_fx = Fastx(pyro_sample_extracted)
        pyro_fx.get_seq_lengths()
        self.assertEqual(604, pyro_fx.num_seqs())

        # Test with V-Xtractor
        cli_args = ["--primer_map", self.test_data_dir + "example_primer_map.txt",
                    "--fastq_path", self.test_data_dir,
                    "--extraction_guide", "vxtractor",
                    "--vxtractor_hmms", os.path.join(self.test_data_dir, "HMMs", "bacteria"),
                    "--output_dir", self.output_dir,
                    "--min_read_length", str(151),
                    "--threads", str(2)]
        main(cli_args)
        return


class InfernalTblParser(Hmmer3TabParser):
    """Parser for the Infernal cmsearch tblout format."""

    def hmm_as_hit(self):
        return True

    def _parse_row(self):
        """Return a dictionary of parsed row values (PRIVATE)."""
        cols = [x for x in self.line.strip().split(" ") if x]
        if len(cols) < 18:
            raise ValueError("Less columns than expected, only %i" % len(cols))
        # if len(cols) > 19, we have extra description columns
        # combine them all into one string in the 19th column
        if len(cols) > 18:
            cols[17] = " ".join(cols[17:])

        # assign parsed column data into qresult, hit, and hsp dicts
        qresult = {}
        qresult["id"] = cols[2]  # query name
        qresult["accession"] = cols[3]  # query accession
        hit = {}
        hit["id"] = cols[0]  # target name
        hit["accession"] = cols[1]  # target accession
        hit["description"] = cols[17]  # description of target
        hsp = {}
        hsp["evalue"] = float(cols[15])  # i-evalue
        hsp["bitscore"] = float(cols[14])  # score
        hsp["bias"] = float(cols[13])  # bias
        frag = {}
        # strand is always 0, since HMMER now only handles protein
        if cols[9] == '-':
            frag["hit_strand"] = frag["query_strand"] = -1
            frag["hit_start"] = int(cols[8])  # hmm to
            frag["hit_end"] = int(cols[7]) - 1  # hmm from
        else:
            frag["hit_strand"] = frag["query_strand"] = 1
            frag["hit_start"] = int(cols[7]) - 1  # hmm from
            frag["hit_end"] = int(cols[8])  # hmm to
        frag["query_start"] = int(cols[5]) - 1  # ali from
        frag["query_end"] = int(cols[6])  # ali to
        # Infernal results are always RNA
        frag["molecule_type"] = "rna"

        # switch hmm<-->ali coordinates if hmm is not hit
        if not self.hmm_as_hit:
            frag["hit_end"], frag["query_end"] = (frag["query_end"], frag["hit_end"])
            frag["hit_start"], frag["query_start"] = (
                frag["query_start"],
                frag["hit_start"],
            )

        return {"qresult": qresult, "hit": hit, "hsp": hsp, "frag": frag}

    def _parse_qresult(self):
        """Return QueryResult objects (PRIVATE)."""
        # state values, determines what to do for each line
        state_EOF = 0
        state_QRES_NEW = 1
        state_QRES_SAME = 3
        state_HIT_NEW = 2
        state_HIT_SAME = 4
        # dummies for initial states
        qres_state = None
        hit_state = None
        file_state = None
        # dummies for initial id caches
        prev_qid = None
        prev_hid = None
        # dummies for initial parsed value containers
        cur, prev = None, None
        hit_list, hsp_list = [], []
        hit_ids = set()
        cur_qid = None
        cur_hid = None
        while True:
            # store previous line's parsed values, for every line after the 1st
            if cur is not None:
                prev = cur
                prev_qid = cur_qid
                prev_hid = cur_hid
            # only parse the line if it's not EOF
            if self.line and not self.line.startswith("#"):
                cur = self._parse_row()
                cur_qid = cur["qresult"]["id"]
                cur_hid = cur["hit"]["id"]
            else:
                file_state = state_EOF
                # mock ID values since the line is empty
                cur_qid, cur_hid = None, None

            # get the state of hit and qresult
            if prev_qid != cur_qid:
                qres_state = state_QRES_NEW
            else:
                qres_state = state_QRES_SAME
            # new hits are hits with different ids or hits in a new qresult
            if prev_hid != cur_hid or qres_state == state_QRES_NEW:
                hit_state = state_HIT_NEW
            else:
                hit_state = state_HIT_SAME

            # start creating objects after the first line (i.e. prev is filled)
            if prev is not None:
                # each line is basically an HSP with one HSPFragment
                frag = HSPFragment(prev_hid, prev_qid)
                for attr, value in prev["frag"].items():
                    setattr(frag, attr, value)
                hsp = HSP([frag])
                for attr, value in prev["hsp"].items():
                    setattr(hsp, attr, value)
                hsp_list.append(hsp)

                # create hit object when we've finished parsing all its hsps
                # i.e. when hit state is state_HIT_NEW
                if hit_state == state_HIT_NEW:
                    hit = Hit(hsp_list)
                    if hit.id not in hit_ids:
                        for attr, value in prev["hit"].items():
                            setattr(hit, attr, value)
                        hit_list.append(hit)
                        hit_ids.add(hit.id)
                    hsp_list = []

                # create qresult and yield if we're at a new qresult or EOF
                if qres_state == state_QRES_NEW or file_state == state_EOF:
                    qresult = QueryResult(hit_list, prev_qid)
                    for attr, value in prev["qresult"].items():
                        setattr(qresult, attr, value)
                    yield qresult
                    # if current line is EOF, break
                    if file_state == state_EOF:
                        break
                    hit_list = []

            self.line = self.handle.readline()

    @staticmethod
    def hsp_name(hsp: HSP) -> str:
        if hsp.hit_description != '-':
            return hsp.hit_id + ' ' + hsp.hit_description
        else:
            return hsp.hit_id


class MyFormatter(logging.Formatter):

    error_fmt = "%(levelname)s - %(module)s, line %(lineno)d:\n%(message)s"
    warning_fmt = "%(levelname)s:\n%(message)s"
    debug_fmt = "%(asctime)s\n%(message)s"
    info_fmt = "%(message)s"

    def __init__(self):
        super().__init__(fmt="%(levelname)s: %(message)s",
                         datefmt="%d/%m %H:%M:%S")

    def format(self, record):

        # Save the original format configured by the user
        # when the logger formatter was instantiated
        format_orig = self._style._fmt

        # Replace the original format with one customized by logging level
        if record.levelno == logging.DEBUG:
            self._style._fmt = MyFormatter.debug_fmt

        elif record.levelno == logging.INFO:
            self._style._fmt = MyFormatter.info_fmt

        elif record.levelno == logging.ERROR:
            self._style._fmt = MyFormatter.error_fmt

        elif record.levelno == logging.WARNING:
            self._style._fmt = MyFormatter.warning_fmt

        # Call the original formatter class to do the grunt work
        result = logging.Formatter.format(self, record)

        # Restore the original format configured by the user
        self._style._fmt = format_orig

        return result


def prep_logging(log_file_name=None, verbosity=False) -> None:
    """
    Allows for multiple file handlers to be added to the root logger, but only a single stream handler.
    The new file handlers must be removed outside of this function explicitly

    :param log_file_name:
    :param verbosity:
    :return: None
    """
    if verbosity:
        logging_level = logging.DEBUG
    else:
        logging_level = logging.INFO

    # Detect whether a handlers are already present and return if true
    logger = logging.getLogger()
    if len(logger.handlers):
        return

    formatter = MyFormatter()
    # Set the console handler normally writing to stdout/stderr
    ch = logging.StreamHandler()
    ch.setLevel(logging_level)
    ch.terminator = ''
    ch.setFormatter(formatter)

    if log_file_name:
        output_dir = os.path.dirname(log_file_name)
        try:
            if output_dir and not os.path.isdir(output_dir):
                os.makedirs(output_dir)
        except (IOError, OSError):
            sys.stderr.write("ERROR: Unable to make directory '" + output_dir + "'.\n")
            sys.exit(3)
        logging.basicConfig(level=logging.DEBUG,
                            filename=log_file_name,
                            filemode='w',
                            datefmt="%d/%m %H:%M:%S",
                            format="%(asctime)s %(levelname)s:\n%(message)s")
        logging.getLogger('').addHandler(ch)
        logging.getLogger('').propagate = False
    else:
        logging.basicConfig(level=logging_level,
                            datefmt="%d/%m %H:%M:%S",
                            format="%(asctime)s %(levelname)s:\n%(message)s")
    return


def _maketrans(complement_mapping):
    """Make a python string translation table (PRIVATE).
    Arguments:
     - complement_mapping - a dictionary such as ambiguous_dna_complement
       and ambiguous_rna_complement from Data.IUPACData.
    Returns a translation table (a string of length 256) for use with the
    python string's translate method to use in a (reverse) complement.
    Compatible with lower case and upper case sequences.
    For internal use only.
    """
    keys = "".join(complement_mapping.keys()).encode("ASCII")
    values = "".join(complement_mapping.values()).encode("ASCII")
    return bytes.maketrans(keys + keys.lower(), values + values.lower())


_dna_complement_table = _maketrans(IUPACData.ambiguous_dna_complement)


def reverse_complement(seq: str) -> str:
    return seq.translate(_dna_complement_table)[::-1]


class Fastx:
    def __init__(self, fastx_path, fx_format=None):
        self.file_name = fastx_path
        # Ensure the file isn't empty
        self.st_size = os.stat(self.file_name).st_size
        if self.st_size == 0:
            logging.debug("{} is empty.\n".format(self.file_name))
            return

        # Set the file format to either FASTA or FASTQ
        self.format = fx_format
        if not self.format:
            self.format = fastx_format_check(self.file_name)
        self.seq_lengths = []

        # Create the pyfastx file handler
        if self.format == 'fasta':
            self.fx_handler = pyfastx.Fasta(self.file_name, build_index=False)
        elif self.format == 'fastq':
            self.fx_handler = pyfastx.Fastq(self.file_name, build_index=False)
        else:
            raise AssertionError("Unknown file format '{}'.".format(self.format))
        return

    def get_seq_lengths(self) -> None:
        for record in self.fx_handler:
            self.seq_lengths.append(len(record[-1]))
        return

    def max_seq_length(self) -> int:
        return max(self.seq_lengths)

    def mean_seq_length(self) -> float:
        return round(sum(self.seq_lengths)/self.num_seqs(), 1)

    def num_seqs(self) -> int:
        return len(self.seq_lengths)

    def summarize(self, verbosity=0) -> None:
        if self.st_size == 0:
            logging.debug("Unable to summarize {} as file is empty.\n".format(self.file_name))
            return

        if not self.seq_lengths:
            self.get_seq_lengths()

        summary_str = "\t".join(str(i) for i in [self.file_name, self.num_seqs(),
                                                 self.mean_seq_length(), self.max_seq_length()]) \
                      + "\n"
        if verbosity:
            logging.info(summary_str)
        else:
            logging.debug(summary_str)

        return


class AmpliconSample:
    def __init__(self, sample_name: str, fwd: str, rev: str):
        self.name = sample_name
        self.fwd_primer = fwd
        self.rev_primer = rev

        self.paired_end = True
        self.repaired = False

        # Path to FASTQ files
        self.raw_fwd = ""
        self.raw_rev = ""
        self.repaired_fwd = ""
        self.repaired_rev = ""
        self.filtered_fwd = ""
        self.filtered_rev = ""
        self.trim_path_fwd = ""
        self.trim_path_rev = ""
        self.merge_path = ""
        self.final_fq = ""

        # Path to the target-region coordinates table
        self.var_pos_tbl = ""

        self.pairs_pass_filter = 0
        self.pairs_fail_filter = 0
        self.n_frags_aligned = 0
        self.n_short_frags = 0
        self.n_good_extracts = 0
        return

    def fetch_raw_fastq(self, input_dir):
        sample_fq = sorted(glob.glob(os.path.join(input_dir, self.name + os.sep + "*")))
        if len(sample_fq) > 2:
            raise AssertionError("More than two FASTQ files found for sample '{}'.\n".format(self.name))
        elif len(sample_fq) == 1:
            self.raw_fwd = sample_fq.pop(0)
            self.paired_end = False
        elif len(sample_fq) == 0:
            raise AssertionError("Unable to find FASTQ files for sample '{}'.\n".format(self.name))
        else:
            self.raw_fwd = sample_fq.pop(0)
            self.raw_rev = sample_fq.pop(0)
        logging.debug("FASTQ files found for '{}': {} {}\n".format(self.name, self.raw_fwd, self.raw_rev))
        return

    def rc_primers(self) -> (str, str):
        return reverse_complement(self.fwd_primer), reverse_complement(self.rev_primer)

    def collect_fastq_by_stage(self, stage: str) -> list:
        fastq_files = []
        if stage == "raw":
            fastq_files.append(self.raw_fwd)
            if self.paired_end:
                fastq_files.append(self.raw_rev)
        elif stage == "repair":
            if self.paired_end:
                fastq_files.append(self.repaired_fwd)
                fastq_files.append(self.repaired_rev)
            else:
                fastq_files.append(self.raw_fwd)
        elif stage == "filter":
            fastq_files.append(self.filtered_fwd)
            if self.paired_end:
                fastq_files.append(self.filtered_rev)
        elif stage == "trim":
            fastq_files.append(self.trim_path_fwd)
            if self.paired_end:
                fastq_files.append(self.trim_path_rev)
        elif stage == "merge":
            if self.paired_end:
                fastq_files.append(self.merge_path)
            else:
                # No merging if single-end files
                fastq_files.append(self.trim_path_fwd)
        elif stage == "final":
            fastq_files.append(self.final_fq)
        elif stage == "all":
            fastq_files += [fq for fq in [self.raw_fwd, self.raw_rev,
                                          self.repaired_fwd, self.repaired_rev,
                                          self.filtered_fwd, self.filtered_rev,
                                          self.trim_path_fwd, self.trim_path_rev,
                                          self.merge_path, self.final_fq]
                            if fq]
        else:
            raise AssertionError("Unknown stage specified for fastq file collection.")
        return fastq_files

    def summarize_reads(self, stage: str, verbosity=0):
        if verbosity:
            logging.info("File name\tSequences\tMean length\tMax length\n")
        else:
            logging.debug("File name\tSequences\tMean length\tMax length\n")

        for fq in self.collect_fastq_by_stage(stage):
            fx = Fastx(fastx_path=fq)
            fx.summarize(verbosity)

        return

    def align_primers(self, fx_path: str) -> dict:
        rc_fwd, rc_rev = self.rc_primers()
        primer_name_map = {"forward": self.fwd_primer,
                           "reverse": self.rev_primer,
                           "forward_rc": rc_fwd,
                           "reverse_rc": rc_rev}
        primer_counts = {"forward": 0, "reverse": 0, "forward_rc": 0, "reverse_rc": 0}

        fx = Fastx(fastx_path=fx_path)
        if fx.st_size == 0:
            logging.debug("{} is empty - no reads to find primer sequences in.\n".format(fx.file_name))
            return primer_counts

        if fx.format == "fasta":
            logging.error("Unable to align primers to FASTA file.\n")
            raise AssertionError

        for _, seq, _ in fx.fx_handler:
            for name in primer_name_map:
                if len(nt_search(seq, primer_name_map[name])) > 1:
                    primer_counts[name] += 1
        return primer_counts

    def primer_check(self, stage: str):
        """Enumerate the number of times a primer (or reverse complement) was found on the reads."""
        fastq_to_check = self.collect_fastq_by_stage(stage)

        for fq in fastq_to_check:
            primer_counts = self.align_primers(fq)
            logging.debug("Primers in '{}':\n"
                          "{}\t{}\t\t{}\t{}\n"
                          "{}\t{}\t\t{}\t{}\n".format(fq,
                                                      "Forward", primer_counts["forward"],
                                                      "Reverse", primer_counts["reverse"],
                                                      "Fwd_RC", primer_counts["forward_rc"],
                                                      "Rev_RC", primer_counts["reverse_rc"]))
        return

    def check_pairing(self, temporary_dir: str, rm_suffix=False, n=100) -> None:
        """
        Reads n number of read pairs from the forward and reverse fastq files and ensures that the sequence names match.
        If the sequence names do not match (i.e. first forward read name is not identical to the first reverse read)
        the BBTools' repair.sh script is called on the raw FASTQ files.
        Once repair.sh is called, the AmpliconSample attributes 'repaired_fwd' and 'repaired_rev' are set, and the
        'repaired' attribute is set to True.

        :param n: Number of read pairs to sample.
        :param rm_suffix: Boolean indicating whether the string 'length=[0-9]+' should be removed from the read names
        :param temporary_dir: Path to write the repaired FASTQ files (if necessary)
        :return: None
        """
        logging.debug("Validating paired-read ordering for '{}'... ".format(self.name))
        rm_suffix_re = re.compile(r" length=\d+$")
        fwd_seq_names = [seq_name.split()[0] for seq_name in
                         read_fastq_seq_names(self.raw_fwd, num_reads=n)]
        rev_seq_names = [seq_name.split()[0] for seq_name in
                         read_fastq_seq_names(self.raw_rev, num_reads=n)]
        if rm_suffix:
            fwd_seq_names = [rm_suffix_re.sub('', seq_name) for seq_name in fwd_seq_names]
            rev_seq_names = [rm_suffix_re.sub('', seq_name) for seq_name in rev_seq_names]

        if compare_read_names(fwd_seq_names, rev_seq_names) is False:
            logging.debug("Repairing FASTQ files {}, {}.\n".format(self.raw_fwd, self.raw_rev))
            self.repaired = True
            self.repaired_fwd = os.path.join(temporary_dir, self.name + "_repaired_R1.fastq")
            self.repaired_rev = os.path.join(temporary_dir, self.name + "_repaired_R2.fastq")
        else:
            self.repaired_fwd = self.raw_fwd
            self.repaired_rev = self.raw_rev
        logging.debug("done.\n")
        return

    def read_vxtractor_coords(self, start_name: str, end_name: str) -> dict:
        variable_positions = {}
        with open(self.var_pos_tbl, 'r') as tbl:
            cmd_line = tbl.readline()
            opt_line = tbl.readline()
            fields = tbl.readline().strip().split(',')
            # Find the field positions for the start and end data in the CSV
            pos_dict = dict(zip(fields, range(len(fields))))
            start_field_pos = pos_dict[start_name]
            end_field_pos = pos_dict[end_name]

            # Read the alignment positions
            for line in tbl:
                if not line:
                    continue
                self.n_frags_aligned += 1
                fields = re.sub(r"['\"]", '', line).strip().split(',')
                try:
                    start = int(fields[start_field_pos].split('-')[1])
                    end = int(fields[end_field_pos].split('-')[0])
                except (IndexError, ValueError):
                    if fields[start_field_pos] == "notfound" or fields[end_field_pos] == "notfound":
                        continue

                variable_positions[fields[0]] = (start, end)
                self.n_good_extracts += 1

        return variable_positions

    def find_trim_coords_from_infernal_alignments(self, target_coords: namedtuple) -> dict:
        trim_coords = {}
        for qseq_name, hsp in parse_hmmer_dom_table(self.var_pos_tbl).items():  # type: (str, HSP)
            self.n_frags_aligned += 1
            # TODO: Handle alignments to the negative strand
            # Ensure the extracted sequences would be full-length
            if hsp.query_start > target_coords.start or hsp.query_end < target_coords.stop:
                self.n_short_frags += 1
                continue

            # Add the coordinates to the trim_coords dictionary
            trim_coords[qseq_name] = (hsp.hit_start + (target_coords.start - hsp.query_start),
                                      hsp.hit_end - (hsp.query_end - target_coords.stop))
            self.n_good_extracts += 1

        if self.n_frags_aligned > 0:
            logging.debug("{} fragments ({}%) failed to cover entire target region.\n"
                          "".format(self.n_short_frags,
                                    round(100*self.n_short_frags/self.n_frags_aligned, 2)))

        return trim_coords


def validate_arguments(args) -> None:
    if args.extraction_guide == "infernal":
        if not args.cmfile:
            raise AssertionError("Reference CM file must be provided with '--cmfile' for Infernal extraction mode.")
        if not os.path.isfile(args.cmfile):
            raise IOError("CM file '{}' does not exist.".format(args.cmfile))
        if not args.guide_primers:
            args.guide_primers = os.path.join(args.output_dir, "515_806_primers.fasta")
            write_fasta_from_dict(_DEFAULT_PRIMERS, fasta_path=args.guide_primers)
    elif args.extraction_guide == "vxtractor":
        if not args.hmm_path:
            raise AssertionError("Profile HMMs directory must be provided with '--vxtractor_hmms' for V-Xtractor.")
        validate_hmm_dir(args.hmm_path)
    else:
        raise ValueError("Unrecognized extraction mode '{}' specified with '--extraction_guide'."
                         "".format(args.extraction_guide))

    if args.infernal_threads > args.threads:
        logging.error("Number of infernal-process threads specified ({}) is more than the number of threads ({}).\n"
                      "".format(args.infernal_threads, args.threads))
        sys.exit(5)

    return


def get_options(sys_args):
    parser = argparse.ArgumentParser(description="A tool for primer-trimming, merging and extracting 16S rRNA gene "
                                                 "variable regions from FASTQ files.",
                                     add_help=False)

    req_args = parser.add_argument_group("Required arguments")
    opt_args = parser.add_argument_group("Optional arguments")
    mis_args = parser.add_argument_group("Miscellaneous arguments")
    profile_args = opt_args.add_mutually_exclusive_group()

    req_args.add_argument("-p", "--primer_map", dest="primers", required=True,
                          help="Path to a CSV file listing the forward and reverse primers used for each sample.")
    req_args.add_argument("-f", "--fastq_path", dest="fastq_dir", required=True,
                          help="Path to the directory containing SRA run directories with FASTQ files.")

    opt_args.add_argument('-o', '--output_dir', default='./out', required=False,
                          help="Path to a directory to write the merged, trimmed FASTQ files [ DEFAULT = './out' ]")
    opt_args.add_argument("--hmmonly", default=False, action="store_true",
                          help="Infernal only: alignment refinement with secondary structure is skipped.")
    opt_args.add_argument("-g", "--guide_primers", default=None, required=False,
                          help="Infernal only: a FASTA file containing primer sequences to guide sequence extraction. "
                               "[ DEFAULT = 515f,806r ]")

    opt_args.add_argument("-e", "--extraction_guide", choices=["infernal", "vxtractor"], default="infernal",
                          help="Select whether to use Infernal's cmsearch or V-Xtractor guide sequence extraction.")
    opt_args.add_argument("--infernal_threads", type=int, default=2,
                          help="Number of threads for each infernal process to use. "
                               "Must be greater than or equal to --threads. [ DEFAULT = 2 ]")
    profile_args.add_argument("-c", "--cmfile", required=False, default=None,
                              help="Path to the covariance model file for cmsearch.")
    profile_args.add_argument("-x", "--vxtractor_hmms", dest="hmm_path", required=False, default=None,
                              help="Path to a directory containing profile HMMs for V-Xtractor.")

    opt_args.add_argument("-l", "--min_read_length", default=100, type=int,
                          help="The minimum sequence length for an input read. [ DEFAULT = 100 bp ]")

    mis_args.add_argument("-n", "--threads", default=4, type=int,
                          help="The number of threads available for PEAR and cutadapt.")
    mis_args.add_argument('-v', '--verbose', action='store_true', default=False,
                          help='Prints a more verbose runtime log.')
    mis_args.add_argument("-d", "--delete", action="store_true", default=False,
                          help="Remove all intermediate files.")
    mis_args.add_argument("-h", "--help",
                          action="help", help="Show this help message and exit.")

    args = parser.parse_args(sys_args)

    return args


def read_fastq_seq_names(fastq_file: str, num_reads=-1) -> list:
    read_names = []
    fx = Fastx(fastx_path=fastq_file, fx_format="fastq")
    if fx.st_size == 0:
        return read_names
    i = 1
    for name, _, _ in fx.fx_handler:
        read_names.append(name)
        if 0 < num_reads <= i:
            break
        i += 1
    return read_names


def read_name_match(fwd_name: str, rev_name: str) -> bool:
    def doctor_read_name_fn(read_name: str):
        return re.sub(r'/[1-2]$', '', read_name.split()[0])

    if doctor_read_name_fn(fwd_name) == doctor_read_name_fn(rev_name):
        return True
    else:
        return False


def compare_read_names(fwd_read_names: list, rev_read_names: list) -> bool:
    if len(fwd_read_names) != len(rev_read_names):
        logging.error("Unable to compare read name lists of different size!\n")
        raise AssertionError

    i = 0
    while i < len(fwd_read_names):
        fn, rn = fwd_read_names[i], rev_read_names[i]
        if not read_name_match(fn, rn):
            return False
        i += 1
    return True


def write_fasta_from_dict(fasta_records: dict, fasta_path: str) -> None:
    fasta_h = open(fasta_path, 'w')
    for name, seq in fasta_records.items():
        fasta_h.write(">{}\n{}\n".format(name, seq))
    fasta_h.close()
    return


def prep_for_analysis(input_dir, output_dir, temporary_dir) -> None:
    if not os.path.isdir(input_dir):
        raise IOError("Unable to find directory containing FASTQ files '{}'".format(input_dir))

    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)

    if os.path.isdir(temporary_dir):
        shutil.rmtree(temporary_dir)
    os.mkdir(temporary_dir)

    return


def validate_executables(extractor="infernal") -> int:
    retcode = 0
    exe_vers = {"cutadapt": "--version",
                "bbmerge.sh": "version"}

    if extractor == "infernal":
        exe_vers["cmsearch"] = "-h"
    else:
        exe_vers.update({"hmmsearch": "-h", "vxtractor.pl": "-h"})

    version_re = re.compile(r"(\d+\.\d+(\.\d+)?)")
    for exe_name, flag in exe_vers.items():
        exe_path = which(exe_name)
        stdout, exe_retcode = launch_system_command([exe_path, flag], graceful=True)
        stmt = ""
        version = ""
        for line in stdout.split("\n"):
            if version_re.search(line):
                version = version_re.search(line).group(1)
                stmt = "Using {} version {}.\n".format(exe_name, version)
                break

        # Ensure the version of hmmsearch is 3.0
        # if exe_name == "hmmsearch" and version != "3.0":
        #     logging.error("Incorrect version of hmmsearch found - 3.0 is required.\n")
        #     raise AssertionError()
        if not stmt:
            stmt = "Using an unknown version of {}.\n".format(exe_name)
            retcode += 1
        logging.info(stmt)
    return retcode


def validate_hmm_dir(hmms_path: str) -> None:
    hmm_paths = glob.glob(hmms_path + os.sep + "*HMM")
    neighbouring_regions = ["V3", "V5"]
    region_names = ["leftlong", "leftshort", "rightlong", "rightshort"]
    if not hmm_paths:
        raise AssertionError("No HMM files found in '{}'.\n".format(hmms_path))

    hmm_files = set([os.path.basename(p) for p in hmm_paths])
    for prefix in neighbouring_regions:
        for suffix in region_names:
            hmm_file_name = prefix + suffix + ".HMM"
            if hmm_file_name not in hmm_files:
                logging.error("Unable to find profile HMM '{}' in {}.\n".format(hmm_file_name, hmms_path))
                raise AssertionError("HMMs are missing from {}.".format(hmms_path))
    return


def read_sample_primer_table(primer_table: str) -> list:
    logging.debug("Reading the sample-primer table... ")
    amplicon_samples = []
    with open(primer_table) as sample_primers:
        for line in sample_primers:
            sample_name, fwd, rev = line.strip().split(',')
            sample = AmpliconSample(sample_name, fwd, rev)
            amplicon_samples.append(sample)
    logging.debug("done.\n")
    return amplicon_samples


def is_exe(fpath):
    return os.path.isfile(fpath) and os.access(fpath, os.X_OK)


def which(program: str):
    fpath, fname = os.path.split(program)
    if fpath:
        if is_exe(program):
            return program
    else:
        for path_element in os.environ["PATH"].split(os.pathsep):
            path_element = path_element.strip('"')
            exe_file = os.path.join(path_element, program)
            if is_exe(exe_file):
                return exe_file
    raise SystemError("Unable to find an executable for '{}'.".format(program))


def launch_system_command(cmd_list: list, graceful=False):
    """
    Wrapper function for opening subprocesses through subprocess.Popen()

    :param cmd_list: A list of strings forming a complete command call
    via stdout or just stderr is returned leaving stdout to be written to the screen
    :param graceful: Return even if return code was not zero.
    :return: A string with stdout and/or stderr text and the returncode of the executable
    """
    logging.debug("Launching the following command:\n{}\n".format(' '.join(cmd_list)))

    proc = subprocess.Popen(cmd_list,
                            shell=False,
                            preexec_fn=os.setsid,
                            stdout=subprocess.PIPE,
                            stderr=subprocess.STDOUT)
    stdout = proc.communicate()[0].decode("utf-8")

    # Ensure the command completed successfully
    if proc.returncode != 0 and not graceful:
        print(cmd_list[0] + " did not complete successfully! Command used:\n" +
              ' '.join(cmd_list) + "\nOutput:\n" + stdout)
        sys.exit(19)

    return stdout, proc.returncode


def clear_intermediate_fastqs(amplicon_samples: list, delete: bool, stage: str) -> None:
    if not delete:
        return

    for sample in amplicon_samples:  # type: AmpliconSample
        for fq in sample.collect_fastq_by_stage(stage):
            os.remove(fq)
    return


def run_apply_async_multiprocessing(func, arguments_list: list, num_processes: int, pbar_desc: str) -> list:
    pool = multiprocessing.Pool(processes=num_processes)

    def update(*a):
        pbar.update()

    jobs = []
    for args in arguments_list:
        jobs.append(pool.apply_async(func=func, args=(*args,), callback=update))
    pool.close()
    result_list_tqdm = []
    pbar = tqdm(jobs, desc=pbar_desc, ncols=100)

    for job in pbar:
        result_list_tqdm.append(job.get())

    pbar.close()

    return result_list_tqdm


def filter_reads(fwd_in: str, filtered_fwd: str, min_seq_length: int,
                 rev_in=None, filtered_rev=None, paired=True) -> dict:
    """
    Screens fastq files for any of the following defects and removes mate-pairs if found:
    1. Short sequences
    2. Ambiguity characters

    :param min_seq_length: The minimum sequence length for a read to pass
    :return: The number of reads or mate-pairs removed from each fastq file
    """
    logging.debug("Filtering reads from FASTQ files {} and {}.\n".format(fwd_in, rev_in))
    pairs_fail_filter = 0
    pairs_pass_filter = 0
    fwd_py_fq = Fastx(fastx_path=fwd_in)
    fwd_out = open(filtered_fwd, 'w')
    fwd_reads = [read for read in fwd_py_fq.fx_handler]
    if paired:
        rev_py_fq = Fastx(fastx_path=rev_in)
        rev_out = open(filtered_rev, 'w')
        rev_reads = [read for read in rev_py_fq.fx_handler]
    else:
        rev_reads = None
        rev_out = None

    # TODO: Can I load these in batches?

    r_name, r_seq, r_qual = None, None, None
    while fwd_reads:
        f_name, f_seq, f_qual = fwd_reads.pop(0)
        if paired:
            r_name, r_seq, r_qual = rev_reads.pop(0)
            # Ensure sequence names match between the mate-pairs
            if not read_name_match(f_name, r_name):
                logging.error("Reads in {} and {} are out of order - names do not match:\n"
                              "{}, {}\n".format(fwd_in, rev_in, f_name, r_name))
                raise AssertionError

            if len(r_seq) < min_seq_length:
                pairs_fail_filter += 1
                continue
            if not nuc_alphabet.match(r_seq):
                pairs_fail_filter += 1
                continue
        if len(f_seq) < min_seq_length:
            pairs_fail_filter += 1
            continue
        if not nuc_alphabet.match(f_seq):
            pairs_fail_filter += 1
            continue

        # Write the sequence to the new FASTQ file if the read or mates passes filters
        fwd_out.write("@{}\n{}\n+\n{}\n".format(f_name, f_seq, f_qual))
        if paired and rev_out:
            rev_out.write("@{}\n{}\n+\n{}\n".format(r_name, r_seq, r_qual))
        pairs_pass_filter += 1

    fwd_out.close()
    if paired and rev_out:
        rev_out.close()

    return {(pairs_pass_filter, pairs_fail_filter): (filtered_fwd, filtered_rev)}


def match_fastqs_to_sample_names(fastq_files: list, amplicon_samples: list) -> list:
    sample_names = []
    for fq_file in fastq_files:
        for sample in amplicon_samples:  # type: AmpliconSample
            if isinstance(fq_file, tuple):
                fwd, rev = fq_file
                if fwd in sample.collect_fastq_by_stage("all") and rev in sample.collect_fastq_by_stage("all"):
                    sample_names.append(sample.name)
                    break
            if isinstance(fq_file, str):
                if fq_file in sample.collect_fastq_by_stage("all"):
                    sample_names.append(sample.name)
    return sample_names


def remove_samples(amplicon_samples: list, **kwargs) -> None:
    """
    Pops all AmpliconSamples from amplicon_samples that are provided through a keyword

    :param amplicon_samples: A list of AmpliconSample instances
    :param kwargs:
    :return:
    """
    # Find the sample names to pop
    names_list = []
    if "sample_names" in kwargs:
        names_list += kwargs["sample_names"]
    if "failed_fastqs" in kwargs:
        names_list += match_fastqs_to_sample_names(amplicon_samples=amplicon_samples,
                                                   fastq_files=kwargs["failed_fastqs"])
    else:
        logging.error("Unable to remove AmpliconSample instances by their '{}'.\n".format(kwargs.keys()))
        raise AssertionError

    # Remove all samples in names_list
    i = 0
    while i < len(amplicon_samples):
        sample = amplicon_samples[i]  # type: AmpliconSample
        if sample.name in names_list:
            amplicon_samples.pop(i)
            names_list.pop(names_list.index(sample.name))
        else:
            i += 1
    return


def validate_sample_inputs(amplicon_samples: list, samples_dir: str, temporary_dir: str, min_seq_length=0, procs=2):
    pbar = tqdm(ncols=100, total=len(amplicon_samples), desc="Check proper pairs")
    task_list = []
    for sample in amplicon_samples:  # type: AmpliconSample
        pbar.set_description(sample.name)

        sample.fetch_raw_fastq(samples_dir)
        sample.summarize_reads("raw")
        if sample.paired_end:
            sample.check_pairing(temporary_dir)
            if sample.repaired:
                task_list.append([sample.raw_fwd, sample.raw_rev, sample.repaired_fwd, sample.repaired_rev])
        pbar.update()
    pbar.close()

    if task_list:
        logging.debug("Repairing read pairing across FASTQ files.\n")
        run_apply_async_multiprocessing(func=repair_wrapper,
                                        arguments_list=task_list,
                                        num_processes=procs,
                                        pbar_desc="FASTQ repair")
    else:
        logging.debug("All FASTQ files are properly paired.\n")

    # Filter the sample's FASTQ files
    task_list = []
    for sample in amplicon_samples:
        fwd_to_repair, rev_to_repair = None, None
        # Collect fastq files based on parity
        fq_files = sample.collect_fastq_by_stage("repair")
        fwd_to_repair = fq_files.pop(0)
        if sample.paired_end:
            rev_to_repair = fq_files.pop(0)

        # Set filtered file paths
        sample.filtered_fwd = os.path.join(temporary_dir, sample.name + "_filtered_R1.fastq")
        sample.filtered_rev = os.path.join(temporary_dir, sample.name + "_filtered_R2.fastq")
        task_list.append([fwd_to_repair, sample.filtered_fwd, min_seq_length,
                          rev_to_repair, sample.filtered_rev, sample.paired_end])

    # partial_func = partial(filter_reads, min_seq_length=min_seq_length)
    results_list = run_apply_async_multiprocessing(func=filter_reads,
                                                   arguments_list=task_list,
                                                   num_processes=procs,
                                                   pbar_desc="Filter reads")

    # Remove samples with no reads that passed
    failures = []
    for filtered_dict in results_list:
        read_pairs, fastqs = filtered_dict.popitem()
        pairs_pass, pairs_fail = read_pairs
        if pairs_pass == 0:
            failures.append(fastqs)
    remove_samples(amplicon_samples, failed_fastqs=failures)

    return


def cutadapt_wrapper(exec_path: str, amplicon_samples: list, temporary_dir: str, threads: int) -> None:
    pbar = tqdm(ncols=100, total=len(amplicon_samples))

    for sample in amplicon_samples:  # type: AmpliconSample
        pbar.set_description(sample.name)
        sample.trim_path_fwd = os.path.join(temporary_dir, sample.name + "_cutadapt_R1.fastq")
        sample.trim_path_rev = os.path.join(temporary_dir, sample.name + "_cutadapt_R2.fastq")
        rc_fwd, rc_rev = sample.rc_primers()
        cut_cmd = [exec_path,
                   "-g", sample.fwd_primer, "-a", rc_rev,
                   "--quiet", "-j", str(threads),
                   "--trim-n", "--minimum-length", str(50)]
        input_fq_files = sample.collect_fastq_by_stage("filter")
        if sample.paired_end:
            cut_cmd += ["-n", str(4),
                        "-o", sample.trim_path_fwd,
                        "-p", sample.trim_path_rev,
                        "-G", sample.rev_primer, "-A", rc_fwd,
                        input_fq_files[0], input_fq_files[1]]
        else:
            cut_cmd += ["-n", str(2),
                        "-o", sample.trim_path_fwd, input_fq_files[0]]
        launch_system_command(cut_cmd)
        sample.primer_check("trim")
        pbar.update()

    pbar.close()
    return


def repair_wrapper(fwd_reads, rev_reads, repaired_fwd, repaired_rev, ram="2g") -> None:
    """
    Wrapper for BBTools' repair.sh script.
    Uses the following parameters, restricting it's utility and making some assumptions about the FASTQ files:
    - 'fint' parameter to fix-interleaving, assuming the reads are not arbitrarily corrupted/rearranged.
    - 'ain' parameter to check for identical read pair names, rather than requiring /1 and /2 suffixes
    """
    repair_cmd = [which("repair.sh"),
                  "fint=f", "repair=t",
                  "in=" + fwd_reads,
                  "in2=" + rev_reads,
                  "out=" + repaired_fwd,
                  "out2=" + repaired_rev,
                  "ain=t",
                  "-Xmx" + ram]
    launch_system_command(repair_cmd)
    return


def bbmerge_wrapper(exec_path: str, amplicon_samples: list, temporary_dir: str, threads: int, delete=False) -> None:
    pbar = tqdm(ncols=100, total=len(amplicon_samples))

    for sample in amplicon_samples:  # type: AmpliconSample
        pbar.set_description(sample.name)
        if not sample.paired_end:
            pbar.update()
            continue
        sample.merge_path = os.path.join(temporary_dir, sample.name + "_merged.fastq")

        # Find the adapter sequences
        adapt_cmd = [exec_path,
                     "t=" + str(threads),
                     "in1=" + sample.trim_path_fwd,
                     "in2=" + sample.trim_path_rev,
                     "outa=" + os.path.join(temporary_dir, "adapters.fa")]
        launch_system_command(adapt_cmd)

        # Merge the FASTQ files
        merge_cmd = [exec_path,
                     "t=" + str(threads),
                     "iupacton=t",
                     "in1=" + sample.trim_path_fwd,
                     "in2=" + sample.trim_path_rev,
                     "out=" + sample.merge_path,
                     "adapters=" + os.path.join(temporary_dir, "adapters.fa")]
        stdout, retcode = launch_system_command(merge_cmd)

        logging.debug(stdout)
        if delete:
            os.remove(sample.trim_path_fwd)
            os.remove(sample.trim_path_rev)
        pbar.update()

    pbar.close()
    return


def fq2fa(fastq: str, fa: str) -> None:

    fa_handler = open(fa, 'w')

    fq = Fastx(fastx_path=fastq, fx_format="fastq")
    if fq.st_size:
        for name, seq, qual in fq.fx_handler:
            fa_handler.write(">{}\n{}\n".format(name, seq))

    fa_handler.close()

    return


def vxtractor_command_generator(exec_path: str, hmm_dir: str, var_pos_tbl: str, fa_extracted: str, fa_in: str) -> list:
    vx_cmd = [exec_path,
              "-r", "V3.-.V5",
              "-h", hmm_dir,
              "-c", var_pos_tbl,
              "-o", fa_extracted,
              fa_in]
    return vx_cmd


def cmsearch_command_generator(exec_path: str, fasta_file: str, cov_model: str, output_tbl: str,
                               threads=2, hmm_only=False) -> list:
    cmsearch_cmd = [exec_path,
                    "--cpu", str(threads),
                    "--noali",
                    "--tformat", "fasta",
                    "--tblout", output_tbl]
    if hmm_only:
        cmsearch_cmd.append("--hmmonly")
    else:
        cmsearch_cmd.append("--rfam")

    cmsearch_cmd += [cov_model, fasta_file]

    return cmsearch_cmd


def get_model_coordinates_for_fragments(amplicon_samples: list, tmp_dir: str, extractor="infernal",
                                        cmfile=None, hmms_path=None, hmmonly=False, threads=2, proc_threads=2) -> None:
    task_list = list()
    procs = 0

    logging.info("Creating fasta files from sample FASTQs for alignment... ")
    for sample in amplicon_samples:  # type: AmpliconSample
        sample.var_pos_tbl = os.path.join(tmp_dir, sample.name + "_aln_coords.csv")
        input_fq = sample.collect_fastq_by_stage("merge").pop()
        tmp_fa = os.path.join(tmp_dir, sample.name + ".fa")

        # Covert fastq file to FASTA  # Either the merged or trim_fwd fastq
        logging.debug("Creating fasta file from {}... ".format(input_fq))
        fq2fa(fastq=input_fq, fa=tmp_fa)
        logging.debug("done.\n")

        if extractor == "infernal":
            if not procs:
                procs = round(threads/proc_threads)
            extract_cmd = cmsearch_command_generator(which("cmsearch"),
                                                     fasta_file=tmp_fa,
                                                     cov_model=cmfile,
                                                     output_tbl=sample.var_pos_tbl,
                                                     hmm_only=hmmonly,
                                                     threads=proc_threads)
        else:
            procs = int(threads)
            tmp_extracted_fa = os.path.join(tmp_dir, sample.name + "_vxtractor.fa")
            extract_cmd = vxtractor_command_generator(which("vxtractor.pl"),
                                                      hmm_dir=hmms_path,
                                                      var_pos_tbl=sample.var_pos_tbl,
                                                      fa_extracted=tmp_extracted_fa,
                                                      fa_in=tmp_fa)

        task_list.append(extract_cmd)
    logging.info("done.\n")

    logging.info("Identifying regions in merged reads with {}\n".format(extractor))

    pool = multiprocessing.Pool(procs)
    pbar = tqdm(ncols=100, total=len(amplicon_samples), desc=extractor)

    def update(*a):
        pbar.update()

    for cmd in task_list:
        pool.apply_async(launch_system_command, args=(cmd,), callback=update)

    pool.close()
    pool.join()

    pbar.close()

    return


def parse_hmmer_dom_table(tbl_path: str) -> dict:
    qseq_coords = {}
    with open(tbl_path) as hmmer_tbl:
        for qresult in InfernalTblParser(hmmer_tbl):  # type: QueryResult
            qseq_coords = {InfernalTblParser.hsp_name(i): i for i in qresult.hsps}

    return qseq_coords


def get_model_primer_coords(cmsearch: str, cov_model: str, primers_fa: str, tmp_dir="./", delete=False) -> namedtuple:
    # Temporary file paths:
    primer_coords_tbl = os.path.join(tmp_dir, "primer_aln.txt")

    # Run Infernal's cmsearch to align the primers to the covariance model
    infernal_cmd = [cmsearch, "-g",  # Need the 'glocal' (-g) alignment otherwise short alignments may not be returned
                    "--cpu", str(1),
                    "--noali",
                    "--tformat", "fasta",
                    "--tblout", primer_coords_tbl,
                    cov_model, primers_fa]
    launch_system_command(infernal_cmd)

    # Read the alignment coordinates
    primer_hsps = parse_hmmer_dom_table(primer_coords_tbl)

    # Clean up intermediate files
    if os.path.isfile(primer_coords_tbl) and delete:
        os.remove(primer_coords_tbl)

    start_hsp, stop_hsp = sorted(primer_hsps.values(), key=lambda x: x.query_start)
    primers_range = _COORD_RANGE(start=start_hsp.query_end, stop=stop_hsp.query_start)

    logging.debug("Target region spans nucleotides {} to {} on '{}'.\n"
                  "".format(primers_range.start, primers_range.stop, cov_model))

    return primers_range


def trim_samples_to_variable_region(amplicon_samples: list, output_dir: str, extractor: str, primer_range=None) -> None:
    pbar = tqdm(ncols=100, total=len(amplicon_samples))

    for sample in amplicon_samples:  # type: AmpliconSample
        pbar.set_description(sample.name)
        if extractor == "infernal":
            v_positions = sample.find_trim_coords_from_infernal_alignments(primer_range)
        elif extractor == "vxtractor":
            v_positions = sample.read_vxtractor_coords(start_name="V3rightlong", end_name="V5leftlong")
        else:
            return

        if sample.n_frags_aligned > 0:
            logging.debug("{}% ({}/{}) reads identified with start and stop coordinates in '{}'.\n"
                          "".format(round(100 * sample.n_good_extracts / sample.n_frags_aligned, 2),
                                    sample.n_good_extracts, sample.n_frags_aligned, sample.var_pos_tbl))

        if len(v_positions) == 0:
            logging.warning("No variable regions detected in {} reads.\n".format(sample.name))
            pbar.update()
            continue
        # Stream the fastq file, and write the extracted positions
        sample.final_fq = os.path.join(output_dir, sample.name + "_extracted.fq")
        if sample.paired_end:
            trim_fq(sample.merge_path, v_positions, sample.final_fq)
        else:
            trim_fq(sample.trim_path_fwd, v_positions, sample.final_fq)
        sample.summarize_reads("final")
        pbar.update()

    pbar.close()
    return


def trim_fq(fq_in: str, trim_positions: dict, fq_out: str) -> None:
    fx = Fastx(fastx_path=fq_in)
    if fx.st_size == 0:
        logging.warning("Unable to trim FASTQ '{}' to target region - file is empty.\n".format(fx.file_name))
        return
    fq_out_handler = open(fq_out, 'w')

    no_coords = 0
    zero_len = 0

    for name, seq, qual in fx.fx_handler:
        try:
            start, end = trim_positions[name]
        except KeyError:
            no_coords += 1
            continue

        if len(seq[start:end]) == 0 or len(qual[start:end]) == 0:
            zero_len += 0
            continue

        fq_out_handler.write("@{}\n{}\n+\n{}\n".format(name,
                                                       seq[start:end],
                                                       qual[start:end]))
    fq_out_handler.close()

    # Summarize:
    logging.debug("Trimming {}:\n"
                  "\t{} reads lacking coordinates.\n"
                  "\t{} reads trimmed to zero bp.\n".format(fq_in, no_coords, zero_len))

    return


def main(cli_args):
    args = get_options(cli_args)
    tmp_dir = args.output_dir + os.sep + "tmp" + os.sep
    prep_for_analysis(args.fastq_dir, args.output_dir, tmp_dir)
    prep_logging(log_file_name=os.path.join(args.output_dir, "log.txt"), verbosity=args.verbose)
    validate_arguments(args)
    validate_executables(extractor=args.extraction_guide)

    if args.extraction_guide == "infernal":
        primers_range = get_model_primer_coords(which("cmsearch"),
                                                cov_model=args.cmfile,
                                                primers_fa=args.guide_primers,
                                                tmp_dir=tmp_dir)
    else:
        primers_range = None

    # Read the file mapping sample to primers
    amplicon_samples = read_sample_primer_table(args.primers)

    logging.info("Validating FASTQ inputs for samples\n")
    validate_sample_inputs(amplicon_samples, args.fastq_dir, tmp_dir, args.min_read_length, args.threads)

    # TODO: support paired-end interleaved FASTQ files
    logging.info("Trimming adapters and primers from reads with cutadapt\n")
    cutadapt_wrapper(which("cutadapt"), amplicon_samples, tmp_dir, args.threads)
    clear_intermediate_fastqs(amplicon_samples, args.delete, "filter")

    logging.info("Merging paired-end reads with BBMerge\n")
    bbmerge_wrapper(which("bbmerge.sh"), amplicon_samples, tmp_dir, args.threads)

    get_model_coordinates_for_fragments(amplicon_samples, tmp_dir,
                                        extractor=args.extraction_guide,
                                        hmms_path=args.hmm_path,
                                        cmfile=args.cmfile,
                                        hmmonly=args.hmmonly,
                                        threads=args.threads,
                                        proc_threads=args.infernal_threads)

    logging.info("Trimming merged reads to just include V4 interval\n")
    trim_samples_to_variable_region(amplicon_samples, args.output_dir, args.extraction_guide, primers_range)

    if args.delete:
        logging.info("Removing temporary files... ")
        shutil.rmtree(tmp_dir)
        logging.info("done.\n")

    logging.info("Extraction complete.\n")

    return


if __name__ == '__main__':
    main(sys.argv[1:])