diff --git a/pyani/anim.py b/pyani/anim.py index 576545ad..ec37c6f2 100644 --- a/pyani/anim.py +++ b/pyani/anim.py @@ -370,7 +370,7 @@ def parse_delta(filename: Path) -> Tuple[int, int, int]: for interval in ref_tree: saln_length += interval.end - interval.begin + 1 - return (saln_length, qaln_length, sim_errors) + return (qaln_length, saln_length, sim_errors) # Parse all the .delta files in the passed directory @@ -434,15 +434,15 @@ def process_deltadir( deltafile, ) continue - tot_length, tot_sim_error = parse_delta(deltafile) - if tot_length == 0 and logger is not None: + query_tot_length, subject_tot_length, tot_sim_error = parse_delta(deltafile) + if subject_tot_length == 0 and logger is not None: if logger: logger.warning( "Total alignment length reported in %s is zero!", deltafile ) sys.exit("Zero length alignment!") - query_cover = float(tot_length) / org_lengths[qname] - sbjct_cover = float(tot_length) / org_lengths[sname] + query_cover = float(query_tot_length) / org_lengths[qname] + sbjct_cover = float(subject_tot_length) / org_lengths[sname] # Calculate percentage ID of aligned length. This may fail if # total length is zero. @@ -450,15 +450,17 @@ def process_deltadir( # Common causes are that a NUCmer run failed, or that a very # distant sequence was included in the analysis. try: - perc_id = 1 - float(tot_sim_error) / tot_length + subject_perc_id = 1 - float(tot_sim_error) / subject_tot_length + query_perc_id = 1 - float(tot_sim_error) / query_tot_length except ZeroDivisionError: - perc_id = 0 # set arbitrary value of zero identity + subject_perc_id = 0 # set arbitrary value of zero identity + query_perc_id = 0 results.zero_error = True # Populate dataframes: when assigning data from symmetrical MUMmer # output, both upper and lower triangles will be populated - results.add_tot_length(qname, sname, tot_length) + results.add_tot_length(qname, sname, query_tot_length, subject_tot_length) results.add_sim_errors(qname, sname, tot_sim_error) - results.add_pid(qname, sname, perc_id) + results.add_pid(qname, sname, query_perc_id, subject_perc_id) results.add_coverage(qname, sname, query_cover, sbjct_cover) return results diff --git a/pyani/pyani_orm.py b/pyani/pyani_orm.py index ac388860..6e8484d6 100644 --- a/pyani/pyani_orm.py +++ b/pyani/pyani_orm.py @@ -279,9 +279,11 @@ class Comparison(Base): comparison_id = Column(Integer, primary_key=True) query_id = Column(Integer, ForeignKey("genomes.genome_id"), nullable=False) subject_id = Column(Integer, ForeignKey("genomes.genome_id"), nullable=False) - aln_length = Column(Integer) # in fastANI this is matchedfrags * fragLength + query_aln_length = Column(Integer) # in fastANI this is matchedfrags * fragLength + subject_aln_length = Column(Integer) sim_errs = Column(Integer) # in fastANI this is allfrags - matchedfrags - identity = Column(Float) + query_identity = Column(Float) + subject_identity = Column(Float) cov_query = Column(Float) # in fastANI this is matchedfrags/allfrags cov_subject = Column(Float) # in fastANI this is Null program = Column(String) @@ -307,7 +309,7 @@ def __str__(self) -> str: "Query: {}, Subject: {}, %%ID={}, ({} {}), FragSize: {}, MaxMatch: {}, KmerSize: {}, MinMatch: {}".format( self.query_id, self.subject_id, - self.identity, + self.query_identity, self.program, self.version, self.fragsize, @@ -349,6 +351,7 @@ def get_comparison_dict(session: Any) -> Dict[Tuple, Any]: Returns Comparison objects, keyed by (_.query_id, _.subject_id, _.program, _.version, _.fragsize, _.maxmatch) tuple """ + return { ( _.query_id, @@ -615,6 +618,8 @@ def add_run_genomes( def update_comparison_matrices(session, run) -> None: + logger = logging.getLogger(__name__) + """Update the Run table with summary matrices for the analysis. :param session: active pyanidb session via ORM @@ -638,19 +643,21 @@ def update_comparison_matrices(session, run) -> None: df_alnlength.loc[genome.genome_id, genome.genome_id] = genome.length # Loop over all comparisons for the run and fill in result matrices + + logger.debug("Existing comparisons\n%s", run.comparisons.all()) for cmp in run.comparisons.all(): qid, sid = cmp.query_id, cmp.subject_id - df_identity.loc[qid, sid] = cmp.identity + df_identity.loc[qid, sid] = cmp.query_identity df_coverage.loc[qid, sid] = cmp.cov_query - df_alnlength.loc[qid, sid] = cmp.aln_length + df_alnlength.loc[qid, sid] = cmp.query_aln_length df_simerrors.loc[qid, sid] = cmp.sim_errs - df_hadamard.loc[qid, sid] = cmp.identity * cmp.cov_query - if cmp.program in ["nucmer"]: - df_hadamard.loc[sid, qid] = cmp.identity * cmp.cov_subject + df_hadamard.loc[qid, sid] = cmp.query_identity * cmp.cov_query + if (qid, sid) == (sid, qid): + df_hadamard.loc[sid, qid] = cmp.query_identity * cmp.cov_subject df_simerrors.loc[sid, qid] = cmp.sim_errs - df_alnlength.loc[sid, qid] = cmp.aln_length - df_coverage.loc[sid, qid] = cmp.cov_subject - df_identity.loc[sid, qid] = cmp.identity + df_alnlength.loc[sid, qid] = cmp.query_aln_length + df_coverage.loc[sid, qid] = cmp.cov_query + df_identity.loc[sid, qid] = cmp.query_identity # Add matrices to the database run.df_identity = df_identity.to_json() diff --git a/pyani/pyani_tools.py b/pyani/pyani_tools.py index a9b93ae8..13ae1b60 100644 --- a/pyani/pyani_tools.py +++ b/pyani/pyani_tools.py @@ -106,7 +106,11 @@ def __init__(self, labels: List[str], mode: str) -> None: self.mode = mode def add_tot_length( - self, qname: str, sname: str, value: float, sym: bool = True + self, + qname: str, + sname: str, + query_alignment_length: float, + subject_alignment_length: Optional[float] = None, ) -> None: """Add a total length value to self.alignment_lengths. @@ -115,9 +119,9 @@ def add_tot_length( :param value: :param sym: """ - self.alignment_lengths.loc[qname, sname] = value - if sym: - self.alignment_lengths.loc[sname, qname] = value + self.alignment_lengths.loc[qname, sname] = query_alignment_length + if subject_alignment_length: + self.alignment_lengths.loc[sname, qname] = subject_alignment_length def add_sim_errors( self, qname: str, sname: str, value: float, sym: bool = True @@ -133,17 +137,23 @@ def add_sim_errors( if sym: self.similarity_errors.loc[sname, qname] = value - def add_pid(self, qname: str, sname: str, value: float, sym: bool = True) -> None: + def add_pid( + self, + qname: str, + sname: str, + qidentity: float, + sidentity: Optional[float] = None, + ) -> None: """Add a percentage identity value to self.percentage_identity. :param qname: :param sname: - :param value: - :param sym: + :param qidentity: + :param sidentity: """ - self.percentage_identity.loc[qname, sname] = value - if sym: - self.percentage_identity.loc[sname, qname] = value + self.percentage_identity.loc[qname, sname] = qidentity + if sidentity: + self.percentage_identity.loc[sname, qname] = sidentity def add_coverage( self, qname: str, sname: str, qcover: float, scover: Optional[float] = None diff --git a/pyani/scripts/subcommands/subcmd_anim.py b/pyani/scripts/subcommands/subcmd_anim.py index 2ce1c389..dd582d08 100644 --- a/pyani/scripts/subcommands/subcmd_anim.py +++ b/pyani/scripts/subcommands/subcmd_anim.py @@ -442,7 +442,7 @@ def update_comparison_results( # Add individual results to Comparison table for job in tqdm(joblist, disable=args.disable_tqdm): logger.debug("\t%s vs %s", job.subject.description, job.query.description) - saln_length, qaln_length, sim_errs = anim.parse_delta(job.outfile) + qaln_length, saln_length, sim_errs = anim.parse_delta(job.outfile) qcov = qaln_length / job.query.length scov = saln_length / job.subject.length logger.debug( @@ -460,16 +460,20 @@ def update_comparison_results( job.subject.description, ) try: - pid = 1 - sim_errs / qaln_length + query_pid = 1 - sim_errs / qaln_length + subject_pid = 1 - sim_errs / saln_length except ZeroDivisionError: # aln_length was zero (no alignment) - pid = 0 + query_pid = 0 + subject_pid = 0 run.comparisons.append( Comparison( query=job.query, subject=job.subject, - aln_length=qaln_length, + query_aln_length=qaln_length, + subject_aln_length=saln_length, sim_errs=sim_errs, - identity=pid, + query_identity=query_pid, + subject_identity=subject_pid, cov_query=qcov, cov_subject=scov, program="nucmer", diff --git a/pyani/scripts/subcommands/subcmd_report.py b/pyani/scripts/subcommands/subcmd_report.py index f62aecbe..ea4d9898 100644 --- a/pyani/scripts/subcommands/subcmd_report.py +++ b/pyani/scripts/subcommands/subcmd_report.py @@ -232,10 +232,12 @@ def subcmd_report(args: Namespace) -> int: genome_query.description, Comparison.subject_id, genome_subject.description, - Comparison.identity, + Comparison.query_identity, + Comparison.subject_identity, Comparison.cov_query, Comparison.cov_subject, - Comparison.aln_length, + Comparison.query_aln_length, + Comparison.subject_aln_length, Comparison.sim_errs, Comparison.program, Comparison.version, @@ -255,10 +257,12 @@ def subcmd_report(args: Namespace) -> int: "Query description", "Subject ID", "Subject description", - "% identity", + "% query identity", + "% subject identity", "% query coverage", "% subject coverage", - "alignment length", + "query alignment length", + "subject alignment length", "similarity errors", "program", "version",