Skip to content

Commit

Permalink
Update matrices and run tables for forward and reverse comparisons
Browse files Browse the repository at this point in the history
  • Loading branch information
kiepczi committed Feb 29, 2024
1 parent 56f0438 commit 693d0c2
Show file tree
Hide file tree
Showing 5 changed files with 66 additions and 39 deletions.
20 changes: 11 additions & 9 deletions pyani/anim.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,7 +370,7 @@ def parse_delta(filename: Path) -> Tuple[int, int, int]:
for interval in ref_tree:
saln_length += interval.end - interval.begin + 1

return (saln_length, qaln_length, sim_errors)
return (qaln_length, saln_length, sim_errors)


# Parse all the .delta files in the passed directory
Expand Down Expand Up @@ -434,31 +434,33 @@ def process_deltadir(
deltafile,
)
continue
tot_length, tot_sim_error = parse_delta(deltafile)
if tot_length == 0 and logger is not None:
query_tot_length, subject_tot_length, tot_sim_error = parse_delta(deltafile)
if subject_tot_length == 0 and logger is not None:
if logger:
logger.warning(
"Total alignment length reported in %s is zero!", deltafile
)
sys.exit("Zero length alignment!")
query_cover = float(tot_length) / org_lengths[qname]
sbjct_cover = float(tot_length) / org_lengths[sname]
query_cover = float(query_tot_length) / org_lengths[qname]
sbjct_cover = float(subject_tot_length) / org_lengths[sname]

# Calculate percentage ID of aligned length. This may fail if
# total length is zero.
# The ZeroDivisionError that would arise should be handled
# Common causes are that a NUCmer run failed, or that a very
# distant sequence was included in the analysis.
try:
perc_id = 1 - float(tot_sim_error) / tot_length
subject_perc_id = 1 - float(tot_sim_error) / subject_tot_length
query_perc_id = 1 - float(tot_sim_error) / query_tot_length
except ZeroDivisionError:
perc_id = 0 # set arbitrary value of zero identity
subject_perc_id = 0 # set arbitrary value of zero identity
query_perc_id = 0
results.zero_error = True

# Populate dataframes: when assigning data from symmetrical MUMmer
# output, both upper and lower triangles will be populated
results.add_tot_length(qname, sname, tot_length)
results.add_tot_length(qname, sname, query_tot_length, subject_tot_length)
results.add_sim_errors(qname, sname, tot_sim_error)
results.add_pid(qname, sname, perc_id)
results.add_pid(qname, sname, query_perc_id, subject_perc_id)
results.add_coverage(qname, sname, query_cover, sbjct_cover)
return results
29 changes: 18 additions & 11 deletions pyani/pyani_orm.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,9 +279,11 @@ class Comparison(Base):
comparison_id = Column(Integer, primary_key=True)
query_id = Column(Integer, ForeignKey("genomes.genome_id"), nullable=False)
subject_id = Column(Integer, ForeignKey("genomes.genome_id"), nullable=False)
aln_length = Column(Integer) # in fastANI this is matchedfrags * fragLength
query_aln_length = Column(Integer) # in fastANI this is matchedfrags * fragLength
subject_aln_length = Column(Integer)
sim_errs = Column(Integer) # in fastANI this is allfrags - matchedfrags
identity = Column(Float)
query_identity = Column(Float)
subject_identity = Column(Float)
cov_query = Column(Float) # in fastANI this is matchedfrags/allfrags
cov_subject = Column(Float) # in fastANI this is Null
program = Column(String)
Expand All @@ -307,7 +309,7 @@ def __str__(self) -> str:
"Query: {}, Subject: {}, %%ID={}, ({} {}), FragSize: {}, MaxMatch: {}, KmerSize: {}, MinMatch: {}".format(
self.query_id,
self.subject_id,
self.identity,
self.query_identity,
self.program,
self.version,
self.fragsize,
Expand Down Expand Up @@ -349,6 +351,7 @@ def get_comparison_dict(session: Any) -> Dict[Tuple, Any]:
Returns Comparison objects, keyed by (_.query_id, _.subject_id,
_.program, _.version, _.fragsize, _.maxmatch) tuple
"""

return {
(
_.query_id,
Expand Down Expand Up @@ -615,6 +618,8 @@ def add_run_genomes(


def update_comparison_matrices(session, run) -> None:
logger = logging.getLogger(__name__)

"""Update the Run table with summary matrices for the analysis.
:param session: active pyanidb session via ORM
Expand All @@ -638,19 +643,21 @@ def update_comparison_matrices(session, run) -> None:
df_alnlength.loc[genome.genome_id, genome.genome_id] = genome.length

# Loop over all comparisons for the run and fill in result matrices

logger.debug("Existing comparisons\n%s", run.comparisons.all())
for cmp in run.comparisons.all():
qid, sid = cmp.query_id, cmp.subject_id
df_identity.loc[qid, sid] = cmp.identity
df_identity.loc[qid, sid] = cmp.query_identity
df_coverage.loc[qid, sid] = cmp.cov_query
df_alnlength.loc[qid, sid] = cmp.aln_length
df_alnlength.loc[qid, sid] = cmp.query_aln_length
df_simerrors.loc[qid, sid] = cmp.sim_errs
df_hadamard.loc[qid, sid] = cmp.identity * cmp.cov_query
if cmp.program in ["nucmer"]:
df_hadamard.loc[sid, qid] = cmp.identity * cmp.cov_subject
df_hadamard.loc[qid, sid] = cmp.query_identity * cmp.cov_query
if (qid, sid) == (sid, qid):
df_hadamard.loc[sid, qid] = cmp.query_identity * cmp.cov_subject
df_simerrors.loc[sid, qid] = cmp.sim_errs
df_alnlength.loc[sid, qid] = cmp.aln_length
df_coverage.loc[sid, qid] = cmp.cov_subject
df_identity.loc[sid, qid] = cmp.identity
df_alnlength.loc[sid, qid] = cmp.query_aln_length
df_coverage.loc[sid, qid] = cmp.cov_query
df_identity.loc[sid, qid] = cmp.query_identity

# Add matrices to the database
run.df_identity = df_identity.to_json()
Expand Down
30 changes: 20 additions & 10 deletions pyani/pyani_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,11 @@ def __init__(self, labels: List[str], mode: str) -> None:
self.mode = mode

def add_tot_length(
self, qname: str, sname: str, value: float, sym: bool = True
self,
qname: str,
sname: str,
query_alignment_length: float,
subject_alignment_length: Optional[float] = None,
) -> None:
"""Add a total length value to self.alignment_lengths.
Expand All @@ -115,9 +119,9 @@ def add_tot_length(
:param value:
:param sym:
"""
self.alignment_lengths.loc[qname, sname] = value
if sym:
self.alignment_lengths.loc[sname, qname] = value
self.alignment_lengths.loc[qname, sname] = query_alignment_length
if subject_alignment_length:
self.alignment_lengths.loc[sname, qname] = subject_alignment_length

def add_sim_errors(
self, qname: str, sname: str, value: float, sym: bool = True
Expand All @@ -133,17 +137,23 @@ def add_sim_errors(
if sym:
self.similarity_errors.loc[sname, qname] = value

def add_pid(self, qname: str, sname: str, value: float, sym: bool = True) -> None:
def add_pid(
self,
qname: str,
sname: str,
qidentity: float,
sidentity: Optional[float] = None,
) -> None:
"""Add a percentage identity value to self.percentage_identity.
:param qname:
:param sname:
:param value:
:param sym:
:param qidentity:
:param sidentity:
"""
self.percentage_identity.loc[qname, sname] = value
if sym:
self.percentage_identity.loc[sname, qname] = value
self.percentage_identity.loc[qname, sname] = qidentity
if sidentity:
self.percentage_identity.loc[sname, qname] = sidentity

def add_coverage(
self, qname: str, sname: str, qcover: float, scover: Optional[float] = None
Expand Down
14 changes: 9 additions & 5 deletions pyani/scripts/subcommands/subcmd_anim.py
Original file line number Diff line number Diff line change
Expand Up @@ -442,7 +442,7 @@ def update_comparison_results(
# Add individual results to Comparison table
for job in tqdm(joblist, disable=args.disable_tqdm):
logger.debug("\t%s vs %s", job.subject.description, job.query.description)
saln_length, qaln_length, sim_errs = anim.parse_delta(job.outfile)
qaln_length, saln_length, sim_errs = anim.parse_delta(job.outfile)
qcov = qaln_length / job.query.length
scov = saln_length / job.subject.length
logger.debug(
Expand All @@ -460,16 +460,20 @@ def update_comparison_results(
job.subject.description,
)
try:
pid = 1 - sim_errs / qaln_length
query_pid = 1 - sim_errs / qaln_length
subject_pid = 1 - sim_errs / saln_length
except ZeroDivisionError: # aln_length was zero (no alignment)
pid = 0
query_pid = 0
subject_pid = 0
run.comparisons.append(
Comparison(
query=job.query,
subject=job.subject,
aln_length=qaln_length,
query_aln_length=qaln_length,
subject_aln_length=saln_length,
sim_errs=sim_errs,
identity=pid,
query_identity=query_pid,
subject_identity=subject_pid,
cov_query=qcov,
cov_subject=scov,
program="nucmer",
Expand Down
12 changes: 8 additions & 4 deletions pyani/scripts/subcommands/subcmd_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,10 +232,12 @@ def subcmd_report(args: Namespace) -> int:
genome_query.description,
Comparison.subject_id,
genome_subject.description,
Comparison.identity,
Comparison.query_identity,
Comparison.subject_identity,
Comparison.cov_query,
Comparison.cov_subject,
Comparison.aln_length,
Comparison.query_aln_length,
Comparison.subject_aln_length,
Comparison.sim_errs,
Comparison.program,
Comparison.version,
Expand All @@ -255,10 +257,12 @@ def subcmd_report(args: Namespace) -> int:
"Query description",
"Subject ID",
"Subject description",
"% identity",
"% query identity",
"% subject identity",
"% query coverage",
"% subject coverage",
"alignment length",
"query alignment length",
"subject alignment length",
"similarity errors",
"program",
"version",
Expand Down

0 comments on commit 693d0c2

Please sign in to comment.