Skip to content

Commit

Permalink
Improve CSV output
Browse files Browse the repository at this point in the history
This adds transcriptions to the CSV output and relabels the
scores to more clearly indicate graphic vs. phonetic similarity
of matches.

Closes #362
  • Loading branch information
thatbudakguy committed Jun 17, 2024
1 parent 240f4b0 commit c27854a
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 14 deletions.
2 changes: 1 addition & 1 deletion dphon/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ def run() -> None:
for match in results:
writer.write(match.as_dict())
elif args["--output-format"] == "csv":
fieldnames = Match("", "", "", "").as_dict().keys()
fieldnames = results[0].as_dict().keys()
writer = csv.DictWriter(sys.stdout, fieldnames=fieldnames)
writer.writeheader()
for match in results:
Expand Down
7 changes: 0 additions & 7 deletions dphon/console.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,10 +126,3 @@ def _add_context(self, match: Match) -> Tuple[str, str, str, str]:
cvl = f"[context]{v[vtxt.start-self.context:vtxt.start]}[/context]"
cvr = f"[context]{v[vtxt.end:vtxt.end+self.context]}[/context]"
return (cul, cur, cvl, cvr)

def transcription(self, match: Match) -> Tuple[str, str]:
"""Get the phonemic transcription for the match for display."""
return (
"*" + " ".join(match.utxt._.syllables),
"*" + " ".join(match.vtxt._.syllables),
)
53 changes: 47 additions & 6 deletions dphon/match.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"""The Match class for encoding text reuse relationships."""

import math
from typing import Dict, List, NamedTuple
from typing import Dict, List, NamedTuple, Tuple

import Levenshtein as Lev
from rich.padding import Padding
Expand Down Expand Up @@ -32,7 +32,7 @@ def __rich_console__(
"""Format the match for display in console."""
# get colorized match text and transcription
su, sv = console.highlighter.format_match(self) # type: ignore
pu, pv = console.highlighter.transcription(self) # type: ignore
pu, pv = self.transcription

# add left-padding to align with match numbers, and bottom-padding
# so that there's a space between matches in output
Expand All @@ -49,27 +49,68 @@ def __rich_console__(
pv,
)

@property
def u_transcription(self) -> str:
return "*" + " ".join(self.utxt._.syllables)

@property
def v_transcription(self) -> str:
return "*" + " ".join(self.vtxt._.syllables)

@property
def weighted_score(self) -> float:
"""Ratio of phonemic similarity to graphic similarity."""
try:
return self.weight / Lev.seqratio(self.au, self.av)
return self.phonetic_similarity() / self.graphic_similarity()
except ZeroDivisionError:
return math.inf

@property
def transcription(self) -> Tuple[str, str]:
"""Return the phonemic transcription of the match."""
return (self.u_transcription, self.v_transcription)

def graphic_similarity(self) -> float:
"""Levenshtein ratio of the aligned sequences."""
return Lev.seqratio(self.au, self.av)

def phonetic_similarity(self) -> float:
"""Similarity score of the phonetic content of the sequences."""
return self.weight

def context(self, chars: int) -> Tuple[str, str, str, str]:
"""Return up to `chars` characters of context around the match.
Return value is a tuple of four strings:
- left context of u
- right context of u
- left context of v
- right context of v
"""
u, v = self.utxt.doc, self.vtxt.doc
u_start, u_end = self.utxt.start, self.utxt.end
v_start, v_end = self.vtxt.start, self.vtxt.end
u_context_left = u[max(u_start - chars, 0) : u_start]
v_context_left = v[max(v_start - chars, 0) : v_start]
u_context_right = u[u_end : min(u_end + chars, len(u))]
v_context_right = v[v_end : min(v_end + chars, len(v))]
return (u_context_left, u_context_right, v_context_left, v_context_right)

def as_dict(self) -> Dict[str, str]:
"""Match with prettier field names for serialization."""
"""Dict form for structured output formats."""
return {
"u_id": self.u,
"v_id": self.v,
"u_text": self.utxt.text,
"v_text": self.vtxt.text,
"u_text_aligned": "".join(self.au),
"v_text_aligned": "".join(self.av),
"u_transcription": self.u_transcription,
"v_transcription": self.v_transcription,
"u_start": self.utxt.start,
"u_end": self.utxt.end,
"v_start": self.vtxt.start,
"v_end": self.vtxt.end,
"score": str(self.weight),
"weighted_score": str(self.weighted_score),
"phonetic_similarity": self.phonetic_similarity(),
"graphic_similarity": self.graphic_similarity(),
}

0 comments on commit c27854a

Please sign in to comment.