Skip to content

Commit

Permalink
Apply black code style (#121)
Browse files Browse the repository at this point in the history
  • Loading branch information
thatbudakguy committed Sep 1, 2023
1 parent 3c5cb0e commit 59c2f25
Show file tree
Hide file tree
Showing 12 changed files with 140 additions and 118 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
![pyversions](https://img.shields.io/pypi/pyversions/dphon.svg?style=flat)
[![zenodo](https://zenodo.org/badge/DOI/10.5281/zenodo.4641277.svg)](https://zenodo.org/record/4641277)
[![spaCy](https://img.shields.io/static/v1?label=made%20with%20%E2%9D%A4%20and&message=spaCy&color=09a3d5)](https://spacy.io)
[![code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)

## installation

Expand Down
2 changes: 1 addition & 1 deletion dphon/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '2.1.0-dev'
__version__ = "2.1.0-dev"
17 changes: 8 additions & 9 deletions dphon/align.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ class SmithWatermanAligner(Aligner):
def __init__(self, scorer: Scorer_T = None, gap_char: str = "-") -> None:
self.scorer = scorer
self.gap_char = gap_char
logging.info(f"using {self.__class__} with gap_char=\"{gap_char}\"")
logging.info(f'using {self.__class__} with gap_char="{gap_char}"')

def _get_seqs(self, match: Match) -> Tuple[Seq_T, Seq_T]:
"""Get the two sequences to compare."""
Expand All @@ -56,16 +56,17 @@ def __call__(self, match: Match) -> Match:
and sequence texts calculated for the alignment."""

# compute the alignment and keep non-aligned regions
(lu, cu, _ru), (lv, cv, _rv), score = sw_align(*self._get_seqs(match),
self.scorer)
(lu, cu, _ru), (lv, cv, _rv), score = sw_align(
*self._get_seqs(match), self.scorer
)

# use lengths of non-aligned regions to move the sequence boundaries
# [...] ["A", "B", "C"] [...]
# ----> <----
u, v = match.utxt.doc, match.vtxt.doc
us, vs = match.utxt.start + len(lu), match.vtxt.start + len(lv)
utxt = u[us:us + len(cu)]
vtxt = v[vs:vs + len(cv)]
utxt = u[us : us + len(cu)]
vtxt = v[vs : vs + len(cv)]

# use the gaps in the alignment to construct a new sequence of token
# texts, inserting gap_char wherever the aligner created a gap
Expand Down Expand Up @@ -113,8 +114,6 @@ def _get_seqs(self, match: Match) -> Tuple[Seq_T, Seq_T]:
# combine the phonemes for each token into a single string; if there's
# no phonetic content, use the token text in place of the phonemes
return (
["".join([p or "" for p in t._.phonemes])
or t.text for t in match.utxt],
["".join([p or "" for p in t._.phonemes])
or t.text for t in match.vtxt],
["".join([p or "" for p in t._.phonemes]) or t.text for t in match.utxt],
["".join([p or "" for p in t._.phonemes]) or t.text for t in match.vtxt],
)
37 changes: 21 additions & 16 deletions dphon/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,18 +110,22 @@ def run() -> None:
args = docopt(__doc__, version=__version__)

# install global logging and exception handlers
logging.basicConfig(level=LOG_LEVELS[args["-v"]], format="%(message)s",
datefmt="[%X]", handlers=[RichHandler(console=err_console)])
logging.basicConfig(
level=LOG_LEVELS[args["-v"]],
format="%(message)s",
datefmt="[%X]",
handlers=[RichHandler(console=err_console)],
)
logging.captureWarnings(True)
traceback.install()

# setup pipeline
nlp = setup(args)

# setup match highlighting
console.highlighter = MatchHighlighter(g2p=nlp.get_pipe("g2p"),
context=int(args["--context"]),
gap_char=" ")
console.highlighter = MatchHighlighter(
g2p=nlp.get_pipe("g2p"), context=int(args["--context"]), gap_char=" "
)

# process all texts
graph = process(nlp, args)
Expand Down Expand Up @@ -158,17 +162,15 @@ def run() -> None:
def setup(args: Dict) -> Language:
"""Set up the spaCy processing pipeline."""
# get sound table
v2_path = pkg_resources.resource_filename(
__package__, "data/sound_table_v2.json")
v2_path = pkg_resources.resource_filename(__package__, "data/sound_table_v2.json")
sound_table = get_sound_table_json(Path(v2_path))

# add Doc metadata
if not Doc.has_extension("id"):
Doc.set_extension("id", default="")

# setup spaCy model
nlp = spacy.blank(
"zh", meta={"tokenizer": {"config": {"use_jieba": False}}})
nlp = spacy.blank("zh", meta={"tokenizer": {"config": {"use_jieba": False}}})
nlp.add_pipe("g2p", config={"sound_table": sound_table})
nlp.add_pipe("ngrams", config={"n": int(args["--ngram-order"])})
nlp.add_pipe("ngram_phonemes_index", name="index")
Expand All @@ -191,7 +193,7 @@ def process(nlp: Language, args: Dict) -> MatchGraph:
for doc, context in nlp.pipe(load_texts(args["<path>"]), as_tuples=True):
doc._.id = context["id"]
graph.add_doc(context["id"], doc)
logging.debug(f"indexed doc \"{doc._.id}\"")
logging.debug(f'indexed doc "{doc._.id}"')
stop = time.perf_counter() - start
logging.info(f"indexed {graph.number_of_docs()} docs in {stop:.1f}s")

Expand All @@ -214,12 +216,14 @@ def process(nlp: Language, args: Dict) -> MatchGraph:
with progress:
for _seed, locations in groups:
logging.debug(
f"evaluating seed group \"{locations[0].text}\", size={len(locations)}")
f'evaluating seed group "{locations[0].text}", size={len(locations)}'
)
progress.update(task, seed=locations[0].text)
for utxt, vtxt in combinations(locations, 2):
if utxt.doc._.id != vtxt.doc._.id: # skip same-doc matches
graph.add_match(
Match(utxt.doc._.id, vtxt.doc._.id, utxt, vtxt, 1.0))
Match(utxt.doc._.id, vtxt.doc._.id, utxt, vtxt, 1.0)
)
progress.advance(task)
stop = time.perf_counter() - start
logging.info(f"seeded {graph.number_of_matches()} matches in {stop:.1f}s")
Expand All @@ -230,10 +234,11 @@ def process(nlp: Language, args: Dict) -> MatchGraph:
graph.filter(has_variant)

# extend all matches
graph.extend(LevenshteinPhoneticExtender(
threshold=float(args["--threshold"]),
len_limit=int(args["--len-limit"])
))
graph.extend(
LevenshteinPhoneticExtender(
threshold=float(args["--threshold"]), len_limit=int(args["--len-limit"])
)
)

# align all matches
graph.align(SmithWatermanPhoneticAligner(gap_char=" "))
Expand Down
22 changes: 10 additions & 12 deletions dphon/console.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,9 @@


# Default color scheme for highlighting matches
DEFAULT_THEME = Theme({
"context": "dim",
"variant": "blue",
"insertion": "green",
"mismatch": "red"
})
DEFAULT_THEME = Theme(
{"context": "dim", "variant": "blue", "insertion": "green", "mismatch": "red"}
)

# Consoles for rendering output
console = Console(theme=DEFAULT_THEME, soft_wrap=True)
Expand All @@ -28,12 +25,13 @@ class MatchHighlighter(RegexHighlighter):
gap_char: str
g2p: GraphemesToPhonemes

def __init__(self, g2p: GraphemesToPhonemes, context: int = 0, gap_char: str = "-") -> None:
def __init__(
self, g2p: GraphemesToPhonemes, context: int = 0, gap_char: str = "-"
) -> None:
"""Create a new highlighter with optional context for each match."""
# can't have negative context
if context < 0:
raise ValueError(
f"{self.__class__} context must be greater than 0")
raise ValueError(f"{self.__class__} context must be greater than 0")

# store parameters
self.context = context
Expand All @@ -44,7 +42,7 @@ def __init__(self, g2p: GraphemesToPhonemes, context: int = 0, gap_char: str = "
def format_match(self, match: Match) -> Tuple[str, str]:
"""Return match sequences as Rich format strings, with optional context.
Adds markup for highlighting insertions, mismatches, etc. If context is
Adds markup for highlighting insertions, mismatches, etc. If context is
set, also adds highlighted context to either end of the match.
"""

Expand Down Expand Up @@ -83,7 +81,7 @@ def _mark(self, match: Match) -> Tuple[str, str]:
continue

# gap in v: insertion in u (if not punctuation)
if match.av[i] == self.gap_char and match.au[i].isalnum():
if match.av[i] == self.gap_char and match.au[i].isalnum():
su.append(f"[insertion]{match.au[i]}[/insertion]")
sv.append(match.av[i])
u_ptr += 1
Expand Down Expand Up @@ -133,5 +131,5 @@ def transcription(self, match: Match) -> Tuple[str, str]:
"""Get the phonemic transcription for the match for display."""
return (
"*" + " ".join(match.utxt._.syllables),
"*" + " ".join(match.vtxt._.syllables)
"*" + " ".join(match.vtxt._.syllables),
)
23 changes: 11 additions & 12 deletions dphon/corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def __call__(self, paths: Iterable[str]) -> Iterable[DocInfo_T]:
Output is a single tuple of (contents, metadata) where "contents" is the
contents of the file as a string and "metadata" is an arbitrary dict.
One tuple per doc should be returned for consumption by spaCy's
One tuple per doc should be returned for consumption by spaCy's
`nlp.pipe(as_tuples=True)`.
"""
raise NotImplementedError
Expand All @@ -78,7 +78,8 @@ def _check(self, paths: Iterable[str]) -> Dict[Path, Any]:
logging.debug(f"found {file.resolve()}, size={size}B")
else:
logging.warning(
f"path {file.resolve()} isn't a {self.filetype} file")
f"path {file.resolve()} isn't a {self.filetype} file"
)

# if no valid files were found, notify the user and exit. otherwise
# report the total number of files found
Expand Down Expand Up @@ -113,21 +114,19 @@ def __call__(self, paths: Iterable[str]) -> Iterable[DocInfo_T]:

# sort files by size, largest first, to speed up processing by spaCy
files = self._check(paths)
files_by_size = OrderedDict(sorted(files.items(),
key=lambda f: f[1]["size"],
reverse=True))
files_by_size = OrderedDict(
sorted(files.items(), key=lambda f: f[1]["size"], reverse=True)
)

# track progress
task = self.progress.add_task(
"indexing", filename="", total=len(files))
task = self.progress.add_task("indexing", filename="", total=len(files))

# open each file and yield contents with metadata as DocInfo_T
with self.progress:
for file, meta in files_by_size.items():
self.progress.update(task, filename=file.name)
with file.open(encoding="utf8") as contents:
logging.debug(
f"loaded doc \"{meta['id']}\" from {file.resolve()}")
logging.debug(f"loaded doc \"{meta['id']}\" from {file.resolve()}")
yield contents.read().translate(OC_TEXT), {"id": meta["id"]}
self.progress.advance(task)

Expand Down Expand Up @@ -157,8 +156,7 @@ def __call__(self, paths: Iterable[str]) -> Iterable[DocInfo_T]:

# track progress
files = self._check(paths)
task = self.progress.add_task(
"indexing", filename="", total=len(files))
task = self.progress.add_task("indexing", filename="", total=len(files))

# open each file and yield each line, with all properties except "text"
# being passed as second element in tuple
Expand All @@ -169,6 +167,7 @@ def __call__(self, paths: Iterable[str]) -> Iterable[DocInfo_T]:
for doc in reader:
meta = {k: v for k, v in doc.items() if k != "text"}
logging.debug(
f"loaded doc \"{doc['id']}\" from {file.resolve()}")
f"loaded doc \"{doc['id']}\" from {file.resolve()}"
)
yield doc["text"].translate(OC_TEXT), meta
self.progress.advance(task)
Loading

0 comments on commit 59c2f25

Please sign in to comment.