Skip to content

Commit

Permalink
feat: Disallow the usage of similarity scores and edit distances at t…
Browse files Browse the repository at this point in the history
…he same time
  • Loading branch information
ChristianMichelsen committed Jun 29, 2022
1 parent 6edd6bb commit bf7f941
Show file tree
Hide file tree
Showing 4 changed files with 127 additions and 28 deletions.
4 changes: 2 additions & 2 deletions docs/source/command-line-interface.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ The `samples` refer to a single or multiple alignment-files (or a directory cont
- `--acc2tax`: Path to the (NCBI) `acc2tax.gz`. Mandatory for LCA.
- `--min-similarity-score`: Normalised edit distance (read to reference similarity) minimum. Number between 0-1. Default: 0.95.
- `--max-similarity-score`: Normalised edit distance (read to reference similarity) maximum. Number between 0-1 Default: 1.0.
- `--min-edit-dist`: Minimum edit distance (read to reference similarity). Number between 0-10. Default: 0.
- `--max-edit-dist`: Maximum edit distance (read to reference similarity). Number between 0-10. Default: 10.
- `--min-edit-dist`: Minimum edit distance (read to reference similarity). Positive integer. Note that edit distances scores cannot be set at the same time as similarity scores; choose one or the other.
- `--max-edit-dist`: Maximum edit distance (read to reference similarity). Positive integer. Note that edit distances scores cannot be set at the same time as similarity scores; choose one or the other.
- `--min-mapping-quality`: Minimum mapping quality. Default: 0.
- `--lca-rank`: The LCA rank used in ngsLCA. Can be either `family`, `genus`, `species` or `""` (everything). Default is `""`.

Expand Down
43 changes: 23 additions & 20 deletions src/metaDMG/cli/cli.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#%%
from pathlib import Path
from typing import Optional
from typing import Optional, Union

import typer

Expand Down Expand Up @@ -52,52 +52,56 @@ def create_config(
...,
help="Single or multiple alignment-files (or a directory containing them).",
),
# LCA parameters
# LCA parameter
names: Optional[Path] = typer.Option(
None,
exists=True,
file_okay=True,
help="Path to the (NCBI) names-mdmg.dmp.",
),
# LCA parameter
nodes: Optional[Path] = typer.Option(
None,
exists=True,
file_okay=True,
help="Path to the (NCBI) nodes-mdmg.dmp.",
),
# LCA parameter
acc2tax: Optional[Path] = typer.Option(
None,
exists=True,
file_okay=True,
help="Path to the (NCBI) acc2tax.gz.",
),
min_similarity_score: float = typer.Option(
0.95,
min_similarity_score: Optional[float] = typer.Option(
None,
"--min-similarity-score",
"-s",
help="Normalised edit distance (read to reference similarity) minimum. Number between 0-1.",
callback=lambda x: cli_utils.is_in_range(x, 0, 1),
callback=lambda x: cli_utils.is_in_range_or_None(x, 0, 1),
),
max_similarity_score: float = typer.Option(
1.0,
max_similarity_score: Optional[float] = typer.Option(
None,
"--max-similarity-score",
"-S",
help="Normalised edit distance (read to reference similarity) maximum. Number between 0-1.",
callback=lambda x: cli_utils.is_in_range(x, 0, 1),
callback=lambda x: cli_utils.is_in_range_or_None(x, 0, 1),
),
min_edit_dist: int = typer.Option(
0,
min_edit_dist: Optional[int] = typer.Option(
None,
# 0,
"--min-edit-dist",
"-e",
help="Minimum edit distance (read to reference similarity). Number between 0-10.",
callback=lambda x: cli_utils.is_in_range(x, 0, 10),
help="Minimum edit distance (read to reference similarity). Positive integer.",
callback=lambda x: cli_utils.is_positive_int_or_None(x),
),
max_edit_dist: int = typer.Option(
10,
max_edit_dist: Optional[int] = typer.Option(
None,
# 10,
"--max-edit-dist",
"-E",
help="Maximum edit distance (read to reference similarity). Number between 0-10.",
callback=lambda x: cli_utils.is_in_range(x, 0, 10),
help="Maximum edit distance (read to reference similarity). Positive integer.",
callback=lambda x: cli_utils.is_positive_int_or_None(x),
),
min_mapping_quality: int = typer.Option(
0,
Expand Down Expand Up @@ -223,10 +227,9 @@ def create_config(
"names": names,
"nodes": nodes,
"acc2tax": acc2tax,
"min_similarity_score": min_similarity_score,
"max_similarity_score": max_similarity_score,
"min_edit_dist": min_edit_dist,
"max_edit_dist": max_edit_dist,
**cli_utils.set_min_max_similarity_score_edit_dist(
min_similarity_score, max_similarity_score, min_edit_dist, max_edit_dist
),
"min_mapping_quality": min_mapping_quality,
"lca_rank": lca_rank.value, # important to get string
"max_position": max_position,
Expand Down
92 changes: 90 additions & 2 deletions src/metaDMG/cli/cli_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from enum import Enum
from typing import Iterable
from typing import Iterable, Optional

import typer
from click import Context, Group
Expand Down Expand Up @@ -59,7 +59,9 @@ def version_callback(value: bool):
raise typer.Exit()


def is_in_range(x: float, val_min: float, val_max: float) -> float:
def is_in_range_or_None(
x: Optional[float], val_min: float, val_max: float
) -> Optional[float]:
"""Confirms that x is val_min <= x <= val_max
Parameters
Expand All @@ -81,13 +83,43 @@ def is_in_range(x: float, val_min: float, val_max: float) -> float:
If x is outside bounds
"""

if x is None:
return x

if x < val_min or val_max < x:
raise typer.BadParameter(
f"x has to be between {val_min} and {val_max}. Got: {x}"
)
return x


def is_positive_int_or_None(x: Optional[int]) -> Optional[int]:
"""Confirms that x is 0 <= x
Parameters
----------
x
Value to check
Returns
-------
Confirmed value
Raises
------
typer.BadParameter
If x is outside bounds
"""

if x is None:
return x

if x < 0:
raise typer.BadParameter(f"x has to be positive. Got: {x}")

return x


#%%
class RANKS(str, Enum):
"Ranks allowed in the LCA"
Expand All @@ -107,3 +139,59 @@ class DAMAGE_MODE(str, Enum):


#%%


def set_min_max_similarity_score_edit_dist(
min_similarity_score: Optional[float],
max_similarity_score: Optional[float],
min_edit_dist: Optional[int],
max_edit_dist: Optional[int],
) -> dict[str, float]:

if any([min_similarity_score, max_similarity_score]) and any(
[min_edit_dist, max_edit_dist]
):
raise typer.BadParameter(
f"You cannot use both similarity scores and edit distances at the same time."
)

# edit distances
if any([min_edit_dist, max_edit_dist]):

if all([min_edit_dist, max_edit_dist]):

if min_edit_dist > max_edit_dist:
raise typer.BadParameter(
f"min-edit-dist ({min_edit_dist}) "
f"has to be lower than max-edit-dist ({max_edit_dist})"
)

return {
"min_edit_dist": min_edit_dist,
"max_edit_dist": max_edit_dist,
}

else:
raise typer.BadParameter(
f"If using (absolute) edit distances, you have to set "
"both `min_edit_dist` and `max_edit_dist`."
)

# similarity scores

if min_similarity_score is None:
min_similarity_score = 0.95

if max_similarity_score is None:
max_similarity_score = 1.0

if min_similarity_score > max_similarity_score:
raise typer.BadParameter(
f"min-similarity-score ({min_similarity_score}) "
f"has to be lower than max-similarity-score ({max_similarity_score})"
)

return {
"min_similarity_score": min_similarity_score,
"max_similarity_score": max_similarity_score,
}
16 changes: 12 additions & 4 deletions src/metaDMG/fit/serial.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,17 @@ def data_dir(config: Config, name, suffix="parquet"):
#%%


def _exists_in_config(s: str, key: str, config: Config) -> str:
return f"-{s} {config[key]}" if key in config else ""


def get_LCA_command(config: Config) -> str:
outnames = config["path_tmp"] / config["sample"]
lca_rank = f"-lca_rank {config['lca_rank']}" if config["lca_rank"] != "" else ""
simscorelow = _exists_in_config("simscorelow", "min_similarity_score", config)
simscorehigh = _exists_in_config("simscorehigh", "max_similarity_score", config)
editdistmin = _exists_in_config("editdistmin", "min_edit_dist", config)
editdistmax = _exists_in_config("editdistmax", "max_edit_dist", config)

command = (
f"{config['metaDMG_cpp']} lca "
Expand All @@ -68,10 +76,10 @@ def get_LCA_command(config: Config) -> str:
f"-names {config['names']} "
f"-nodes {config['nodes']} "
f"-acc2tax {config['acc2tax']} "
f"-simscorelow {config['min_similarity_score']} "
f"-simscorehigh {config['max_similarity_score']} "
f"-editdistmin {config['min_edit_dist']} "
f"-editdistmax {config['max_edit_dist']} "
f"{simscorelow} "
f"{simscorehigh} "
f"{editdistmin} "
f"{editdistmax} "
f"{lca_rank} "
f"-minmapq {config['min_mapping_quality']} "
f"-howmany {config['max_position']} "
Expand Down

0 comments on commit bf7f941

Please sign in to comment.