Skip to content

Commit

Permalink
Bugfix: Support collapsing ranges with duplicate units where the unit…
Browse files Browse the repository at this point in the history
…s are not identical but are synonyms.
  • Loading branch information
strangetom committed Oct 18, 2024
1 parent e5dc412 commit b2d3d3b
Show file tree
Hide file tree
Showing 4 changed files with 92 additions and 2 deletions.
13 changes: 13 additions & 0 deletions ingredient_parser/en/_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -427,3 +427,16 @@
]
# Tokens that indicate an amount is singular
SINGULAR_TOKENS = ["each"]

# List of sets, where each set contains the synonyms that represent the same unit.
UNIT_SYNONYMS = [
{"cup", "c"},
{"gram", "g", "gm"},
{"kilogram", "kg"},
{"litre", "liter", "l"},
{"ounce", "oz"},
{"pound", "lb"},
{"quart", "qt"},
{"tablespoon", "tbsp", "tbs", "tb"},
{"teaspoon", "tsp"},
]
46 changes: 45 additions & 1 deletion ingredient_parser/en/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from .._common import download_nltk_resources, is_float, is_range
from ..dataclasses import IngredientAmount
from ._constants import UNITS
from ._constants import FLATTENED_UNITS_LIST, UNIT_SYNONYMS, UNITS
from ._regex import FRACTION_SPLIT_AND_PATTERN, STRING_RANGE_PATTERN

UREG = pint.UnitRegistry()
Expand Down Expand Up @@ -236,6 +236,50 @@ def convert_to_pint_unit(unit: str, imperial_units: bool = False) -> str | pint.
return unit


def is_unit_synonym(unit1: str, unit2: str) -> bool:
"""Check if given units are synonyms.
Parameters
----------
unit1 : str
First unit to check.
unit2 : str
Second unit to check.
Returns
-------
bool
True if units are synonyms.
Examples
--------
>>> is_unit_synonym("oz", "ounce")
True
>>> is_unit_synonym("cups", "c")
True
>>> is_unit_synonym("kg", "g")
False
"""
# If not in units list, then cannot be unit synonyms
if unit1 not in FLATTENED_UNITS_LIST or unit2 not in FLATTENED_UNITS_LIST:
return False

# Make singular if plural
if unit1 in UNITS.keys():
unit1 = UNITS[unit1]

if unit2 in UNITS.keys():
unit2 = UNITS[unit2]

for synonyms in UNIT_SYNONYMS:
if unit1 in synonyms and unit2 in synonyms:
return True

return False


def combine_quantities_split_by_and(text: str) -> str:
"""Combine fractional quantities split by 'and' into single value.
Expand Down
20 changes: 19 additions & 1 deletion ingredient_parser/en/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
)
from ._utils import (
combine_quantities_split_by_and,
is_unit_synonym,
replace_string_range,
stem,
tokenize,
Expand Down Expand Up @@ -464,7 +465,7 @@ def _replace_dupe_units_ranges(self, sentence: str) -> str:

for full_match, quantity1, unit1, quantity2, unit2 in matches:
# We are only interested if the both captured units are the same
if unit1 != unit2:
if not is_unit_synonym(unit1, unit2):
continue

# If capture unit not in units list, abort
Expand Down Expand Up @@ -809,6 +810,23 @@ def _is_inside_parentheses(self, index: int) -> bool:

return False

def _is_example(self, index: int) -> bool:
"""Return True is the token is part of an example in the sentence.
Examples are indicated using phrases like "such as", "for example"
Parameters
----------
index : int
Index of token to check
Returns
-------
bool
True if index is part of an example, else False
"""
return False

def _is_ambiguous_unit(self, token: str) -> bool:
"""Return True if token is in AMBIGUOUS_UNITS list.
Expand Down
15 changes: 15 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
UREG,
combine_quantities_split_by_and,
convert_to_pint_unit,
is_unit_synonym,
pluralise_units,
replace_string_range,
)
Expand Down Expand Up @@ -200,3 +201,17 @@ def test_second_starts_with_zero(self):
"""
input_sentence = "Type 1 or 00 flour"
assert replace_string_range(input_sentence) == "Type 1 or 00 flour"


class Test_is_unit_synonym:
def test_singular(self):
assert is_unit_synonym("oz", "ounce")

def test_plural_singular(self):
assert is_unit_synonym("cups", "c")

def test_plural(self):
assert is_unit_synonym("lbs", "pounds")

def test_not_synonym(self):
assert not is_unit_synonym("kg", "gram")

0 comments on commit b2d3d3b

Please sign in to comment.