Skip to content

Commit

Permalink
Grammar enhancements; type annotations; new ord.compressed
Browse files Browse the repository at this point in the history
  • Loading branch information
vthorsteinsson committed Nov 13, 2020
1 parent 5df5bc8 commit 098b7d6
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 29 deletions.
26 changes: 22 additions & 4 deletions src/reynir/Greynir.grammar
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,7 @@ Samtenging →
| 'auk_þess:ao' "sem:st" AðvörunAð?
| 'það_er_að_segja:ao' Tengimerki? "að:st"?
| "svo:st" "lengi:ao"? "sem:st"
| "eins:ao" "og:st"
| "rétt:ao"? "eins:ao" "og:st"
| "því:st" "að:st"?
| "þannig:ao" "að:st"
| "og:st" "loks:ao" "að:st"
Expand Down Expand Up @@ -5780,6 +5780,7 @@ EinnAl →
| AlMiðstig # [mun] hraðar en augað sér
| AlHvortSemUmErAðRæða # Hvort sem um er að ræða X eða Y
| AðNýju
| FjöldiSaman # Tugþúsundum saman

# Fjölyrtir atviksliðir sem ekki er unnt að búa til fasta frasa úr

Expand Down Expand Up @@ -5842,6 +5843,19 @@ UndirEins →

$score(+4) UndirEins

FjöldiSaman →
Fjöldi_þgf "saman:ao"

Fjöldi_þgf →
'tugur:kk'_ft_þgf # tugum saman
| 'hundrað:hk'_ft_þgf # hundruðum saman
| 'þúsund:kvk'_ft_þgf # þúsundum saman
| 'tugþúsundir:kvk'_ft_þgf # tugþúsundum saman
| 'milljón:kvk'_ft_þgf # milljónum saman
| 'milljarður:kk'_ft_þgf # milljörðum saman

$score(+12) FjöldiSaman

Sinnum →
# tvisvar, þrisvar og fjórum sinnum er skilgreint sem atviksliður í Phrases.conf
TalaOg? töl_ft_hk_þgf TalaTil? 'sinn:hk'_þgf_ft # tuttugu og fimm sinnum
Expand Down Expand Up @@ -6420,12 +6434,16 @@ Tíðni →
| 'í_sífellu:ao'
| "aldrei:ao"
| "alltaf:ao"
| "árlega:ao"
| "daglega:ao"
| "dagsdaglega:ao"
| "daglega:ao"
| "vikulega:ao"
| "mánaðarlega:ao"
| "oft:ao"
| "ársfjórðungslega:ao"
| "árlega:ao"
| "oft:ao" | "oftast:ao"
| "reglulega:ao"
| "gjarnan:ao"
| "ævinlega:ao"
| "sjaldan:ao"
| "stundum:ao"
| "títt:ao"
Expand Down
1 change: 1 addition & 0 deletions src/reynir/binparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,7 @@ def matcher_ao(token, terminal, m):
elif v[0] == "x" and not m.stofn.endswith(v[1:]):
return False
fbits = BIN_Token.get_fbits(m.beyging)
# The fbits may contain MST and EST
return terminal.fbits_match(fbits)

@staticmethod
Expand Down
50 changes: 27 additions & 23 deletions src/reynir/bintokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
Tuple,
List,
Dict,
Mapping,
Union,
Iterable,
Iterator,
Expand Down Expand Up @@ -105,6 +106,10 @@ def all_except_suffix(s):
FirstPhaseFunction = Callable[[], TokenIterator]
FollowingPhaseFunction = Callable[[TokenIterator], TokenIterator]
PhaseFunction = Union[FirstPhaseFunction, FollowingPhaseFunction]
StateTuple = Tuple[List[str], int]
StateList = List[StateTuple]
StateDict = Mapping[str, StateList]
DisambiguationTuple = Tuple[str, FrozenSet[str]]

# Person names that are not recognized at the start of sentences
NOT_NAME_AT_SENTENCE_START = {
Expand Down Expand Up @@ -1337,17 +1342,17 @@ class MatchingStream:
replace or modify these sequences.
"""

def __init__(self, phrase_dictionary) -> None:
def __init__(self, phrase_dictionary: StateDict) -> None:
self._pdict = phrase_dictionary

def key(self, token: Tok) -> Any:
""" Generate a state key from the given token """
return token.txt.lower()

def match_state(self, key: Any, state: Any) -> Any:
def match_state(self, key: Any, state: StateDict) -> StateList:
""" Returns an iterable of states that match the key,
or a falsy value if the key matches no states. """
return state.get(key)
return state.get(key, [])

def match(self, tq: List[Tok], ix: int) -> Iterable[Tok]:
""" Called when we have found a match for the entire
Expand All @@ -1364,7 +1369,7 @@ def process(self, token_stream: TokenIterator) -> TokenIterator:
# Token queue
tq: List[Tok] = []
# Phrases we're considering
state: Dict[str, List[Tuple[List[str], int]]] = defaultdict(list)
state: StateDict = defaultdict(list)
pdict = self._pdict # The phrase dictionary

try:
Expand All @@ -1386,7 +1391,7 @@ def process(self, token_stream: TokenIterator) -> TokenIterator:
continue

# Look for matches in the current state and build a new state
newstate = defaultdict(list)
newstate: StateDict = defaultdict(list)
key = self.key(token)

def add_to_state(slist, index):
Expand Down Expand Up @@ -1476,15 +1481,15 @@ class StaticPhraseStream(MatchingStream):
length of the longest phrase.
"""

def __init__(self, token_ctor, auto_uppercase):
def __init__(self, token_ctor, auto_uppercase: bool) -> None:
super().__init__(StaticPhrases.DICT)
self._token_ctor = token_ctor
self._auto_uppercase = auto_uppercase

def length(self, ix):
def length(self, ix: int) -> int:
return StaticPhrases.get_length(ix)

def key(self, token):
def key(self, token: Tok) -> Tuple[str, str]:
""" We allow both the original token text and a lowercase
version of it to match """
wo = token.txt # Original word
Expand All @@ -1493,22 +1498,22 @@ def key(self, token):
wo = w
return wo, w

def match_state(self, key, state):
def match_state(self, key: Tuple[str, str], state: StateDict) -> StateList:
""" First check for original (uppercase) word in the state, if any;
if that doesn't match, check the lower case """
wm = None
wm = ""
wo, w = key
if self._auto_uppercase and len(wo) == 1 and w is wo:
if self._auto_uppercase and len(wo) == 1 and w != wo:
# If we are auto-uppercasing, leave single-letter lowercase
# phrases alone, i.e. 'g' for 'gram' and 'm' for 'meter'
pass
wm = wo
elif wo is not w and wo in state:
wm = wo # Original word
elif w in state:
wm = w # Lowercase version
return state[wm]
return state.get(wm, [])

def match(self, tq, ix):
def match(self, tq: List[Tok], ix: int) -> Iterable[Tok]:
w = " ".join([t.txt for t in tq])
# Add the entire phrase as one 'word' to the token queue.
# Note that the StaticPhrases meaning list will be converted
Expand All @@ -1532,37 +1537,36 @@ class DisambiguationStream(MatchingStream):
meanings that have categories matching those allowed
in the [disambiguate_phrases] section in config/Phrases.conf """

def __init__(self, token_ctor):
def __init__(self, token_ctor: Type["Bin_TOK"]) -> None:
super().__init__(AmbigPhrases.DICT)
self._token_ctor = token_ctor

def key(self, token):
def key(self, token: Tok) -> DisambiguationTuple:
""" Generate a phrase key from the given token """
# Construct a set of all possible lemmas of this word form
if token.kind == TOK.WORD:
return token.txt.lower(), frozenset(m.stofn + "*" for m in token.val)
return token.txt.lower(), frozenset()

def match_state(self, key, state):
def match_state(self, key: DisambiguationTuple, state: StateDict) -> StateList:
""" Called to see if the current token's key matches
the given state. Returns the value that should be
used to look up the key within the state, or None
if there is no match. """
# First, check for a direct text match
states = []
if key[0] in state:
states.extend(state[key[0]])
txt, stems = key
states = list(state.get(txt, []))
# Then, check whether the stems of the token match any
# asterisk-marked entry in the state
for stem in key[1]:
for stem in stems:
if stem in state:
states.extend(state[stem])
return states

def length(self, ix):
def length(self, ix: int) -> int:
return len(AmbigPhrases.get_cats(ix))

def match(self, tq, ix):
def match(self, tq: List[Tok], ix: int) -> Iterable[Tok]:
""" We have a phrase match: return the tokens in the token
queue, but with their meanings filtered down to only
the word categories specified in the phrase configration """
Expand Down
2 changes: 2 additions & 0 deletions src/reynir/config/Prefs.conf
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,8 @@ lífi lífi < líf
lífinu lífi < líf
# lykta lykt < lyktir
læsi læsa < lesa læsi
löndunum landa landi < löndun land
landanna landa landi < land
mannanna manni < maður
manna manni < maður
manninn manni < maður
Expand Down
4 changes: 2 additions & 2 deletions src/reynir/resources/ord.compressed
Git LFS file not shown

0 comments on commit 098b7d6

Please sign in to comment.