Grammar enhancements; type annotations; new ord.compressed

mideind · Nov 13, 2020 · 098b7d6 · 098b7d6
1 parent 5df5bc8
commit 098b7d6
Show file tree

Hide file tree

Showing 5 changed files with 54 additions and 29 deletions.
diff --git a/src/reynir/Greynir.grammar b/src/reynir/Greynir.grammar
@@ -305,7 +305,7 @@ Samtenging →
     | 'auk_þess:ao' "sem:st" AðvörunAð?
     | 'það_er_að_segja:ao' Tengimerki? "að:st"?
     | "svo:st" "lengi:ao"? "sem:st"
-    | "eins:ao" "og:st"
+    | "rétt:ao"? "eins:ao" "og:st"
     | "því:st" "að:st"?
     | "þannig:ao" "að:st"
     | "og:st" "loks:ao" "að:st"
@@ -5780,6 +5780,7 @@ EinnAl →
     | AlMiðstig             # [mun] hraðar en augað sér
     | AlHvortSemUmErAðRæða  # Hvort sem um er að ræða X eða Y
     | AðNýju
+    | FjöldiSaman           # Tugþúsundum saman
 
     # Fjölyrtir atviksliðir sem ekki er unnt að búa til fasta frasa úr
 
@@ -5842,6 +5843,19 @@ UndirEins →
 
 $score(+4) UndirEins
 
+FjöldiSaman →
+    Fjöldi_þgf "saman:ao"
+
+Fjöldi_þgf →
+    'tugur:kk'_ft_þgf           # tugum saman
+    | 'hundrað:hk'_ft_þgf       # hundruðum saman
+    | 'þúsund:kvk'_ft_þgf       # þúsundum saman
+    | 'tugþúsundir:kvk'_ft_þgf  # tugþúsundum saman
+    | 'milljón:kvk'_ft_þgf      # milljónum saman
+    | 'milljarður:kk'_ft_þgf    # milljörðum saman
+
+$score(+12) FjöldiSaman
+
 Sinnum →
     # tvisvar, þrisvar og fjórum sinnum er skilgreint sem atviksliður í Phrases.conf
     TalaOg? töl_ft_hk_þgf TalaTil? 'sinn:hk'_þgf_ft     # tuttugu og fimm sinnum
@@ -6420,12 +6434,16 @@ Tíðni →
     | 'í_sífellu:ao'
     | "aldrei:ao"
     | "alltaf:ao"
-    | "árlega:ao"
-    | "daglega:ao"
     | "dagsdaglega:ao"
+    | "daglega:ao"
+    | "vikulega:ao"
     | "mánaðarlega:ao"
-    | "oft:ao"
+    | "ársfjórðungslega:ao"
+    | "árlega:ao"
+    | "oft:ao" | "oftast:ao"
     | "reglulega:ao"
+    | "gjarnan:ao"
+    | "ævinlega:ao"
     | "sjaldan:ao"
     | "stundum:ao"
     | "títt:ao"

diff --git a/src/reynir/binparser.py b/src/reynir/binparser.py
@@ -279,6 +279,7 @@ def matcher_ao(token, terminal, m):
                 elif v[0] == "x" and not m.stofn.endswith(v[1:]):
                     return False
         fbits = BIN_Token.get_fbits(m.beyging)
+        # The fbits may contain MST and EST
         return terminal.fbits_match(fbits)
 
     @staticmethod

diff --git a/src/reynir/bintokenizer.py b/src/reynir/bintokenizer.py
@@ -41,6 +41,7 @@
     Tuple,
     List,
     Dict,
+    Mapping,
     Union,
     Iterable,
     Iterator,
@@ -105,6 +106,10 @@ def all_except_suffix(s):
 FirstPhaseFunction = Callable[[], TokenIterator]
 FollowingPhaseFunction = Callable[[TokenIterator], TokenIterator]
 PhaseFunction = Union[FirstPhaseFunction, FollowingPhaseFunction]
+StateTuple = Tuple[List[str], int]
+StateList = List[StateTuple]
+StateDict = Mapping[str, StateList]
+DisambiguationTuple = Tuple[str, FrozenSet[str]]
 
 # Person names that are not recognized at the start of sentences
 NOT_NAME_AT_SENTENCE_START = {
@@ -1337,17 +1342,17 @@ class MatchingStream:
         replace or modify these sequences.
     """
 
-    def __init__(self, phrase_dictionary) -> None:
+    def __init__(self, phrase_dictionary: StateDict) -> None:
         self._pdict = phrase_dictionary
 
     def key(self, token: Tok) -> Any:
         """ Generate a state key from the given token """
         return token.txt.lower()
 
-    def match_state(self, key: Any, state: Any) -> Any:
+    def match_state(self, key: Any, state: StateDict) -> StateList:
         """ Returns an iterable of states that match the key,
             or a falsy value if the key matches no states. """
-        return state.get(key)
+        return state.get(key, [])
 
     def match(self, tq: List[Tok], ix: int) -> Iterable[Tok]:
         """ Called when we have found a match for the entire
@@ -1364,7 +1369,7 @@ def process(self, token_stream: TokenIterator) -> TokenIterator:
         # Token queue
         tq: List[Tok] = []
         # Phrases we're considering
-        state: Dict[str, List[Tuple[List[str], int]]] = defaultdict(list)
+        state: StateDict = defaultdict(list)
         pdict = self._pdict  # The phrase dictionary
 
         try:
@@ -1386,7 +1391,7 @@ def process(self, token_stream: TokenIterator) -> TokenIterator:
                     continue
 
                 # Look for matches in the current state and build a new state
-                newstate = defaultdict(list)
+                newstate: StateDict = defaultdict(list)
                 key = self.key(token)
 
                 def add_to_state(slist, index):
@@ -1476,15 +1481,15 @@ class StaticPhraseStream(MatchingStream):
         length of the longest phrase.
     """
 
-    def __init__(self, token_ctor, auto_uppercase):
+    def __init__(self, token_ctor, auto_uppercase: bool) -> None:
         super().__init__(StaticPhrases.DICT)
         self._token_ctor = token_ctor
         self._auto_uppercase = auto_uppercase
 
-    def length(self, ix):
+    def length(self, ix: int) -> int:
         return StaticPhrases.get_length(ix)
 
-    def key(self, token):
+    def key(self, token: Tok) -> Tuple[str, str]:
         """ We allow both the original token text and a lowercase
             version of it to match """
         wo = token.txt  # Original word
@@ -1493,22 +1498,22 @@ def key(self, token):
             wo = w
         return wo, w
 
-    def match_state(self, key, state):
+    def match_state(self, key: Tuple[str, str], state: StateDict) -> StateList:
         """ First check for original (uppercase) word in the state, if any;
             if that doesn't match, check the lower case """
-        wm = None
+        wm = ""
         wo, w = key
-        if self._auto_uppercase and len(wo) == 1 and w is wo:
+        if self._auto_uppercase and len(wo) == 1 and w != wo:
             # If we are auto-uppercasing, leave single-letter lowercase
             # phrases alone, i.e. 'g' for 'gram' and 'm' for 'meter'
-            pass
+            wm = wo
         elif wo is not w and wo in state:
             wm = wo  # Original word
         elif w in state:
             wm = w  # Lowercase version
-        return state[wm]
+        return state.get(wm, [])
 
-    def match(self, tq, ix):
+    def match(self, tq: List[Tok], ix: int) -> Iterable[Tok]:
         w = " ".join([t.txt for t in tq])
         # Add the entire phrase as one 'word' to the token queue.
         # Note that the StaticPhrases meaning list will be converted
@@ -1532,37 +1537,36 @@ class DisambiguationStream(MatchingStream):
         meanings that have categories matching those allowed
         in the [disambiguate_phrases] section in config/Phrases.conf """
 
-    def __init__(self, token_ctor):
+    def __init__(self, token_ctor: Type["Bin_TOK"]) -> None:
         super().__init__(AmbigPhrases.DICT)
         self._token_ctor = token_ctor
 
-    def key(self, token):
+    def key(self, token: Tok) -> DisambiguationTuple:
         """ Generate a phrase key from the given token """
         # Construct a set of all possible lemmas of this word form
         if token.kind == TOK.WORD:
             return token.txt.lower(), frozenset(m.stofn + "*" for m in token.val)
         return token.txt.lower(), frozenset()
 
-    def match_state(self, key, state):
+    def match_state(self, key: DisambiguationTuple, state: StateDict) -> StateList:
         """ Called to see if the current token's key matches
             the given state. Returns the value that should be
             used to look up the key within the state, or None
             if there is no match. """
         # First, check for a direct text match
-        states = []
-        if key[0] in state:
-            states.extend(state[key[0]])
+        txt, stems = key
+        states = list(state.get(txt, []))
         # Then, check whether the stems of the token match any
         # asterisk-marked entry in the state
-        for stem in key[1]:
+        for stem in stems:
             if stem in state:
                 states.extend(state[stem])
         return states
 
-    def length(self, ix):
+    def length(self, ix: int) -> int:
         return len(AmbigPhrases.get_cats(ix))
 
-    def match(self, tq, ix):
+    def match(self, tq: List[Tok], ix: int) -> Iterable[Tok]:
         """ We have a phrase match: return the tokens in the token
             queue, but with their meanings filtered down to only
             the word categories specified in the phrase configration """

diff --git a/src/reynir/config/Prefs.conf b/src/reynir/config/Prefs.conf
@@ -267,6 +267,8 @@ lífi lífi < líf
 lífinu lífi < líf
 # lykta lykt < lyktir
 læsi læsa < lesa læsi
+löndunum landa landi < löndun land
+landanna landa landi < land
 mannanna manni < maður
 manna manni < maður
 manninn manni < maður

diff --git a/src/reynir/resources/ord.compressed b/src/reynir/resources/ord.compressed