Merge pull request #16 from linhd-postdata/develop

New release
linhd-postdata · Nov 20, 2019 · 21730d2 · 21730d2
2 parents cf2b1a6 + c79744d
commit 21730d2
Show file tree

Hide file tree

Showing 16 changed files with 3,224 additions and 520 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -4,10 +4,12 @@ matrix:
   include:
     - python: 3.7
       env: TOXENV=py37
-    - python: 3.6
+    - python: 3.8
       env: TOXENV=flake8
     - python: 3.6
       env: TOXENV=py36
+    - python: 3.8
+      env: TOXENV=py38
 install:
 - pip install -U tox-travis
 script: tox
@@ -22,9 +24,9 @@ deploy:
   on:
     tags: true
     repo: linhd-postdata/spacy-affixes
-    python: 3.7
-    condition: "$TOXENV = py37"
+    python: 3.8
+    condition: "$TOXENV = py38"
 notifications:
   email:
     on_success: never
-    on_failure: always
+    on_failure: always
diff --git a/Makefile b/Makefile
@@ -56,6 +56,9 @@ lint: ## check style with flake8
 test: ## run tests quickly with the default Python
 	py.test --pdbcls=IPython.terminal.debugger:Pdb
 
+test-snaps: ## update snapshots for tests
+	py.test --snapshot-update
+
 test-all: ## run tests on every Python version with tox
 	tox
 

diff --git a/README.rst b/README.rst
@@ -90,6 +90,5 @@ Where :code:`lang` is the 2-character ISO 639-1 code for a supported language, a
 
 Notes
 -----
-- There is not yet support for Universal Dependencies tags since a good mapping is missing.
 - Some decisions might feel idiosyncratic since the purpose of this library at the beginning was to just split clitics in Spanish texts. 
 
diff --git a/setup.cfg b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.1.3
+current_version = 0.1.4
 commit = True
 tag = True
 
@@ -26,4 +26,4 @@ collect_ignore = ['setup.py']
 [coverage:run]
 omit = 
 	*.tox*
-
+	src/spacy_affixes/__main__.py
diff --git a/setup.py b/setup.py
@@ -38,7 +38,7 @@
     license="Apache Software License 2.0",
     long_description=readme + '\n\n' + history,
     url='https://github.com/linhd-postdata/spacy-affixes',
-    version='0.1.3',
+    version='0.1.4',
     include_package_data=True,
     keywords='spacy_affixes',
     name='spacy_affixes',

diff --git a/src/spacy_affixes/__init__.py b/src/spacy_affixes/__init__.py
@@ -4,6 +4,7 @@
 
 __author__ = """LINHD POSTDATA Project"""
 __email__ = '[email protected]'
-__version__ = '0.1.3'
+__version__ = '0.1.4'
 
 from .main import AffixesMatcher  # pragma: no cover
+
diff --git a/src/spacy_affixes/__main__.py b/src/spacy_affixes/__main__.py
@@ -11,7 +11,6 @@
 For example, the default behaviour is:
     python -m spacy_affixes download es 4.1
 """
-# pragma: no cover
 if __name__ == "__main__":
     argv_len = len(sys.argv)
     if 2 <= argv_len <= 4 and sys.argv[1] == "download":

diff --git a/src/spacy_affixes/eagles.py b/src/spacy_affixes/eagles.py
diff --git a/src/spacy_affixes/main.py b/src/spacy_affixes/main.py
@@ -14,7 +14,8 @@
 
 class AffixesMatcher(object):
 
-    def __init__(self, nlp, rules=None, lexicon=None, split_on=None):
+    def __init__(self, nlp, rules=None, lexicon=None, split_on=None,
+                 replace_lemmas=True):
         """
         :param nlp: SpaCy NLP object with the language already loaded
         :param rules: Dictionary of rules for affixes handling. Each dict
@@ -40,38 +41,45 @@ def __init__(self, nlp, rules=None, lexicon=None, split_on=None):
                                     ["me", "lo"] as its `affix_text`
                       It defaults to Freeling if installed (environment
                       variable `FREELINGDIR` should be set) or downloaded using
-                      `python -m spacy_affixes download_freeling_data`. Please,
-                      check the Freeling site to see license compatibilities.
+                      `python -m spacy_affixes download <lang> [version]`.
+                      Please, check the Freeling site to see license
+                      compatibilities.
         :param lexicon: Dictionary keyed by word with values for lemma,
                         EAGLE code, UD POS, and UD Tags. It defaults to
                         Freeling if installed (environment
                         variable `FREELINGDIR` should be set) or downloaded
-                        using `python -m spacy_affixes download_freeling_data`.
+                        using
+                        `python -m spacy_affixes download <lang> [version]`.
                         Please, check the Freeling site to see license
                         compatibilities.
         :param split_on: Tuple of UD POS to split tokens on. Defaults to
                          verbs. A `*` means split whenever possible.
+        :param replace_lemmas: Boolean specifying whether the lemma should be
+                               replaced with the output of the Freeling rules
         """
         self.nlp = nlp
         self.rules = load_affixes() if rules is None else rules
         self.lexicon = load_lexicon() if lexicon is None else lexicon
         self.split_on = ("VERB", ) if split_on is None else split_on
+        self.lemma_lookup = self.nlp.vocab.lookups.get_table("lemma_lookup")
+        self.replace_lemmas = replace_lemmas
         if None in (self.lexicon, self.rules):
             raise ValueError("""
             Data for affixes rules or lexicon data is missing. Check
             that Freeling is installed and its environment
             variable `FREELINGDIR` is set), or that you have downloaded the
             neccessary files using
-            `python -m spacy_affixes download_freeling_data`.
+            `python -m spacy_affixes download <lang> [version]`.
             Please, check the Freeling site to see license
             compatibilities.
             """)
         if not Token.has_extension("has_affixes"):
             Token.set_extension("has_affixes", default=False)
-            Token.set_extension("affixes_rule", default=None)
-            Token.set_extension("affixes_text", default=None)
             Token.set_extension("affixes_kind", default=None)
+            Token.set_extension("affixes_lemma", default=None)
             Token.set_extension("affixes_length", default=0)
+            Token.set_extension("affixes_rule", default=None)
+            Token.set_extension("affixes_text", default=None)
         self.matcher = Matcher(nlp.vocab)
         for rule_key, rules in self.rules.items():
             for rule in rules:
@@ -82,27 +90,41 @@ def __init__(self, nlp, rules=None, lexicon=None, split_on=None):
                 ])
 
     def apply_rules(self, retokenizer, token, rule):
+        if (rule["always_apply"]
+                or (token.is_oov or token not in self.lexicon)) is False:
+            return
+        strip_accent_exceptions = (
+            "automática",
+        )
         for affix_add in rule["affix_add"]:
+            strip_accent = rule["strip_accent"]
             token_sub = re.sub(rule["pattern"], '', token.text)
             token_left = token_transform(
                 token_sub,
-                rule["kind"],
                 affix_add,
-                rule["strip_accent"]
+                False if token_sub in strip_accent_exceptions else strip_accent
             )
+            morfo_lemma_opts = {
+                "affix_text": "".join(rule["affix_text"]),
+                "token_lower": token.lower_,
+                "token_left": token_left,
+            }
             morfo = get_morfo(
                 token_left.lower(),
                 self.lexicon,
-                re.compile(rule["pos_re"], re.I)
+                re.compile(rule["pos_re"], re.I),
+                rule["assign_pos"],
+                rule["assign_lemma"],
+                **morfo_lemma_opts
             )
-            if (token_left and morfo and not token._.has_affixes):
+            if token_left and morfo and not token._.has_affixes:
                 _, token_ud, token_tags, token_lemma = morfo
                 affixes_length = (
                     len(rule["affix_text"]) or int(affix_add != "")
                 )
                 if rule["kind"] == AFFIXES_SUFFIX:
                     heads = [(token, 1)] + (affixes_length * [(token, 0)])
-                    token.lemma_ = self.nlp.Defaults.lemma_lookup.get(
+                    token.lemma_ = self.lemma_lookup.get(
                         token_left.lower(),
                         token_lemma
                     )
@@ -123,6 +145,7 @@ def apply_rules(self, retokenizer, token, rule):
                 token._.affixes_text = token_left
                 token._.affixes_kind = rule["kind"]
                 token._.affixes_length = affixes_length
+                token._.affixes_lemma = token.lemma_
                 token._.has_affixes = True
 
     def __call__(self, doc):
@@ -138,4 +161,11 @@ def __call__(self, doc):
                 if token._.affixes_rule:
                     for rule in self.rules[token._.affixes_rule]:
                         self.apply_rules(retokenizer, token, rule)
+                if not token._.has_affixes:
+                    token._.affixes_rule = None
+        if self.replace_lemmas:
+            # Tokens are views of C structs
+            for index, _ in enumerate(doc):
+                if doc[index]._.has_affixes and doc[index]._.affixes_lemma:
+                    doc[index].lemma_ = doc[index]._.affixes_lemma
         return doc
diff --git a/src/spacy_affixes/utils.py b/src/spacy_affixes/utils.py
@@ -5,7 +5,7 @@
 import unicodedata
 from collections import defaultdict
 from urllib.request import urlopen
-from .eagles import EAGLES_TO_UD_DICT
+from .eagles import eagles2ud
 
 AFFIXES_SUFFIX = "suffix"
 AFFIXES_PREFIX = "prefix"
@@ -105,9 +105,12 @@ def build_affixes(affixes_raw):
         for affix in affixes:
             if len(affix.strip()) > 0 and not affix.startswith("#"):
                 affix_split = re.split(r"\s+", affix)
-                key, add, pos_re, _, strip_accent, *_, tokens = affix_split
+                (key, add, pos_re, assign_pos, strip_accent,
+                 *_, assign_lemma, always_apply, tokens) = affix_split
                 add = add if add != "*" else ""
+                assign_pos = assign_pos if assign_pos != "*" else ""
                 strip_accent = int(strip_accent) == 0
+                always_apply = int(always_apply) == 1
                 if tokens == "-":
                     text = [key]
                 else:
@@ -116,7 +119,10 @@ def build_affixes(affixes_raw):
                     "pattern": affix_pos_re.format(key),
                     "kind": affix_kind,
                     "pos_re": fr"{pos_re}",
+                    "assign_pos": assign_pos,
                     "strip_accent": strip_accent,
+                    "assign_lemma": assign_lemma,
+                    "always_apply": always_apply,
                     "affix_add": add.split("|"),
                     "affix_text": text,
                 }
@@ -136,28 +142,16 @@ def eagle2tag(eagle):
     :param eagle: EAGLES tag to be converted
     :return: Equivalent UD tag
     """
-    return EAGLES_TO_UD_DICT.get(eagle, "X")
+    tag = eagles2ud(eagle)
+    return tag if tag != '' else 'X__'
 
 
 def eagle2pos(eagle):
-    mapper = {
-        "A": "ADJ",
-        "R": "ADV",
-        "D": "DET",
-        "N": "NOUN",
-        "V": "VERB",
-        "P": "PRON",
-        "C": "CONJ",
-        "I": "INTJ",
-        "S": "ADP",
-        "F": "PUNCT",
-        "Z": "NUM",
-        "W": "NUM",  # Dates (W) are not standard EAGLE
-    }
-    return mapper.get(eagle[0], "X")
+    pos = eagles2ud(eagle).split('__')[0]
+    return pos
 
 
-def token_transform(string, kind, add, strip_accent):
+def token_transform(string, add, strip_accent):
     if add == AFFIXES_PREFIX:
         prefix, suffix = add, ""
     else:
@@ -198,7 +192,7 @@ def download_lexicon(lang="es", version="4.1"):
                 'lemma': lemma,
                 'eagle': eagle,
                 'ud': ud or eagle2pos(eagle),
-                'tags': eagle2tag(eagle),
+                'tags': eagle2tag(eagle).split("__")[1],
             })
     return lexicon
 
@@ -211,22 +205,45 @@ def build_lexicon(lexicon_raw):
             'lemma': lemma,
             'eagle': eagle,
             'ud': eagle2pos(eagle),
-            'tags': eagle2tag(eagle),
+            'tags': eagle2tag(eagle).split("__")[1],
         })
     return lexicon
 
 
-def get_morfo(string, lexicon, regex):
+def get_assigned_lemma(rule, **opts):
+    ralf = {
+        "R": opts["token_left"],
+        "A": opts["affix_text"],
+        "L": opts["lemma"],
+        "F": opts["token_lower"],
+    }
+    return "".join([ralf.get(opt, opt) for opt in rule.split("+")])
+
+
+def get_morfo(string, lexicon, regex, assign_pos, assign_lemma,
+              **assign_lemma_opts):
     # Checks for string in the lexicon
     # Returns EAGLE, UD, tags, lemma
     if string in lexicon:
         entry = lexicon[string]
         for definition in entry:
             if regex.match(definition["eagle"]):
-                return (
-                    definition["eagle"],
-                    definition["ud"],
-                    definition["tags"],
-                    definition["lemma"]
-                )
+                assign_lemma_opts.update({
+                    "lemma": definition["lemma"]
+                })
+                lemma = get_assigned_lemma(assign_lemma, **assign_lemma_opts)
+                if assign_pos:
+                    return (
+                        assign_pos,
+                        eagle2pos(assign_pos),
+                        eagle2tag(assign_pos).split('__')[1],
+                        lemma
+                    )
+                else:
+                    return (
+                        definition["eagle"],
+                        definition["ud"],
+                        definition["tags"],
+                        lemma
+                    )
     return None
diff --git a/tests/__init__.py b/tests/__init__.py
diff --git a/tests/fixtures/__init__.py b/tests/fixtures/__init__.py
-Original file line number
+Diff line change
@@ Expand Up @@
     Notes
     -----
-    - There is not yet support for Universal Dependencies tags since a good mapping is missing.
     - Some decisions might feel idiosyncratic since the purpose of this library at the beginning was to just split clitics in Spanish texts.