Skip to content

Commit

Permalink
Merge pull request #16 from linhd-postdata/develop
Browse files Browse the repository at this point in the history
New release
  • Loading branch information
alvp authored Nov 20, 2019
2 parents cf2b1a6 + c79744d commit 21730d2
Show file tree
Hide file tree
Showing 16 changed files with 3,224 additions and 520 deletions.
10 changes: 6 additions & 4 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@ matrix:
include:
- python: 3.7
env: TOXENV=py37
- python: 3.6
- python: 3.8
env: TOXENV=flake8
- python: 3.6
env: TOXENV=py36
- python: 3.8
env: TOXENV=py38
install:
- pip install -U tox-travis
script: tox
Expand All @@ -22,9 +24,9 @@ deploy:
on:
tags: true
repo: linhd-postdata/spacy-affixes
python: 3.7
condition: "$TOXENV = py37"
python: 3.8
condition: "$TOXENV = py38"
notifications:
email:
on_success: never
on_failure: always
on_failure: always
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@ lint: ## check style with flake8
test: ## run tests quickly with the default Python
py.test --pdbcls=IPython.terminal.debugger:Pdb

test-snaps: ## update snapshots for tests
py.test --snapshot-update

test-all: ## run tests on every Python version with tox
tox

Expand Down
1 change: 0 additions & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,5 @@ Where :code:`lang` is the 2-character ISO 639-1 code for a supported language, a

Notes
-----
- There is not yet support for Universal Dependencies tags since a good mapping is missing.
- Some decisions might feel idiosyncratic since the purpose of this library at the beginning was to just split clitics in Spanish texts.

4 changes: 2 additions & 2 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.1.3
current_version = 0.1.4
commit = True
tag = True

Expand All @@ -26,4 +26,4 @@ collect_ignore = ['setup.py']
[coverage:run]
omit =
*.tox*

src/spacy_affixes/__main__.py
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
license="Apache Software License 2.0",
long_description=readme + '\n\n' + history,
url='https://github.com/linhd-postdata/spacy-affixes',
version='0.1.3',
version='0.1.4',
include_package_data=True,
keywords='spacy_affixes',
name='spacy_affixes',
Expand Down
3 changes: 2 additions & 1 deletion src/spacy_affixes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

__author__ = """LINHD POSTDATA Project"""
__email__ = '[email protected]'
__version__ = '0.1.3'
__version__ = '0.1.4'

from .main import AffixesMatcher # pragma: no cover

1 change: 0 additions & 1 deletion src/spacy_affixes/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
For example, the default behaviour is:
python -m spacy_affixes download es 4.1
"""
# pragma: no cover
if __name__ == "__main__":
argv_len = len(sys.argv)
if 2 <= argv_len <= 4 and sys.argv[1] == "download":
Expand Down
668 changes: 254 additions & 414 deletions src/spacy_affixes/eagles.py

Large diffs are not rendered by default.

54 changes: 42 additions & 12 deletions src/spacy_affixes/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@

class AffixesMatcher(object):

def __init__(self, nlp, rules=None, lexicon=None, split_on=None):
def __init__(self, nlp, rules=None, lexicon=None, split_on=None,
replace_lemmas=True):
"""
:param nlp: SpaCy NLP object with the language already loaded
:param rules: Dictionary of rules for affixes handling. Each dict
Expand All @@ -40,38 +41,45 @@ def __init__(self, nlp, rules=None, lexicon=None, split_on=None):
["me", "lo"] as its `affix_text`
It defaults to Freeling if installed (environment
variable `FREELINGDIR` should be set) or downloaded using
`python -m spacy_affixes download_freeling_data`. Please,
check the Freeling site to see license compatibilities.
`python -m spacy_affixes download <lang> [version]`.
Please, check the Freeling site to see license
compatibilities.
:param lexicon: Dictionary keyed by word with values for lemma,
EAGLE code, UD POS, and UD Tags. It defaults to
Freeling if installed (environment
variable `FREELINGDIR` should be set) or downloaded
using `python -m spacy_affixes download_freeling_data`.
using
`python -m spacy_affixes download <lang> [version]`.
Please, check the Freeling site to see license
compatibilities.
:param split_on: Tuple of UD POS to split tokens on. Defaults to
verbs. A `*` means split whenever possible.
:param replace_lemmas: Boolean specifying whether the lemma should be
replaced with the output of the Freeling rules
"""
self.nlp = nlp
self.rules = load_affixes() if rules is None else rules
self.lexicon = load_lexicon() if lexicon is None else lexicon
self.split_on = ("VERB", ) if split_on is None else split_on
self.lemma_lookup = self.nlp.vocab.lookups.get_table("lemma_lookup")
self.replace_lemmas = replace_lemmas
if None in (self.lexicon, self.rules):
raise ValueError("""
Data for affixes rules or lexicon data is missing. Check
that Freeling is installed and its environment
variable `FREELINGDIR` is set), or that you have downloaded the
neccessary files using
`python -m spacy_affixes download_freeling_data`.
`python -m spacy_affixes download <lang> [version]`.
Please, check the Freeling site to see license
compatibilities.
""")
if not Token.has_extension("has_affixes"):
Token.set_extension("has_affixes", default=False)
Token.set_extension("affixes_rule", default=None)
Token.set_extension("affixes_text", default=None)
Token.set_extension("affixes_kind", default=None)
Token.set_extension("affixes_lemma", default=None)
Token.set_extension("affixes_length", default=0)
Token.set_extension("affixes_rule", default=None)
Token.set_extension("affixes_text", default=None)
self.matcher = Matcher(nlp.vocab)
for rule_key, rules in self.rules.items():
for rule in rules:
Expand All @@ -82,27 +90,41 @@ def __init__(self, nlp, rules=None, lexicon=None, split_on=None):
])

def apply_rules(self, retokenizer, token, rule):
if (rule["always_apply"]
or (token.is_oov or token not in self.lexicon)) is False:
return
strip_accent_exceptions = (
"automática",
)
for affix_add in rule["affix_add"]:
strip_accent = rule["strip_accent"]
token_sub = re.sub(rule["pattern"], '', token.text)
token_left = token_transform(
token_sub,
rule["kind"],
affix_add,
rule["strip_accent"]
False if token_sub in strip_accent_exceptions else strip_accent
)
morfo_lemma_opts = {
"affix_text": "".join(rule["affix_text"]),
"token_lower": token.lower_,
"token_left": token_left,
}
morfo = get_morfo(
token_left.lower(),
self.lexicon,
re.compile(rule["pos_re"], re.I)
re.compile(rule["pos_re"], re.I),
rule["assign_pos"],
rule["assign_lemma"],
**morfo_lemma_opts
)
if (token_left and morfo and not token._.has_affixes):
if token_left and morfo and not token._.has_affixes:
_, token_ud, token_tags, token_lemma = morfo
affixes_length = (
len(rule["affix_text"]) or int(affix_add != "")
)
if rule["kind"] == AFFIXES_SUFFIX:
heads = [(token, 1)] + (affixes_length * [(token, 0)])
token.lemma_ = self.nlp.Defaults.lemma_lookup.get(
token.lemma_ = self.lemma_lookup.get(
token_left.lower(),
token_lemma
)
Expand All @@ -123,6 +145,7 @@ def apply_rules(self, retokenizer, token, rule):
token._.affixes_text = token_left
token._.affixes_kind = rule["kind"]
token._.affixes_length = affixes_length
token._.affixes_lemma = token.lemma_
token._.has_affixes = True

def __call__(self, doc):
Expand All @@ -138,4 +161,11 @@ def __call__(self, doc):
if token._.affixes_rule:
for rule in self.rules[token._.affixes_rule]:
self.apply_rules(retokenizer, token, rule)
if not token._.has_affixes:
token._.affixes_rule = None
if self.replace_lemmas:
# Tokens are views of C structs
for index, _ in enumerate(doc):
if doc[index]._.has_affixes and doc[index]._.affixes_lemma:
doc[index].lemma_ = doc[index]._.affixes_lemma
return doc
73 changes: 45 additions & 28 deletions src/spacy_affixes/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import unicodedata
from collections import defaultdict
from urllib.request import urlopen
from .eagles import EAGLES_TO_UD_DICT
from .eagles import eagles2ud

AFFIXES_SUFFIX = "suffix"
AFFIXES_PREFIX = "prefix"
Expand Down Expand Up @@ -105,9 +105,12 @@ def build_affixes(affixes_raw):
for affix in affixes:
if len(affix.strip()) > 0 and not affix.startswith("#"):
affix_split = re.split(r"\s+", affix)
key, add, pos_re, _, strip_accent, *_, tokens = affix_split
(key, add, pos_re, assign_pos, strip_accent,
*_, assign_lemma, always_apply, tokens) = affix_split
add = add if add != "*" else ""
assign_pos = assign_pos if assign_pos != "*" else ""
strip_accent = int(strip_accent) == 0
always_apply = int(always_apply) == 1
if tokens == "-":
text = [key]
else:
Expand All @@ -116,7 +119,10 @@ def build_affixes(affixes_raw):
"pattern": affix_pos_re.format(key),
"kind": affix_kind,
"pos_re": fr"{pos_re}",
"assign_pos": assign_pos,
"strip_accent": strip_accent,
"assign_lemma": assign_lemma,
"always_apply": always_apply,
"affix_add": add.split("|"),
"affix_text": text,
}
Expand All @@ -136,28 +142,16 @@ def eagle2tag(eagle):
:param eagle: EAGLES tag to be converted
:return: Equivalent UD tag
"""
return EAGLES_TO_UD_DICT.get(eagle, "X")
tag = eagles2ud(eagle)
return tag if tag != '' else 'X__'


def eagle2pos(eagle):
mapper = {
"A": "ADJ",
"R": "ADV",
"D": "DET",
"N": "NOUN",
"V": "VERB",
"P": "PRON",
"C": "CONJ",
"I": "INTJ",
"S": "ADP",
"F": "PUNCT",
"Z": "NUM",
"W": "NUM", # Dates (W) are not standard EAGLE
}
return mapper.get(eagle[0], "X")
pos = eagles2ud(eagle).split('__')[0]
return pos


def token_transform(string, kind, add, strip_accent):
def token_transform(string, add, strip_accent):
if add == AFFIXES_PREFIX:
prefix, suffix = add, ""
else:
Expand Down Expand Up @@ -198,7 +192,7 @@ def download_lexicon(lang="es", version="4.1"):
'lemma': lemma,
'eagle': eagle,
'ud': ud or eagle2pos(eagle),
'tags': eagle2tag(eagle),
'tags': eagle2tag(eagle).split("__")[1],
})
return lexicon

Expand All @@ -211,22 +205,45 @@ def build_lexicon(lexicon_raw):
'lemma': lemma,
'eagle': eagle,
'ud': eagle2pos(eagle),
'tags': eagle2tag(eagle),
'tags': eagle2tag(eagle).split("__")[1],
})
return lexicon


def get_morfo(string, lexicon, regex):
def get_assigned_lemma(rule, **opts):
ralf = {
"R": opts["token_left"],
"A": opts["affix_text"],
"L": opts["lemma"],
"F": opts["token_lower"],
}
return "".join([ralf.get(opt, opt) for opt in rule.split("+")])


def get_morfo(string, lexicon, regex, assign_pos, assign_lemma,
**assign_lemma_opts):
# Checks for string in the lexicon
# Returns EAGLE, UD, tags, lemma
if string in lexicon:
entry = lexicon[string]
for definition in entry:
if regex.match(definition["eagle"]):
return (
definition["eagle"],
definition["ud"],
definition["tags"],
definition["lemma"]
)
assign_lemma_opts.update({
"lemma": definition["lemma"]
})
lemma = get_assigned_lemma(assign_lemma, **assign_lemma_opts)
if assign_pos:
return (
assign_pos,
eagle2pos(assign_pos),
eagle2tag(assign_pos).split('__')[1],
lemma
)
else:
return (
definition["eagle"],
definition["ud"],
definition["tags"],
lemma
)
return None
Empty file added tests/__init__.py
Empty file.
Empty file added tests/fixtures/__init__.py
Empty file.
Loading

0 comments on commit 21730d2

Please sign in to comment.