Refactor lemmatizer and data table integration (#4353)

* Move test

* Allow default in Lookups.get_table

* Start with blank tables in Lookups.from_bytes

* Refactor lemmatizer to hold instance of Lookups

* Get lookups table within the lemmatization methods to make sure it references the correct table (even if the table was replaced or modified, e.g. when loading a model from disk)
* Deprecate other arguments on Lemmatizer.__init__ and expect Lookups for consistency
* Remove old and unsupported Lemmatizer.load classmethod
* Refactor language-specific lemmatizers to inherit as much as possible from base class and override only what they need

* Update tests and docs

* Fix more tests

* Fix lemmatizer

* Upgrade pytest to try and fix weird CI errors

* Try pytest 4.6.5
This commit is contained in:
Ines Montani 2019-10-01 21:36:04 +02:00 committed by Matthew Honnibal
parent 3297a19545
commit cf65a80f36
27 changed files with 332 additions and 331 deletions

View File

@ -13,7 +13,6 @@ install:
- "pip install -e ." - "pip install -e ."
script: script:
- "cat /proc/cpuinfo | grep flags | head -n 1" - "cat /proc/cpuinfo | grep flags | head -n 1"
- "pip install pytest pytest-timeout"
- "python -m pytest --tb=native spacy" - "python -m pytest --tb=native spacy"
branches: branches:
except: except:

View File

@ -15,7 +15,7 @@ pathlib==1.0.1; python_version < "3.4"
jsonschema>=2.6.0,<3.1.0 jsonschema>=2.6.0,<3.1.0
# Development dependencies # Development dependencies
cython>=0.25 cython>=0.25
pytest>=4.0.0,<4.1.0 pytest>=4.6.5
pytest-timeout>=1.3.0,<2.0.0 pytest-timeout>=1.3.0,<2.0.0
mock>=2.0.0,<3.0.0 mock>=2.0.0,<3.0.0
flake8>=3.5.0,<3.6.0 flake8>=3.5.0,<3.6.0

View File

@ -96,3 +96,7 @@ exclude =
__pycache__, __pycache__,
_tokenizer_exceptions_list.py, _tokenizer_exceptions_list.py,
spacy/__init__.py spacy/__init__.py
[tool:pytest]
markers =
slow

View File

@ -487,6 +487,12 @@ class Errors(object):
E170 = ("Cannot apply transition {name}: invalid for the current state.") E170 = ("Cannot apply transition {name}: invalid for the current state.")
E171 = ("Matcher.add received invalid on_match callback argument: expected " E171 = ("Matcher.add received invalid on_match callback argument: expected "
"callable or None, but got: {arg_type}") "callable or None, but got: {arg_type}")
E172 = ("The Lemmatizer.load classmethod is deprecated. To create a "
"Lemmatizer, initialize the class directly. See the docs for "
"details: https://spacy.io/api/lemmatizer")
E173 = ("As of v2.2, the Lemmatizer is initialized with an instance of "
"Lookups containing the lemmatization tables. See the docs for "
"details: https://spacy.io/api/lemmatizer#init")
@add_codes @add_codes

View File

@ -13,8 +13,9 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS
from .norm_exceptions import NORM_EXCEPTIONS from .norm_exceptions import NORM_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...lookups import Lookups
from ...attrs import LANG, NORM from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups, get_lemma_tables from ...util import update_exc, add_lookups
class GreekDefaults(Language.Defaults): class GreekDefaults(Language.Defaults):
@ -34,8 +35,9 @@ class GreekDefaults(Language.Defaults):
@classmethod @classmethod
def create_lemmatizer(cls, nlp=None, lookups=None): def create_lemmatizer(cls, nlp=None, lookups=None):
lemma_rules, lemma_index, lemma_exc, lemma_lookup = get_lemma_tables(lookups) if lookups is None:
return GreekLemmatizer(lemma_index, lemma_exc, lemma_rules, lemma_lookup) lookups = Lookups()
return GreekLemmatizer(lookups)
class Greek(Language): class Greek(Language):

View File

@ -1,10 +1,10 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from ...symbols import NOUN, VERB, ADJ, PUNCT from ...lemmatizer import Lemmatizer
class GreekLemmatizer(object): class GreekLemmatizer(Lemmatizer):
""" """
Greek language lemmatizer applies the default rule based lemmatization Greek language lemmatizer applies the default rule based lemmatization
procedure with some modifications for better Greek language support. procedure with some modifications for better Greek language support.
@ -15,64 +15,26 @@ class GreekLemmatizer(object):
not applicable for Greek language. not applicable for Greek language.
""" """
@classmethod def lemmatize(self, string, index, exceptions, rules):
def load(cls, path, index=None, exc=None, rules=None, lookup=None): string = string.lower()
return cls(index, exc, rules, lookup) forms = []
if string in index:
def __init__(self, index=None, exceptions=None, rules=None, lookup=None): forms.append(string)
self.index = index return forms
self.exc = exceptions forms.extend(exceptions.get(string, []))
self.rules = rules oov_forms = []
self.lookup_table = lookup if lookup is not None else {} if not forms:
for old, new in rules:
def __call__(self, string, univ_pos, morphology=None): if string.endswith(old):
if not self.rules: form = string[: len(string) - len(old)] + new
return [self.lookup_table.get(string, string)] if not form:
if univ_pos in (NOUN, "NOUN", "noun"): pass
univ_pos = "noun" elif form in index or not form.isalpha():
elif univ_pos in (VERB, "VERB", "verb"): forms.append(form)
univ_pos = "verb" else:
elif univ_pos in (ADJ, "ADJ", "adj"): oov_forms.append(form)
univ_pos = "adj" if not forms:
elif univ_pos in (PUNCT, "PUNCT", "punct"): forms.extend(oov_forms)
univ_pos = "punct" if not forms:
else: forms.append(string)
return list(set([string.lower()])) return list(set(forms))
lemmas = lemmatize(
string,
self.index.get(univ_pos, {}),
self.exc.get(univ_pos, {}),
self.rules.get(univ_pos, []),
)
return lemmas
def lookup(self, string, orth=None):
key = orth if orth is not None else string
if key in self.lookup_table:
return self.lookup_table[key]
return string
def lemmatize(string, index, exceptions, rules):
string = string.lower()
forms = []
if string in index:
forms.append(string)
return forms
forms.extend(exceptions.get(string, []))
oov_forms = []
if not forms:
for old, new in rules:
if string.endswith(old):
form = string[: len(string) - len(old)] + new
if not form:
pass
elif form in index or not form.isalpha():
forms.append(form)
else:
oov_forms.append(form)
if not forms:
forms.extend(oov_forms)
if not forms:
forms.append(string)
return list(set(forms))

View File

@ -12,8 +12,9 @@ from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...lookups import Lookups
from ...attrs import LANG, NORM from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups, get_lemma_tables from ...util import update_exc, add_lookups
class FrenchDefaults(Language.Defaults): class FrenchDefaults(Language.Defaults):
@ -33,8 +34,9 @@ class FrenchDefaults(Language.Defaults):
@classmethod @classmethod
def create_lemmatizer(cls, nlp=None, lookups=None): def create_lemmatizer(cls, nlp=None, lookups=None):
lemma_rules, lemma_index, lemma_exc, lemma_lookup = get_lemma_tables(lookups) if lookups is None:
return FrenchLemmatizer(lemma_index, lemma_exc, lemma_rules, lemma_lookup) lookups = Lookups()
return FrenchLemmatizer(lookups)
class French(Language): class French(Language):

View File

@ -1,12 +1,13 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from ...lemmatizer import Lemmatizer
from ...symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT, ADP from ...symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT, ADP
from ...symbols import SCONJ, CCONJ from ...symbols import SCONJ, CCONJ
from ...symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos from ...symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos
class FrenchLemmatizer(object): class FrenchLemmatizer(Lemmatizer):
""" """
French language lemmatizer applies the default rule based lemmatization French language lemmatizer applies the default rule based lemmatization
procedure with some modifications for better French language support. procedure with some modifications for better French language support.
@ -16,19 +17,10 @@ class FrenchLemmatizer(object):
the lookup table. the lookup table.
""" """
@classmethod
def load(cls, path, index=None, exc=None, rules=None, lookup=None):
return cls(index, exc, rules, lookup)
def __init__(self, index=None, exceptions=None, rules=None, lookup=None):
self.index = index
self.exc = exceptions
self.rules = rules
self.lookup_table = lookup if lookup is not None else {}
def __call__(self, string, univ_pos, morphology=None): def __call__(self, string, univ_pos, morphology=None):
if not self.rules: lookup_table = self.lookups.get_table("lemma_lookup", {})
return [self.lookup_table.get(string, string)] if "lemma_rules" not in self.lookups:
return [lookup_table.get(string, string)]
if univ_pos in (NOUN, "NOUN", "noun"): if univ_pos in (NOUN, "NOUN", "noun"):
univ_pos = "noun" univ_pos = "noun"
elif univ_pos in (VERB, "VERB", "verb"): elif univ_pos in (VERB, "VERB", "verb"):
@ -56,12 +48,14 @@ class FrenchLemmatizer(object):
# See Issue #435 for example of where this logic is requied. # See Issue #435 for example of where this logic is requied.
if self.is_base_form(univ_pos, morphology): if self.is_base_form(univ_pos, morphology):
return list(set([string.lower()])) return list(set([string.lower()]))
lemmas = lemmatize( index_table = self.lookups.get_table("lemma_index", {})
exc_table = self.lookups.get_table("lemma_exc", {})
rules_table = self.lookups.get_table("lemma_rules", {})
lemmas = self.lemmatize(
string, string,
self.index.get(univ_pos, {}), index_table.get(univ_pos, {}),
self.exc.get(univ_pos, {}), exc_table.get(univ_pos, {}),
self.rules.get(univ_pos, []), rules_table.get(univ_pos, []),
self.lookup_table,
) )
return lemmas return lemmas
@ -115,33 +109,34 @@ class FrenchLemmatizer(object):
return self(string, "punct", morphology) return self(string, "punct", morphology)
def lookup(self, string, orth=None): def lookup(self, string, orth=None):
if orth is not None and orth in self.lookup_table: lookup_table = self.lookups.get_table("lemma_lookup", {})
return self.lookup_table[orth][0] if orth is not None and orth in lookup_table:
return lookup_table[orth][0]
return string return string
def lemmatize(self, string, index, exceptions, rules):
def lemmatize(string, index, exceptions, rules, lookup): lookup_table = self.lookups.get_table("lemma_lookup", {})
string = string.lower() string = string.lower()
forms = [] forms = []
if string in index: if string in index:
forms.append(string) forms.append(string)
return forms return forms
forms.extend(exceptions.get(string, [])) forms.extend(exceptions.get(string, []))
oov_forms = [] oov_forms = []
if not forms: if not forms:
for old, new in rules: for old, new in rules:
if string.endswith(old): if string.endswith(old):
form = string[: len(string) - len(old)] + new form = string[: len(string) - len(old)] + new
if not form: if not form:
pass pass
elif form in index or not form.isalpha(): elif form in index or not form.isalpha():
forms.append(form) forms.append(form)
else: else:
oov_forms.append(form) oov_forms.append(form)
if not forms: if not forms:
forms.extend(oov_forms) forms.extend(oov_forms)
if not forms and string in lookup.keys(): if not forms and string in lookup_table.keys():
forms.append(lookup[string][0]) forms.append(lookup_table[string][0])
if not forms: if not forms:
forms.append(string) forms.append(string)
return list(set(forms)) return list(set(forms))

View File

@ -10,8 +10,9 @@ from .lemmatizer import DutchLemmatizer
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...lookups import Lookups
from ...attrs import LANG, NORM from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups, get_lemma_tables from ...util import update_exc, add_lookups
class DutchDefaults(Language.Defaults): class DutchDefaults(Language.Defaults):
@ -29,8 +30,9 @@ class DutchDefaults(Language.Defaults):
@classmethod @classmethod
def create_lemmatizer(cls, nlp=None, lookups=None): def create_lemmatizer(cls, nlp=None, lookups=None):
lemma_rules, lemma_index, lemma_exc, lemma_lookup = get_lemma_tables(lookups) if lookups is None:
return DutchLemmatizer(lemma_index, lemma_exc, lemma_rules, lemma_lookup) lookups = Lookups()
return DutchLemmatizer(lookups)
class Dutch(Language): class Dutch(Language):

View File

@ -1,10 +1,11 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from ...lemmatizer import Lemmatizer
from ...symbols import NOUN, VERB, ADJ, NUM, DET, PRON, ADP, AUX, ADV from ...symbols import NOUN, VERB, ADJ, NUM, DET, PRON, ADP, AUX, ADV
class DutchLemmatizer(object): class DutchLemmatizer(Lemmatizer):
# Note: CGN does not distinguish AUX verbs, so we treat AUX as VERB. # Note: CGN does not distinguish AUX verbs, so we treat AUX as VERB.
univ_pos_name_variants = { univ_pos_name_variants = {
NOUN: "noun", NOUN: "noun",
@ -36,16 +37,6 @@ class DutchLemmatizer(object):
"num": "num", "num": "num",
} }
@classmethod
def load(cls, path, index=None, exc=None, rules=None, lookup=None):
return cls(index, exc, rules, lookup)
def __init__(self, index=None, exceptions=None, rules=None, lookup=None):
self.index = index
self.exc = exceptions
self.rules = rules or {}
self.lookup_table = lookup if lookup is not None else {}
def __call__(self, string, univ_pos, morphology=None): def __call__(self, string, univ_pos, morphology=None):
# Difference 1: self.rules is assumed to be non-None, so no # Difference 1: self.rules is assumed to be non-None, so no
# 'is None' check required. # 'is None' check required.
@ -62,11 +53,13 @@ class DutchLemmatizer(object):
# are not lemmatized. They are lowercased, however. # are not lemmatized. They are lowercased, however.
return [string] return [string]
# if string in self.lemma_index.get(univ_pos) # if string in self.lemma_index.get(univ_pos)
lemma_index = self.index.get(univ_pos, {}) index_table = self.lookups.get_table("lemma_index", {})
lemma_index = index_table.get(univ_pos, {})
# string is already lemma # string is already lemma
if string in lemma_index: if string in lemma_index:
return [string] return [string]
exceptions = self.exc.get(univ_pos, {}) exc_table = self.lookups.get_table("lemma_exc", {})
exceptions = exc_table.get(univ_pos, {})
# string is irregular token contained in exceptions index. # string is irregular token contained in exceptions index.
try: try:
lemma = exceptions[string] lemma = exceptions[string]
@ -74,15 +67,14 @@ class DutchLemmatizer(object):
except KeyError: except KeyError:
pass pass
# string corresponds to key in lookup table # string corresponds to key in lookup table
lookup_table = self.lookup_table lookup_table = self.lookups.get_table("lemma_lookup", {})
looked_up_lemma = lookup_table.get(string) looked_up_lemma = lookup_table.get(string)
if looked_up_lemma and looked_up_lemma in lemma_index: if looked_up_lemma and looked_up_lemma in lemma_index:
return [looked_up_lemma] return [looked_up_lemma]
rules_table = self.lookups.get_table("lemma_rules", {})
forms, is_known = lemmatize( forms, is_known = self.lemmatize(
string, lemma_index, exceptions, self.rules.get(univ_pos, []) string, lemma_index, exceptions, rules_table.get(univ_pos, [])
) )
# Back-off through remaining return value candidates. # Back-off through remaining return value candidates.
if forms: if forms:
if is_known: if is_known:
@ -104,46 +96,25 @@ class DutchLemmatizer(object):
# used to search the lookup table. This is necessary because our lookup # used to search the lookup table. This is necessary because our lookup
# table consists entirely of lowercase keys. # table consists entirely of lowercase keys.
def lookup(self, string, orth=None): def lookup(self, string, orth=None):
lookup_table = self.lookups.get_table("lemma_lookup", {})
string = string.lower() string = string.lower()
if orth is not None: if orth is not None:
return self.lookup_table.get(orth, string) return lookup_table.get(orth, string)
else: else:
return self.lookup_table.get(string, string) return lookup_table.get(string, string)
def noun(self, string, morphology=None): # Reimplemented to focus more on application of suffix rules and to return
return self(string, "noun", morphology) # as early as possible.
def lemmatize(self, string, index, exceptions, rules):
def verb(self, string, morphology=None): # returns (forms, is_known: bool)
return self(string, "verb", morphology) oov_forms = []
for old, new in rules:
def adj(self, string, morphology=None): if string.endswith(old):
return self(string, "adj", morphology) form = string[: len(string) - len(old)] + new
if not form:
def det(self, string, morphology=None): pass
return self(string, "det", morphology) elif form in index:
return [form], True # True = Is known (is lemma)
def pron(self, string, morphology=None): else:
return self(string, "pron", morphology) oov_forms.append(form)
return list(set(oov_forms)), False
def adp(self, string, morphology=None):
return self(string, "adp", morphology)
def punct(self, string, morphology=None):
return self(string, "punct", morphology)
# Reimplemented to focus more on application of suffix rules and to return
# as early as possible.
def lemmatize(string, index, exceptions, rules):
# returns (forms, is_known: bool)
oov_forms = []
for old, new in rules:
if string.endswith(old):
form = string[: len(string) - len(old)] + new
if not form:
pass
elif form in index:
return [form], True # True = Is known (is lemma)
else:
oov_forms.append(form)
return list(set(oov_forms)), False

View File

@ -12,6 +12,7 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS from ..norm_exceptions import BASE_NORMS
from ...util import update_exc, add_lookups from ...util import update_exc, add_lookups
from ...language import Language from ...language import Language
from ...lookups import Lookups
from ...attrs import LANG, NORM from ...attrs import LANG, NORM
@ -27,8 +28,10 @@ class RussianDefaults(Language.Defaults):
tag_map = TAG_MAP tag_map = TAG_MAP
@classmethod @classmethod
def create_lemmatizer(cls, nlp=None, **kwargs): def create_lemmatizer(cls, nlp=None, lookups=None):
return RussianLemmatizer() if lookups is None:
lookups = Lookups()
return RussianLemmatizer(lookups)
class Russian(Language): class Russian(Language):

View File

@ -9,8 +9,8 @@ from ...compat import unicode_
class RussianLemmatizer(Lemmatizer): class RussianLemmatizer(Lemmatizer):
_morph = None _morph = None
def __init__(self): def __init__(self, lookups=None):
super(RussianLemmatizer, self).__init__() super(RussianLemmatizer, self).__init__(lookups)
try: try:
from pymorphy2 import MorphAnalyzer from pymorphy2 import MorphAnalyzer
except ImportError: except ImportError:
@ -102,19 +102,6 @@ class RussianLemmatizer(Lemmatizer):
return symbols_to_str[univ_pos] return symbols_to_str[univ_pos]
return None return None
def is_base_form(self, univ_pos, morphology=None):
# TODO
raise NotImplementedError
def det(self, string, morphology=None):
return self(string, "det", morphology)
def num(self, string, morphology=None):
return self(string, "num", morphology)
def pron(self, string, morphology=None):
return self(string, "pron", morphology)
def lookup(self, string, orth=None): def lookup(self, string, orth=None):
analyses = self._morph.parse(string) analyses = self._morph.parse(string)
if len(analyses) == 1: if len(analyses) == 1:

View File

@ -41,8 +41,7 @@ class BaseDefaults(object):
def create_lemmatizer(cls, nlp=None, lookups=None): def create_lemmatizer(cls, nlp=None, lookups=None):
if lookups is None: if lookups is None:
lookups = cls.create_lookups(nlp=nlp) lookups = cls.create_lookups(nlp=nlp)
rules, index, exc, lookup = util.get_lemma_tables(lookups) return Lemmatizer(lookups=lookups)
return Lemmatizer(index, exc, rules, lookup)
@classmethod @classmethod
def create_lookups(cls, nlp=None): def create_lookups(cls, nlp=None):

View File

@ -1,8 +1,11 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from collections import OrderedDict from collections import OrderedDict
from .symbols import NOUN, VERB, ADJ, PUNCT, PROPN from .symbols import NOUN, VERB, ADJ, PUNCT, PROPN
from .errors import Errors
from .lookups import Lookups
class Lemmatizer(object): class Lemmatizer(object):
@ -14,18 +17,32 @@ class Lemmatizer(object):
""" """
@classmethod @classmethod
def load(cls, path, index=None, exc=None, rules=None, lookup=None): def load(cls, *args, **kwargs):
return cls(index, exc, rules, lookup) raise NotImplementedError(Errors.E172)
def __init__(self, index=None, exceptions=None, rules=None, lookup=None): def __init__(self, lookups, *args, **kwargs):
self.index = index """Initialize a Lemmatizer.
self.exc = exceptions
self.rules = rules lookups (Lookups): The lookups object containing the (optional) tables
self.lookup_table = lookup if lookup is not None else {} "lemma_rules", "lemma_index", "lemma_exc" and "lemma_lookup".
RETURNS (Lemmatizer): The newly constructed object.
"""
if args or kwargs or not isinstance(lookups, Lookups):
raise ValueError(Errors.E173)
self.lookups = lookups
def __call__(self, string, univ_pos, morphology=None): def __call__(self, string, univ_pos, morphology=None):
if not self.rules: """Lemmatize a string.
return [self.lookup_table.get(string, string)]
string (unicode): The string to lemmatize, e.g. the token text.
univ_pos (unicode / int): The token's universal part-of-speech tag.
morphology (dict): The token's morphological features following the
Universal Dependencies scheme.
RETURNS (list): The available lemmas for the string.
"""
lookup_table = self.lookups.get_table("lemma_lookup", {})
if "lemma_rules" not in self.lookups:
return [lookup_table.get(string, string)]
if univ_pos in (NOUN, "NOUN", "noun"): if univ_pos in (NOUN, "NOUN", "noun"):
univ_pos = "noun" univ_pos = "noun"
elif univ_pos in (VERB, "VERB", "verb"): elif univ_pos in (VERB, "VERB", "verb"):
@ -41,11 +58,14 @@ class Lemmatizer(object):
# See Issue #435 for example of where this logic is requied. # See Issue #435 for example of where this logic is requied.
if self.is_base_form(univ_pos, morphology): if self.is_base_form(univ_pos, morphology):
return [string.lower()] return [string.lower()]
lemmas = lemmatize( index_table = self.lookups.get_table("lemma_index", {})
exc_table = self.lookups.get_table("lemma_exc", {})
rules_table = self.lookups.get_table("lemma_rules", {})
lemmas = self.lemmatize(
string, string,
self.index.get(univ_pos, {}), index_table.get(univ_pos, {}),
self.exc.get(univ_pos, {}), exc_table.get(univ_pos, {}),
self.rules.get(univ_pos, []), rules_table.get(univ_pos, []),
) )
return lemmas return lemmas
@ -53,6 +73,10 @@ class Lemmatizer(object):
""" """
Check whether we're dealing with an uninflected paradigm, so we can Check whether we're dealing with an uninflected paradigm, so we can
avoid lemmatization entirely. avoid lemmatization entirely.
univ_pos (unicode / int): The token's universal part-of-speech tag.
morphology (dict): The token's morphological features following the
Universal Dependencies scheme.
""" """
if morphology is None: if morphology is None:
morphology = {} morphology = {}
@ -90,6 +114,18 @@ class Lemmatizer(object):
def adj(self, string, morphology=None): def adj(self, string, morphology=None):
return self(string, "adj", morphology) return self(string, "adj", morphology)
def det(self, string, morphology=None):
return self(string, "det", morphology)
def pron(self, string, morphology=None):
return self(string, "pron", morphology)
def adp(self, string, morphology=None):
return self(string, "adp", morphology)
def num(self, string, morphology=None):
return self(string, "num", morphology)
def punct(self, string, morphology=None): def punct(self, string, morphology=None):
return self(string, "punct", morphology) return self(string, "punct", morphology)
@ -103,37 +139,37 @@ class Lemmatizer(object):
RETURNS (unicode): The lemma if the string was found, otherwise the RETURNS (unicode): The lemma if the string was found, otherwise the
original string. original string.
""" """
lookup_table = self.lookups.get_table("lemma_lookup", {})
key = orth if orth is not None else string key = orth if orth is not None else string
if key in self.lookup_table: if key in lookup_table:
return self.lookup_table[key] return lookup_table[key]
return string return string
def lemmatize(self, string, index, exceptions, rules):
def lemmatize(string, index, exceptions, rules): orig = string
orig = string string = string.lower()
string = string.lower() forms = []
forms = [] oov_forms = []
oov_forms = [] for old, new in rules:
for old, new in rules: if string.endswith(old):
if string.endswith(old): form = string[: len(string) - len(old)] + new
form = string[: len(string) - len(old)] + new if not form:
if not form: pass
pass elif form in index or not form.isalpha():
elif form in index or not form.isalpha(): forms.append(form)
forms.append(form) else:
else: oov_forms.append(form)
oov_forms.append(form) # Remove duplicates but preserve the ordering of applied "rules"
# Remove duplicates but preserve the ordering of applied "rules" forms = list(OrderedDict.fromkeys(forms))
forms = list(OrderedDict.fromkeys(forms)) # Put exceptions at the front of the list, so they get priority.
# Put exceptions at the front of the list, so they get priority. # This is a dodgy heuristic -- but it's the best we can do until we get
# This is a dodgy heuristic -- but it's the best we can do until we get # frequencies on this. We can at least prune out problematic exceptions,
# frequencies on this. We can at least prune out problematic exceptions, # if they shadow more frequent analyses.
# if they shadow more frequent analyses. for form in exceptions.get(string, []):
for form in exceptions.get(string, []): if form not in forms:
if form not in forms: forms.insert(0, form)
forms.insert(0, form) if not forms:
if not forms: forms.extend(oov_forms)
forms.extend(oov_forms) if not forms:
if not forms: forms.append(orig)
forms.append(orig) return forms
return forms

View File

@ -10,6 +10,9 @@ from .util import SimpleFrozenDict, ensure_path
from .strings import get_string_id from .strings import get_string_id
UNSET = object()
class Lookups(object): class Lookups(object):
"""Container for large lookup tables and dictionaries, e.g. lemmatization """Container for large lookup tables and dictionaries, e.g. lemmatization
data or tokenizer exception lists. Lookups are available via vocab.lookups, data or tokenizer exception lists. Lookups are available via vocab.lookups,
@ -60,16 +63,20 @@ class Lookups(object):
self._tables[name] = table self._tables[name] = table
return table return table
def get_table(self, name): def get_table(self, name, default=UNSET):
"""Get a table. Raises an error if the table doesn't exist. """Get a table. Raises an error if the table doesn't exist and no
default value is provided.
name (unicode): Name of the table. name (unicode): Name of the table.
default: Optional default value to return if table doesn't exist.
RETURNS (Table): The table. RETURNS (Table): The table.
DOCS: https://spacy.io/api/lookups#get_table DOCS: https://spacy.io/api/lookups#get_table
""" """
if name not in self._tables: if name not in self._tables:
raise KeyError(Errors.E159.format(name=name, tables=self.tables)) if default == UNSET:
raise KeyError(Errors.E159.format(name=name, tables=self.tables))
return default
return self._tables[name] return self._tables[name]
def remove_table(self, name): def remove_table(self, name):
@ -111,6 +118,7 @@ class Lookups(object):
DOCS: https://spacy.io/api/lookups#from_bytes DOCS: https://spacy.io/api/lookups#from_bytes
""" """
self._tables = OrderedDict()
for key, value in srsly.msgpack_loads(bytes_data).items(): for key, value in srsly.msgpack_loads(bytes_data).items():
self._tables[key] = Table(key) self._tables[key] = Table(key)
self._tables[key].update(value) self._tables[key].update(value)

View File

@ -5,13 +5,14 @@ import pytest
from spacy.vocab import Vocab from spacy.vocab import Vocab
from spacy.tokens import Doc from spacy.tokens import Doc
from spacy.lemmatizer import Lemmatizer from spacy.lemmatizer import Lemmatizer
from spacy.lookups import Table from spacy.lookups import Lookups
@pytest.fixture @pytest.fixture
def lemmatizer(): def lemmatizer():
lookup = Table(data={"dogs": "dog", "boxen": "box", "mice": "mouse"}) lookups = Lookups()
return Lemmatizer(lookup=lookup) lookups.add_table("lemma_lookup", {"dogs": "dog", "boxen": "box", "mice": "mouse"})
return Lemmatizer(lookups)
@pytest.fixture @pytest.fixture

View File

@ -5,11 +5,13 @@ import pytest
from spacy.morphology import Morphology from spacy.morphology import Morphology
from spacy.strings import StringStore, get_string_id from spacy.strings import StringStore, get_string_id
from spacy.lemmatizer import Lemmatizer from spacy.lemmatizer import Lemmatizer
from spacy.lookups import Lookups
@pytest.fixture @pytest.fixture
def morphology(): def morphology():
return Morphology(StringStore(), {}, Lemmatizer()) lemmatizer = Lemmatizer(Lookups())
return Morphology(StringStore(), {}, lemmatizer)
def test_init(morphology): def test_init(morphology):

View File

@ -1,22 +0,0 @@
# coding: utf8
from __future__ import unicode_literals
import pytest
from spacy.lang.en import English
from spacy.lookups import Lookups
def test_tagger_warns_no_lemma_lookups():
nlp = English()
nlp.vocab.lookups = Lookups()
assert not len(nlp.vocab.lookups)
tagger = nlp.create_pipe("tagger")
with pytest.warns(UserWarning):
tagger.begin_training()
nlp.add_pipe(tagger)
with pytest.warns(UserWarning):
nlp.begin_training()
nlp.vocab.lookups.add_table("lemma_lookup")
with pytest.warns(None) as record:
nlp.begin_training()
assert not record.list

View File

@ -9,6 +9,7 @@ from spacy.symbols import POS, VERB, VerbForm_inf
from spacy.vocab import Vocab from spacy.vocab import Vocab
from spacy.language import Language from spacy.language import Language
from spacy.lemmatizer import Lemmatizer from spacy.lemmatizer import Lemmatizer
from spacy.lookups import Lookups
from spacy.tokens import Doc, Span from spacy.tokens import Doc, Span
from ..util import get_doc, make_tempdir from ..util import get_doc, make_tempdir
@ -173,8 +174,11 @@ def test_issue595():
"""Test lemmatization of base forms""" """Test lemmatization of base forms"""
words = ["Do", "n't", "feed", "the", "dog"] words = ["Do", "n't", "feed", "the", "dog"]
tag_map = {"VB": {POS: VERB, VerbForm_inf: True}} tag_map = {"VB": {POS: VERB, VerbForm_inf: True}}
rules = {"verb": [["ed", "e"]]} lookups = Lookups()
lemmatizer = Lemmatizer({"verb": {}}, {"verb": {}}, rules) lookups.add_table("lemma_rules", {"verb": [["ed", "e"]]})
lookups.add_table("lemma_index", {"verb": {}})
lookups.add_table("lemma_exc", {"verb": {}})
lemmatizer = Lemmatizer(lookups)
vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map) vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
doc = Doc(vocab, words=words) doc = Doc(vocab, words=words)
doc[2].tag_ = "VB" doc[2].tag_ = "VB"

View File

@ -10,6 +10,7 @@ from spacy.lang.lex_attrs import LEX_ATTRS
from spacy.matcher import Matcher from spacy.matcher import Matcher
from spacy.tokenizer import Tokenizer from spacy.tokenizer import Tokenizer
from spacy.lemmatizer import Lemmatizer from spacy.lemmatizer import Lemmatizer
from spacy.lookups import Lookups
from spacy.symbols import ORTH, LEMMA, POS, VERB, VerbForm_part from spacy.symbols import ORTH, LEMMA, POS, VERB, VerbForm_part
@ -91,10 +92,11 @@ def test_issue1375():
def test_issue1387(): def test_issue1387():
tag_map = {"VBG": {POS: VERB, VerbForm_part: True}} tag_map = {"VBG": {POS: VERB, VerbForm_part: True}}
index = {"verb": ("cope", "cop")} lookups = Lookups()
exc = {"verb": {"coping": ("cope",)}} lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
rules = {"verb": [["ing", ""]]} lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
lemmatizer = Lemmatizer(index, exc, rules) lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
lemmatizer = Lemmatizer(lookups)
vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map) vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
doc = Doc(vocab, words=["coping"]) doc = Doc(vocab, words=["coping"])
doc[0].tag_ = "VBG" doc[0].tag_ = "VBG"

View File

@ -126,7 +126,8 @@ def test_issue1727():
vectors = Vectors(data=data, keys=["I", "am", "Matt"]) vectors = Vectors(data=data, keys=["I", "am", "Matt"])
tagger = Tagger(Vocab()) tagger = Tagger(Vocab())
tagger.add_label("PRP") tagger.add_label("PRP")
tagger.begin_training() with pytest.warns(UserWarning):
tagger.begin_training()
assert tagger.cfg.get("pretrained_dims", 0) == 0 assert tagger.cfg.get("pretrained_dims", 0) == 0
tagger.vocab.vectors = vectors tagger.vocab.vectors = vectors
with make_tempdir() as path: with make_tempdir() as path:

View File

@ -22,7 +22,8 @@ def test_issue2564():
"""Test the tagger sets is_tagged correctly when used via Language.pipe.""" """Test the tagger sets is_tagged correctly when used via Language.pipe."""
nlp = Language() nlp = Language()
tagger = nlp.create_pipe("tagger") tagger = nlp.create_pipe("tagger")
tagger.begin_training() # initialise weights with pytest.warns(UserWarning):
tagger.begin_training() # initialise weights
nlp.add_pipe(tagger) nlp.add_pipe(tagger)
doc = nlp("hello world") doc = nlp("hello world")
assert doc.is_tagged assert doc.is_tagged

View File

@ -0,0 +1,49 @@
# coding: utf8
from __future__ import unicode_literals
import pytest
from spacy.tokens import Doc
from spacy.language import Language
from spacy.lookups import Lookups
def test_lemmatizer_reflects_lookups_changes():
"""Test for an issue that'd cause lookups available in a model loaded from
disk to not be reflected in the lemmatizer."""
nlp = Language()
assert Doc(nlp.vocab, words=["foo"])[0].lemma_ == "foo"
table = nlp.vocab.lookups.add_table("lemma_lookup")
table["foo"] = "bar"
assert Doc(nlp.vocab, words=["foo"])[0].lemma_ == "bar"
table = nlp.vocab.lookups.get_table("lemma_lookup")
table["hello"] = "world"
# The update to the table should be reflected in the lemmatizer
assert Doc(nlp.vocab, words=["hello"])[0].lemma_ == "world"
new_nlp = Language()
table = new_nlp.vocab.lookups.add_table("lemma_lookup")
table["hello"] = "hi"
assert Doc(new_nlp.vocab, words=["hello"])[0].lemma_ == "hi"
nlp_bytes = nlp.to_bytes()
new_nlp.from_bytes(nlp_bytes)
# Make sure we have the previously saved lookup table
assert len(new_nlp.vocab.lookups) == 1
assert len(new_nlp.vocab.lookups.get_table("lemma_lookup")) == 2
assert new_nlp.vocab.lookups.get_table("lemma_lookup")["hello"] == "world"
assert Doc(new_nlp.vocab, words=["foo"])[0].lemma_ == "bar"
assert Doc(new_nlp.vocab, words=["hello"])[0].lemma_ == "world"
def test_tagger_warns_no_lemma_lookups():
nlp = Language()
nlp.vocab.lookups = Lookups()
assert not len(nlp.vocab.lookups)
tagger = nlp.create_pipe("tagger")
with pytest.warns(UserWarning):
tagger.begin_training()
nlp.add_pipe(tagger)
with pytest.warns(UserWarning):
nlp.begin_training()
nlp.vocab.lookups.add_table("lemma_lookup")
with pytest.warns(None) as record:
nlp.begin_training()
assert not record.list

View File

@ -111,16 +111,12 @@ SUFFIXES = ['"', ":", ">"]
@pytest.mark.parametrize("url", URLS_SHOULD_MATCH) @pytest.mark.parametrize("url", URLS_SHOULD_MATCH)
def test_should_match(en_tokenizer, url): def test_should_match(en_tokenizer, url):
token_match = en_tokenizer.token_match assert en_tokenizer.token_match(url) is not None
if token_match:
assert token_match(url)
@pytest.mark.parametrize("url", URLS_SHOULD_NOT_MATCH) @pytest.mark.parametrize("url", URLS_SHOULD_NOT_MATCH)
def test_should_not_match(en_tokenizer, url): def test_should_not_match(en_tokenizer, url):
token_match = en_tokenizer.token_match assert en_tokenizer.token_match(url) is None
if token_match:
assert not token_match(url)
@pytest.mark.parametrize("url", URLS_BASIC) @pytest.mark.parametrize("url", URLS_BASIC)

View File

@ -467,31 +467,6 @@ def expand_exc(excs, search, replace):
return new_excs return new_excs
def get_lemma_tables(lookups):
"""Load lemmatizer data from lookups table. Mostly used via
Language.Defaults.create_lemmatizer, but available as helper so it can be
reused in language classes that implement custom lemmatizers.
lookups (Lookups): The lookups table.
RETURNS (tuple): A (lemma_rules, lemma_index, lemma_exc, lemma_lookup)
tuple that can be used to initialize a Lemmatizer.
"""
lemma_rules = {}
lemma_index = {}
lemma_exc = {}
lemma_lookup = None
if lookups is not None:
if "lemma_rules" in lookups:
lemma_rules = lookups.get_table("lemma_rules")
if "lemma_index" in lookups:
lemma_index = lookups.get_table("lemma_index")
if "lemma_exc" in lookups:
lemma_exc = lookups.get_table("lemma_exc")
if "lemma_lookup" in lookups:
lemma_lookup = lookups.get_table("lemma_lookup")
return (lemma_rules, lemma_index, lemma_exc, lemma_lookup)
def normalize_slice(length, start, stop, step=None): def normalize_slice(length, start, stop, step=None):
if not (step is None or step == 1): if not (step is None or step == 1):
raise ValueError(Errors.E057) raise ValueError(Errors.E057)

View File

@ -50,10 +50,10 @@ cdef class Vocab:
""" """
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {} lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
tag_map = tag_map if tag_map is not None else {} tag_map = tag_map if tag_map is not None else {}
if lemmatizer in (None, True, False):
lemmatizer = Lemmatizer({}, {}, {})
if lookups in (None, True, False): if lookups in (None, True, False):
lookups = Lookups() lookups = Lookups()
if lemmatizer in (None, True, False):
lemmatizer = Lemmatizer(lookups)
self.cfg = {'oov_prob': oov_prob} self.cfg = {'oov_prob': oov_prob}
self.mem = Pool() self.mem = Pool()
self._by_orth = PreshMap() self._by_orth = PreshMap()

View File

@ -10,22 +10,40 @@ lookup tables.
## Lemmatizer.\_\_init\_\_ {#init tag="method"} ## Lemmatizer.\_\_init\_\_ {#init tag="method"}
Create a `Lemmatizer`. Initialize a `Lemmatizer`. Typically, this happens under the hood within spaCy
when a `Language` subclass and its `Vocab` is initialized.
> #### Example > #### Example
> >
> ```python > ```python
> from spacy.lemmatizer import Lemmatizer > from spacy.lemmatizer import Lemmatizer
> lemmatizer = Lemmatizer() > from spacy.lookups import Lookups
> lookups = Lookups()
> lookups.add_table("lemma_rules", {"noun": [["s", ""]]})
> lemmatizer = Lemmatizer(lookups)
> ``` > ```
>
> For examples of the data format, see the
> [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) repo.
| Name | Type | Description | | Name | Type | Description |
| ------------ | ------------- | ---------------------------------------------------------- | | -------------------------------------- | ------------------------- | ------------------------------------------------------------------------------------------------------------------------- |
| `index` | dict / `None` | Inventory of lemmas in the language. | | `lookups` <Tag variant="new">2.2</Tag> | [`Lookups`](/api/lookups) | The lookups object containing the (optional) tables `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. |
| `exceptions` | dict / `None` | Mapping of string forms to lemmas that bypass the `rules`. | | **RETURNS** | `Lemmatizer` | The newly created object. |
| `rules` | dict / `None` | List of suffix rewrite rules. |
| `lookup` | dict / `None` | Lookup table mapping string to their lemmas. | <Infobox title="Deprecation note" variant="danger">
| **RETURNS** | `Lemmatizer` | The newly created object. |
As of v2.2, the lemmatizer is initialized with a [`Lookups`](/api/lookups)
object containing tables for the different components. This makes it easier for
spaCy to share and serialize rules and lookup tables via the `Vocab`, and allows
users to modify lemmatizer data at runtime by updating `nlp.vocab.lookups`.
```diff
- lemmatizer = Lemmatizer(rules=lemma_rules)
+ lemmatizer = Lemmatizer(lookups)
```
</Infobox>
## Lemmatizer.\_\_call\_\_ {#call tag="method"} ## Lemmatizer.\_\_call\_\_ {#call tag="method"}
@ -35,8 +53,10 @@ Lemmatize a string.
> >
> ```python > ```python
> from spacy.lemmatizer import Lemmatizer > from spacy.lemmatizer import Lemmatizer
> rules = {"noun": [["s", ""]]} > from spacy.lookups import Lookups
> lemmatizer = Lemmatizer(index={}, exceptions={}, rules=rules) > lookups = Loookups()
> lookups.add_table("lemma_rules", {"noun": [["s", ""]]})
> lemmatizer = Lemmatizer(lookups)
> lemmas = lemmatizer("ducks", "NOUN") > lemmas = lemmatizer("ducks", "NOUN")
> assert lemmas == ["duck"] > assert lemmas == ["duck"]
> ``` > ```
@ -52,14 +72,13 @@ Lemmatize a string.
Look up a lemma in the lookup table, if available. If no lemma is found, the Look up a lemma in the lookup table, if available. If no lemma is found, the
original string is returned. Languages can provide a original string is returned. Languages can provide a
[lookup table](/usage/adding-languages#lemmatizer) via the `resources`, set on [lookup table](/usage/adding-languages#lemmatizer) via the `Lookups`.
the individual `Language` class.
> #### Example > #### Example
> >
> ```python > ```python
> lookup = {"going": "go"} > lookups = Lookups()
> lemmatizer = Lemmatizer(lookup=lookup) > lookups.add_table("lemma_lookup", {"going": "go"})
> assert lemmatizer.lookup("going") == "go" > assert lemmatizer.lookup("going") == "go"
> ``` > ```
@ -91,9 +110,6 @@ lemmatization entirely.
## Attributes {#attributes} ## Attributes {#attributes}
| Name | Type | Description | | Name | Type | Description |
| ----------------------------------------- | ------------- | ---------------------------------------------------------- | | -------------------------------------- | ------------------------- | --------------------------------------------------------------- |
| `index` | dict / `None` | Inventory of lemmas in the language. | | `lookups` <Tag variant="new">2.2</Tag> | [`Lookups`](/api/lookups) | The lookups object containing the rules and data, if available. |
| `exc` | dict / `None` | Mapping of string forms to lemmas that bypass the `rules`. |
| `rules` | dict / `None` | List of suffix rewrite rules. |
| `lookup_table` <Tag variant="new">2</Tag> | dict / `None` | The lemma lookup table, if available. |