Refactor lemmatizer and data table integration (#4353)

* Move test

* Allow default in Lookups.get_table

* Start with blank tables in Lookups.from_bytes

* Refactor lemmatizer to hold instance of Lookups

* Get lookups table within the lemmatization methods to make sure it references the correct table (even if the table was replaced or modified, e.g. when loading a model from disk)
* Deprecate other arguments on Lemmatizer.__init__ and expect Lookups for consistency
* Remove old and unsupported Lemmatizer.load classmethod
* Refactor language-specific lemmatizers to inherit as much as possible from base class and override only what they need

* Update tests and docs

* Fix more tests

* Fix lemmatizer

* Upgrade pytest to try and fix weird CI errors

* Try pytest 4.6.5
This commit is contained in:
Ines Montani 2019-10-01 21:36:04 +02:00 committed by Matthew Honnibal
parent 3297a19545
commit cf65a80f36
27 changed files with 332 additions and 331 deletions

View File

@ -13,7 +13,6 @@ install:
- "pip install -e ."
script:
- "cat /proc/cpuinfo | grep flags | head -n 1"
- "pip install pytest pytest-timeout"
- "python -m pytest --tb=native spacy"
branches:
except:

View File

@ -15,7 +15,7 @@ pathlib==1.0.1; python_version < "3.4"
jsonschema>=2.6.0,<3.1.0
# Development dependencies
cython>=0.25
pytest>=4.0.0,<4.1.0
pytest>=4.6.5
pytest-timeout>=1.3.0,<2.0.0
mock>=2.0.0,<3.0.0
flake8>=3.5.0,<3.6.0

View File

@ -96,3 +96,7 @@ exclude =
__pycache__,
_tokenizer_exceptions_list.py,
spacy/__init__.py
[tool:pytest]
markers =
slow

View File

@ -487,6 +487,12 @@ class Errors(object):
E170 = ("Cannot apply transition {name}: invalid for the current state.")
E171 = ("Matcher.add received invalid on_match callback argument: expected "
"callable or None, but got: {arg_type}")
E172 = ("The Lemmatizer.load classmethod is deprecated. To create a "
"Lemmatizer, initialize the class directly. See the docs for "
"details: https://spacy.io/api/lemmatizer")
E173 = ("As of v2.2, the Lemmatizer is initialized with an instance of "
"Lookups containing the lemmatization tables. See the docs for "
"details: https://spacy.io/api/lemmatizer#init")
@add_codes

View File

@ -13,8 +13,9 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS
from .norm_exceptions import NORM_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...lookups import Lookups
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups, get_lemma_tables
from ...util import update_exc, add_lookups
class GreekDefaults(Language.Defaults):
@ -34,8 +35,9 @@ class GreekDefaults(Language.Defaults):
@classmethod
def create_lemmatizer(cls, nlp=None, lookups=None):
lemma_rules, lemma_index, lemma_exc, lemma_lookup = get_lemma_tables(lookups)
return GreekLemmatizer(lemma_index, lemma_exc, lemma_rules, lemma_lookup)
if lookups is None:
lookups = Lookups()
return GreekLemmatizer(lookups)
class Greek(Language):

View File

@ -1,10 +1,10 @@
# coding: utf8
from __future__ import unicode_literals
from ...symbols import NOUN, VERB, ADJ, PUNCT
from ...lemmatizer import Lemmatizer
class GreekLemmatizer(object):
class GreekLemmatizer(Lemmatizer):
"""
Greek language lemmatizer applies the default rule based lemmatization
procedure with some modifications for better Greek language support.
@ -15,64 +15,26 @@ class GreekLemmatizer(object):
not applicable for Greek language.
"""
@classmethod
def load(cls, path, index=None, exc=None, rules=None, lookup=None):
return cls(index, exc, rules, lookup)
def __init__(self, index=None, exceptions=None, rules=None, lookup=None):
self.index = index
self.exc = exceptions
self.rules = rules
self.lookup_table = lookup if lookup is not None else {}
def __call__(self, string, univ_pos, morphology=None):
if not self.rules:
return [self.lookup_table.get(string, string)]
if univ_pos in (NOUN, "NOUN", "noun"):
univ_pos = "noun"
elif univ_pos in (VERB, "VERB", "verb"):
univ_pos = "verb"
elif univ_pos in (ADJ, "ADJ", "adj"):
univ_pos = "adj"
elif univ_pos in (PUNCT, "PUNCT", "punct"):
univ_pos = "punct"
else:
return list(set([string.lower()]))
lemmas = lemmatize(
string,
self.index.get(univ_pos, {}),
self.exc.get(univ_pos, {}),
self.rules.get(univ_pos, []),
)
return lemmas
def lookup(self, string, orth=None):
key = orth if orth is not None else string
if key in self.lookup_table:
return self.lookup_table[key]
return string
def lemmatize(string, index, exceptions, rules):
string = string.lower()
forms = []
if string in index:
forms.append(string)
return forms
forms.extend(exceptions.get(string, []))
oov_forms = []
if not forms:
for old, new in rules:
if string.endswith(old):
form = string[: len(string) - len(old)] + new
if not form:
pass
elif form in index or not form.isalpha():
forms.append(form)
else:
oov_forms.append(form)
if not forms:
forms.extend(oov_forms)
if not forms:
forms.append(string)
return list(set(forms))
def lemmatize(self, string, index, exceptions, rules):
string = string.lower()
forms = []
if string in index:
forms.append(string)
return forms
forms.extend(exceptions.get(string, []))
oov_forms = []
if not forms:
for old, new in rules:
if string.endswith(old):
form = string[: len(string) - len(old)] + new
if not form:
pass
elif form in index or not form.isalpha():
forms.append(form)
else:
oov_forms.append(form)
if not forms:
forms.extend(oov_forms)
if not forms:
forms.append(string)
return list(set(forms))

View File

@ -12,8 +12,9 @@ from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...lookups import Lookups
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups, get_lemma_tables
from ...util import update_exc, add_lookups
class FrenchDefaults(Language.Defaults):
@ -33,8 +34,9 @@ class FrenchDefaults(Language.Defaults):
@classmethod
def create_lemmatizer(cls, nlp=None, lookups=None):
lemma_rules, lemma_index, lemma_exc, lemma_lookup = get_lemma_tables(lookups)
return FrenchLemmatizer(lemma_index, lemma_exc, lemma_rules, lemma_lookup)
if lookups is None:
lookups = Lookups()
return FrenchLemmatizer(lookups)
class French(Language):

View File

@ -1,12 +1,13 @@
# coding: utf8
from __future__ import unicode_literals
from ...lemmatizer import Lemmatizer
from ...symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT, ADP
from ...symbols import SCONJ, CCONJ
from ...symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos
class FrenchLemmatizer(object):
class FrenchLemmatizer(Lemmatizer):
"""
French language lemmatizer applies the default rule based lemmatization
procedure with some modifications for better French language support.
@ -16,19 +17,10 @@ class FrenchLemmatizer(object):
the lookup table.
"""
@classmethod
def load(cls, path, index=None, exc=None, rules=None, lookup=None):
return cls(index, exc, rules, lookup)
def __init__(self, index=None, exceptions=None, rules=None, lookup=None):
self.index = index
self.exc = exceptions
self.rules = rules
self.lookup_table = lookup if lookup is not None else {}
def __call__(self, string, univ_pos, morphology=None):
if not self.rules:
return [self.lookup_table.get(string, string)]
lookup_table = self.lookups.get_table("lemma_lookup", {})
if "lemma_rules" not in self.lookups:
return [lookup_table.get(string, string)]
if univ_pos in (NOUN, "NOUN", "noun"):
univ_pos = "noun"
elif univ_pos in (VERB, "VERB", "verb"):
@ -56,12 +48,14 @@ class FrenchLemmatizer(object):
# See Issue #435 for example of where this logic is requied.
if self.is_base_form(univ_pos, morphology):
return list(set([string.lower()]))
lemmas = lemmatize(
index_table = self.lookups.get_table("lemma_index", {})
exc_table = self.lookups.get_table("lemma_exc", {})
rules_table = self.lookups.get_table("lemma_rules", {})
lemmas = self.lemmatize(
string,
self.index.get(univ_pos, {}),
self.exc.get(univ_pos, {}),
self.rules.get(univ_pos, []),
self.lookup_table,
index_table.get(univ_pos, {}),
exc_table.get(univ_pos, {}),
rules_table.get(univ_pos, []),
)
return lemmas
@ -115,33 +109,34 @@ class FrenchLemmatizer(object):
return self(string, "punct", morphology)
def lookup(self, string, orth=None):
if orth is not None and orth in self.lookup_table:
return self.lookup_table[orth][0]
lookup_table = self.lookups.get_table("lemma_lookup", {})
if orth is not None and orth in lookup_table:
return lookup_table[orth][0]
return string
def lemmatize(string, index, exceptions, rules, lookup):
string = string.lower()
forms = []
if string in index:
forms.append(string)
return forms
forms.extend(exceptions.get(string, []))
oov_forms = []
if not forms:
for old, new in rules:
if string.endswith(old):
form = string[: len(string) - len(old)] + new
if not form:
pass
elif form in index or not form.isalpha():
forms.append(form)
else:
oov_forms.append(form)
if not forms:
forms.extend(oov_forms)
if not forms and string in lookup.keys():
forms.append(lookup[string][0])
if not forms:
forms.append(string)
return list(set(forms))
def lemmatize(self, string, index, exceptions, rules):
lookup_table = self.lookups.get_table("lemma_lookup", {})
string = string.lower()
forms = []
if string in index:
forms.append(string)
return forms
forms.extend(exceptions.get(string, []))
oov_forms = []
if not forms:
for old, new in rules:
if string.endswith(old):
form = string[: len(string) - len(old)] + new
if not form:
pass
elif form in index or not form.isalpha():
forms.append(form)
else:
oov_forms.append(form)
if not forms:
forms.extend(oov_forms)
if not forms and string in lookup_table.keys():
forms.append(lookup_table[string][0])
if not forms:
forms.append(string)
return list(set(forms))

View File

@ -10,8 +10,9 @@ from .lemmatizer import DutchLemmatizer
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...lookups import Lookups
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups, get_lemma_tables
from ...util import update_exc, add_lookups
class DutchDefaults(Language.Defaults):
@ -29,8 +30,9 @@ class DutchDefaults(Language.Defaults):
@classmethod
def create_lemmatizer(cls, nlp=None, lookups=None):
lemma_rules, lemma_index, lemma_exc, lemma_lookup = get_lemma_tables(lookups)
return DutchLemmatizer(lemma_index, lemma_exc, lemma_rules, lemma_lookup)
if lookups is None:
lookups = Lookups()
return DutchLemmatizer(lookups)
class Dutch(Language):

View File

@ -1,10 +1,11 @@
# coding: utf8
from __future__ import unicode_literals
from ...lemmatizer import Lemmatizer
from ...symbols import NOUN, VERB, ADJ, NUM, DET, PRON, ADP, AUX, ADV
class DutchLemmatizer(object):
class DutchLemmatizer(Lemmatizer):
# Note: CGN does not distinguish AUX verbs, so we treat AUX as VERB.
univ_pos_name_variants = {
NOUN: "noun",
@ -36,16 +37,6 @@ class DutchLemmatizer(object):
"num": "num",
}
@classmethod
def load(cls, path, index=None, exc=None, rules=None, lookup=None):
return cls(index, exc, rules, lookup)
def __init__(self, index=None, exceptions=None, rules=None, lookup=None):
self.index = index
self.exc = exceptions
self.rules = rules or {}
self.lookup_table = lookup if lookup is not None else {}
def __call__(self, string, univ_pos, morphology=None):
# Difference 1: self.rules is assumed to be non-None, so no
# 'is None' check required.
@ -62,11 +53,13 @@ class DutchLemmatizer(object):
# are not lemmatized. They are lowercased, however.
return [string]
# if string in self.lemma_index.get(univ_pos)
lemma_index = self.index.get(univ_pos, {})
index_table = self.lookups.get_table("lemma_index", {})
lemma_index = index_table.get(univ_pos, {})
# string is already lemma
if string in lemma_index:
return [string]
exceptions = self.exc.get(univ_pos, {})
exc_table = self.lookups.get_table("lemma_exc", {})
exceptions = exc_table.get(univ_pos, {})
# string is irregular token contained in exceptions index.
try:
lemma = exceptions[string]
@ -74,15 +67,14 @@ class DutchLemmatizer(object):
except KeyError:
pass
# string corresponds to key in lookup table
lookup_table = self.lookup_table
lookup_table = self.lookups.get_table("lemma_lookup", {})
looked_up_lemma = lookup_table.get(string)
if looked_up_lemma and looked_up_lemma in lemma_index:
return [looked_up_lemma]
forms, is_known = lemmatize(
string, lemma_index, exceptions, self.rules.get(univ_pos, [])
rules_table = self.lookups.get_table("lemma_rules", {})
forms, is_known = self.lemmatize(
string, lemma_index, exceptions, rules_table.get(univ_pos, [])
)
# Back-off through remaining return value candidates.
if forms:
if is_known:
@ -104,46 +96,25 @@ class DutchLemmatizer(object):
# used to search the lookup table. This is necessary because our lookup
# table consists entirely of lowercase keys.
def lookup(self, string, orth=None):
lookup_table = self.lookups.get_table("lemma_lookup", {})
string = string.lower()
if orth is not None:
return self.lookup_table.get(orth, string)
return lookup_table.get(orth, string)
else:
return self.lookup_table.get(string, string)
return lookup_table.get(string, string)
def noun(self, string, morphology=None):
return self(string, "noun", morphology)
def verb(self, string, morphology=None):
return self(string, "verb", morphology)
def adj(self, string, morphology=None):
return self(string, "adj", morphology)
def det(self, string, morphology=None):
return self(string, "det", morphology)
def pron(self, string, morphology=None):
return self(string, "pron", morphology)
def adp(self, string, morphology=None):
return self(string, "adp", morphology)
def punct(self, string, morphology=None):
return self(string, "punct", morphology)
# Reimplemented to focus more on application of suffix rules and to return
# as early as possible.
def lemmatize(string, index, exceptions, rules):
# returns (forms, is_known: bool)
oov_forms = []
for old, new in rules:
if string.endswith(old):
form = string[: len(string) - len(old)] + new
if not form:
pass
elif form in index:
return [form], True # True = Is known (is lemma)
else:
oov_forms.append(form)
return list(set(oov_forms)), False
# Reimplemented to focus more on application of suffix rules and to return
# as early as possible.
def lemmatize(self, string, index, exceptions, rules):
# returns (forms, is_known: bool)
oov_forms = []
for old, new in rules:
if string.endswith(old):
form = string[: len(string) - len(old)] + new
if not form:
pass
elif form in index:
return [form], True # True = Is known (is lemma)
else:
oov_forms.append(form)
return list(set(oov_forms)), False

View File

@ -12,6 +12,7 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...util import update_exc, add_lookups
from ...language import Language
from ...lookups import Lookups
from ...attrs import LANG, NORM
@ -27,8 +28,10 @@ class RussianDefaults(Language.Defaults):
tag_map = TAG_MAP
@classmethod
def create_lemmatizer(cls, nlp=None, **kwargs):
return RussianLemmatizer()
def create_lemmatizer(cls, nlp=None, lookups=None):
if lookups is None:
lookups = Lookups()
return RussianLemmatizer(lookups)
class Russian(Language):

View File

@ -9,8 +9,8 @@ from ...compat import unicode_
class RussianLemmatizer(Lemmatizer):
_morph = None
def __init__(self):
super(RussianLemmatizer, self).__init__()
def __init__(self, lookups=None):
super(RussianLemmatizer, self).__init__(lookups)
try:
from pymorphy2 import MorphAnalyzer
except ImportError:
@ -102,19 +102,6 @@ class RussianLemmatizer(Lemmatizer):
return symbols_to_str[univ_pos]
return None
def is_base_form(self, univ_pos, morphology=None):
# TODO
raise NotImplementedError
def det(self, string, morphology=None):
return self(string, "det", morphology)
def num(self, string, morphology=None):
return self(string, "num", morphology)
def pron(self, string, morphology=None):
return self(string, "pron", morphology)
def lookup(self, string, orth=None):
analyses = self._morph.parse(string)
if len(analyses) == 1:

View File

@ -41,8 +41,7 @@ class BaseDefaults(object):
def create_lemmatizer(cls, nlp=None, lookups=None):
if lookups is None:
lookups = cls.create_lookups(nlp=nlp)
rules, index, exc, lookup = util.get_lemma_tables(lookups)
return Lemmatizer(index, exc, rules, lookup)
return Lemmatizer(lookups=lookups)
@classmethod
def create_lookups(cls, nlp=None):

View File

@ -1,8 +1,11 @@
# coding: utf8
from __future__ import unicode_literals
from collections import OrderedDict
from .symbols import NOUN, VERB, ADJ, PUNCT, PROPN
from .errors import Errors
from .lookups import Lookups
class Lemmatizer(object):
@ -14,18 +17,32 @@ class Lemmatizer(object):
"""
@classmethod
def load(cls, path, index=None, exc=None, rules=None, lookup=None):
return cls(index, exc, rules, lookup)
def load(cls, *args, **kwargs):
raise NotImplementedError(Errors.E172)
def __init__(self, index=None, exceptions=None, rules=None, lookup=None):
self.index = index
self.exc = exceptions
self.rules = rules
self.lookup_table = lookup if lookup is not None else {}
def __init__(self, lookups, *args, **kwargs):
"""Initialize a Lemmatizer.
lookups (Lookups): The lookups object containing the (optional) tables
"lemma_rules", "lemma_index", "lemma_exc" and "lemma_lookup".
RETURNS (Lemmatizer): The newly constructed object.
"""
if args or kwargs or not isinstance(lookups, Lookups):
raise ValueError(Errors.E173)
self.lookups = lookups
def __call__(self, string, univ_pos, morphology=None):
if not self.rules:
return [self.lookup_table.get(string, string)]
"""Lemmatize a string.
string (unicode): The string to lemmatize, e.g. the token text.
univ_pos (unicode / int): The token's universal part-of-speech tag.
morphology (dict): The token's morphological features following the
Universal Dependencies scheme.
RETURNS (list): The available lemmas for the string.
"""
lookup_table = self.lookups.get_table("lemma_lookup", {})
if "lemma_rules" not in self.lookups:
return [lookup_table.get(string, string)]
if univ_pos in (NOUN, "NOUN", "noun"):
univ_pos = "noun"
elif univ_pos in (VERB, "VERB", "verb"):
@ -41,11 +58,14 @@ class Lemmatizer(object):
# See Issue #435 for example of where this logic is requied.
if self.is_base_form(univ_pos, morphology):
return [string.lower()]
lemmas = lemmatize(
index_table = self.lookups.get_table("lemma_index", {})
exc_table = self.lookups.get_table("lemma_exc", {})
rules_table = self.lookups.get_table("lemma_rules", {})
lemmas = self.lemmatize(
string,
self.index.get(univ_pos, {}),
self.exc.get(univ_pos, {}),
self.rules.get(univ_pos, []),
index_table.get(univ_pos, {}),
exc_table.get(univ_pos, {}),
rules_table.get(univ_pos, []),
)
return lemmas
@ -53,6 +73,10 @@ class Lemmatizer(object):
"""
Check whether we're dealing with an uninflected paradigm, so we can
avoid lemmatization entirely.
univ_pos (unicode / int): The token's universal part-of-speech tag.
morphology (dict): The token's morphological features following the
Universal Dependencies scheme.
"""
if morphology is None:
morphology = {}
@ -90,6 +114,18 @@ class Lemmatizer(object):
def adj(self, string, morphology=None):
return self(string, "adj", morphology)
def det(self, string, morphology=None):
return self(string, "det", morphology)
def pron(self, string, morphology=None):
return self(string, "pron", morphology)
def adp(self, string, morphology=None):
return self(string, "adp", morphology)
def num(self, string, morphology=None):
return self(string, "num", morphology)
def punct(self, string, morphology=None):
return self(string, "punct", morphology)
@ -103,37 +139,37 @@ class Lemmatizer(object):
RETURNS (unicode): The lemma if the string was found, otherwise the
original string.
"""
lookup_table = self.lookups.get_table("lemma_lookup", {})
key = orth if orth is not None else string
if key in self.lookup_table:
return self.lookup_table[key]
if key in lookup_table:
return lookup_table[key]
return string
def lemmatize(string, index, exceptions, rules):
orig = string
string = string.lower()
forms = []
oov_forms = []
for old, new in rules:
if string.endswith(old):
form = string[: len(string) - len(old)] + new
if not form:
pass
elif form in index or not form.isalpha():
forms.append(form)
else:
oov_forms.append(form)
# Remove duplicates but preserve the ordering of applied "rules"
forms = list(OrderedDict.fromkeys(forms))
# Put exceptions at the front of the list, so they get priority.
# This is a dodgy heuristic -- but it's the best we can do until we get
# frequencies on this. We can at least prune out problematic exceptions,
# if they shadow more frequent analyses.
for form in exceptions.get(string, []):
if form not in forms:
forms.insert(0, form)
if not forms:
forms.extend(oov_forms)
if not forms:
forms.append(orig)
return forms
def lemmatize(self, string, index, exceptions, rules):
orig = string
string = string.lower()
forms = []
oov_forms = []
for old, new in rules:
if string.endswith(old):
form = string[: len(string) - len(old)] + new
if not form:
pass
elif form in index or not form.isalpha():
forms.append(form)
else:
oov_forms.append(form)
# Remove duplicates but preserve the ordering of applied "rules"
forms = list(OrderedDict.fromkeys(forms))
# Put exceptions at the front of the list, so they get priority.
# This is a dodgy heuristic -- but it's the best we can do until we get
# frequencies on this. We can at least prune out problematic exceptions,
# if they shadow more frequent analyses.
for form in exceptions.get(string, []):
if form not in forms:
forms.insert(0, form)
if not forms:
forms.extend(oov_forms)
if not forms:
forms.append(orig)
return forms

View File

@ -10,6 +10,9 @@ from .util import SimpleFrozenDict, ensure_path
from .strings import get_string_id
UNSET = object()
class Lookups(object):
"""Container for large lookup tables and dictionaries, e.g. lemmatization
data or tokenizer exception lists. Lookups are available via vocab.lookups,
@ -60,16 +63,20 @@ class Lookups(object):
self._tables[name] = table
return table
def get_table(self, name):
"""Get a table. Raises an error if the table doesn't exist.
def get_table(self, name, default=UNSET):
"""Get a table. Raises an error if the table doesn't exist and no
default value is provided.
name (unicode): Name of the table.
default: Optional default value to return if table doesn't exist.
RETURNS (Table): The table.
DOCS: https://spacy.io/api/lookups#get_table
"""
if name not in self._tables:
raise KeyError(Errors.E159.format(name=name, tables=self.tables))
if default == UNSET:
raise KeyError(Errors.E159.format(name=name, tables=self.tables))
return default
return self._tables[name]
def remove_table(self, name):
@ -111,6 +118,7 @@ class Lookups(object):
DOCS: https://spacy.io/api/lookups#from_bytes
"""
self._tables = OrderedDict()
for key, value in srsly.msgpack_loads(bytes_data).items():
self._tables[key] = Table(key)
self._tables[key].update(value)

View File

@ -5,13 +5,14 @@ import pytest
from spacy.vocab import Vocab
from spacy.tokens import Doc
from spacy.lemmatizer import Lemmatizer
from spacy.lookups import Table
from spacy.lookups import Lookups
@pytest.fixture
def lemmatizer():
lookup = Table(data={"dogs": "dog", "boxen": "box", "mice": "mouse"})
return Lemmatizer(lookup=lookup)
lookups = Lookups()
lookups.add_table("lemma_lookup", {"dogs": "dog", "boxen": "box", "mice": "mouse"})
return Lemmatizer(lookups)
@pytest.fixture

View File

@ -5,11 +5,13 @@ import pytest
from spacy.morphology import Morphology
from spacy.strings import StringStore, get_string_id
from spacy.lemmatizer import Lemmatizer
from spacy.lookups import Lookups
@pytest.fixture
def morphology():
return Morphology(StringStore(), {}, Lemmatizer())
lemmatizer = Lemmatizer(Lookups())
return Morphology(StringStore(), {}, lemmatizer)
def test_init(morphology):

View File

@ -1,22 +0,0 @@
# coding: utf8
from __future__ import unicode_literals
import pytest
from spacy.lang.en import English
from spacy.lookups import Lookups
def test_tagger_warns_no_lemma_lookups():
nlp = English()
nlp.vocab.lookups = Lookups()
assert not len(nlp.vocab.lookups)
tagger = nlp.create_pipe("tagger")
with pytest.warns(UserWarning):
tagger.begin_training()
nlp.add_pipe(tagger)
with pytest.warns(UserWarning):
nlp.begin_training()
nlp.vocab.lookups.add_table("lemma_lookup")
with pytest.warns(None) as record:
nlp.begin_training()
assert not record.list

View File

@ -9,6 +9,7 @@ from spacy.symbols import POS, VERB, VerbForm_inf
from spacy.vocab import Vocab
from spacy.language import Language
from spacy.lemmatizer import Lemmatizer
from spacy.lookups import Lookups
from spacy.tokens import Doc, Span
from ..util import get_doc, make_tempdir
@ -173,8 +174,11 @@ def test_issue595():
"""Test lemmatization of base forms"""
words = ["Do", "n't", "feed", "the", "dog"]
tag_map = {"VB": {POS: VERB, VerbForm_inf: True}}
rules = {"verb": [["ed", "e"]]}
lemmatizer = Lemmatizer({"verb": {}}, {"verb": {}}, rules)
lookups = Lookups()
lookups.add_table("lemma_rules", {"verb": [["ed", "e"]]})
lookups.add_table("lemma_index", {"verb": {}})
lookups.add_table("lemma_exc", {"verb": {}})
lemmatizer = Lemmatizer(lookups)
vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
doc = Doc(vocab, words=words)
doc[2].tag_ = "VB"

View File

@ -10,6 +10,7 @@ from spacy.lang.lex_attrs import LEX_ATTRS
from spacy.matcher import Matcher
from spacy.tokenizer import Tokenizer
from spacy.lemmatizer import Lemmatizer
from spacy.lookups import Lookups
from spacy.symbols import ORTH, LEMMA, POS, VERB, VerbForm_part
@ -91,10 +92,11 @@ def test_issue1375():
def test_issue1387():
tag_map = {"VBG": {POS: VERB, VerbForm_part: True}}
index = {"verb": ("cope", "cop")}
exc = {"verb": {"coping": ("cope",)}}
rules = {"verb": [["ing", ""]]}
lemmatizer = Lemmatizer(index, exc, rules)
lookups = Lookups()
lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
lemmatizer = Lemmatizer(lookups)
vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
doc = Doc(vocab, words=["coping"])
doc[0].tag_ = "VBG"

View File

@ -126,7 +126,8 @@ def test_issue1727():
vectors = Vectors(data=data, keys=["I", "am", "Matt"])
tagger = Tagger(Vocab())
tagger.add_label("PRP")
tagger.begin_training()
with pytest.warns(UserWarning):
tagger.begin_training()
assert tagger.cfg.get("pretrained_dims", 0) == 0
tagger.vocab.vectors = vectors
with make_tempdir() as path:

View File

@ -22,7 +22,8 @@ def test_issue2564():
"""Test the tagger sets is_tagged correctly when used via Language.pipe."""
nlp = Language()
tagger = nlp.create_pipe("tagger")
tagger.begin_training() # initialise weights
with pytest.warns(UserWarning):
tagger.begin_training() # initialise weights
nlp.add_pipe(tagger)
doc = nlp("hello world")
assert doc.is_tagged

View File

@ -0,0 +1,49 @@
# coding: utf8
from __future__ import unicode_literals
import pytest
from spacy.tokens import Doc
from spacy.language import Language
from spacy.lookups import Lookups
def test_lemmatizer_reflects_lookups_changes():
"""Test for an issue that'd cause lookups available in a model loaded from
disk to not be reflected in the lemmatizer."""
nlp = Language()
assert Doc(nlp.vocab, words=["foo"])[0].lemma_ == "foo"
table = nlp.vocab.lookups.add_table("lemma_lookup")
table["foo"] = "bar"
assert Doc(nlp.vocab, words=["foo"])[0].lemma_ == "bar"
table = nlp.vocab.lookups.get_table("lemma_lookup")
table["hello"] = "world"
# The update to the table should be reflected in the lemmatizer
assert Doc(nlp.vocab, words=["hello"])[0].lemma_ == "world"
new_nlp = Language()
table = new_nlp.vocab.lookups.add_table("lemma_lookup")
table["hello"] = "hi"
assert Doc(new_nlp.vocab, words=["hello"])[0].lemma_ == "hi"
nlp_bytes = nlp.to_bytes()
new_nlp.from_bytes(nlp_bytes)
# Make sure we have the previously saved lookup table
assert len(new_nlp.vocab.lookups) == 1
assert len(new_nlp.vocab.lookups.get_table("lemma_lookup")) == 2
assert new_nlp.vocab.lookups.get_table("lemma_lookup")["hello"] == "world"
assert Doc(new_nlp.vocab, words=["foo"])[0].lemma_ == "bar"
assert Doc(new_nlp.vocab, words=["hello"])[0].lemma_ == "world"
def test_tagger_warns_no_lemma_lookups():
nlp = Language()
nlp.vocab.lookups = Lookups()
assert not len(nlp.vocab.lookups)
tagger = nlp.create_pipe("tagger")
with pytest.warns(UserWarning):
tagger.begin_training()
nlp.add_pipe(tagger)
with pytest.warns(UserWarning):
nlp.begin_training()
nlp.vocab.lookups.add_table("lemma_lookup")
with pytest.warns(None) as record:
nlp.begin_training()
assert not record.list

View File

@ -111,16 +111,12 @@ SUFFIXES = ['"', ":", ">"]
@pytest.mark.parametrize("url", URLS_SHOULD_MATCH)
def test_should_match(en_tokenizer, url):
token_match = en_tokenizer.token_match
if token_match:
assert token_match(url)
assert en_tokenizer.token_match(url) is not None
@pytest.mark.parametrize("url", URLS_SHOULD_NOT_MATCH)
def test_should_not_match(en_tokenizer, url):
token_match = en_tokenizer.token_match
if token_match:
assert not token_match(url)
assert en_tokenizer.token_match(url) is None
@pytest.mark.parametrize("url", URLS_BASIC)

View File

@ -467,31 +467,6 @@ def expand_exc(excs, search, replace):
return new_excs
def get_lemma_tables(lookups):
"""Load lemmatizer data from lookups table. Mostly used via
Language.Defaults.create_lemmatizer, but available as helper so it can be
reused in language classes that implement custom lemmatizers.
lookups (Lookups): The lookups table.
RETURNS (tuple): A (lemma_rules, lemma_index, lemma_exc, lemma_lookup)
tuple that can be used to initialize a Lemmatizer.
"""
lemma_rules = {}
lemma_index = {}
lemma_exc = {}
lemma_lookup = None
if lookups is not None:
if "lemma_rules" in lookups:
lemma_rules = lookups.get_table("lemma_rules")
if "lemma_index" in lookups:
lemma_index = lookups.get_table("lemma_index")
if "lemma_exc" in lookups:
lemma_exc = lookups.get_table("lemma_exc")
if "lemma_lookup" in lookups:
lemma_lookup = lookups.get_table("lemma_lookup")
return (lemma_rules, lemma_index, lemma_exc, lemma_lookup)
def normalize_slice(length, start, stop, step=None):
if not (step is None or step == 1):
raise ValueError(Errors.E057)

View File

@ -50,10 +50,10 @@ cdef class Vocab:
"""
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
tag_map = tag_map if tag_map is not None else {}
if lemmatizer in (None, True, False):
lemmatizer = Lemmatizer({}, {}, {})
if lookups in (None, True, False):
lookups = Lookups()
if lemmatizer in (None, True, False):
lemmatizer = Lemmatizer(lookups)
self.cfg = {'oov_prob': oov_prob}
self.mem = Pool()
self._by_orth = PreshMap()

View File

@ -10,22 +10,40 @@ lookup tables.
## Lemmatizer.\_\_init\_\_ {#init tag="method"}
Create a `Lemmatizer`.
Initialize a `Lemmatizer`. Typically, this happens under the hood within spaCy
when a `Language` subclass and its `Vocab` is initialized.
> #### Example
>
> ```python
> from spacy.lemmatizer import Lemmatizer
> lemmatizer = Lemmatizer()
> from spacy.lookups import Lookups
> lookups = Lookups()
> lookups.add_table("lemma_rules", {"noun": [["s", ""]]})
> lemmatizer = Lemmatizer(lookups)
> ```
>
> For examples of the data format, see the
> [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) repo.
| Name | Type | Description |
| ------------ | ------------- | ---------------------------------------------------------- |
| `index` | dict / `None` | Inventory of lemmas in the language. |
| `exceptions` | dict / `None` | Mapping of string forms to lemmas that bypass the `rules`. |
| `rules` | dict / `None` | List of suffix rewrite rules. |
| `lookup` | dict / `None` | Lookup table mapping string to their lemmas. |
| **RETURNS** | `Lemmatizer` | The newly created object. |
| Name | Type | Description |
| -------------------------------------- | ------------------------- | ------------------------------------------------------------------------------------------------------------------------- |
| `lookups` <Tag variant="new">2.2</Tag> | [`Lookups`](/api/lookups) | The lookups object containing the (optional) tables `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. |
| **RETURNS** | `Lemmatizer` | The newly created object. |
<Infobox title="Deprecation note" variant="danger">
As of v2.2, the lemmatizer is initialized with a [`Lookups`](/api/lookups)
object containing tables for the different components. This makes it easier for
spaCy to share and serialize rules and lookup tables via the `Vocab`, and allows
users to modify lemmatizer data at runtime by updating `nlp.vocab.lookups`.
```diff
- lemmatizer = Lemmatizer(rules=lemma_rules)
+ lemmatizer = Lemmatizer(lookups)
```
</Infobox>
## Lemmatizer.\_\_call\_\_ {#call tag="method"}
@ -35,8 +53,10 @@ Lemmatize a string.
>
> ```python
> from spacy.lemmatizer import Lemmatizer
> rules = {"noun": [["s", ""]]}
> lemmatizer = Lemmatizer(index={}, exceptions={}, rules=rules)
> from spacy.lookups import Lookups
> lookups = Loookups()
> lookups.add_table("lemma_rules", {"noun": [["s", ""]]})
> lemmatizer = Lemmatizer(lookups)
> lemmas = lemmatizer("ducks", "NOUN")
> assert lemmas == ["duck"]
> ```
@ -52,14 +72,13 @@ Lemmatize a string.
Look up a lemma in the lookup table, if available. If no lemma is found, the
original string is returned. Languages can provide a
[lookup table](/usage/adding-languages#lemmatizer) via the `resources`, set on
the individual `Language` class.
[lookup table](/usage/adding-languages#lemmatizer) via the `Lookups`.
> #### Example
>
> ```python
> lookup = {"going": "go"}
> lemmatizer = Lemmatizer(lookup=lookup)
> lookups = Lookups()
> lookups.add_table("lemma_lookup", {"going": "go"})
> assert lemmatizer.lookup("going") == "go"
> ```
@ -91,9 +110,6 @@ lemmatization entirely.
## Attributes {#attributes}
| Name | Type | Description |
| ----------------------------------------- | ------------- | ---------------------------------------------------------- |
| `index` | dict / `None` | Inventory of lemmas in the language. |
| `exc` | dict / `None` | Mapping of string forms to lemmas that bypass the `rules`. |
| `rules` | dict / `None` | List of suffix rewrite rules. |
| `lookup_table` <Tag variant="new">2</Tag> | dict / `None` | The lemma lookup table, if available. |
| Name | Type | Description |
| -------------------------------------- | ------------------------- | --------------------------------------------------------------- |
| `lookups` <Tag variant="new">2.2</Tag> | [`Lookups`](/api/lookups) | The lookups object containing the rules and data, if available. |