diff --git a/.travis.yml b/.travis.yml
index 957112e92..e3ce53024 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -13,7 +13,6 @@ install:
- "pip install -e ."
script:
- "cat /proc/cpuinfo | grep flags | head -n 1"
- - "pip install pytest pytest-timeout"
- "python -m pytest --tb=native spacy"
branches:
except:
diff --git a/requirements.txt b/requirements.txt
index ebe660b97..601b73559 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,7 +15,7 @@ pathlib==1.0.1; python_version < "3.4"
jsonschema>=2.6.0,<3.1.0
# Development dependencies
cython>=0.25
-pytest>=4.0.0,<4.1.0
+pytest>=4.6.5
pytest-timeout>=1.3.0,<2.0.0
mock>=2.0.0,<3.0.0
flake8>=3.5.0,<3.6.0
diff --git a/setup.cfg b/setup.cfg
index d188f123e..4d0a88c35 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -96,3 +96,7 @@ exclude =
__pycache__,
_tokenizer_exceptions_list.py,
spacy/__init__.py
+
+[tool:pytest]
+markers =
+ slow
diff --git a/spacy/errors.py b/spacy/errors.py
index 93d42aa4c..2ef5d1ce4 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -487,6 +487,12 @@ class Errors(object):
E170 = ("Cannot apply transition {name}: invalid for the current state.")
E171 = ("Matcher.add received invalid on_match callback argument: expected "
"callable or None, but got: {arg_type}")
+ E172 = ("The Lemmatizer.load classmethod is deprecated. To create a "
+ "Lemmatizer, initialize the class directly. See the docs for "
+ "details: https://spacy.io/api/lemmatizer")
+ E173 = ("As of v2.2, the Lemmatizer is initialized with an instance of "
+ "Lookups containing the lemmatization tables. See the docs for "
+ "details: https://spacy.io/api/lemmatizer#init")
@add_codes
diff --git a/spacy/lang/el/__init__.py b/spacy/lang/el/__init__.py
index 5312e7474..16863e6d7 100644
--- a/spacy/lang/el/__init__.py
+++ b/spacy/lang/el/__init__.py
@@ -13,8 +13,9 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS
from .norm_exceptions import NORM_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
+from ...lookups import Lookups
from ...attrs import LANG, NORM
-from ...util import update_exc, add_lookups, get_lemma_tables
+from ...util import update_exc, add_lookups
class GreekDefaults(Language.Defaults):
@@ -34,8 +35,9 @@ class GreekDefaults(Language.Defaults):
@classmethod
def create_lemmatizer(cls, nlp=None, lookups=None):
- lemma_rules, lemma_index, lemma_exc, lemma_lookup = get_lemma_tables(lookups)
- return GreekLemmatizer(lemma_index, lemma_exc, lemma_rules, lemma_lookup)
+ if lookups is None:
+ lookups = Lookups()
+ return GreekLemmatizer(lookups)
class Greek(Language):
diff --git a/spacy/lang/el/lemmatizer.py b/spacy/lang/el/lemmatizer.py
index 647ea9c33..6f5b3999b 100644
--- a/spacy/lang/el/lemmatizer.py
+++ b/spacy/lang/el/lemmatizer.py
@@ -1,10 +1,10 @@
# coding: utf8
from __future__ import unicode_literals
-from ...symbols import NOUN, VERB, ADJ, PUNCT
+from ...lemmatizer import Lemmatizer
-class GreekLemmatizer(object):
+class GreekLemmatizer(Lemmatizer):
"""
Greek language lemmatizer applies the default rule based lemmatization
procedure with some modifications for better Greek language support.
@@ -15,64 +15,26 @@ class GreekLemmatizer(object):
not applicable for Greek language.
"""
- @classmethod
- def load(cls, path, index=None, exc=None, rules=None, lookup=None):
- return cls(index, exc, rules, lookup)
-
- def __init__(self, index=None, exceptions=None, rules=None, lookup=None):
- self.index = index
- self.exc = exceptions
- self.rules = rules
- self.lookup_table = lookup if lookup is not None else {}
-
- def __call__(self, string, univ_pos, morphology=None):
- if not self.rules:
- return [self.lookup_table.get(string, string)]
- if univ_pos in (NOUN, "NOUN", "noun"):
- univ_pos = "noun"
- elif univ_pos in (VERB, "VERB", "verb"):
- univ_pos = "verb"
- elif univ_pos in (ADJ, "ADJ", "adj"):
- univ_pos = "adj"
- elif univ_pos in (PUNCT, "PUNCT", "punct"):
- univ_pos = "punct"
- else:
- return list(set([string.lower()]))
- lemmas = lemmatize(
- string,
- self.index.get(univ_pos, {}),
- self.exc.get(univ_pos, {}),
- self.rules.get(univ_pos, []),
- )
- return lemmas
-
- def lookup(self, string, orth=None):
- key = orth if orth is not None else string
- if key in self.lookup_table:
- return self.lookup_table[key]
- return string
-
-
-def lemmatize(string, index, exceptions, rules):
- string = string.lower()
- forms = []
- if string in index:
- forms.append(string)
- return forms
- forms.extend(exceptions.get(string, []))
- oov_forms = []
- if not forms:
- for old, new in rules:
- if string.endswith(old):
- form = string[: len(string) - len(old)] + new
- if not form:
- pass
- elif form in index or not form.isalpha():
- forms.append(form)
- else:
- oov_forms.append(form)
- if not forms:
- forms.extend(oov_forms)
- if not forms:
- forms.append(string)
- return list(set(forms))
+ def lemmatize(self, string, index, exceptions, rules):
+ string = string.lower()
+ forms = []
+ if string in index:
+ forms.append(string)
+ return forms
+ forms.extend(exceptions.get(string, []))
+ oov_forms = []
+ if not forms:
+ for old, new in rules:
+ if string.endswith(old):
+ form = string[: len(string) - len(old)] + new
+ if not form:
+ pass
+ elif form in index or not form.isalpha():
+ forms.append(form)
+ else:
+ oov_forms.append(form)
+ if not forms:
+ forms.extend(oov_forms)
+ if not forms:
+ forms.append(string)
+ return list(set(forms))
diff --git a/spacy/lang/fr/__init__.py b/spacy/lang/fr/__init__.py
index c9dd623fc..f56c8688a 100644
--- a/spacy/lang/fr/__init__.py
+++ b/spacy/lang/fr/__init__.py
@@ -12,8 +12,9 @@ from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
+from ...lookups import Lookups
from ...attrs import LANG, NORM
-from ...util import update_exc, add_lookups, get_lemma_tables
+from ...util import update_exc, add_lookups
class FrenchDefaults(Language.Defaults):
@@ -33,8 +34,9 @@ class FrenchDefaults(Language.Defaults):
@classmethod
def create_lemmatizer(cls, nlp=None, lookups=None):
- lemma_rules, lemma_index, lemma_exc, lemma_lookup = get_lemma_tables(lookups)
- return FrenchLemmatizer(lemma_index, lemma_exc, lemma_rules, lemma_lookup)
+ if lookups is None:
+ lookups = Lookups()
+ return FrenchLemmatizer(lookups)
class French(Language):
diff --git a/spacy/lang/fr/lemmatizer.py b/spacy/lang/fr/lemmatizer.py
index d98d3cb5b..79f4dd28d 100644
--- a/spacy/lang/fr/lemmatizer.py
+++ b/spacy/lang/fr/lemmatizer.py
@@ -1,12 +1,13 @@
# coding: utf8
from __future__ import unicode_literals
+from ...lemmatizer import Lemmatizer
from ...symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT, ADP
from ...symbols import SCONJ, CCONJ
from ...symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos
-class FrenchLemmatizer(object):
+class FrenchLemmatizer(Lemmatizer):
"""
French language lemmatizer applies the default rule based lemmatization
procedure with some modifications for better French language support.
@@ -16,19 +17,10 @@ class FrenchLemmatizer(object):
the lookup table.
"""
- @classmethod
- def load(cls, path, index=None, exc=None, rules=None, lookup=None):
- return cls(index, exc, rules, lookup)
-
- def __init__(self, index=None, exceptions=None, rules=None, lookup=None):
- self.index = index
- self.exc = exceptions
- self.rules = rules
- self.lookup_table = lookup if lookup is not None else {}
-
def __call__(self, string, univ_pos, morphology=None):
- if not self.rules:
- return [self.lookup_table.get(string, string)]
+ lookup_table = self.lookups.get_table("lemma_lookup", {})
+ if "lemma_rules" not in self.lookups:
+ return [lookup_table.get(string, string)]
if univ_pos in (NOUN, "NOUN", "noun"):
univ_pos = "noun"
elif univ_pos in (VERB, "VERB", "verb"):
@@ -56,12 +48,14 @@ class FrenchLemmatizer(object):
# See Issue #435 for example of where this logic is requied.
if self.is_base_form(univ_pos, morphology):
return list(set([string.lower()]))
- lemmas = lemmatize(
+ index_table = self.lookups.get_table("lemma_index", {})
+ exc_table = self.lookups.get_table("lemma_exc", {})
+ rules_table = self.lookups.get_table("lemma_rules", {})
+ lemmas = self.lemmatize(
string,
- self.index.get(univ_pos, {}),
- self.exc.get(univ_pos, {}),
- self.rules.get(univ_pos, []),
- self.lookup_table,
+ index_table.get(univ_pos, {}),
+ exc_table.get(univ_pos, {}),
+ rules_table.get(univ_pos, []),
)
return lemmas
@@ -115,33 +109,34 @@ class FrenchLemmatizer(object):
return self(string, "punct", morphology)
def lookup(self, string, orth=None):
- if orth is not None and orth in self.lookup_table:
- return self.lookup_table[orth][0]
+ lookup_table = self.lookups.get_table("lemma_lookup", {})
+ if orth is not None and orth in lookup_table:
+ return lookup_table[orth][0]
return string
-
-def lemmatize(string, index, exceptions, rules, lookup):
- string = string.lower()
- forms = []
- if string in index:
- forms.append(string)
- return forms
- forms.extend(exceptions.get(string, []))
- oov_forms = []
- if not forms:
- for old, new in rules:
- if string.endswith(old):
- form = string[: len(string) - len(old)] + new
- if not form:
- pass
- elif form in index or not form.isalpha():
- forms.append(form)
- else:
- oov_forms.append(form)
- if not forms:
- forms.extend(oov_forms)
- if not forms and string in lookup.keys():
- forms.append(lookup[string][0])
- if not forms:
- forms.append(string)
- return list(set(forms))
+ def lemmatize(self, string, index, exceptions, rules):
+ lookup_table = self.lookups.get_table("lemma_lookup", {})
+ string = string.lower()
+ forms = []
+ if string in index:
+ forms.append(string)
+ return forms
+ forms.extend(exceptions.get(string, []))
+ oov_forms = []
+ if not forms:
+ for old, new in rules:
+ if string.endswith(old):
+ form = string[: len(string) - len(old)] + new
+ if not form:
+ pass
+ elif form in index or not form.isalpha():
+ forms.append(form)
+ else:
+ oov_forms.append(form)
+ if not forms:
+ forms.extend(oov_forms)
+ if not forms and string in lookup_table.keys():
+ forms.append(lookup_table[string][0])
+ if not forms:
+ forms.append(string)
+ return list(set(forms))
diff --git a/spacy/lang/nl/__init__.py b/spacy/lang/nl/__init__.py
index f4037990b..074fd9133 100644
--- a/spacy/lang/nl/__init__.py
+++ b/spacy/lang/nl/__init__.py
@@ -10,8 +10,9 @@ from .lemmatizer import DutchLemmatizer
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
+from ...lookups import Lookups
from ...attrs import LANG, NORM
-from ...util import update_exc, add_lookups, get_lemma_tables
+from ...util import update_exc, add_lookups
class DutchDefaults(Language.Defaults):
@@ -29,8 +30,9 @@ class DutchDefaults(Language.Defaults):
@classmethod
def create_lemmatizer(cls, nlp=None, lookups=None):
- lemma_rules, lemma_index, lemma_exc, lemma_lookup = get_lemma_tables(lookups)
- return DutchLemmatizer(lemma_index, lemma_exc, lemma_rules, lemma_lookup)
+ if lookups is None:
+ lookups = Lookups()
+ return DutchLemmatizer(lookups)
class Dutch(Language):
diff --git a/spacy/lang/nl/lemmatizer.py b/spacy/lang/nl/lemmatizer.py
index 08ae0b1f7..9a92bee44 100644
--- a/spacy/lang/nl/lemmatizer.py
+++ b/spacy/lang/nl/lemmatizer.py
@@ -1,10 +1,11 @@
# coding: utf8
from __future__ import unicode_literals
+from ...lemmatizer import Lemmatizer
from ...symbols import NOUN, VERB, ADJ, NUM, DET, PRON, ADP, AUX, ADV
-class DutchLemmatizer(object):
+class DutchLemmatizer(Lemmatizer):
# Note: CGN does not distinguish AUX verbs, so we treat AUX as VERB.
univ_pos_name_variants = {
NOUN: "noun",
@@ -36,16 +37,6 @@ class DutchLemmatizer(object):
"num": "num",
}
- @classmethod
- def load(cls, path, index=None, exc=None, rules=None, lookup=None):
- return cls(index, exc, rules, lookup)
-
- def __init__(self, index=None, exceptions=None, rules=None, lookup=None):
- self.index = index
- self.exc = exceptions
- self.rules = rules or {}
- self.lookup_table = lookup if lookup is not None else {}
-
def __call__(self, string, univ_pos, morphology=None):
# Difference 1: self.rules is assumed to be non-None, so no
# 'is None' check required.
@@ -62,11 +53,13 @@ class DutchLemmatizer(object):
# are not lemmatized. They are lowercased, however.
return [string]
# if string in self.lemma_index.get(univ_pos)
- lemma_index = self.index.get(univ_pos, {})
+ index_table = self.lookups.get_table("lemma_index", {})
+ lemma_index = index_table.get(univ_pos, {})
# string is already lemma
if string in lemma_index:
return [string]
- exceptions = self.exc.get(univ_pos, {})
+ exc_table = self.lookups.get_table("lemma_exc", {})
+ exceptions = exc_table.get(univ_pos, {})
# string is irregular token contained in exceptions index.
try:
lemma = exceptions[string]
@@ -74,15 +67,14 @@ class DutchLemmatizer(object):
except KeyError:
pass
# string corresponds to key in lookup table
- lookup_table = self.lookup_table
+ lookup_table = self.lookups.get_table("lemma_lookup", {})
looked_up_lemma = lookup_table.get(string)
if looked_up_lemma and looked_up_lemma in lemma_index:
return [looked_up_lemma]
-
- forms, is_known = lemmatize(
- string, lemma_index, exceptions, self.rules.get(univ_pos, [])
+ rules_table = self.lookups.get_table("lemma_rules", {})
+ forms, is_known = self.lemmatize(
+ string, lemma_index, exceptions, rules_table.get(univ_pos, [])
)
-
# Back-off through remaining return value candidates.
if forms:
if is_known:
@@ -104,46 +96,25 @@ class DutchLemmatizer(object):
# used to search the lookup table. This is necessary because our lookup
# table consists entirely of lowercase keys.
def lookup(self, string, orth=None):
+ lookup_table = self.lookups.get_table("lemma_lookup", {})
string = string.lower()
if orth is not None:
- return self.lookup_table.get(orth, string)
+ return lookup_table.get(orth, string)
else:
- return self.lookup_table.get(string, string)
+ return lookup_table.get(string, string)
- def noun(self, string, morphology=None):
- return self(string, "noun", morphology)
-
- def verb(self, string, morphology=None):
- return self(string, "verb", morphology)
-
- def adj(self, string, morphology=None):
- return self(string, "adj", morphology)
-
- def det(self, string, morphology=None):
- return self(string, "det", morphology)
-
- def pron(self, string, morphology=None):
- return self(string, "pron", morphology)
-
- def adp(self, string, morphology=None):
- return self(string, "adp", morphology)
-
- def punct(self, string, morphology=None):
- return self(string, "punct", morphology)
-
-
-# Reimplemented to focus more on application of suffix rules and to return
-# as early as possible.
-def lemmatize(string, index, exceptions, rules):
- # returns (forms, is_known: bool)
- oov_forms = []
- for old, new in rules:
- if string.endswith(old):
- form = string[: len(string) - len(old)] + new
- if not form:
- pass
- elif form in index:
- return [form], True # True = Is known (is lemma)
- else:
- oov_forms.append(form)
- return list(set(oov_forms)), False
+ # Reimplemented to focus more on application of suffix rules and to return
+ # as early as possible.
+ def lemmatize(self, string, index, exceptions, rules):
+ # returns (forms, is_known: bool)
+ oov_forms = []
+ for old, new in rules:
+ if string.endswith(old):
+ form = string[: len(string) - len(old)] + new
+ if not form:
+ pass
+ elif form in index:
+ return [form], True # True = Is known (is lemma)
+ else:
+ oov_forms.append(form)
+ return list(set(oov_forms)), False
diff --git a/spacy/lang/ru/__init__.py b/spacy/lang/ru/__init__.py
index 2699bad7e..f34fc5435 100644
--- a/spacy/lang/ru/__init__.py
+++ b/spacy/lang/ru/__init__.py
@@ -12,6 +12,7 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...util import update_exc, add_lookups
from ...language import Language
+from ...lookups import Lookups
from ...attrs import LANG, NORM
@@ -27,8 +28,10 @@ class RussianDefaults(Language.Defaults):
tag_map = TAG_MAP
@classmethod
- def create_lemmatizer(cls, nlp=None, **kwargs):
- return RussianLemmatizer()
+ def create_lemmatizer(cls, nlp=None, lookups=None):
+ if lookups is None:
+ lookups = Lookups()
+ return RussianLemmatizer(lookups)
class Russian(Language):
diff --git a/spacy/lang/ru/lemmatizer.py b/spacy/lang/ru/lemmatizer.py
index 70120566b..96d32f59c 100644
--- a/spacy/lang/ru/lemmatizer.py
+++ b/spacy/lang/ru/lemmatizer.py
@@ -9,8 +9,8 @@ from ...compat import unicode_
class RussianLemmatizer(Lemmatizer):
_morph = None
- def __init__(self):
- super(RussianLemmatizer, self).__init__()
+ def __init__(self, lookups=None):
+ super(RussianLemmatizer, self).__init__(lookups)
try:
from pymorphy2 import MorphAnalyzer
except ImportError:
@@ -102,19 +102,6 @@ class RussianLemmatizer(Lemmatizer):
return symbols_to_str[univ_pos]
return None
- def is_base_form(self, univ_pos, morphology=None):
- # TODO
- raise NotImplementedError
-
- def det(self, string, morphology=None):
- return self(string, "det", morphology)
-
- def num(self, string, morphology=None):
- return self(string, "num", morphology)
-
- def pron(self, string, morphology=None):
- return self(string, "pron", morphology)
-
def lookup(self, string, orth=None):
analyses = self._morph.parse(string)
if len(analyses) == 1:
diff --git a/spacy/language.py b/spacy/language.py
index f7d530ad4..88022a1f2 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -41,8 +41,7 @@ class BaseDefaults(object):
def create_lemmatizer(cls, nlp=None, lookups=None):
if lookups is None:
lookups = cls.create_lookups(nlp=nlp)
- rules, index, exc, lookup = util.get_lemma_tables(lookups)
- return Lemmatizer(index, exc, rules, lookup)
+ return Lemmatizer(lookups=lookups)
@classmethod
def create_lookups(cls, nlp=None):
diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py
index 26c2227a0..d70e4cfc4 100644
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@@ -1,8 +1,11 @@
# coding: utf8
from __future__ import unicode_literals
+
from collections import OrderedDict
from .symbols import NOUN, VERB, ADJ, PUNCT, PROPN
+from .errors import Errors
+from .lookups import Lookups
class Lemmatizer(object):
@@ -14,18 +17,32 @@ class Lemmatizer(object):
"""
@classmethod
- def load(cls, path, index=None, exc=None, rules=None, lookup=None):
- return cls(index, exc, rules, lookup)
+ def load(cls, *args, **kwargs):
+ raise NotImplementedError(Errors.E172)
- def __init__(self, index=None, exceptions=None, rules=None, lookup=None):
- self.index = index
- self.exc = exceptions
- self.rules = rules
- self.lookup_table = lookup if lookup is not None else {}
+ def __init__(self, lookups, *args, **kwargs):
+ """Initialize a Lemmatizer.
+
+ lookups (Lookups): The lookups object containing the (optional) tables
+ "lemma_rules", "lemma_index", "lemma_exc" and "lemma_lookup".
+ RETURNS (Lemmatizer): The newly constructed object.
+ """
+ if args or kwargs or not isinstance(lookups, Lookups):
+ raise ValueError(Errors.E173)
+ self.lookups = lookups
def __call__(self, string, univ_pos, morphology=None):
- if not self.rules:
- return [self.lookup_table.get(string, string)]
+ """Lemmatize a string.
+
+ string (unicode): The string to lemmatize, e.g. the token text.
+ univ_pos (unicode / int): The token's universal part-of-speech tag.
+ morphology (dict): The token's morphological features following the
+ Universal Dependencies scheme.
+ RETURNS (list): The available lemmas for the string.
+ """
+ lookup_table = self.lookups.get_table("lemma_lookup", {})
+ if "lemma_rules" not in self.lookups:
+ return [lookup_table.get(string, string)]
if univ_pos in (NOUN, "NOUN", "noun"):
univ_pos = "noun"
elif univ_pos in (VERB, "VERB", "verb"):
@@ -41,11 +58,14 @@ class Lemmatizer(object):
# See Issue #435 for example of where this logic is requied.
if self.is_base_form(univ_pos, morphology):
return [string.lower()]
- lemmas = lemmatize(
+ index_table = self.lookups.get_table("lemma_index", {})
+ exc_table = self.lookups.get_table("lemma_exc", {})
+ rules_table = self.lookups.get_table("lemma_rules", {})
+ lemmas = self.lemmatize(
string,
- self.index.get(univ_pos, {}),
- self.exc.get(univ_pos, {}),
- self.rules.get(univ_pos, []),
+ index_table.get(univ_pos, {}),
+ exc_table.get(univ_pos, {}),
+ rules_table.get(univ_pos, []),
)
return lemmas
@@ -53,6 +73,10 @@ class Lemmatizer(object):
"""
Check whether we're dealing with an uninflected paradigm, so we can
avoid lemmatization entirely.
+
+ univ_pos (unicode / int): The token's universal part-of-speech tag.
+ morphology (dict): The token's morphological features following the
+ Universal Dependencies scheme.
"""
if morphology is None:
morphology = {}
@@ -90,6 +114,18 @@ class Lemmatizer(object):
def adj(self, string, morphology=None):
return self(string, "adj", morphology)
+ def det(self, string, morphology=None):
+ return self(string, "det", morphology)
+
+ def pron(self, string, morphology=None):
+ return self(string, "pron", morphology)
+
+ def adp(self, string, morphology=None):
+ return self(string, "adp", morphology)
+
+ def num(self, string, morphology=None):
+ return self(string, "num", morphology)
+
def punct(self, string, morphology=None):
return self(string, "punct", morphology)
@@ -103,37 +139,37 @@ class Lemmatizer(object):
RETURNS (unicode): The lemma if the string was found, otherwise the
original string.
"""
+ lookup_table = self.lookups.get_table("lemma_lookup", {})
key = orth if orth is not None else string
- if key in self.lookup_table:
- return self.lookup_table[key]
+ if key in lookup_table:
+ return lookup_table[key]
return string
-
-def lemmatize(string, index, exceptions, rules):
- orig = string
- string = string.lower()
- forms = []
- oov_forms = []
- for old, new in rules:
- if string.endswith(old):
- form = string[: len(string) - len(old)] + new
- if not form:
- pass
- elif form in index or not form.isalpha():
- forms.append(form)
- else:
- oov_forms.append(form)
- # Remove duplicates but preserve the ordering of applied "rules"
- forms = list(OrderedDict.fromkeys(forms))
- # Put exceptions at the front of the list, so they get priority.
- # This is a dodgy heuristic -- but it's the best we can do until we get
- # frequencies on this. We can at least prune out problematic exceptions,
- # if they shadow more frequent analyses.
- for form in exceptions.get(string, []):
- if form not in forms:
- forms.insert(0, form)
- if not forms:
- forms.extend(oov_forms)
- if not forms:
- forms.append(orig)
- return forms
+ def lemmatize(self, string, index, exceptions, rules):
+ orig = string
+ string = string.lower()
+ forms = []
+ oov_forms = []
+ for old, new in rules:
+ if string.endswith(old):
+ form = string[: len(string) - len(old)] + new
+ if not form:
+ pass
+ elif form in index or not form.isalpha():
+ forms.append(form)
+ else:
+ oov_forms.append(form)
+ # Remove duplicates but preserve the ordering of applied "rules"
+ forms = list(OrderedDict.fromkeys(forms))
+ # Put exceptions at the front of the list, so they get priority.
+ # This is a dodgy heuristic -- but it's the best we can do until we get
+ # frequencies on this. We can at least prune out problematic exceptions,
+ # if they shadow more frequent analyses.
+ for form in exceptions.get(string, []):
+ if form not in forms:
+ forms.insert(0, form)
+ if not forms:
+ forms.extend(oov_forms)
+ if not forms:
+ forms.append(orig)
+ return forms
diff --git a/spacy/lookups.py b/spacy/lookups.py
index 05a60f289..bf250b4b4 100644
--- a/spacy/lookups.py
+++ b/spacy/lookups.py
@@ -10,6 +10,9 @@ from .util import SimpleFrozenDict, ensure_path
from .strings import get_string_id
+UNSET = object()
+
+
class Lookups(object):
"""Container for large lookup tables and dictionaries, e.g. lemmatization
data or tokenizer exception lists. Lookups are available via vocab.lookups,
@@ -60,16 +63,20 @@ class Lookups(object):
self._tables[name] = table
return table
- def get_table(self, name):
- """Get a table. Raises an error if the table doesn't exist.
+ def get_table(self, name, default=UNSET):
+ """Get a table. Raises an error if the table doesn't exist and no
+ default value is provided.
name (unicode): Name of the table.
+ default: Optional default value to return if table doesn't exist.
RETURNS (Table): The table.
DOCS: https://spacy.io/api/lookups#get_table
"""
if name not in self._tables:
- raise KeyError(Errors.E159.format(name=name, tables=self.tables))
+ if default == UNSET:
+ raise KeyError(Errors.E159.format(name=name, tables=self.tables))
+ return default
return self._tables[name]
def remove_table(self, name):
@@ -111,6 +118,7 @@ class Lookups(object):
DOCS: https://spacy.io/api/lookups#from_bytes
"""
+ self._tables = OrderedDict()
for key, value in srsly.msgpack_loads(bytes_data).items():
self._tables[key] = Table(key)
self._tables[key].update(value)
diff --git a/spacy/tests/doc/test_creation.py b/spacy/tests/doc/test_creation.py
index b222f6bf0..120fb6e28 100644
--- a/spacy/tests/doc/test_creation.py
+++ b/spacy/tests/doc/test_creation.py
@@ -5,13 +5,14 @@ import pytest
from spacy.vocab import Vocab
from spacy.tokens import Doc
from spacy.lemmatizer import Lemmatizer
-from spacy.lookups import Table
+from spacy.lookups import Lookups
@pytest.fixture
def lemmatizer():
- lookup = Table(data={"dogs": "dog", "boxen": "box", "mice": "mouse"})
- return Lemmatizer(lookup=lookup)
+ lookups = Lookups()
+ lookups.add_table("lemma_lookup", {"dogs": "dog", "boxen": "box", "mice": "mouse"})
+ return Lemmatizer(lookups)
@pytest.fixture
diff --git a/spacy/tests/morphology/test_morph_features.py b/spacy/tests/morphology/test_morph_features.py
index 4b8f0d754..41f807143 100644
--- a/spacy/tests/morphology/test_morph_features.py
+++ b/spacy/tests/morphology/test_morph_features.py
@@ -5,11 +5,13 @@ import pytest
from spacy.morphology import Morphology
from spacy.strings import StringStore, get_string_id
from spacy.lemmatizer import Lemmatizer
+from spacy.lookups import Lookups
@pytest.fixture
def morphology():
- return Morphology(StringStore(), {}, Lemmatizer())
+ lemmatizer = Lemmatizer(Lookups())
+ return Morphology(StringStore(), {}, lemmatizer)
def test_init(morphology):
diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py
deleted file mode 100644
index e843723e1..000000000
--- a/spacy/tests/pipeline/test_tagger.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-import pytest
-from spacy.lang.en import English
-from spacy.lookups import Lookups
-
-
-def test_tagger_warns_no_lemma_lookups():
- nlp = English()
- nlp.vocab.lookups = Lookups()
- assert not len(nlp.vocab.lookups)
- tagger = nlp.create_pipe("tagger")
- with pytest.warns(UserWarning):
- tagger.begin_training()
- nlp.add_pipe(tagger)
- with pytest.warns(UserWarning):
- nlp.begin_training()
- nlp.vocab.lookups.add_table("lemma_lookup")
- with pytest.warns(None) as record:
- nlp.begin_training()
- assert not record.list
diff --git a/spacy/tests/regression/test_issue1-1000.py b/spacy/tests/regression/test_issue1-1000.py
index b3f347765..dca3d624f 100644
--- a/spacy/tests/regression/test_issue1-1000.py
+++ b/spacy/tests/regression/test_issue1-1000.py
@@ -9,6 +9,7 @@ from spacy.symbols import POS, VERB, VerbForm_inf
from spacy.vocab import Vocab
from spacy.language import Language
from spacy.lemmatizer import Lemmatizer
+from spacy.lookups import Lookups
from spacy.tokens import Doc, Span
from ..util import get_doc, make_tempdir
@@ -173,8 +174,11 @@ def test_issue595():
"""Test lemmatization of base forms"""
words = ["Do", "n't", "feed", "the", "dog"]
tag_map = {"VB": {POS: VERB, VerbForm_inf: True}}
- rules = {"verb": [["ed", "e"]]}
- lemmatizer = Lemmatizer({"verb": {}}, {"verb": {}}, rules)
+ lookups = Lookups()
+ lookups.add_table("lemma_rules", {"verb": [["ed", "e"]]})
+ lookups.add_table("lemma_index", {"verb": {}})
+ lookups.add_table("lemma_exc", {"verb": {}})
+ lemmatizer = Lemmatizer(lookups)
vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
doc = Doc(vocab, words=words)
doc[2].tag_ = "VB"
diff --git a/spacy/tests/regression/test_issue1001-1500.py b/spacy/tests/regression/test_issue1001-1500.py
index a405d7b0f..889a5dc71 100644
--- a/spacy/tests/regression/test_issue1001-1500.py
+++ b/spacy/tests/regression/test_issue1001-1500.py
@@ -10,6 +10,7 @@ from spacy.lang.lex_attrs import LEX_ATTRS
from spacy.matcher import Matcher
from spacy.tokenizer import Tokenizer
from spacy.lemmatizer import Lemmatizer
+from spacy.lookups import Lookups
from spacy.symbols import ORTH, LEMMA, POS, VERB, VerbForm_part
@@ -91,10 +92,11 @@ def test_issue1375():
def test_issue1387():
tag_map = {"VBG": {POS: VERB, VerbForm_part: True}}
- index = {"verb": ("cope", "cop")}
- exc = {"verb": {"coping": ("cope",)}}
- rules = {"verb": [["ing", ""]]}
- lemmatizer = Lemmatizer(index, exc, rules)
+ lookups = Lookups()
+ lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
+ lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
+ lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
+ lemmatizer = Lemmatizer(lookups)
vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
doc = Doc(vocab, words=["coping"])
doc[0].tag_ = "VBG"
diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py
index 520090bb4..a9cf070cd 100644
--- a/spacy/tests/regression/test_issue1501-2000.py
+++ b/spacy/tests/regression/test_issue1501-2000.py
@@ -126,7 +126,8 @@ def test_issue1727():
vectors = Vectors(data=data, keys=["I", "am", "Matt"])
tagger = Tagger(Vocab())
tagger.add_label("PRP")
- tagger.begin_training()
+ with pytest.warns(UserWarning):
+ tagger.begin_training()
assert tagger.cfg.get("pretrained_dims", 0) == 0
tagger.vocab.vectors = vectors
with make_tempdir() as path:
diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py
index a0b1e2aac..e26ccbf4b 100644
--- a/spacy/tests/regression/test_issue2501-3000.py
+++ b/spacy/tests/regression/test_issue2501-3000.py
@@ -22,7 +22,8 @@ def test_issue2564():
"""Test the tagger sets is_tagged correctly when used via Language.pipe."""
nlp = Language()
tagger = nlp.create_pipe("tagger")
- tagger.begin_training() # initialise weights
+ with pytest.warns(UserWarning):
+ tagger.begin_training() # initialise weights
nlp.add_pipe(tagger)
doc = nlp("hello world")
assert doc.is_tagged
diff --git a/spacy/tests/test_lemmatizer.py b/spacy/tests/test_lemmatizer.py
new file mode 100644
index 000000000..701222afc
--- /dev/null
+++ b/spacy/tests/test_lemmatizer.py
@@ -0,0 +1,49 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import pytest
+from spacy.tokens import Doc
+from spacy.language import Language
+from spacy.lookups import Lookups
+
+
+def test_lemmatizer_reflects_lookups_changes():
+ """Test for an issue that'd cause lookups available in a model loaded from
+ disk to not be reflected in the lemmatizer."""
+ nlp = Language()
+ assert Doc(nlp.vocab, words=["foo"])[0].lemma_ == "foo"
+ table = nlp.vocab.lookups.add_table("lemma_lookup")
+ table["foo"] = "bar"
+ assert Doc(nlp.vocab, words=["foo"])[0].lemma_ == "bar"
+ table = nlp.vocab.lookups.get_table("lemma_lookup")
+ table["hello"] = "world"
+ # The update to the table should be reflected in the lemmatizer
+ assert Doc(nlp.vocab, words=["hello"])[0].lemma_ == "world"
+ new_nlp = Language()
+ table = new_nlp.vocab.lookups.add_table("lemma_lookup")
+ table["hello"] = "hi"
+ assert Doc(new_nlp.vocab, words=["hello"])[0].lemma_ == "hi"
+ nlp_bytes = nlp.to_bytes()
+ new_nlp.from_bytes(nlp_bytes)
+ # Make sure we have the previously saved lookup table
+ assert len(new_nlp.vocab.lookups) == 1
+ assert len(new_nlp.vocab.lookups.get_table("lemma_lookup")) == 2
+ assert new_nlp.vocab.lookups.get_table("lemma_lookup")["hello"] == "world"
+ assert Doc(new_nlp.vocab, words=["foo"])[0].lemma_ == "bar"
+ assert Doc(new_nlp.vocab, words=["hello"])[0].lemma_ == "world"
+
+
+def test_tagger_warns_no_lemma_lookups():
+ nlp = Language()
+ nlp.vocab.lookups = Lookups()
+ assert not len(nlp.vocab.lookups)
+ tagger = nlp.create_pipe("tagger")
+ with pytest.warns(UserWarning):
+ tagger.begin_training()
+ nlp.add_pipe(tagger)
+ with pytest.warns(UserWarning):
+ nlp.begin_training()
+ nlp.vocab.lookups.add_table("lemma_lookup")
+ with pytest.warns(None) as record:
+ nlp.begin_training()
+ assert not record.list
diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py
index ac64e99bd..59c2b3204 100644
--- a/spacy/tests/tokenizer/test_urls.py
+++ b/spacy/tests/tokenizer/test_urls.py
@@ -111,16 +111,12 @@ SUFFIXES = ['"', ":", ">"]
@pytest.mark.parametrize("url", URLS_SHOULD_MATCH)
def test_should_match(en_tokenizer, url):
- token_match = en_tokenizer.token_match
- if token_match:
- assert token_match(url)
+ assert en_tokenizer.token_match(url) is not None
@pytest.mark.parametrize("url", URLS_SHOULD_NOT_MATCH)
def test_should_not_match(en_tokenizer, url):
- token_match = en_tokenizer.token_match
- if token_match:
- assert not token_match(url)
+ assert en_tokenizer.token_match(url) is None
@pytest.mark.parametrize("url", URLS_BASIC)
diff --git a/spacy/util.py b/spacy/util.py
index ca2c416b1..c7ce38c3f 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -467,31 +467,6 @@ def expand_exc(excs, search, replace):
return new_excs
-def get_lemma_tables(lookups):
- """Load lemmatizer data from lookups table. Mostly used via
- Language.Defaults.create_lemmatizer, but available as helper so it can be
- reused in language classes that implement custom lemmatizers.
-
- lookups (Lookups): The lookups table.
- RETURNS (tuple): A (lemma_rules, lemma_index, lemma_exc, lemma_lookup)
- tuple that can be used to initialize a Lemmatizer.
- """
- lemma_rules = {}
- lemma_index = {}
- lemma_exc = {}
- lemma_lookup = None
- if lookups is not None:
- if "lemma_rules" in lookups:
- lemma_rules = lookups.get_table("lemma_rules")
- if "lemma_index" in lookups:
- lemma_index = lookups.get_table("lemma_index")
- if "lemma_exc" in lookups:
- lemma_exc = lookups.get_table("lemma_exc")
- if "lemma_lookup" in lookups:
- lemma_lookup = lookups.get_table("lemma_lookup")
- return (lemma_rules, lemma_index, lemma_exc, lemma_lookup)
-
-
def normalize_slice(length, start, stop, step=None):
if not (step is None or step == 1):
raise ValueError(Errors.E057)
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index 62c1791b9..c0d835553 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -50,10 +50,10 @@ cdef class Vocab:
"""
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
tag_map = tag_map if tag_map is not None else {}
- if lemmatizer in (None, True, False):
- lemmatizer = Lemmatizer({}, {}, {})
if lookups in (None, True, False):
lookups = Lookups()
+ if lemmatizer in (None, True, False):
+ lemmatizer = Lemmatizer(lookups)
self.cfg = {'oov_prob': oov_prob}
self.mem = Pool()
self._by_orth = PreshMap()
diff --git a/website/docs/api/lemmatizer.md b/website/docs/api/lemmatizer.md
index 805e96b0f..7570e4ea2 100644
--- a/website/docs/api/lemmatizer.md
+++ b/website/docs/api/lemmatizer.md
@@ -10,22 +10,40 @@ lookup tables.
## Lemmatizer.\_\_init\_\_ {#init tag="method"}
-Create a `Lemmatizer`.
+Initialize a `Lemmatizer`. Typically, this happens under the hood within spaCy
+when a `Language` subclass and its `Vocab` is initialized.
> #### Example
>
> ```python
> from spacy.lemmatizer import Lemmatizer
-> lemmatizer = Lemmatizer()
+> from spacy.lookups import Lookups
+> lookups = Lookups()
+> lookups.add_table("lemma_rules", {"noun": [["s", ""]]})
+> lemmatizer = Lemmatizer(lookups)
> ```
+>
+> For examples of the data format, see the
+> [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) repo.
-| Name | Type | Description |
-| ------------ | ------------- | ---------------------------------------------------------- |
-| `index` | dict / `None` | Inventory of lemmas in the language. |
-| `exceptions` | dict / `None` | Mapping of string forms to lemmas that bypass the `rules`. |
-| `rules` | dict / `None` | List of suffix rewrite rules. |
-| `lookup` | dict / `None` | Lookup table mapping string to their lemmas. |
-| **RETURNS** | `Lemmatizer` | The newly created object. |
+| Name | Type | Description |
+| -------------------------------------- | ------------------------- | ------------------------------------------------------------------------------------------------------------------------- |
+| `lookups` 2.2 | [`Lookups`](/api/lookups) | The lookups object containing the (optional) tables `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. |
+| **RETURNS** | `Lemmatizer` | The newly created object. |
+
+
+
+As of v2.2, the lemmatizer is initialized with a [`Lookups`](/api/lookups)
+object containing tables for the different components. This makes it easier for
+spaCy to share and serialize rules and lookup tables via the `Vocab`, and allows
+users to modify lemmatizer data at runtime by updating `nlp.vocab.lookups`.
+
+```diff
+- lemmatizer = Lemmatizer(rules=lemma_rules)
++ lemmatizer = Lemmatizer(lookups)
+```
+
+
## Lemmatizer.\_\_call\_\_ {#call tag="method"}
@@ -35,8 +53,10 @@ Lemmatize a string.
>
> ```python
> from spacy.lemmatizer import Lemmatizer
-> rules = {"noun": [["s", ""]]}
-> lemmatizer = Lemmatizer(index={}, exceptions={}, rules=rules)
+> from spacy.lookups import Lookups
+> lookups = Loookups()
+> lookups.add_table("lemma_rules", {"noun": [["s", ""]]})
+> lemmatizer = Lemmatizer(lookups)
> lemmas = lemmatizer("ducks", "NOUN")
> assert lemmas == ["duck"]
> ```
@@ -52,14 +72,13 @@ Lemmatize a string.
Look up a lemma in the lookup table, if available. If no lemma is found, the
original string is returned. Languages can provide a
-[lookup table](/usage/adding-languages#lemmatizer) via the `resources`, set on
-the individual `Language` class.
+[lookup table](/usage/adding-languages#lemmatizer) via the `Lookups`.
> #### Example
>
> ```python
-> lookup = {"going": "go"}
-> lemmatizer = Lemmatizer(lookup=lookup)
+> lookups = Lookups()
+> lookups.add_table("lemma_lookup", {"going": "go"})
> assert lemmatizer.lookup("going") == "go"
> ```
@@ -91,9 +110,6 @@ lemmatization entirely.
## Attributes {#attributes}
-| Name | Type | Description |
-| ----------------------------------------- | ------------- | ---------------------------------------------------------- |
-| `index` | dict / `None` | Inventory of lemmas in the language. |
-| `exc` | dict / `None` | Mapping of string forms to lemmas that bypass the `rules`. |
-| `rules` | dict / `None` | List of suffix rewrite rules. |
-| `lookup_table` 2 | dict / `None` | The lemma lookup table, if available. |
+| Name | Type | Description |
+| -------------------------------------- | ------------------------- | --------------------------------------------------------------- |
+| `lookups` 2.2 | [`Lookups`](/api/lookups) | The lookups object containing the rules and data, if available. |