From cf65a80f367ced104f39edc0b44bbd9f069b6c96 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 1 Oct 2019 21:36:04 +0200 Subject: [PATCH] Refactor lemmatizer and data table integration (#4353) * Move test * Allow default in Lookups.get_table * Start with blank tables in Lookups.from_bytes * Refactor lemmatizer to hold instance of Lookups * Get lookups table within the lemmatization methods to make sure it references the correct table (even if the table was replaced or modified, e.g. when loading a model from disk) * Deprecate other arguments on Lemmatizer.__init__ and expect Lookups for consistency * Remove old and unsupported Lemmatizer.load classmethod * Refactor language-specific lemmatizers to inherit as much as possible from base class and override only what they need * Update tests and docs * Fix more tests * Fix lemmatizer * Upgrade pytest to try and fix weird CI errors * Try pytest 4.6.5 --- .travis.yml | 1 - requirements.txt | 2 +- setup.cfg | 4 + spacy/errors.py | 6 + spacy/lang/el/__init__.py | 8 +- spacy/lang/el/lemmatizer.py | 88 ++++--------- spacy/lang/fr/__init__.py | 8 +- spacy/lang/fr/lemmatizer.py | 87 ++++++------ spacy/lang/nl/__init__.py | 8 +- spacy/lang/nl/lemmatizer.py | 85 ++++-------- spacy/lang/ru/__init__.py | 7 +- spacy/lang/ru/lemmatizer.py | 17 +-- spacy/language.py | 3 +- spacy/lemmatizer.py | 124 +++++++++++------- spacy/lookups.py | 14 +- spacy/tests/doc/test_creation.py | 7 +- spacy/tests/morphology/test_morph_features.py | 4 +- spacy/tests/pipeline/test_tagger.py | 22 ---- spacy/tests/regression/test_issue1-1000.py | 8 +- spacy/tests/regression/test_issue1001-1500.py | 10 +- spacy/tests/regression/test_issue1501-2000.py | 3 +- spacy/tests/regression/test_issue2501-3000.py | 3 +- spacy/tests/test_lemmatizer.py | 49 +++++++ spacy/tests/tokenizer/test_urls.py | 8 +- spacy/util.py | 25 ---- spacy/vocab.pyx | 4 +- website/docs/api/lemmatizer.md | 58 +++++--- 27 files changed, 332 insertions(+), 331 deletions(-) delete mode 100644 spacy/tests/pipeline/test_tagger.py create mode 100644 spacy/tests/test_lemmatizer.py diff --git a/.travis.yml b/.travis.yml index 957112e92..e3ce53024 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,7 +13,6 @@ install: - "pip install -e ." script: - "cat /proc/cpuinfo | grep flags | head -n 1" - - "pip install pytest pytest-timeout" - "python -m pytest --tb=native spacy" branches: except: diff --git a/requirements.txt b/requirements.txt index ebe660b97..601b73559 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,7 +15,7 @@ pathlib==1.0.1; python_version < "3.4" jsonschema>=2.6.0,<3.1.0 # Development dependencies cython>=0.25 -pytest>=4.0.0,<4.1.0 +pytest>=4.6.5 pytest-timeout>=1.3.0,<2.0.0 mock>=2.0.0,<3.0.0 flake8>=3.5.0,<3.6.0 diff --git a/setup.cfg b/setup.cfg index d188f123e..4d0a88c35 100644 --- a/setup.cfg +++ b/setup.cfg @@ -96,3 +96,7 @@ exclude = __pycache__, _tokenizer_exceptions_list.py, spacy/__init__.py + +[tool:pytest] +markers = + slow diff --git a/spacy/errors.py b/spacy/errors.py index 93d42aa4c..2ef5d1ce4 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -487,6 +487,12 @@ class Errors(object): E170 = ("Cannot apply transition {name}: invalid for the current state.") E171 = ("Matcher.add received invalid on_match callback argument: expected " "callable or None, but got: {arg_type}") + E172 = ("The Lemmatizer.load classmethod is deprecated. To create a " + "Lemmatizer, initialize the class directly. See the docs for " + "details: https://spacy.io/api/lemmatizer") + E173 = ("As of v2.2, the Lemmatizer is initialized with an instance of " + "Lookups containing the lemmatization tables. See the docs for " + "details: https://spacy.io/api/lemmatizer#init") @add_codes diff --git a/spacy/lang/el/__init__.py b/spacy/lang/el/__init__.py index 5312e7474..16863e6d7 100644 --- a/spacy/lang/el/__init__.py +++ b/spacy/lang/el/__init__.py @@ -13,8 +13,9 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS from .norm_exceptions import NORM_EXCEPTIONS from ..norm_exceptions import BASE_NORMS from ...language import Language +from ...lookups import Lookups from ...attrs import LANG, NORM -from ...util import update_exc, add_lookups, get_lemma_tables +from ...util import update_exc, add_lookups class GreekDefaults(Language.Defaults): @@ -34,8 +35,9 @@ class GreekDefaults(Language.Defaults): @classmethod def create_lemmatizer(cls, nlp=None, lookups=None): - lemma_rules, lemma_index, lemma_exc, lemma_lookup = get_lemma_tables(lookups) - return GreekLemmatizer(lemma_index, lemma_exc, lemma_rules, lemma_lookup) + if lookups is None: + lookups = Lookups() + return GreekLemmatizer(lookups) class Greek(Language): diff --git a/spacy/lang/el/lemmatizer.py b/spacy/lang/el/lemmatizer.py index 647ea9c33..6f5b3999b 100644 --- a/spacy/lang/el/lemmatizer.py +++ b/spacy/lang/el/lemmatizer.py @@ -1,10 +1,10 @@ # coding: utf8 from __future__ import unicode_literals -from ...symbols import NOUN, VERB, ADJ, PUNCT +from ...lemmatizer import Lemmatizer -class GreekLemmatizer(object): +class GreekLemmatizer(Lemmatizer): """ Greek language lemmatizer applies the default rule based lemmatization procedure with some modifications for better Greek language support. @@ -15,64 +15,26 @@ class GreekLemmatizer(object): not applicable for Greek language. """ - @classmethod - def load(cls, path, index=None, exc=None, rules=None, lookup=None): - return cls(index, exc, rules, lookup) - - def __init__(self, index=None, exceptions=None, rules=None, lookup=None): - self.index = index - self.exc = exceptions - self.rules = rules - self.lookup_table = lookup if lookup is not None else {} - - def __call__(self, string, univ_pos, morphology=None): - if not self.rules: - return [self.lookup_table.get(string, string)] - if univ_pos in (NOUN, "NOUN", "noun"): - univ_pos = "noun" - elif univ_pos in (VERB, "VERB", "verb"): - univ_pos = "verb" - elif univ_pos in (ADJ, "ADJ", "adj"): - univ_pos = "adj" - elif univ_pos in (PUNCT, "PUNCT", "punct"): - univ_pos = "punct" - else: - return list(set([string.lower()])) - lemmas = lemmatize( - string, - self.index.get(univ_pos, {}), - self.exc.get(univ_pos, {}), - self.rules.get(univ_pos, []), - ) - return lemmas - - def lookup(self, string, orth=None): - key = orth if orth is not None else string - if key in self.lookup_table: - return self.lookup_table[key] - return string - - -def lemmatize(string, index, exceptions, rules): - string = string.lower() - forms = [] - if string in index: - forms.append(string) - return forms - forms.extend(exceptions.get(string, [])) - oov_forms = [] - if not forms: - for old, new in rules: - if string.endswith(old): - form = string[: len(string) - len(old)] + new - if not form: - pass - elif form in index or not form.isalpha(): - forms.append(form) - else: - oov_forms.append(form) - if not forms: - forms.extend(oov_forms) - if not forms: - forms.append(string) - return list(set(forms)) + def lemmatize(self, string, index, exceptions, rules): + string = string.lower() + forms = [] + if string in index: + forms.append(string) + return forms + forms.extend(exceptions.get(string, [])) + oov_forms = [] + if not forms: + for old, new in rules: + if string.endswith(old): + form = string[: len(string) - len(old)] + new + if not form: + pass + elif form in index or not form.isalpha(): + forms.append(form) + else: + oov_forms.append(form) + if not forms: + forms.extend(oov_forms) + if not forms: + forms.append(string) + return list(set(forms)) diff --git a/spacy/lang/fr/__init__.py b/spacy/lang/fr/__init__.py index c9dd623fc..f56c8688a 100644 --- a/spacy/lang/fr/__init__.py +++ b/spacy/lang/fr/__init__.py @@ -12,8 +12,9 @@ from .syntax_iterators import SYNTAX_ITERATORS from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS from ...language import Language +from ...lookups import Lookups from ...attrs import LANG, NORM -from ...util import update_exc, add_lookups, get_lemma_tables +from ...util import update_exc, add_lookups class FrenchDefaults(Language.Defaults): @@ -33,8 +34,9 @@ class FrenchDefaults(Language.Defaults): @classmethod def create_lemmatizer(cls, nlp=None, lookups=None): - lemma_rules, lemma_index, lemma_exc, lemma_lookup = get_lemma_tables(lookups) - return FrenchLemmatizer(lemma_index, lemma_exc, lemma_rules, lemma_lookup) + if lookups is None: + lookups = Lookups() + return FrenchLemmatizer(lookups) class French(Language): diff --git a/spacy/lang/fr/lemmatizer.py b/spacy/lang/fr/lemmatizer.py index d98d3cb5b..79f4dd28d 100644 --- a/spacy/lang/fr/lemmatizer.py +++ b/spacy/lang/fr/lemmatizer.py @@ -1,12 +1,13 @@ # coding: utf8 from __future__ import unicode_literals +from ...lemmatizer import Lemmatizer from ...symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT, ADP from ...symbols import SCONJ, CCONJ from ...symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos -class FrenchLemmatizer(object): +class FrenchLemmatizer(Lemmatizer): """ French language lemmatizer applies the default rule based lemmatization procedure with some modifications for better French language support. @@ -16,19 +17,10 @@ class FrenchLemmatizer(object): the lookup table. """ - @classmethod - def load(cls, path, index=None, exc=None, rules=None, lookup=None): - return cls(index, exc, rules, lookup) - - def __init__(self, index=None, exceptions=None, rules=None, lookup=None): - self.index = index - self.exc = exceptions - self.rules = rules - self.lookup_table = lookup if lookup is not None else {} - def __call__(self, string, univ_pos, morphology=None): - if not self.rules: - return [self.lookup_table.get(string, string)] + lookup_table = self.lookups.get_table("lemma_lookup", {}) + if "lemma_rules" not in self.lookups: + return [lookup_table.get(string, string)] if univ_pos in (NOUN, "NOUN", "noun"): univ_pos = "noun" elif univ_pos in (VERB, "VERB", "verb"): @@ -56,12 +48,14 @@ class FrenchLemmatizer(object): # See Issue #435 for example of where this logic is requied. if self.is_base_form(univ_pos, morphology): return list(set([string.lower()])) - lemmas = lemmatize( + index_table = self.lookups.get_table("lemma_index", {}) + exc_table = self.lookups.get_table("lemma_exc", {}) + rules_table = self.lookups.get_table("lemma_rules", {}) + lemmas = self.lemmatize( string, - self.index.get(univ_pos, {}), - self.exc.get(univ_pos, {}), - self.rules.get(univ_pos, []), - self.lookup_table, + index_table.get(univ_pos, {}), + exc_table.get(univ_pos, {}), + rules_table.get(univ_pos, []), ) return lemmas @@ -115,33 +109,34 @@ class FrenchLemmatizer(object): return self(string, "punct", morphology) def lookup(self, string, orth=None): - if orth is not None and orth in self.lookup_table: - return self.lookup_table[orth][0] + lookup_table = self.lookups.get_table("lemma_lookup", {}) + if orth is not None and orth in lookup_table: + return lookup_table[orth][0] return string - -def lemmatize(string, index, exceptions, rules, lookup): - string = string.lower() - forms = [] - if string in index: - forms.append(string) - return forms - forms.extend(exceptions.get(string, [])) - oov_forms = [] - if not forms: - for old, new in rules: - if string.endswith(old): - form = string[: len(string) - len(old)] + new - if not form: - pass - elif form in index or not form.isalpha(): - forms.append(form) - else: - oov_forms.append(form) - if not forms: - forms.extend(oov_forms) - if not forms and string in lookup.keys(): - forms.append(lookup[string][0]) - if not forms: - forms.append(string) - return list(set(forms)) + def lemmatize(self, string, index, exceptions, rules): + lookup_table = self.lookups.get_table("lemma_lookup", {}) + string = string.lower() + forms = [] + if string in index: + forms.append(string) + return forms + forms.extend(exceptions.get(string, [])) + oov_forms = [] + if not forms: + for old, new in rules: + if string.endswith(old): + form = string[: len(string) - len(old)] + new + if not form: + pass + elif form in index or not form.isalpha(): + forms.append(form) + else: + oov_forms.append(form) + if not forms: + forms.extend(oov_forms) + if not forms and string in lookup_table.keys(): + forms.append(lookup_table[string][0]) + if not forms: + forms.append(string) + return list(set(forms)) diff --git a/spacy/lang/nl/__init__.py b/spacy/lang/nl/__init__.py index f4037990b..074fd9133 100644 --- a/spacy/lang/nl/__init__.py +++ b/spacy/lang/nl/__init__.py @@ -10,8 +10,9 @@ from .lemmatizer import DutchLemmatizer from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS from ...language import Language +from ...lookups import Lookups from ...attrs import LANG, NORM -from ...util import update_exc, add_lookups, get_lemma_tables +from ...util import update_exc, add_lookups class DutchDefaults(Language.Defaults): @@ -29,8 +30,9 @@ class DutchDefaults(Language.Defaults): @classmethod def create_lemmatizer(cls, nlp=None, lookups=None): - lemma_rules, lemma_index, lemma_exc, lemma_lookup = get_lemma_tables(lookups) - return DutchLemmatizer(lemma_index, lemma_exc, lemma_rules, lemma_lookup) + if lookups is None: + lookups = Lookups() + return DutchLemmatizer(lookups) class Dutch(Language): diff --git a/spacy/lang/nl/lemmatizer.py b/spacy/lang/nl/lemmatizer.py index 08ae0b1f7..9a92bee44 100644 --- a/spacy/lang/nl/lemmatizer.py +++ b/spacy/lang/nl/lemmatizer.py @@ -1,10 +1,11 @@ # coding: utf8 from __future__ import unicode_literals +from ...lemmatizer import Lemmatizer from ...symbols import NOUN, VERB, ADJ, NUM, DET, PRON, ADP, AUX, ADV -class DutchLemmatizer(object): +class DutchLemmatizer(Lemmatizer): # Note: CGN does not distinguish AUX verbs, so we treat AUX as VERB. univ_pos_name_variants = { NOUN: "noun", @@ -36,16 +37,6 @@ class DutchLemmatizer(object): "num": "num", } - @classmethod - def load(cls, path, index=None, exc=None, rules=None, lookup=None): - return cls(index, exc, rules, lookup) - - def __init__(self, index=None, exceptions=None, rules=None, lookup=None): - self.index = index - self.exc = exceptions - self.rules = rules or {} - self.lookup_table = lookup if lookup is not None else {} - def __call__(self, string, univ_pos, morphology=None): # Difference 1: self.rules is assumed to be non-None, so no # 'is None' check required. @@ -62,11 +53,13 @@ class DutchLemmatizer(object): # are not lemmatized. They are lowercased, however. return [string] # if string in self.lemma_index.get(univ_pos) - lemma_index = self.index.get(univ_pos, {}) + index_table = self.lookups.get_table("lemma_index", {}) + lemma_index = index_table.get(univ_pos, {}) # string is already lemma if string in lemma_index: return [string] - exceptions = self.exc.get(univ_pos, {}) + exc_table = self.lookups.get_table("lemma_exc", {}) + exceptions = exc_table.get(univ_pos, {}) # string is irregular token contained in exceptions index. try: lemma = exceptions[string] @@ -74,15 +67,14 @@ class DutchLemmatizer(object): except KeyError: pass # string corresponds to key in lookup table - lookup_table = self.lookup_table + lookup_table = self.lookups.get_table("lemma_lookup", {}) looked_up_lemma = lookup_table.get(string) if looked_up_lemma and looked_up_lemma in lemma_index: return [looked_up_lemma] - - forms, is_known = lemmatize( - string, lemma_index, exceptions, self.rules.get(univ_pos, []) + rules_table = self.lookups.get_table("lemma_rules", {}) + forms, is_known = self.lemmatize( + string, lemma_index, exceptions, rules_table.get(univ_pos, []) ) - # Back-off through remaining return value candidates. if forms: if is_known: @@ -104,46 +96,25 @@ class DutchLemmatizer(object): # used to search the lookup table. This is necessary because our lookup # table consists entirely of lowercase keys. def lookup(self, string, orth=None): + lookup_table = self.lookups.get_table("lemma_lookup", {}) string = string.lower() if orth is not None: - return self.lookup_table.get(orth, string) + return lookup_table.get(orth, string) else: - return self.lookup_table.get(string, string) + return lookup_table.get(string, string) - def noun(self, string, morphology=None): - return self(string, "noun", morphology) - - def verb(self, string, morphology=None): - return self(string, "verb", morphology) - - def adj(self, string, morphology=None): - return self(string, "adj", morphology) - - def det(self, string, morphology=None): - return self(string, "det", morphology) - - def pron(self, string, morphology=None): - return self(string, "pron", morphology) - - def adp(self, string, morphology=None): - return self(string, "adp", morphology) - - def punct(self, string, morphology=None): - return self(string, "punct", morphology) - - -# Reimplemented to focus more on application of suffix rules and to return -# as early as possible. -def lemmatize(string, index, exceptions, rules): - # returns (forms, is_known: bool) - oov_forms = [] - for old, new in rules: - if string.endswith(old): - form = string[: len(string) - len(old)] + new - if not form: - pass - elif form in index: - return [form], True # True = Is known (is lemma) - else: - oov_forms.append(form) - return list(set(oov_forms)), False + # Reimplemented to focus more on application of suffix rules and to return + # as early as possible. + def lemmatize(self, string, index, exceptions, rules): + # returns (forms, is_known: bool) + oov_forms = [] + for old, new in rules: + if string.endswith(old): + form = string[: len(string) - len(old)] + new + if not form: + pass + elif form in index: + return [form], True # True = Is known (is lemma) + else: + oov_forms.append(form) + return list(set(oov_forms)), False diff --git a/spacy/lang/ru/__init__.py b/spacy/lang/ru/__init__.py index 2699bad7e..f34fc5435 100644 --- a/spacy/lang/ru/__init__.py +++ b/spacy/lang/ru/__init__.py @@ -12,6 +12,7 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS from ...util import update_exc, add_lookups from ...language import Language +from ...lookups import Lookups from ...attrs import LANG, NORM @@ -27,8 +28,10 @@ class RussianDefaults(Language.Defaults): tag_map = TAG_MAP @classmethod - def create_lemmatizer(cls, nlp=None, **kwargs): - return RussianLemmatizer() + def create_lemmatizer(cls, nlp=None, lookups=None): + if lookups is None: + lookups = Lookups() + return RussianLemmatizer(lookups) class Russian(Language): diff --git a/spacy/lang/ru/lemmatizer.py b/spacy/lang/ru/lemmatizer.py index 70120566b..96d32f59c 100644 --- a/spacy/lang/ru/lemmatizer.py +++ b/spacy/lang/ru/lemmatizer.py @@ -9,8 +9,8 @@ from ...compat import unicode_ class RussianLemmatizer(Lemmatizer): _morph = None - def __init__(self): - super(RussianLemmatizer, self).__init__() + def __init__(self, lookups=None): + super(RussianLemmatizer, self).__init__(lookups) try: from pymorphy2 import MorphAnalyzer except ImportError: @@ -102,19 +102,6 @@ class RussianLemmatizer(Lemmatizer): return symbols_to_str[univ_pos] return None - def is_base_form(self, univ_pos, morphology=None): - # TODO - raise NotImplementedError - - def det(self, string, morphology=None): - return self(string, "det", morphology) - - def num(self, string, morphology=None): - return self(string, "num", morphology) - - def pron(self, string, morphology=None): - return self(string, "pron", morphology) - def lookup(self, string, orth=None): analyses = self._morph.parse(string) if len(analyses) == 1: diff --git a/spacy/language.py b/spacy/language.py index f7d530ad4..88022a1f2 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -41,8 +41,7 @@ class BaseDefaults(object): def create_lemmatizer(cls, nlp=None, lookups=None): if lookups is None: lookups = cls.create_lookups(nlp=nlp) - rules, index, exc, lookup = util.get_lemma_tables(lookups) - return Lemmatizer(index, exc, rules, lookup) + return Lemmatizer(lookups=lookups) @classmethod def create_lookups(cls, nlp=None): diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 26c2227a0..d70e4cfc4 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -1,8 +1,11 @@ # coding: utf8 from __future__ import unicode_literals + from collections import OrderedDict from .symbols import NOUN, VERB, ADJ, PUNCT, PROPN +from .errors import Errors +from .lookups import Lookups class Lemmatizer(object): @@ -14,18 +17,32 @@ class Lemmatizer(object): """ @classmethod - def load(cls, path, index=None, exc=None, rules=None, lookup=None): - return cls(index, exc, rules, lookup) + def load(cls, *args, **kwargs): + raise NotImplementedError(Errors.E172) - def __init__(self, index=None, exceptions=None, rules=None, lookup=None): - self.index = index - self.exc = exceptions - self.rules = rules - self.lookup_table = lookup if lookup is not None else {} + def __init__(self, lookups, *args, **kwargs): + """Initialize a Lemmatizer. + + lookups (Lookups): The lookups object containing the (optional) tables + "lemma_rules", "lemma_index", "lemma_exc" and "lemma_lookup". + RETURNS (Lemmatizer): The newly constructed object. + """ + if args or kwargs or not isinstance(lookups, Lookups): + raise ValueError(Errors.E173) + self.lookups = lookups def __call__(self, string, univ_pos, morphology=None): - if not self.rules: - return [self.lookup_table.get(string, string)] + """Lemmatize a string. + + string (unicode): The string to lemmatize, e.g. the token text. + univ_pos (unicode / int): The token's universal part-of-speech tag. + morphology (dict): The token's morphological features following the + Universal Dependencies scheme. + RETURNS (list): The available lemmas for the string. + """ + lookup_table = self.lookups.get_table("lemma_lookup", {}) + if "lemma_rules" not in self.lookups: + return [lookup_table.get(string, string)] if univ_pos in (NOUN, "NOUN", "noun"): univ_pos = "noun" elif univ_pos in (VERB, "VERB", "verb"): @@ -41,11 +58,14 @@ class Lemmatizer(object): # See Issue #435 for example of where this logic is requied. if self.is_base_form(univ_pos, morphology): return [string.lower()] - lemmas = lemmatize( + index_table = self.lookups.get_table("lemma_index", {}) + exc_table = self.lookups.get_table("lemma_exc", {}) + rules_table = self.lookups.get_table("lemma_rules", {}) + lemmas = self.lemmatize( string, - self.index.get(univ_pos, {}), - self.exc.get(univ_pos, {}), - self.rules.get(univ_pos, []), + index_table.get(univ_pos, {}), + exc_table.get(univ_pos, {}), + rules_table.get(univ_pos, []), ) return lemmas @@ -53,6 +73,10 @@ class Lemmatizer(object): """ Check whether we're dealing with an uninflected paradigm, so we can avoid lemmatization entirely. + + univ_pos (unicode / int): The token's universal part-of-speech tag. + morphology (dict): The token's morphological features following the + Universal Dependencies scheme. """ if morphology is None: morphology = {} @@ -90,6 +114,18 @@ class Lemmatizer(object): def adj(self, string, morphology=None): return self(string, "adj", morphology) + def det(self, string, morphology=None): + return self(string, "det", morphology) + + def pron(self, string, morphology=None): + return self(string, "pron", morphology) + + def adp(self, string, morphology=None): + return self(string, "adp", morphology) + + def num(self, string, morphology=None): + return self(string, "num", morphology) + def punct(self, string, morphology=None): return self(string, "punct", morphology) @@ -103,37 +139,37 @@ class Lemmatizer(object): RETURNS (unicode): The lemma if the string was found, otherwise the original string. """ + lookup_table = self.lookups.get_table("lemma_lookup", {}) key = orth if orth is not None else string - if key in self.lookup_table: - return self.lookup_table[key] + if key in lookup_table: + return lookup_table[key] return string - -def lemmatize(string, index, exceptions, rules): - orig = string - string = string.lower() - forms = [] - oov_forms = [] - for old, new in rules: - if string.endswith(old): - form = string[: len(string) - len(old)] + new - if not form: - pass - elif form in index or not form.isalpha(): - forms.append(form) - else: - oov_forms.append(form) - # Remove duplicates but preserve the ordering of applied "rules" - forms = list(OrderedDict.fromkeys(forms)) - # Put exceptions at the front of the list, so they get priority. - # This is a dodgy heuristic -- but it's the best we can do until we get - # frequencies on this. We can at least prune out problematic exceptions, - # if they shadow more frequent analyses. - for form in exceptions.get(string, []): - if form not in forms: - forms.insert(0, form) - if not forms: - forms.extend(oov_forms) - if not forms: - forms.append(orig) - return forms + def lemmatize(self, string, index, exceptions, rules): + orig = string + string = string.lower() + forms = [] + oov_forms = [] + for old, new in rules: + if string.endswith(old): + form = string[: len(string) - len(old)] + new + if not form: + pass + elif form in index or not form.isalpha(): + forms.append(form) + else: + oov_forms.append(form) + # Remove duplicates but preserve the ordering of applied "rules" + forms = list(OrderedDict.fromkeys(forms)) + # Put exceptions at the front of the list, so they get priority. + # This is a dodgy heuristic -- but it's the best we can do until we get + # frequencies on this. We can at least prune out problematic exceptions, + # if they shadow more frequent analyses. + for form in exceptions.get(string, []): + if form not in forms: + forms.insert(0, form) + if not forms: + forms.extend(oov_forms) + if not forms: + forms.append(orig) + return forms diff --git a/spacy/lookups.py b/spacy/lookups.py index 05a60f289..bf250b4b4 100644 --- a/spacy/lookups.py +++ b/spacy/lookups.py @@ -10,6 +10,9 @@ from .util import SimpleFrozenDict, ensure_path from .strings import get_string_id +UNSET = object() + + class Lookups(object): """Container for large lookup tables and dictionaries, e.g. lemmatization data or tokenizer exception lists. Lookups are available via vocab.lookups, @@ -60,16 +63,20 @@ class Lookups(object): self._tables[name] = table return table - def get_table(self, name): - """Get a table. Raises an error if the table doesn't exist. + def get_table(self, name, default=UNSET): + """Get a table. Raises an error if the table doesn't exist and no + default value is provided. name (unicode): Name of the table. + default: Optional default value to return if table doesn't exist. RETURNS (Table): The table. DOCS: https://spacy.io/api/lookups#get_table """ if name not in self._tables: - raise KeyError(Errors.E159.format(name=name, tables=self.tables)) + if default == UNSET: + raise KeyError(Errors.E159.format(name=name, tables=self.tables)) + return default return self._tables[name] def remove_table(self, name): @@ -111,6 +118,7 @@ class Lookups(object): DOCS: https://spacy.io/api/lookups#from_bytes """ + self._tables = OrderedDict() for key, value in srsly.msgpack_loads(bytes_data).items(): self._tables[key] = Table(key) self._tables[key].update(value) diff --git a/spacy/tests/doc/test_creation.py b/spacy/tests/doc/test_creation.py index b222f6bf0..120fb6e28 100644 --- a/spacy/tests/doc/test_creation.py +++ b/spacy/tests/doc/test_creation.py @@ -5,13 +5,14 @@ import pytest from spacy.vocab import Vocab from spacy.tokens import Doc from spacy.lemmatizer import Lemmatizer -from spacy.lookups import Table +from spacy.lookups import Lookups @pytest.fixture def lemmatizer(): - lookup = Table(data={"dogs": "dog", "boxen": "box", "mice": "mouse"}) - return Lemmatizer(lookup=lookup) + lookups = Lookups() + lookups.add_table("lemma_lookup", {"dogs": "dog", "boxen": "box", "mice": "mouse"}) + return Lemmatizer(lookups) @pytest.fixture diff --git a/spacy/tests/morphology/test_morph_features.py b/spacy/tests/morphology/test_morph_features.py index 4b8f0d754..41f807143 100644 --- a/spacy/tests/morphology/test_morph_features.py +++ b/spacy/tests/morphology/test_morph_features.py @@ -5,11 +5,13 @@ import pytest from spacy.morphology import Morphology from spacy.strings import StringStore, get_string_id from spacy.lemmatizer import Lemmatizer +from spacy.lookups import Lookups @pytest.fixture def morphology(): - return Morphology(StringStore(), {}, Lemmatizer()) + lemmatizer = Lemmatizer(Lookups()) + return Morphology(StringStore(), {}, lemmatizer) def test_init(morphology): diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py deleted file mode 100644 index e843723e1..000000000 --- a/spacy/tests/pipeline/test_tagger.py +++ /dev/null @@ -1,22 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -import pytest -from spacy.lang.en import English -from spacy.lookups import Lookups - - -def test_tagger_warns_no_lemma_lookups(): - nlp = English() - nlp.vocab.lookups = Lookups() - assert not len(nlp.vocab.lookups) - tagger = nlp.create_pipe("tagger") - with pytest.warns(UserWarning): - tagger.begin_training() - nlp.add_pipe(tagger) - with pytest.warns(UserWarning): - nlp.begin_training() - nlp.vocab.lookups.add_table("lemma_lookup") - with pytest.warns(None) as record: - nlp.begin_training() - assert not record.list diff --git a/spacy/tests/regression/test_issue1-1000.py b/spacy/tests/regression/test_issue1-1000.py index b3f347765..dca3d624f 100644 --- a/spacy/tests/regression/test_issue1-1000.py +++ b/spacy/tests/regression/test_issue1-1000.py @@ -9,6 +9,7 @@ from spacy.symbols import POS, VERB, VerbForm_inf from spacy.vocab import Vocab from spacy.language import Language from spacy.lemmatizer import Lemmatizer +from spacy.lookups import Lookups from spacy.tokens import Doc, Span from ..util import get_doc, make_tempdir @@ -173,8 +174,11 @@ def test_issue595(): """Test lemmatization of base forms""" words = ["Do", "n't", "feed", "the", "dog"] tag_map = {"VB": {POS: VERB, VerbForm_inf: True}} - rules = {"verb": [["ed", "e"]]} - lemmatizer = Lemmatizer({"verb": {}}, {"verb": {}}, rules) + lookups = Lookups() + lookups.add_table("lemma_rules", {"verb": [["ed", "e"]]}) + lookups.add_table("lemma_index", {"verb": {}}) + lookups.add_table("lemma_exc", {"verb": {}}) + lemmatizer = Lemmatizer(lookups) vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map) doc = Doc(vocab, words=words) doc[2].tag_ = "VB" diff --git a/spacy/tests/regression/test_issue1001-1500.py b/spacy/tests/regression/test_issue1001-1500.py index a405d7b0f..889a5dc71 100644 --- a/spacy/tests/regression/test_issue1001-1500.py +++ b/spacy/tests/regression/test_issue1001-1500.py @@ -10,6 +10,7 @@ from spacy.lang.lex_attrs import LEX_ATTRS from spacy.matcher import Matcher from spacy.tokenizer import Tokenizer from spacy.lemmatizer import Lemmatizer +from spacy.lookups import Lookups from spacy.symbols import ORTH, LEMMA, POS, VERB, VerbForm_part @@ -91,10 +92,11 @@ def test_issue1375(): def test_issue1387(): tag_map = {"VBG": {POS: VERB, VerbForm_part: True}} - index = {"verb": ("cope", "cop")} - exc = {"verb": {"coping": ("cope",)}} - rules = {"verb": [["ing", ""]]} - lemmatizer = Lemmatizer(index, exc, rules) + lookups = Lookups() + lookups.add_table("lemma_index", {"verb": ("cope", "cop")}) + lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}}) + lookups.add_table("lemma_rules", {"verb": [["ing", ""]]}) + lemmatizer = Lemmatizer(lookups) vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map) doc = Doc(vocab, words=["coping"]) doc[0].tag_ = "VBG" diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py index 520090bb4..a9cf070cd 100644 --- a/spacy/tests/regression/test_issue1501-2000.py +++ b/spacy/tests/regression/test_issue1501-2000.py @@ -126,7 +126,8 @@ def test_issue1727(): vectors = Vectors(data=data, keys=["I", "am", "Matt"]) tagger = Tagger(Vocab()) tagger.add_label("PRP") - tagger.begin_training() + with pytest.warns(UserWarning): + tagger.begin_training() assert tagger.cfg.get("pretrained_dims", 0) == 0 tagger.vocab.vectors = vectors with make_tempdir() as path: diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py index a0b1e2aac..e26ccbf4b 100644 --- a/spacy/tests/regression/test_issue2501-3000.py +++ b/spacy/tests/regression/test_issue2501-3000.py @@ -22,7 +22,8 @@ def test_issue2564(): """Test the tagger sets is_tagged correctly when used via Language.pipe.""" nlp = Language() tagger = nlp.create_pipe("tagger") - tagger.begin_training() # initialise weights + with pytest.warns(UserWarning): + tagger.begin_training() # initialise weights nlp.add_pipe(tagger) doc = nlp("hello world") assert doc.is_tagged diff --git a/spacy/tests/test_lemmatizer.py b/spacy/tests/test_lemmatizer.py new file mode 100644 index 000000000..701222afc --- /dev/null +++ b/spacy/tests/test_lemmatizer.py @@ -0,0 +1,49 @@ +# coding: utf8 +from __future__ import unicode_literals + +import pytest +from spacy.tokens import Doc +from spacy.language import Language +from spacy.lookups import Lookups + + +def test_lemmatizer_reflects_lookups_changes(): + """Test for an issue that'd cause lookups available in a model loaded from + disk to not be reflected in the lemmatizer.""" + nlp = Language() + assert Doc(nlp.vocab, words=["foo"])[0].lemma_ == "foo" + table = nlp.vocab.lookups.add_table("lemma_lookup") + table["foo"] = "bar" + assert Doc(nlp.vocab, words=["foo"])[0].lemma_ == "bar" + table = nlp.vocab.lookups.get_table("lemma_lookup") + table["hello"] = "world" + # The update to the table should be reflected in the lemmatizer + assert Doc(nlp.vocab, words=["hello"])[0].lemma_ == "world" + new_nlp = Language() + table = new_nlp.vocab.lookups.add_table("lemma_lookup") + table["hello"] = "hi" + assert Doc(new_nlp.vocab, words=["hello"])[0].lemma_ == "hi" + nlp_bytes = nlp.to_bytes() + new_nlp.from_bytes(nlp_bytes) + # Make sure we have the previously saved lookup table + assert len(new_nlp.vocab.lookups) == 1 + assert len(new_nlp.vocab.lookups.get_table("lemma_lookup")) == 2 + assert new_nlp.vocab.lookups.get_table("lemma_lookup")["hello"] == "world" + assert Doc(new_nlp.vocab, words=["foo"])[0].lemma_ == "bar" + assert Doc(new_nlp.vocab, words=["hello"])[0].lemma_ == "world" + + +def test_tagger_warns_no_lemma_lookups(): + nlp = Language() + nlp.vocab.lookups = Lookups() + assert not len(nlp.vocab.lookups) + tagger = nlp.create_pipe("tagger") + with pytest.warns(UserWarning): + tagger.begin_training() + nlp.add_pipe(tagger) + with pytest.warns(UserWarning): + nlp.begin_training() + nlp.vocab.lookups.add_table("lemma_lookup") + with pytest.warns(None) as record: + nlp.begin_training() + assert not record.list diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py index ac64e99bd..59c2b3204 100644 --- a/spacy/tests/tokenizer/test_urls.py +++ b/spacy/tests/tokenizer/test_urls.py @@ -111,16 +111,12 @@ SUFFIXES = ['"', ":", ">"] @pytest.mark.parametrize("url", URLS_SHOULD_MATCH) def test_should_match(en_tokenizer, url): - token_match = en_tokenizer.token_match - if token_match: - assert token_match(url) + assert en_tokenizer.token_match(url) is not None @pytest.mark.parametrize("url", URLS_SHOULD_NOT_MATCH) def test_should_not_match(en_tokenizer, url): - token_match = en_tokenizer.token_match - if token_match: - assert not token_match(url) + assert en_tokenizer.token_match(url) is None @pytest.mark.parametrize("url", URLS_BASIC) diff --git a/spacy/util.py b/spacy/util.py index ca2c416b1..c7ce38c3f 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -467,31 +467,6 @@ def expand_exc(excs, search, replace): return new_excs -def get_lemma_tables(lookups): - """Load lemmatizer data from lookups table. Mostly used via - Language.Defaults.create_lemmatizer, but available as helper so it can be - reused in language classes that implement custom lemmatizers. - - lookups (Lookups): The lookups table. - RETURNS (tuple): A (lemma_rules, lemma_index, lemma_exc, lemma_lookup) - tuple that can be used to initialize a Lemmatizer. - """ - lemma_rules = {} - lemma_index = {} - lemma_exc = {} - lemma_lookup = None - if lookups is not None: - if "lemma_rules" in lookups: - lemma_rules = lookups.get_table("lemma_rules") - if "lemma_index" in lookups: - lemma_index = lookups.get_table("lemma_index") - if "lemma_exc" in lookups: - lemma_exc = lookups.get_table("lemma_exc") - if "lemma_lookup" in lookups: - lemma_lookup = lookups.get_table("lemma_lookup") - return (lemma_rules, lemma_index, lemma_exc, lemma_lookup) - - def normalize_slice(length, start, stop, step=None): if not (step is None or step == 1): raise ValueError(Errors.E057) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 62c1791b9..c0d835553 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -50,10 +50,10 @@ cdef class Vocab: """ lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {} tag_map = tag_map if tag_map is not None else {} - if lemmatizer in (None, True, False): - lemmatizer = Lemmatizer({}, {}, {}) if lookups in (None, True, False): lookups = Lookups() + if lemmatizer in (None, True, False): + lemmatizer = Lemmatizer(lookups) self.cfg = {'oov_prob': oov_prob} self.mem = Pool() self._by_orth = PreshMap() diff --git a/website/docs/api/lemmatizer.md b/website/docs/api/lemmatizer.md index 805e96b0f..7570e4ea2 100644 --- a/website/docs/api/lemmatizer.md +++ b/website/docs/api/lemmatizer.md @@ -10,22 +10,40 @@ lookup tables. ## Lemmatizer.\_\_init\_\_ {#init tag="method"} -Create a `Lemmatizer`. +Initialize a `Lemmatizer`. Typically, this happens under the hood within spaCy +when a `Language` subclass and its `Vocab` is initialized. > #### Example > > ```python > from spacy.lemmatizer import Lemmatizer -> lemmatizer = Lemmatizer() +> from spacy.lookups import Lookups +> lookups = Lookups() +> lookups.add_table("lemma_rules", {"noun": [["s", ""]]}) +> lemmatizer = Lemmatizer(lookups) > ``` +> +> For examples of the data format, see the +> [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) repo. -| Name | Type | Description | -| ------------ | ------------- | ---------------------------------------------------------- | -| `index` | dict / `None` | Inventory of lemmas in the language. | -| `exceptions` | dict / `None` | Mapping of string forms to lemmas that bypass the `rules`. | -| `rules` | dict / `None` | List of suffix rewrite rules. | -| `lookup` | dict / `None` | Lookup table mapping string to their lemmas. | -| **RETURNS** | `Lemmatizer` | The newly created object. | +| Name | Type | Description | +| -------------------------------------- | ------------------------- | ------------------------------------------------------------------------------------------------------------------------- | +| `lookups` 2.2 | [`Lookups`](/api/lookups) | The lookups object containing the (optional) tables `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. | +| **RETURNS** | `Lemmatizer` | The newly created object. | + + + +As of v2.2, the lemmatizer is initialized with a [`Lookups`](/api/lookups) +object containing tables for the different components. This makes it easier for +spaCy to share and serialize rules and lookup tables via the `Vocab`, and allows +users to modify lemmatizer data at runtime by updating `nlp.vocab.lookups`. + +```diff +- lemmatizer = Lemmatizer(rules=lemma_rules) ++ lemmatizer = Lemmatizer(lookups) +``` + + ## Lemmatizer.\_\_call\_\_ {#call tag="method"} @@ -35,8 +53,10 @@ Lemmatize a string. > > ```python > from spacy.lemmatizer import Lemmatizer -> rules = {"noun": [["s", ""]]} -> lemmatizer = Lemmatizer(index={}, exceptions={}, rules=rules) +> from spacy.lookups import Lookups +> lookups = Loookups() +> lookups.add_table("lemma_rules", {"noun": [["s", ""]]}) +> lemmatizer = Lemmatizer(lookups) > lemmas = lemmatizer("ducks", "NOUN") > assert lemmas == ["duck"] > ``` @@ -52,14 +72,13 @@ Lemmatize a string. Look up a lemma in the lookup table, if available. If no lemma is found, the original string is returned. Languages can provide a -[lookup table](/usage/adding-languages#lemmatizer) via the `resources`, set on -the individual `Language` class. +[lookup table](/usage/adding-languages#lemmatizer) via the `Lookups`. > #### Example > > ```python -> lookup = {"going": "go"} -> lemmatizer = Lemmatizer(lookup=lookup) +> lookups = Lookups() +> lookups.add_table("lemma_lookup", {"going": "go"}) > assert lemmatizer.lookup("going") == "go" > ``` @@ -91,9 +110,6 @@ lemmatization entirely. ## Attributes {#attributes} -| Name | Type | Description | -| ----------------------------------------- | ------------- | ---------------------------------------------------------- | -| `index` | dict / `None` | Inventory of lemmas in the language. | -| `exc` | dict / `None` | Mapping of string forms to lemmas that bypass the `rules`. | -| `rules` | dict / `None` | List of suffix rewrite rules. | -| `lookup_table` 2 | dict / `None` | The lemma lookup table, if available. | +| Name | Type | Description | +| -------------------------------------- | ------------------------- | --------------------------------------------------------------- | +| `lookups` 2.2 | [`Lookups`](/api/lookups) | The lookups object containing the rules and data, if available. |