mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-11 17:56:30 +03:00
Refactor lemmatizer and data table integration (#4353)
* Move test * Allow default in Lookups.get_table * Start with blank tables in Lookups.from_bytes * Refactor lemmatizer to hold instance of Lookups * Get lookups table within the lemmatization methods to make sure it references the correct table (even if the table was replaced or modified, e.g. when loading a model from disk) * Deprecate other arguments on Lemmatizer.__init__ and expect Lookups for consistency * Remove old and unsupported Lemmatizer.load classmethod * Refactor language-specific lemmatizers to inherit as much as possible from base class and override only what they need * Update tests and docs * Fix more tests * Fix lemmatizer * Upgrade pytest to try and fix weird CI errors * Try pytest 4.6.5
This commit is contained in:
parent
3297a19545
commit
cf65a80f36
|
@ -13,7 +13,6 @@ install:
|
|||
- "pip install -e ."
|
||||
script:
|
||||
- "cat /proc/cpuinfo | grep flags | head -n 1"
|
||||
- "pip install pytest pytest-timeout"
|
||||
- "python -m pytest --tb=native spacy"
|
||||
branches:
|
||||
except:
|
||||
|
|
|
@ -15,7 +15,7 @@ pathlib==1.0.1; python_version < "3.4"
|
|||
jsonschema>=2.6.0,<3.1.0
|
||||
# Development dependencies
|
||||
cython>=0.25
|
||||
pytest>=4.0.0,<4.1.0
|
||||
pytest>=4.6.5
|
||||
pytest-timeout>=1.3.0,<2.0.0
|
||||
mock>=2.0.0,<3.0.0
|
||||
flake8>=3.5.0,<3.6.0
|
||||
|
|
|
@ -96,3 +96,7 @@ exclude =
|
|||
__pycache__,
|
||||
_tokenizer_exceptions_list.py,
|
||||
spacy/__init__.py
|
||||
|
||||
[tool:pytest]
|
||||
markers =
|
||||
slow
|
||||
|
|
|
@ -487,6 +487,12 @@ class Errors(object):
|
|||
E170 = ("Cannot apply transition {name}: invalid for the current state.")
|
||||
E171 = ("Matcher.add received invalid on_match callback argument: expected "
|
||||
"callable or None, but got: {arg_type}")
|
||||
E172 = ("The Lemmatizer.load classmethod is deprecated. To create a "
|
||||
"Lemmatizer, initialize the class directly. See the docs for "
|
||||
"details: https://spacy.io/api/lemmatizer")
|
||||
E173 = ("As of v2.2, the Lemmatizer is initialized with an instance of "
|
||||
"Lookups containing the lemmatization tables. See the docs for "
|
||||
"details: https://spacy.io/api/lemmatizer#init")
|
||||
|
||||
|
||||
@add_codes
|
||||
|
|
|
@ -13,8 +13,9 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|||
from .norm_exceptions import NORM_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...lookups import Lookups
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups, get_lemma_tables
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
|
||||
class GreekDefaults(Language.Defaults):
|
||||
|
@ -34,8 +35,9 @@ class GreekDefaults(Language.Defaults):
|
|||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None, lookups=None):
|
||||
lemma_rules, lemma_index, lemma_exc, lemma_lookup = get_lemma_tables(lookups)
|
||||
return GreekLemmatizer(lemma_index, lemma_exc, lemma_rules, lemma_lookup)
|
||||
if lookups is None:
|
||||
lookups = Lookups()
|
||||
return GreekLemmatizer(lookups)
|
||||
|
||||
|
||||
class Greek(Language):
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...symbols import NOUN, VERB, ADJ, PUNCT
|
||||
from ...lemmatizer import Lemmatizer
|
||||
|
||||
|
||||
class GreekLemmatizer(object):
|
||||
class GreekLemmatizer(Lemmatizer):
|
||||
"""
|
||||
Greek language lemmatizer applies the default rule based lemmatization
|
||||
procedure with some modifications for better Greek language support.
|
||||
|
@ -15,45 +15,7 @@ class GreekLemmatizer(object):
|
|||
not applicable for Greek language.
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def load(cls, path, index=None, exc=None, rules=None, lookup=None):
|
||||
return cls(index, exc, rules, lookup)
|
||||
|
||||
def __init__(self, index=None, exceptions=None, rules=None, lookup=None):
|
||||
self.index = index
|
||||
self.exc = exceptions
|
||||
self.rules = rules
|
||||
self.lookup_table = lookup if lookup is not None else {}
|
||||
|
||||
def __call__(self, string, univ_pos, morphology=None):
|
||||
if not self.rules:
|
||||
return [self.lookup_table.get(string, string)]
|
||||
if univ_pos in (NOUN, "NOUN", "noun"):
|
||||
univ_pos = "noun"
|
||||
elif univ_pos in (VERB, "VERB", "verb"):
|
||||
univ_pos = "verb"
|
||||
elif univ_pos in (ADJ, "ADJ", "adj"):
|
||||
univ_pos = "adj"
|
||||
elif univ_pos in (PUNCT, "PUNCT", "punct"):
|
||||
univ_pos = "punct"
|
||||
else:
|
||||
return list(set([string.lower()]))
|
||||
lemmas = lemmatize(
|
||||
string,
|
||||
self.index.get(univ_pos, {}),
|
||||
self.exc.get(univ_pos, {}),
|
||||
self.rules.get(univ_pos, []),
|
||||
)
|
||||
return lemmas
|
||||
|
||||
def lookup(self, string, orth=None):
|
||||
key = orth if orth is not None else string
|
||||
if key in self.lookup_table:
|
||||
return self.lookup_table[key]
|
||||
return string
|
||||
|
||||
|
||||
def lemmatize(string, index, exceptions, rules):
|
||||
def lemmatize(self, string, index, exceptions, rules):
|
||||
string = string.lower()
|
||||
forms = []
|
||||
if string in index:
|
||||
|
|
|
@ -12,8 +12,9 @@ from .syntax_iterators import SYNTAX_ITERATORS
|
|||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...lookups import Lookups
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups, get_lemma_tables
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
|
||||
class FrenchDefaults(Language.Defaults):
|
||||
|
@ -33,8 +34,9 @@ class FrenchDefaults(Language.Defaults):
|
|||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None, lookups=None):
|
||||
lemma_rules, lemma_index, lemma_exc, lemma_lookup = get_lemma_tables(lookups)
|
||||
return FrenchLemmatizer(lemma_index, lemma_exc, lemma_rules, lemma_lookup)
|
||||
if lookups is None:
|
||||
lookups = Lookups()
|
||||
return FrenchLemmatizer(lookups)
|
||||
|
||||
|
||||
class French(Language):
|
||||
|
|
|
@ -1,12 +1,13 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...lemmatizer import Lemmatizer
|
||||
from ...symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT, ADP
|
||||
from ...symbols import SCONJ, CCONJ
|
||||
from ...symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos
|
||||
|
||||
|
||||
class FrenchLemmatizer(object):
|
||||
class FrenchLemmatizer(Lemmatizer):
|
||||
"""
|
||||
French language lemmatizer applies the default rule based lemmatization
|
||||
procedure with some modifications for better French language support.
|
||||
|
@ -16,19 +17,10 @@ class FrenchLemmatizer(object):
|
|||
the lookup table.
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def load(cls, path, index=None, exc=None, rules=None, lookup=None):
|
||||
return cls(index, exc, rules, lookup)
|
||||
|
||||
def __init__(self, index=None, exceptions=None, rules=None, lookup=None):
|
||||
self.index = index
|
||||
self.exc = exceptions
|
||||
self.rules = rules
|
||||
self.lookup_table = lookup if lookup is not None else {}
|
||||
|
||||
def __call__(self, string, univ_pos, morphology=None):
|
||||
if not self.rules:
|
||||
return [self.lookup_table.get(string, string)]
|
||||
lookup_table = self.lookups.get_table("lemma_lookup", {})
|
||||
if "lemma_rules" not in self.lookups:
|
||||
return [lookup_table.get(string, string)]
|
||||
if univ_pos in (NOUN, "NOUN", "noun"):
|
||||
univ_pos = "noun"
|
||||
elif univ_pos in (VERB, "VERB", "verb"):
|
||||
|
@ -56,12 +48,14 @@ class FrenchLemmatizer(object):
|
|||
# See Issue #435 for example of where this logic is requied.
|
||||
if self.is_base_form(univ_pos, morphology):
|
||||
return list(set([string.lower()]))
|
||||
lemmas = lemmatize(
|
||||
index_table = self.lookups.get_table("lemma_index", {})
|
||||
exc_table = self.lookups.get_table("lemma_exc", {})
|
||||
rules_table = self.lookups.get_table("lemma_rules", {})
|
||||
lemmas = self.lemmatize(
|
||||
string,
|
||||
self.index.get(univ_pos, {}),
|
||||
self.exc.get(univ_pos, {}),
|
||||
self.rules.get(univ_pos, []),
|
||||
self.lookup_table,
|
||||
index_table.get(univ_pos, {}),
|
||||
exc_table.get(univ_pos, {}),
|
||||
rules_table.get(univ_pos, []),
|
||||
)
|
||||
return lemmas
|
||||
|
||||
|
@ -115,12 +109,13 @@ class FrenchLemmatizer(object):
|
|||
return self(string, "punct", morphology)
|
||||
|
||||
def lookup(self, string, orth=None):
|
||||
if orth is not None and orth in self.lookup_table:
|
||||
return self.lookup_table[orth][0]
|
||||
lookup_table = self.lookups.get_table("lemma_lookup", {})
|
||||
if orth is not None and orth in lookup_table:
|
||||
return lookup_table[orth][0]
|
||||
return string
|
||||
|
||||
|
||||
def lemmatize(string, index, exceptions, rules, lookup):
|
||||
def lemmatize(self, string, index, exceptions, rules):
|
||||
lookup_table = self.lookups.get_table("lemma_lookup", {})
|
||||
string = string.lower()
|
||||
forms = []
|
||||
if string in index:
|
||||
|
@ -140,8 +135,8 @@ def lemmatize(string, index, exceptions, rules, lookup):
|
|||
oov_forms.append(form)
|
||||
if not forms:
|
||||
forms.extend(oov_forms)
|
||||
if not forms and string in lookup.keys():
|
||||
forms.append(lookup[string][0])
|
||||
if not forms and string in lookup_table.keys():
|
||||
forms.append(lookup_table[string][0])
|
||||
if not forms:
|
||||
forms.append(string)
|
||||
return list(set(forms))
|
||||
|
|
|
@ -10,8 +10,9 @@ from .lemmatizer import DutchLemmatizer
|
|||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...lookups import Lookups
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups, get_lemma_tables
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
|
||||
class DutchDefaults(Language.Defaults):
|
||||
|
@ -29,8 +30,9 @@ class DutchDefaults(Language.Defaults):
|
|||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None, lookups=None):
|
||||
lemma_rules, lemma_index, lemma_exc, lemma_lookup = get_lemma_tables(lookups)
|
||||
return DutchLemmatizer(lemma_index, lemma_exc, lemma_rules, lemma_lookup)
|
||||
if lookups is None:
|
||||
lookups = Lookups()
|
||||
return DutchLemmatizer(lookups)
|
||||
|
||||
|
||||
class Dutch(Language):
|
||||
|
|
|
@ -1,10 +1,11 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...lemmatizer import Lemmatizer
|
||||
from ...symbols import NOUN, VERB, ADJ, NUM, DET, PRON, ADP, AUX, ADV
|
||||
|
||||
|
||||
class DutchLemmatizer(object):
|
||||
class DutchLemmatizer(Lemmatizer):
|
||||
# Note: CGN does not distinguish AUX verbs, so we treat AUX as VERB.
|
||||
univ_pos_name_variants = {
|
||||
NOUN: "noun",
|
||||
|
@ -36,16 +37,6 @@ class DutchLemmatizer(object):
|
|||
"num": "num",
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def load(cls, path, index=None, exc=None, rules=None, lookup=None):
|
||||
return cls(index, exc, rules, lookup)
|
||||
|
||||
def __init__(self, index=None, exceptions=None, rules=None, lookup=None):
|
||||
self.index = index
|
||||
self.exc = exceptions
|
||||
self.rules = rules or {}
|
||||
self.lookup_table = lookup if lookup is not None else {}
|
||||
|
||||
def __call__(self, string, univ_pos, morphology=None):
|
||||
# Difference 1: self.rules is assumed to be non-None, so no
|
||||
# 'is None' check required.
|
||||
|
@ -62,11 +53,13 @@ class DutchLemmatizer(object):
|
|||
# are not lemmatized. They are lowercased, however.
|
||||
return [string]
|
||||
# if string in self.lemma_index.get(univ_pos)
|
||||
lemma_index = self.index.get(univ_pos, {})
|
||||
index_table = self.lookups.get_table("lemma_index", {})
|
||||
lemma_index = index_table.get(univ_pos, {})
|
||||
# string is already lemma
|
||||
if string in lemma_index:
|
||||
return [string]
|
||||
exceptions = self.exc.get(univ_pos, {})
|
||||
exc_table = self.lookups.get_table("lemma_exc", {})
|
||||
exceptions = exc_table.get(univ_pos, {})
|
||||
# string is irregular token contained in exceptions index.
|
||||
try:
|
||||
lemma = exceptions[string]
|
||||
|
@ -74,15 +67,14 @@ class DutchLemmatizer(object):
|
|||
except KeyError:
|
||||
pass
|
||||
# string corresponds to key in lookup table
|
||||
lookup_table = self.lookup_table
|
||||
lookup_table = self.lookups.get_table("lemma_lookup", {})
|
||||
looked_up_lemma = lookup_table.get(string)
|
||||
if looked_up_lemma and looked_up_lemma in lemma_index:
|
||||
return [looked_up_lemma]
|
||||
|
||||
forms, is_known = lemmatize(
|
||||
string, lemma_index, exceptions, self.rules.get(univ_pos, [])
|
||||
rules_table = self.lookups.get_table("lemma_rules", {})
|
||||
forms, is_known = self.lemmatize(
|
||||
string, lemma_index, exceptions, rules_table.get(univ_pos, [])
|
||||
)
|
||||
|
||||
# Back-off through remaining return value candidates.
|
||||
if forms:
|
||||
if is_known:
|
||||
|
@ -104,37 +96,16 @@ class DutchLemmatizer(object):
|
|||
# used to search the lookup table. This is necessary because our lookup
|
||||
# table consists entirely of lowercase keys.
|
||||
def lookup(self, string, orth=None):
|
||||
lookup_table = self.lookups.get_table("lemma_lookup", {})
|
||||
string = string.lower()
|
||||
if orth is not None:
|
||||
return self.lookup_table.get(orth, string)
|
||||
return lookup_table.get(orth, string)
|
||||
else:
|
||||
return self.lookup_table.get(string, string)
|
||||
return lookup_table.get(string, string)
|
||||
|
||||
def noun(self, string, morphology=None):
|
||||
return self(string, "noun", morphology)
|
||||
|
||||
def verb(self, string, morphology=None):
|
||||
return self(string, "verb", morphology)
|
||||
|
||||
def adj(self, string, morphology=None):
|
||||
return self(string, "adj", morphology)
|
||||
|
||||
def det(self, string, morphology=None):
|
||||
return self(string, "det", morphology)
|
||||
|
||||
def pron(self, string, morphology=None):
|
||||
return self(string, "pron", morphology)
|
||||
|
||||
def adp(self, string, morphology=None):
|
||||
return self(string, "adp", morphology)
|
||||
|
||||
def punct(self, string, morphology=None):
|
||||
return self(string, "punct", morphology)
|
||||
|
||||
|
||||
# Reimplemented to focus more on application of suffix rules and to return
|
||||
# as early as possible.
|
||||
def lemmatize(string, index, exceptions, rules):
|
||||
# Reimplemented to focus more on application of suffix rules and to return
|
||||
# as early as possible.
|
||||
def lemmatize(self, string, index, exceptions, rules):
|
||||
# returns (forms, is_known: bool)
|
||||
oov_forms = []
|
||||
for old, new in rules:
|
||||
|
|
|
@ -12,6 +12,7 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...util import update_exc, add_lookups
|
||||
from ...language import Language
|
||||
from ...lookups import Lookups
|
||||
from ...attrs import LANG, NORM
|
||||
|
||||
|
||||
|
@ -27,8 +28,10 @@ class RussianDefaults(Language.Defaults):
|
|||
tag_map = TAG_MAP
|
||||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None, **kwargs):
|
||||
return RussianLemmatizer()
|
||||
def create_lemmatizer(cls, nlp=None, lookups=None):
|
||||
if lookups is None:
|
||||
lookups = Lookups()
|
||||
return RussianLemmatizer(lookups)
|
||||
|
||||
|
||||
class Russian(Language):
|
||||
|
|
|
@ -9,8 +9,8 @@ from ...compat import unicode_
|
|||
class RussianLemmatizer(Lemmatizer):
|
||||
_morph = None
|
||||
|
||||
def __init__(self):
|
||||
super(RussianLemmatizer, self).__init__()
|
||||
def __init__(self, lookups=None):
|
||||
super(RussianLemmatizer, self).__init__(lookups)
|
||||
try:
|
||||
from pymorphy2 import MorphAnalyzer
|
||||
except ImportError:
|
||||
|
@ -102,19 +102,6 @@ class RussianLemmatizer(Lemmatizer):
|
|||
return symbols_to_str[univ_pos]
|
||||
return None
|
||||
|
||||
def is_base_form(self, univ_pos, morphology=None):
|
||||
# TODO
|
||||
raise NotImplementedError
|
||||
|
||||
def det(self, string, morphology=None):
|
||||
return self(string, "det", morphology)
|
||||
|
||||
def num(self, string, morphology=None):
|
||||
return self(string, "num", morphology)
|
||||
|
||||
def pron(self, string, morphology=None):
|
||||
return self(string, "pron", morphology)
|
||||
|
||||
def lookup(self, string, orth=None):
|
||||
analyses = self._morph.parse(string)
|
||||
if len(analyses) == 1:
|
||||
|
|
|
@ -41,8 +41,7 @@ class BaseDefaults(object):
|
|||
def create_lemmatizer(cls, nlp=None, lookups=None):
|
||||
if lookups is None:
|
||||
lookups = cls.create_lookups(nlp=nlp)
|
||||
rules, index, exc, lookup = util.get_lemma_tables(lookups)
|
||||
return Lemmatizer(index, exc, rules, lookup)
|
||||
return Lemmatizer(lookups=lookups)
|
||||
|
||||
@classmethod
|
||||
def create_lookups(cls, nlp=None):
|
||||
|
|
|
@ -1,8 +1,11 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from collections import OrderedDict
|
||||
|
||||
from .symbols import NOUN, VERB, ADJ, PUNCT, PROPN
|
||||
from .errors import Errors
|
||||
from .lookups import Lookups
|
||||
|
||||
|
||||
class Lemmatizer(object):
|
||||
|
@ -14,18 +17,32 @@ class Lemmatizer(object):
|
|||
"""
|
||||
|
||||
@classmethod
|
||||
def load(cls, path, index=None, exc=None, rules=None, lookup=None):
|
||||
return cls(index, exc, rules, lookup)
|
||||
def load(cls, *args, **kwargs):
|
||||
raise NotImplementedError(Errors.E172)
|
||||
|
||||
def __init__(self, index=None, exceptions=None, rules=None, lookup=None):
|
||||
self.index = index
|
||||
self.exc = exceptions
|
||||
self.rules = rules
|
||||
self.lookup_table = lookup if lookup is not None else {}
|
||||
def __init__(self, lookups, *args, **kwargs):
|
||||
"""Initialize a Lemmatizer.
|
||||
|
||||
lookups (Lookups): The lookups object containing the (optional) tables
|
||||
"lemma_rules", "lemma_index", "lemma_exc" and "lemma_lookup".
|
||||
RETURNS (Lemmatizer): The newly constructed object.
|
||||
"""
|
||||
if args or kwargs or not isinstance(lookups, Lookups):
|
||||
raise ValueError(Errors.E173)
|
||||
self.lookups = lookups
|
||||
|
||||
def __call__(self, string, univ_pos, morphology=None):
|
||||
if not self.rules:
|
||||
return [self.lookup_table.get(string, string)]
|
||||
"""Lemmatize a string.
|
||||
|
||||
string (unicode): The string to lemmatize, e.g. the token text.
|
||||
univ_pos (unicode / int): The token's universal part-of-speech tag.
|
||||
morphology (dict): The token's morphological features following the
|
||||
Universal Dependencies scheme.
|
||||
RETURNS (list): The available lemmas for the string.
|
||||
"""
|
||||
lookup_table = self.lookups.get_table("lemma_lookup", {})
|
||||
if "lemma_rules" not in self.lookups:
|
||||
return [lookup_table.get(string, string)]
|
||||
if univ_pos in (NOUN, "NOUN", "noun"):
|
||||
univ_pos = "noun"
|
||||
elif univ_pos in (VERB, "VERB", "verb"):
|
||||
|
@ -41,11 +58,14 @@ class Lemmatizer(object):
|
|||
# See Issue #435 for example of where this logic is requied.
|
||||
if self.is_base_form(univ_pos, morphology):
|
||||
return [string.lower()]
|
||||
lemmas = lemmatize(
|
||||
index_table = self.lookups.get_table("lemma_index", {})
|
||||
exc_table = self.lookups.get_table("lemma_exc", {})
|
||||
rules_table = self.lookups.get_table("lemma_rules", {})
|
||||
lemmas = self.lemmatize(
|
||||
string,
|
||||
self.index.get(univ_pos, {}),
|
||||
self.exc.get(univ_pos, {}),
|
||||
self.rules.get(univ_pos, []),
|
||||
index_table.get(univ_pos, {}),
|
||||
exc_table.get(univ_pos, {}),
|
||||
rules_table.get(univ_pos, []),
|
||||
)
|
||||
return lemmas
|
||||
|
||||
|
@ -53,6 +73,10 @@ class Lemmatizer(object):
|
|||
"""
|
||||
Check whether we're dealing with an uninflected paradigm, so we can
|
||||
avoid lemmatization entirely.
|
||||
|
||||
univ_pos (unicode / int): The token's universal part-of-speech tag.
|
||||
morphology (dict): The token's morphological features following the
|
||||
Universal Dependencies scheme.
|
||||
"""
|
||||
if morphology is None:
|
||||
morphology = {}
|
||||
|
@ -90,6 +114,18 @@ class Lemmatizer(object):
|
|||
def adj(self, string, morphology=None):
|
||||
return self(string, "adj", morphology)
|
||||
|
||||
def det(self, string, morphology=None):
|
||||
return self(string, "det", morphology)
|
||||
|
||||
def pron(self, string, morphology=None):
|
||||
return self(string, "pron", morphology)
|
||||
|
||||
def adp(self, string, morphology=None):
|
||||
return self(string, "adp", morphology)
|
||||
|
||||
def num(self, string, morphology=None):
|
||||
return self(string, "num", morphology)
|
||||
|
||||
def punct(self, string, morphology=None):
|
||||
return self(string, "punct", morphology)
|
||||
|
||||
|
@ -103,13 +139,13 @@ class Lemmatizer(object):
|
|||
RETURNS (unicode): The lemma if the string was found, otherwise the
|
||||
original string.
|
||||
"""
|
||||
lookup_table = self.lookups.get_table("lemma_lookup", {})
|
||||
key = orth if orth is not None else string
|
||||
if key in self.lookup_table:
|
||||
return self.lookup_table[key]
|
||||
if key in lookup_table:
|
||||
return lookup_table[key]
|
||||
return string
|
||||
|
||||
|
||||
def lemmatize(string, index, exceptions, rules):
|
||||
def lemmatize(self, string, index, exceptions, rules):
|
||||
orig = string
|
||||
string = string.lower()
|
||||
forms = []
|
||||
|
|
|
@ -10,6 +10,9 @@ from .util import SimpleFrozenDict, ensure_path
|
|||
from .strings import get_string_id
|
||||
|
||||
|
||||
UNSET = object()
|
||||
|
||||
|
||||
class Lookups(object):
|
||||
"""Container for large lookup tables and dictionaries, e.g. lemmatization
|
||||
data or tokenizer exception lists. Lookups are available via vocab.lookups,
|
||||
|
@ -60,16 +63,20 @@ class Lookups(object):
|
|||
self._tables[name] = table
|
||||
return table
|
||||
|
||||
def get_table(self, name):
|
||||
"""Get a table. Raises an error if the table doesn't exist.
|
||||
def get_table(self, name, default=UNSET):
|
||||
"""Get a table. Raises an error if the table doesn't exist and no
|
||||
default value is provided.
|
||||
|
||||
name (unicode): Name of the table.
|
||||
default: Optional default value to return if table doesn't exist.
|
||||
RETURNS (Table): The table.
|
||||
|
||||
DOCS: https://spacy.io/api/lookups#get_table
|
||||
"""
|
||||
if name not in self._tables:
|
||||
if default == UNSET:
|
||||
raise KeyError(Errors.E159.format(name=name, tables=self.tables))
|
||||
return default
|
||||
return self._tables[name]
|
||||
|
||||
def remove_table(self, name):
|
||||
|
@ -111,6 +118,7 @@ class Lookups(object):
|
|||
|
||||
DOCS: https://spacy.io/api/lookups#from_bytes
|
||||
"""
|
||||
self._tables = OrderedDict()
|
||||
for key, value in srsly.msgpack_loads(bytes_data).items():
|
||||
self._tables[key] = Table(key)
|
||||
self._tables[key].update(value)
|
||||
|
|
|
@ -5,13 +5,14 @@ import pytest
|
|||
from spacy.vocab import Vocab
|
||||
from spacy.tokens import Doc
|
||||
from spacy.lemmatizer import Lemmatizer
|
||||
from spacy.lookups import Table
|
||||
from spacy.lookups import Lookups
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def lemmatizer():
|
||||
lookup = Table(data={"dogs": "dog", "boxen": "box", "mice": "mouse"})
|
||||
return Lemmatizer(lookup=lookup)
|
||||
lookups = Lookups()
|
||||
lookups.add_table("lemma_lookup", {"dogs": "dog", "boxen": "box", "mice": "mouse"})
|
||||
return Lemmatizer(lookups)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
|
|
@ -5,11 +5,13 @@ import pytest
|
|||
from spacy.morphology import Morphology
|
||||
from spacy.strings import StringStore, get_string_id
|
||||
from spacy.lemmatizer import Lemmatizer
|
||||
from spacy.lookups import Lookups
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def morphology():
|
||||
return Morphology(StringStore(), {}, Lemmatizer())
|
||||
lemmatizer = Lemmatizer(Lookups())
|
||||
return Morphology(StringStore(), {}, lemmatizer)
|
||||
|
||||
|
||||
def test_init(morphology):
|
||||
|
|
|
@ -1,22 +0,0 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
from spacy.lang.en import English
|
||||
from spacy.lookups import Lookups
|
||||
|
||||
|
||||
def test_tagger_warns_no_lemma_lookups():
|
||||
nlp = English()
|
||||
nlp.vocab.lookups = Lookups()
|
||||
assert not len(nlp.vocab.lookups)
|
||||
tagger = nlp.create_pipe("tagger")
|
||||
with pytest.warns(UserWarning):
|
||||
tagger.begin_training()
|
||||
nlp.add_pipe(tagger)
|
||||
with pytest.warns(UserWarning):
|
||||
nlp.begin_training()
|
||||
nlp.vocab.lookups.add_table("lemma_lookup")
|
||||
with pytest.warns(None) as record:
|
||||
nlp.begin_training()
|
||||
assert not record.list
|
|
@ -9,6 +9,7 @@ from spacy.symbols import POS, VERB, VerbForm_inf
|
|||
from spacy.vocab import Vocab
|
||||
from spacy.language import Language
|
||||
from spacy.lemmatizer import Lemmatizer
|
||||
from spacy.lookups import Lookups
|
||||
from spacy.tokens import Doc, Span
|
||||
|
||||
from ..util import get_doc, make_tempdir
|
||||
|
@ -173,8 +174,11 @@ def test_issue595():
|
|||
"""Test lemmatization of base forms"""
|
||||
words = ["Do", "n't", "feed", "the", "dog"]
|
||||
tag_map = {"VB": {POS: VERB, VerbForm_inf: True}}
|
||||
rules = {"verb": [["ed", "e"]]}
|
||||
lemmatizer = Lemmatizer({"verb": {}}, {"verb": {}}, rules)
|
||||
lookups = Lookups()
|
||||
lookups.add_table("lemma_rules", {"verb": [["ed", "e"]]})
|
||||
lookups.add_table("lemma_index", {"verb": {}})
|
||||
lookups.add_table("lemma_exc", {"verb": {}})
|
||||
lemmatizer = Lemmatizer(lookups)
|
||||
vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
|
||||
doc = Doc(vocab, words=words)
|
||||
doc[2].tag_ = "VB"
|
||||
|
|
|
@ -10,6 +10,7 @@ from spacy.lang.lex_attrs import LEX_ATTRS
|
|||
from spacy.matcher import Matcher
|
||||
from spacy.tokenizer import Tokenizer
|
||||
from spacy.lemmatizer import Lemmatizer
|
||||
from spacy.lookups import Lookups
|
||||
from spacy.symbols import ORTH, LEMMA, POS, VERB, VerbForm_part
|
||||
|
||||
|
||||
|
@ -91,10 +92,11 @@ def test_issue1375():
|
|||
|
||||
def test_issue1387():
|
||||
tag_map = {"VBG": {POS: VERB, VerbForm_part: True}}
|
||||
index = {"verb": ("cope", "cop")}
|
||||
exc = {"verb": {"coping": ("cope",)}}
|
||||
rules = {"verb": [["ing", ""]]}
|
||||
lemmatizer = Lemmatizer(index, exc, rules)
|
||||
lookups = Lookups()
|
||||
lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
|
||||
lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
|
||||
lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
|
||||
lemmatizer = Lemmatizer(lookups)
|
||||
vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
|
||||
doc = Doc(vocab, words=["coping"])
|
||||
doc[0].tag_ = "VBG"
|
||||
|
|
|
@ -126,6 +126,7 @@ def test_issue1727():
|
|||
vectors = Vectors(data=data, keys=["I", "am", "Matt"])
|
||||
tagger = Tagger(Vocab())
|
||||
tagger.add_label("PRP")
|
||||
with pytest.warns(UserWarning):
|
||||
tagger.begin_training()
|
||||
assert tagger.cfg.get("pretrained_dims", 0) == 0
|
||||
tagger.vocab.vectors = vectors
|
||||
|
|
|
@ -22,6 +22,7 @@ def test_issue2564():
|
|||
"""Test the tagger sets is_tagged correctly when used via Language.pipe."""
|
||||
nlp = Language()
|
||||
tagger = nlp.create_pipe("tagger")
|
||||
with pytest.warns(UserWarning):
|
||||
tagger.begin_training() # initialise weights
|
||||
nlp.add_pipe(tagger)
|
||||
doc = nlp("hello world")
|
||||
|
|
49
spacy/tests/test_lemmatizer.py
Normal file
49
spacy/tests/test_lemmatizer.py
Normal file
|
@ -0,0 +1,49 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
from spacy.tokens import Doc
|
||||
from spacy.language import Language
|
||||
from spacy.lookups import Lookups
|
||||
|
||||
|
||||
def test_lemmatizer_reflects_lookups_changes():
|
||||
"""Test for an issue that'd cause lookups available in a model loaded from
|
||||
disk to not be reflected in the lemmatizer."""
|
||||
nlp = Language()
|
||||
assert Doc(nlp.vocab, words=["foo"])[0].lemma_ == "foo"
|
||||
table = nlp.vocab.lookups.add_table("lemma_lookup")
|
||||
table["foo"] = "bar"
|
||||
assert Doc(nlp.vocab, words=["foo"])[0].lemma_ == "bar"
|
||||
table = nlp.vocab.lookups.get_table("lemma_lookup")
|
||||
table["hello"] = "world"
|
||||
# The update to the table should be reflected in the lemmatizer
|
||||
assert Doc(nlp.vocab, words=["hello"])[0].lemma_ == "world"
|
||||
new_nlp = Language()
|
||||
table = new_nlp.vocab.lookups.add_table("lemma_lookup")
|
||||
table["hello"] = "hi"
|
||||
assert Doc(new_nlp.vocab, words=["hello"])[0].lemma_ == "hi"
|
||||
nlp_bytes = nlp.to_bytes()
|
||||
new_nlp.from_bytes(nlp_bytes)
|
||||
# Make sure we have the previously saved lookup table
|
||||
assert len(new_nlp.vocab.lookups) == 1
|
||||
assert len(new_nlp.vocab.lookups.get_table("lemma_lookup")) == 2
|
||||
assert new_nlp.vocab.lookups.get_table("lemma_lookup")["hello"] == "world"
|
||||
assert Doc(new_nlp.vocab, words=["foo"])[0].lemma_ == "bar"
|
||||
assert Doc(new_nlp.vocab, words=["hello"])[0].lemma_ == "world"
|
||||
|
||||
|
||||
def test_tagger_warns_no_lemma_lookups():
|
||||
nlp = Language()
|
||||
nlp.vocab.lookups = Lookups()
|
||||
assert not len(nlp.vocab.lookups)
|
||||
tagger = nlp.create_pipe("tagger")
|
||||
with pytest.warns(UserWarning):
|
||||
tagger.begin_training()
|
||||
nlp.add_pipe(tagger)
|
||||
with pytest.warns(UserWarning):
|
||||
nlp.begin_training()
|
||||
nlp.vocab.lookups.add_table("lemma_lookup")
|
||||
with pytest.warns(None) as record:
|
||||
nlp.begin_training()
|
||||
assert not record.list
|
|
@ -111,16 +111,12 @@ SUFFIXES = ['"', ":", ">"]
|
|||
|
||||
@pytest.mark.parametrize("url", URLS_SHOULD_MATCH)
|
||||
def test_should_match(en_tokenizer, url):
|
||||
token_match = en_tokenizer.token_match
|
||||
if token_match:
|
||||
assert token_match(url)
|
||||
assert en_tokenizer.token_match(url) is not None
|
||||
|
||||
|
||||
@pytest.mark.parametrize("url", URLS_SHOULD_NOT_MATCH)
|
||||
def test_should_not_match(en_tokenizer, url):
|
||||
token_match = en_tokenizer.token_match
|
||||
if token_match:
|
||||
assert not token_match(url)
|
||||
assert en_tokenizer.token_match(url) is None
|
||||
|
||||
|
||||
@pytest.mark.parametrize("url", URLS_BASIC)
|
||||
|
|
|
@ -467,31 +467,6 @@ def expand_exc(excs, search, replace):
|
|||
return new_excs
|
||||
|
||||
|
||||
def get_lemma_tables(lookups):
|
||||
"""Load lemmatizer data from lookups table. Mostly used via
|
||||
Language.Defaults.create_lemmatizer, but available as helper so it can be
|
||||
reused in language classes that implement custom lemmatizers.
|
||||
|
||||
lookups (Lookups): The lookups table.
|
||||
RETURNS (tuple): A (lemma_rules, lemma_index, lemma_exc, lemma_lookup)
|
||||
tuple that can be used to initialize a Lemmatizer.
|
||||
"""
|
||||
lemma_rules = {}
|
||||
lemma_index = {}
|
||||
lemma_exc = {}
|
||||
lemma_lookup = None
|
||||
if lookups is not None:
|
||||
if "lemma_rules" in lookups:
|
||||
lemma_rules = lookups.get_table("lemma_rules")
|
||||
if "lemma_index" in lookups:
|
||||
lemma_index = lookups.get_table("lemma_index")
|
||||
if "lemma_exc" in lookups:
|
||||
lemma_exc = lookups.get_table("lemma_exc")
|
||||
if "lemma_lookup" in lookups:
|
||||
lemma_lookup = lookups.get_table("lemma_lookup")
|
||||
return (lemma_rules, lemma_index, lemma_exc, lemma_lookup)
|
||||
|
||||
|
||||
def normalize_slice(length, start, stop, step=None):
|
||||
if not (step is None or step == 1):
|
||||
raise ValueError(Errors.E057)
|
||||
|
|
|
@ -50,10 +50,10 @@ cdef class Vocab:
|
|||
"""
|
||||
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
|
||||
tag_map = tag_map if tag_map is not None else {}
|
||||
if lemmatizer in (None, True, False):
|
||||
lemmatizer = Lemmatizer({}, {}, {})
|
||||
if lookups in (None, True, False):
|
||||
lookups = Lookups()
|
||||
if lemmatizer in (None, True, False):
|
||||
lemmatizer = Lemmatizer(lookups)
|
||||
self.cfg = {'oov_prob': oov_prob}
|
||||
self.mem = Pool()
|
||||
self._by_orth = PreshMap()
|
||||
|
|
|
@ -10,23 +10,41 @@ lookup tables.
|
|||
|
||||
## Lemmatizer.\_\_init\_\_ {#init tag="method"}
|
||||
|
||||
Create a `Lemmatizer`.
|
||||
Initialize a `Lemmatizer`. Typically, this happens under the hood within spaCy
|
||||
when a `Language` subclass and its `Vocab` is initialized.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> from spacy.lemmatizer import Lemmatizer
|
||||
> lemmatizer = Lemmatizer()
|
||||
> from spacy.lookups import Lookups
|
||||
> lookups = Lookups()
|
||||
> lookups.add_table("lemma_rules", {"noun": [["s", ""]]})
|
||||
> lemmatizer = Lemmatizer(lookups)
|
||||
> ```
|
||||
>
|
||||
> For examples of the data format, see the
|
||||
> [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) repo.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------ | ------------- | ---------------------------------------------------------- |
|
||||
| `index` | dict / `None` | Inventory of lemmas in the language. |
|
||||
| `exceptions` | dict / `None` | Mapping of string forms to lemmas that bypass the `rules`. |
|
||||
| `rules` | dict / `None` | List of suffix rewrite rules. |
|
||||
| `lookup` | dict / `None` | Lookup table mapping string to their lemmas. |
|
||||
| -------------------------------------- | ------------------------- | ------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `lookups` <Tag variant="new">2.2</Tag> | [`Lookups`](/api/lookups) | The lookups object containing the (optional) tables `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. |
|
||||
| **RETURNS** | `Lemmatizer` | The newly created object. |
|
||||
|
||||
<Infobox title="Deprecation note" variant="danger">
|
||||
|
||||
As of v2.2, the lemmatizer is initialized with a [`Lookups`](/api/lookups)
|
||||
object containing tables for the different components. This makes it easier for
|
||||
spaCy to share and serialize rules and lookup tables via the `Vocab`, and allows
|
||||
users to modify lemmatizer data at runtime by updating `nlp.vocab.lookups`.
|
||||
|
||||
```diff
|
||||
- lemmatizer = Lemmatizer(rules=lemma_rules)
|
||||
+ lemmatizer = Lemmatizer(lookups)
|
||||
```
|
||||
|
||||
</Infobox>
|
||||
|
||||
## Lemmatizer.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
Lemmatize a string.
|
||||
|
@ -35,8 +53,10 @@ Lemmatize a string.
|
|||
>
|
||||
> ```python
|
||||
> from spacy.lemmatizer import Lemmatizer
|
||||
> rules = {"noun": [["s", ""]]}
|
||||
> lemmatizer = Lemmatizer(index={}, exceptions={}, rules=rules)
|
||||
> from spacy.lookups import Lookups
|
||||
> lookups = Loookups()
|
||||
> lookups.add_table("lemma_rules", {"noun": [["s", ""]]})
|
||||
> lemmatizer = Lemmatizer(lookups)
|
||||
> lemmas = lemmatizer("ducks", "NOUN")
|
||||
> assert lemmas == ["duck"]
|
||||
> ```
|
||||
|
@ -52,14 +72,13 @@ Lemmatize a string.
|
|||
|
||||
Look up a lemma in the lookup table, if available. If no lemma is found, the
|
||||
original string is returned. Languages can provide a
|
||||
[lookup table](/usage/adding-languages#lemmatizer) via the `resources`, set on
|
||||
the individual `Language` class.
|
||||
[lookup table](/usage/adding-languages#lemmatizer) via the `Lookups`.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> lookup = {"going": "go"}
|
||||
> lemmatizer = Lemmatizer(lookup=lookup)
|
||||
> lookups = Lookups()
|
||||
> lookups.add_table("lemma_lookup", {"going": "go"})
|
||||
> assert lemmatizer.lookup("going") == "go"
|
||||
> ```
|
||||
|
||||
|
@ -92,8 +111,5 @@ lemmatization entirely.
|
|||
## Attributes {#attributes}
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------------------------------------- | ------------- | ---------------------------------------------------------- |
|
||||
| `index` | dict / `None` | Inventory of lemmas in the language. |
|
||||
| `exc` | dict / `None` | Mapping of string forms to lemmas that bypass the `rules`. |
|
||||
| `rules` | dict / `None` | List of suffix rewrite rules. |
|
||||
| `lookup_table` <Tag variant="new">2</Tag> | dict / `None` | The lemma lookup table, if available. |
|
||||
| -------------------------------------- | ------------------------- | --------------------------------------------------------------- |
|
||||
| `lookups` <Tag variant="new">2.2</Tag> | [`Lookups`](/api/lookups) | The lookups object containing the rules and data, if available. |
|
||||
|
|
Loading…
Reference in New Issue
Block a user