mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Refactor lemmatizer and data table integration (#4353)
* Move test * Allow default in Lookups.get_table * Start with blank tables in Lookups.from_bytes * Refactor lemmatizer to hold instance of Lookups * Get lookups table within the lemmatization methods to make sure it references the correct table (even if the table was replaced or modified, e.g. when loading a model from disk) * Deprecate other arguments on Lemmatizer.__init__ and expect Lookups for consistency * Remove old and unsupported Lemmatizer.load classmethod * Refactor language-specific lemmatizers to inherit as much as possible from base class and override only what they need * Update tests and docs * Fix more tests * Fix lemmatizer * Upgrade pytest to try and fix weird CI errors * Try pytest 4.6.5
This commit is contained in:
parent
3297a19545
commit
cf65a80f36
|
@ -13,7 +13,6 @@ install:
|
||||||
- "pip install -e ."
|
- "pip install -e ."
|
||||||
script:
|
script:
|
||||||
- "cat /proc/cpuinfo | grep flags | head -n 1"
|
- "cat /proc/cpuinfo | grep flags | head -n 1"
|
||||||
- "pip install pytest pytest-timeout"
|
|
||||||
- "python -m pytest --tb=native spacy"
|
- "python -m pytest --tb=native spacy"
|
||||||
branches:
|
branches:
|
||||||
except:
|
except:
|
||||||
|
|
|
@ -15,7 +15,7 @@ pathlib==1.0.1; python_version < "3.4"
|
||||||
jsonschema>=2.6.0,<3.1.0
|
jsonschema>=2.6.0,<3.1.0
|
||||||
# Development dependencies
|
# Development dependencies
|
||||||
cython>=0.25
|
cython>=0.25
|
||||||
pytest>=4.0.0,<4.1.0
|
pytest>=4.6.5
|
||||||
pytest-timeout>=1.3.0,<2.0.0
|
pytest-timeout>=1.3.0,<2.0.0
|
||||||
mock>=2.0.0,<3.0.0
|
mock>=2.0.0,<3.0.0
|
||||||
flake8>=3.5.0,<3.6.0
|
flake8>=3.5.0,<3.6.0
|
||||||
|
|
|
@ -96,3 +96,7 @@ exclude =
|
||||||
__pycache__,
|
__pycache__,
|
||||||
_tokenizer_exceptions_list.py,
|
_tokenizer_exceptions_list.py,
|
||||||
spacy/__init__.py
|
spacy/__init__.py
|
||||||
|
|
||||||
|
[tool:pytest]
|
||||||
|
markers =
|
||||||
|
slow
|
||||||
|
|
|
@ -487,6 +487,12 @@ class Errors(object):
|
||||||
E170 = ("Cannot apply transition {name}: invalid for the current state.")
|
E170 = ("Cannot apply transition {name}: invalid for the current state.")
|
||||||
E171 = ("Matcher.add received invalid on_match callback argument: expected "
|
E171 = ("Matcher.add received invalid on_match callback argument: expected "
|
||||||
"callable or None, but got: {arg_type}")
|
"callable or None, but got: {arg_type}")
|
||||||
|
E172 = ("The Lemmatizer.load classmethod is deprecated. To create a "
|
||||||
|
"Lemmatizer, initialize the class directly. See the docs for "
|
||||||
|
"details: https://spacy.io/api/lemmatizer")
|
||||||
|
E173 = ("As of v2.2, the Lemmatizer is initialized with an instance of "
|
||||||
|
"Lookups containing the lemmatization tables. See the docs for "
|
||||||
|
"details: https://spacy.io/api/lemmatizer#init")
|
||||||
|
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
|
|
|
@ -13,8 +13,9 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from .norm_exceptions import NORM_EXCEPTIONS
|
from .norm_exceptions import NORM_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
from ..norm_exceptions import BASE_NORMS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
from ...lookups import Lookups
|
||||||
from ...attrs import LANG, NORM
|
from ...attrs import LANG, NORM
|
||||||
from ...util import update_exc, add_lookups, get_lemma_tables
|
from ...util import update_exc, add_lookups
|
||||||
|
|
||||||
|
|
||||||
class GreekDefaults(Language.Defaults):
|
class GreekDefaults(Language.Defaults):
|
||||||
|
@ -34,8 +35,9 @@ class GreekDefaults(Language.Defaults):
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create_lemmatizer(cls, nlp=None, lookups=None):
|
def create_lemmatizer(cls, nlp=None, lookups=None):
|
||||||
lemma_rules, lemma_index, lemma_exc, lemma_lookup = get_lemma_tables(lookups)
|
if lookups is None:
|
||||||
return GreekLemmatizer(lemma_index, lemma_exc, lemma_rules, lemma_lookup)
|
lookups = Lookups()
|
||||||
|
return GreekLemmatizer(lookups)
|
||||||
|
|
||||||
|
|
||||||
class Greek(Language):
|
class Greek(Language):
|
||||||
|
|
|
@ -1,10 +1,10 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...symbols import NOUN, VERB, ADJ, PUNCT
|
from ...lemmatizer import Lemmatizer
|
||||||
|
|
||||||
|
|
||||||
class GreekLemmatizer(object):
|
class GreekLemmatizer(Lemmatizer):
|
||||||
"""
|
"""
|
||||||
Greek language lemmatizer applies the default rule based lemmatization
|
Greek language lemmatizer applies the default rule based lemmatization
|
||||||
procedure with some modifications for better Greek language support.
|
procedure with some modifications for better Greek language support.
|
||||||
|
@ -15,64 +15,26 @@ class GreekLemmatizer(object):
|
||||||
not applicable for Greek language.
|
not applicable for Greek language.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@classmethod
|
def lemmatize(self, string, index, exceptions, rules):
|
||||||
def load(cls, path, index=None, exc=None, rules=None, lookup=None):
|
string = string.lower()
|
||||||
return cls(index, exc, rules, lookup)
|
forms = []
|
||||||
|
if string in index:
|
||||||
def __init__(self, index=None, exceptions=None, rules=None, lookup=None):
|
forms.append(string)
|
||||||
self.index = index
|
return forms
|
||||||
self.exc = exceptions
|
forms.extend(exceptions.get(string, []))
|
||||||
self.rules = rules
|
oov_forms = []
|
||||||
self.lookup_table = lookup if lookup is not None else {}
|
if not forms:
|
||||||
|
for old, new in rules:
|
||||||
def __call__(self, string, univ_pos, morphology=None):
|
if string.endswith(old):
|
||||||
if not self.rules:
|
form = string[: len(string) - len(old)] + new
|
||||||
return [self.lookup_table.get(string, string)]
|
if not form:
|
||||||
if univ_pos in (NOUN, "NOUN", "noun"):
|
pass
|
||||||
univ_pos = "noun"
|
elif form in index or not form.isalpha():
|
||||||
elif univ_pos in (VERB, "VERB", "verb"):
|
forms.append(form)
|
||||||
univ_pos = "verb"
|
else:
|
||||||
elif univ_pos in (ADJ, "ADJ", "adj"):
|
oov_forms.append(form)
|
||||||
univ_pos = "adj"
|
if not forms:
|
||||||
elif univ_pos in (PUNCT, "PUNCT", "punct"):
|
forms.extend(oov_forms)
|
||||||
univ_pos = "punct"
|
if not forms:
|
||||||
else:
|
forms.append(string)
|
||||||
return list(set([string.lower()]))
|
return list(set(forms))
|
||||||
lemmas = lemmatize(
|
|
||||||
string,
|
|
||||||
self.index.get(univ_pos, {}),
|
|
||||||
self.exc.get(univ_pos, {}),
|
|
||||||
self.rules.get(univ_pos, []),
|
|
||||||
)
|
|
||||||
return lemmas
|
|
||||||
|
|
||||||
def lookup(self, string, orth=None):
|
|
||||||
key = orth if orth is not None else string
|
|
||||||
if key in self.lookup_table:
|
|
||||||
return self.lookup_table[key]
|
|
||||||
return string
|
|
||||||
|
|
||||||
|
|
||||||
def lemmatize(string, index, exceptions, rules):
|
|
||||||
string = string.lower()
|
|
||||||
forms = []
|
|
||||||
if string in index:
|
|
||||||
forms.append(string)
|
|
||||||
return forms
|
|
||||||
forms.extend(exceptions.get(string, []))
|
|
||||||
oov_forms = []
|
|
||||||
if not forms:
|
|
||||||
for old, new in rules:
|
|
||||||
if string.endswith(old):
|
|
||||||
form = string[: len(string) - len(old)] + new
|
|
||||||
if not form:
|
|
||||||
pass
|
|
||||||
elif form in index or not form.isalpha():
|
|
||||||
forms.append(form)
|
|
||||||
else:
|
|
||||||
oov_forms.append(form)
|
|
||||||
if not forms:
|
|
||||||
forms.extend(oov_forms)
|
|
||||||
if not forms:
|
|
||||||
forms.append(string)
|
|
||||||
return list(set(forms))
|
|
||||||
|
|
|
@ -12,8 +12,9 @@ from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
from ..norm_exceptions import BASE_NORMS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
from ...lookups import Lookups
|
||||||
from ...attrs import LANG, NORM
|
from ...attrs import LANG, NORM
|
||||||
from ...util import update_exc, add_lookups, get_lemma_tables
|
from ...util import update_exc, add_lookups
|
||||||
|
|
||||||
|
|
||||||
class FrenchDefaults(Language.Defaults):
|
class FrenchDefaults(Language.Defaults):
|
||||||
|
@ -33,8 +34,9 @@ class FrenchDefaults(Language.Defaults):
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create_lemmatizer(cls, nlp=None, lookups=None):
|
def create_lemmatizer(cls, nlp=None, lookups=None):
|
||||||
lemma_rules, lemma_index, lemma_exc, lemma_lookup = get_lemma_tables(lookups)
|
if lookups is None:
|
||||||
return FrenchLemmatizer(lemma_index, lemma_exc, lemma_rules, lemma_lookup)
|
lookups = Lookups()
|
||||||
|
return FrenchLemmatizer(lookups)
|
||||||
|
|
||||||
|
|
||||||
class French(Language):
|
class French(Language):
|
||||||
|
|
|
@ -1,12 +1,13 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from ...lemmatizer import Lemmatizer
|
||||||
from ...symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT, ADP
|
from ...symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT, ADP
|
||||||
from ...symbols import SCONJ, CCONJ
|
from ...symbols import SCONJ, CCONJ
|
||||||
from ...symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos
|
from ...symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos
|
||||||
|
|
||||||
|
|
||||||
class FrenchLemmatizer(object):
|
class FrenchLemmatizer(Lemmatizer):
|
||||||
"""
|
"""
|
||||||
French language lemmatizer applies the default rule based lemmatization
|
French language lemmatizer applies the default rule based lemmatization
|
||||||
procedure with some modifications for better French language support.
|
procedure with some modifications for better French language support.
|
||||||
|
@ -16,19 +17,10 @@ class FrenchLemmatizer(object):
|
||||||
the lookup table.
|
the lookup table.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def load(cls, path, index=None, exc=None, rules=None, lookup=None):
|
|
||||||
return cls(index, exc, rules, lookup)
|
|
||||||
|
|
||||||
def __init__(self, index=None, exceptions=None, rules=None, lookup=None):
|
|
||||||
self.index = index
|
|
||||||
self.exc = exceptions
|
|
||||||
self.rules = rules
|
|
||||||
self.lookup_table = lookup if lookup is not None else {}
|
|
||||||
|
|
||||||
def __call__(self, string, univ_pos, morphology=None):
|
def __call__(self, string, univ_pos, morphology=None):
|
||||||
if not self.rules:
|
lookup_table = self.lookups.get_table("lemma_lookup", {})
|
||||||
return [self.lookup_table.get(string, string)]
|
if "lemma_rules" not in self.lookups:
|
||||||
|
return [lookup_table.get(string, string)]
|
||||||
if univ_pos in (NOUN, "NOUN", "noun"):
|
if univ_pos in (NOUN, "NOUN", "noun"):
|
||||||
univ_pos = "noun"
|
univ_pos = "noun"
|
||||||
elif univ_pos in (VERB, "VERB", "verb"):
|
elif univ_pos in (VERB, "VERB", "verb"):
|
||||||
|
@ -56,12 +48,14 @@ class FrenchLemmatizer(object):
|
||||||
# See Issue #435 for example of where this logic is requied.
|
# See Issue #435 for example of where this logic is requied.
|
||||||
if self.is_base_form(univ_pos, morphology):
|
if self.is_base_form(univ_pos, morphology):
|
||||||
return list(set([string.lower()]))
|
return list(set([string.lower()]))
|
||||||
lemmas = lemmatize(
|
index_table = self.lookups.get_table("lemma_index", {})
|
||||||
|
exc_table = self.lookups.get_table("lemma_exc", {})
|
||||||
|
rules_table = self.lookups.get_table("lemma_rules", {})
|
||||||
|
lemmas = self.lemmatize(
|
||||||
string,
|
string,
|
||||||
self.index.get(univ_pos, {}),
|
index_table.get(univ_pos, {}),
|
||||||
self.exc.get(univ_pos, {}),
|
exc_table.get(univ_pos, {}),
|
||||||
self.rules.get(univ_pos, []),
|
rules_table.get(univ_pos, []),
|
||||||
self.lookup_table,
|
|
||||||
)
|
)
|
||||||
return lemmas
|
return lemmas
|
||||||
|
|
||||||
|
@ -115,33 +109,34 @@ class FrenchLemmatizer(object):
|
||||||
return self(string, "punct", morphology)
|
return self(string, "punct", morphology)
|
||||||
|
|
||||||
def lookup(self, string, orth=None):
|
def lookup(self, string, orth=None):
|
||||||
if orth is not None and orth in self.lookup_table:
|
lookup_table = self.lookups.get_table("lemma_lookup", {})
|
||||||
return self.lookup_table[orth][0]
|
if orth is not None and orth in lookup_table:
|
||||||
|
return lookup_table[orth][0]
|
||||||
return string
|
return string
|
||||||
|
|
||||||
|
def lemmatize(self, string, index, exceptions, rules):
|
||||||
def lemmatize(string, index, exceptions, rules, lookup):
|
lookup_table = self.lookups.get_table("lemma_lookup", {})
|
||||||
string = string.lower()
|
string = string.lower()
|
||||||
forms = []
|
forms = []
|
||||||
if string in index:
|
if string in index:
|
||||||
forms.append(string)
|
forms.append(string)
|
||||||
return forms
|
return forms
|
||||||
forms.extend(exceptions.get(string, []))
|
forms.extend(exceptions.get(string, []))
|
||||||
oov_forms = []
|
oov_forms = []
|
||||||
if not forms:
|
if not forms:
|
||||||
for old, new in rules:
|
for old, new in rules:
|
||||||
if string.endswith(old):
|
if string.endswith(old):
|
||||||
form = string[: len(string) - len(old)] + new
|
form = string[: len(string) - len(old)] + new
|
||||||
if not form:
|
if not form:
|
||||||
pass
|
pass
|
||||||
elif form in index or not form.isalpha():
|
elif form in index or not form.isalpha():
|
||||||
forms.append(form)
|
forms.append(form)
|
||||||
else:
|
else:
|
||||||
oov_forms.append(form)
|
oov_forms.append(form)
|
||||||
if not forms:
|
if not forms:
|
||||||
forms.extend(oov_forms)
|
forms.extend(oov_forms)
|
||||||
if not forms and string in lookup.keys():
|
if not forms and string in lookup_table.keys():
|
||||||
forms.append(lookup[string][0])
|
forms.append(lookup_table[string][0])
|
||||||
if not forms:
|
if not forms:
|
||||||
forms.append(string)
|
forms.append(string)
|
||||||
return list(set(forms))
|
return list(set(forms))
|
||||||
|
|
|
@ -10,8 +10,9 @@ from .lemmatizer import DutchLemmatizer
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
from ..norm_exceptions import BASE_NORMS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
from ...lookups import Lookups
|
||||||
from ...attrs import LANG, NORM
|
from ...attrs import LANG, NORM
|
||||||
from ...util import update_exc, add_lookups, get_lemma_tables
|
from ...util import update_exc, add_lookups
|
||||||
|
|
||||||
|
|
||||||
class DutchDefaults(Language.Defaults):
|
class DutchDefaults(Language.Defaults):
|
||||||
|
@ -29,8 +30,9 @@ class DutchDefaults(Language.Defaults):
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create_lemmatizer(cls, nlp=None, lookups=None):
|
def create_lemmatizer(cls, nlp=None, lookups=None):
|
||||||
lemma_rules, lemma_index, lemma_exc, lemma_lookup = get_lemma_tables(lookups)
|
if lookups is None:
|
||||||
return DutchLemmatizer(lemma_index, lemma_exc, lemma_rules, lemma_lookup)
|
lookups = Lookups()
|
||||||
|
return DutchLemmatizer(lookups)
|
||||||
|
|
||||||
|
|
||||||
class Dutch(Language):
|
class Dutch(Language):
|
||||||
|
|
|
@ -1,10 +1,11 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from ...lemmatizer import Lemmatizer
|
||||||
from ...symbols import NOUN, VERB, ADJ, NUM, DET, PRON, ADP, AUX, ADV
|
from ...symbols import NOUN, VERB, ADJ, NUM, DET, PRON, ADP, AUX, ADV
|
||||||
|
|
||||||
|
|
||||||
class DutchLemmatizer(object):
|
class DutchLemmatizer(Lemmatizer):
|
||||||
# Note: CGN does not distinguish AUX verbs, so we treat AUX as VERB.
|
# Note: CGN does not distinguish AUX verbs, so we treat AUX as VERB.
|
||||||
univ_pos_name_variants = {
|
univ_pos_name_variants = {
|
||||||
NOUN: "noun",
|
NOUN: "noun",
|
||||||
|
@ -36,16 +37,6 @@ class DutchLemmatizer(object):
|
||||||
"num": "num",
|
"num": "num",
|
||||||
}
|
}
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def load(cls, path, index=None, exc=None, rules=None, lookup=None):
|
|
||||||
return cls(index, exc, rules, lookup)
|
|
||||||
|
|
||||||
def __init__(self, index=None, exceptions=None, rules=None, lookup=None):
|
|
||||||
self.index = index
|
|
||||||
self.exc = exceptions
|
|
||||||
self.rules = rules or {}
|
|
||||||
self.lookup_table = lookup if lookup is not None else {}
|
|
||||||
|
|
||||||
def __call__(self, string, univ_pos, morphology=None):
|
def __call__(self, string, univ_pos, morphology=None):
|
||||||
# Difference 1: self.rules is assumed to be non-None, so no
|
# Difference 1: self.rules is assumed to be non-None, so no
|
||||||
# 'is None' check required.
|
# 'is None' check required.
|
||||||
|
@ -62,11 +53,13 @@ class DutchLemmatizer(object):
|
||||||
# are not lemmatized. They are lowercased, however.
|
# are not lemmatized. They are lowercased, however.
|
||||||
return [string]
|
return [string]
|
||||||
# if string in self.lemma_index.get(univ_pos)
|
# if string in self.lemma_index.get(univ_pos)
|
||||||
lemma_index = self.index.get(univ_pos, {})
|
index_table = self.lookups.get_table("lemma_index", {})
|
||||||
|
lemma_index = index_table.get(univ_pos, {})
|
||||||
# string is already lemma
|
# string is already lemma
|
||||||
if string in lemma_index:
|
if string in lemma_index:
|
||||||
return [string]
|
return [string]
|
||||||
exceptions = self.exc.get(univ_pos, {})
|
exc_table = self.lookups.get_table("lemma_exc", {})
|
||||||
|
exceptions = exc_table.get(univ_pos, {})
|
||||||
# string is irregular token contained in exceptions index.
|
# string is irregular token contained in exceptions index.
|
||||||
try:
|
try:
|
||||||
lemma = exceptions[string]
|
lemma = exceptions[string]
|
||||||
|
@ -74,15 +67,14 @@ class DutchLemmatizer(object):
|
||||||
except KeyError:
|
except KeyError:
|
||||||
pass
|
pass
|
||||||
# string corresponds to key in lookup table
|
# string corresponds to key in lookup table
|
||||||
lookup_table = self.lookup_table
|
lookup_table = self.lookups.get_table("lemma_lookup", {})
|
||||||
looked_up_lemma = lookup_table.get(string)
|
looked_up_lemma = lookup_table.get(string)
|
||||||
if looked_up_lemma and looked_up_lemma in lemma_index:
|
if looked_up_lemma and looked_up_lemma in lemma_index:
|
||||||
return [looked_up_lemma]
|
return [looked_up_lemma]
|
||||||
|
rules_table = self.lookups.get_table("lemma_rules", {})
|
||||||
forms, is_known = lemmatize(
|
forms, is_known = self.lemmatize(
|
||||||
string, lemma_index, exceptions, self.rules.get(univ_pos, [])
|
string, lemma_index, exceptions, rules_table.get(univ_pos, [])
|
||||||
)
|
)
|
||||||
|
|
||||||
# Back-off through remaining return value candidates.
|
# Back-off through remaining return value candidates.
|
||||||
if forms:
|
if forms:
|
||||||
if is_known:
|
if is_known:
|
||||||
|
@ -104,46 +96,25 @@ class DutchLemmatizer(object):
|
||||||
# used to search the lookup table. This is necessary because our lookup
|
# used to search the lookup table. This is necessary because our lookup
|
||||||
# table consists entirely of lowercase keys.
|
# table consists entirely of lowercase keys.
|
||||||
def lookup(self, string, orth=None):
|
def lookup(self, string, orth=None):
|
||||||
|
lookup_table = self.lookups.get_table("lemma_lookup", {})
|
||||||
string = string.lower()
|
string = string.lower()
|
||||||
if orth is not None:
|
if orth is not None:
|
||||||
return self.lookup_table.get(orth, string)
|
return lookup_table.get(orth, string)
|
||||||
else:
|
else:
|
||||||
return self.lookup_table.get(string, string)
|
return lookup_table.get(string, string)
|
||||||
|
|
||||||
def noun(self, string, morphology=None):
|
# Reimplemented to focus more on application of suffix rules and to return
|
||||||
return self(string, "noun", morphology)
|
# as early as possible.
|
||||||
|
def lemmatize(self, string, index, exceptions, rules):
|
||||||
def verb(self, string, morphology=None):
|
# returns (forms, is_known: bool)
|
||||||
return self(string, "verb", morphology)
|
oov_forms = []
|
||||||
|
for old, new in rules:
|
||||||
def adj(self, string, morphology=None):
|
if string.endswith(old):
|
||||||
return self(string, "adj", morphology)
|
form = string[: len(string) - len(old)] + new
|
||||||
|
if not form:
|
||||||
def det(self, string, morphology=None):
|
pass
|
||||||
return self(string, "det", morphology)
|
elif form in index:
|
||||||
|
return [form], True # True = Is known (is lemma)
|
||||||
def pron(self, string, morphology=None):
|
else:
|
||||||
return self(string, "pron", morphology)
|
oov_forms.append(form)
|
||||||
|
return list(set(oov_forms)), False
|
||||||
def adp(self, string, morphology=None):
|
|
||||||
return self(string, "adp", morphology)
|
|
||||||
|
|
||||||
def punct(self, string, morphology=None):
|
|
||||||
return self(string, "punct", morphology)
|
|
||||||
|
|
||||||
|
|
||||||
# Reimplemented to focus more on application of suffix rules and to return
|
|
||||||
# as early as possible.
|
|
||||||
def lemmatize(string, index, exceptions, rules):
|
|
||||||
# returns (forms, is_known: bool)
|
|
||||||
oov_forms = []
|
|
||||||
for old, new in rules:
|
|
||||||
if string.endswith(old):
|
|
||||||
form = string[: len(string) - len(old)] + new
|
|
||||||
if not form:
|
|
||||||
pass
|
|
||||||
elif form in index:
|
|
||||||
return [form], True # True = Is known (is lemma)
|
|
||||||
else:
|
|
||||||
oov_forms.append(form)
|
|
||||||
return list(set(oov_forms)), False
|
|
||||||
|
|
|
@ -12,6 +12,7 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
from ..norm_exceptions import BASE_NORMS
|
||||||
from ...util import update_exc, add_lookups
|
from ...util import update_exc, add_lookups
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
from ...lookups import Lookups
|
||||||
from ...attrs import LANG, NORM
|
from ...attrs import LANG, NORM
|
||||||
|
|
||||||
|
|
||||||
|
@ -27,8 +28,10 @@ class RussianDefaults(Language.Defaults):
|
||||||
tag_map = TAG_MAP
|
tag_map = TAG_MAP
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create_lemmatizer(cls, nlp=None, **kwargs):
|
def create_lemmatizer(cls, nlp=None, lookups=None):
|
||||||
return RussianLemmatizer()
|
if lookups is None:
|
||||||
|
lookups = Lookups()
|
||||||
|
return RussianLemmatizer(lookups)
|
||||||
|
|
||||||
|
|
||||||
class Russian(Language):
|
class Russian(Language):
|
||||||
|
|
|
@ -9,8 +9,8 @@ from ...compat import unicode_
|
||||||
class RussianLemmatizer(Lemmatizer):
|
class RussianLemmatizer(Lemmatizer):
|
||||||
_morph = None
|
_morph = None
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self, lookups=None):
|
||||||
super(RussianLemmatizer, self).__init__()
|
super(RussianLemmatizer, self).__init__(lookups)
|
||||||
try:
|
try:
|
||||||
from pymorphy2 import MorphAnalyzer
|
from pymorphy2 import MorphAnalyzer
|
||||||
except ImportError:
|
except ImportError:
|
||||||
|
@ -102,19 +102,6 @@ class RussianLemmatizer(Lemmatizer):
|
||||||
return symbols_to_str[univ_pos]
|
return symbols_to_str[univ_pos]
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def is_base_form(self, univ_pos, morphology=None):
|
|
||||||
# TODO
|
|
||||||
raise NotImplementedError
|
|
||||||
|
|
||||||
def det(self, string, morphology=None):
|
|
||||||
return self(string, "det", morphology)
|
|
||||||
|
|
||||||
def num(self, string, morphology=None):
|
|
||||||
return self(string, "num", morphology)
|
|
||||||
|
|
||||||
def pron(self, string, morphology=None):
|
|
||||||
return self(string, "pron", morphology)
|
|
||||||
|
|
||||||
def lookup(self, string, orth=None):
|
def lookup(self, string, orth=None):
|
||||||
analyses = self._morph.parse(string)
|
analyses = self._morph.parse(string)
|
||||||
if len(analyses) == 1:
|
if len(analyses) == 1:
|
||||||
|
|
|
@ -41,8 +41,7 @@ class BaseDefaults(object):
|
||||||
def create_lemmatizer(cls, nlp=None, lookups=None):
|
def create_lemmatizer(cls, nlp=None, lookups=None):
|
||||||
if lookups is None:
|
if lookups is None:
|
||||||
lookups = cls.create_lookups(nlp=nlp)
|
lookups = cls.create_lookups(nlp=nlp)
|
||||||
rules, index, exc, lookup = util.get_lemma_tables(lookups)
|
return Lemmatizer(lookups=lookups)
|
||||||
return Lemmatizer(index, exc, rules, lookup)
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create_lookups(cls, nlp=None):
|
def create_lookups(cls, nlp=None):
|
||||||
|
|
|
@ -1,8 +1,11 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
|
|
||||||
from .symbols import NOUN, VERB, ADJ, PUNCT, PROPN
|
from .symbols import NOUN, VERB, ADJ, PUNCT, PROPN
|
||||||
|
from .errors import Errors
|
||||||
|
from .lookups import Lookups
|
||||||
|
|
||||||
|
|
||||||
class Lemmatizer(object):
|
class Lemmatizer(object):
|
||||||
|
@ -14,18 +17,32 @@ class Lemmatizer(object):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def load(cls, path, index=None, exc=None, rules=None, lookup=None):
|
def load(cls, *args, **kwargs):
|
||||||
return cls(index, exc, rules, lookup)
|
raise NotImplementedError(Errors.E172)
|
||||||
|
|
||||||
def __init__(self, index=None, exceptions=None, rules=None, lookup=None):
|
def __init__(self, lookups, *args, **kwargs):
|
||||||
self.index = index
|
"""Initialize a Lemmatizer.
|
||||||
self.exc = exceptions
|
|
||||||
self.rules = rules
|
lookups (Lookups): The lookups object containing the (optional) tables
|
||||||
self.lookup_table = lookup if lookup is not None else {}
|
"lemma_rules", "lemma_index", "lemma_exc" and "lemma_lookup".
|
||||||
|
RETURNS (Lemmatizer): The newly constructed object.
|
||||||
|
"""
|
||||||
|
if args or kwargs or not isinstance(lookups, Lookups):
|
||||||
|
raise ValueError(Errors.E173)
|
||||||
|
self.lookups = lookups
|
||||||
|
|
||||||
def __call__(self, string, univ_pos, morphology=None):
|
def __call__(self, string, univ_pos, morphology=None):
|
||||||
if not self.rules:
|
"""Lemmatize a string.
|
||||||
return [self.lookup_table.get(string, string)]
|
|
||||||
|
string (unicode): The string to lemmatize, e.g. the token text.
|
||||||
|
univ_pos (unicode / int): The token's universal part-of-speech tag.
|
||||||
|
morphology (dict): The token's morphological features following the
|
||||||
|
Universal Dependencies scheme.
|
||||||
|
RETURNS (list): The available lemmas for the string.
|
||||||
|
"""
|
||||||
|
lookup_table = self.lookups.get_table("lemma_lookup", {})
|
||||||
|
if "lemma_rules" not in self.lookups:
|
||||||
|
return [lookup_table.get(string, string)]
|
||||||
if univ_pos in (NOUN, "NOUN", "noun"):
|
if univ_pos in (NOUN, "NOUN", "noun"):
|
||||||
univ_pos = "noun"
|
univ_pos = "noun"
|
||||||
elif univ_pos in (VERB, "VERB", "verb"):
|
elif univ_pos in (VERB, "VERB", "verb"):
|
||||||
|
@ -41,11 +58,14 @@ class Lemmatizer(object):
|
||||||
# See Issue #435 for example of where this logic is requied.
|
# See Issue #435 for example of where this logic is requied.
|
||||||
if self.is_base_form(univ_pos, morphology):
|
if self.is_base_form(univ_pos, morphology):
|
||||||
return [string.lower()]
|
return [string.lower()]
|
||||||
lemmas = lemmatize(
|
index_table = self.lookups.get_table("lemma_index", {})
|
||||||
|
exc_table = self.lookups.get_table("lemma_exc", {})
|
||||||
|
rules_table = self.lookups.get_table("lemma_rules", {})
|
||||||
|
lemmas = self.lemmatize(
|
||||||
string,
|
string,
|
||||||
self.index.get(univ_pos, {}),
|
index_table.get(univ_pos, {}),
|
||||||
self.exc.get(univ_pos, {}),
|
exc_table.get(univ_pos, {}),
|
||||||
self.rules.get(univ_pos, []),
|
rules_table.get(univ_pos, []),
|
||||||
)
|
)
|
||||||
return lemmas
|
return lemmas
|
||||||
|
|
||||||
|
@ -53,6 +73,10 @@ class Lemmatizer(object):
|
||||||
"""
|
"""
|
||||||
Check whether we're dealing with an uninflected paradigm, so we can
|
Check whether we're dealing with an uninflected paradigm, so we can
|
||||||
avoid lemmatization entirely.
|
avoid lemmatization entirely.
|
||||||
|
|
||||||
|
univ_pos (unicode / int): The token's universal part-of-speech tag.
|
||||||
|
morphology (dict): The token's morphological features following the
|
||||||
|
Universal Dependencies scheme.
|
||||||
"""
|
"""
|
||||||
if morphology is None:
|
if morphology is None:
|
||||||
morphology = {}
|
morphology = {}
|
||||||
|
@ -90,6 +114,18 @@ class Lemmatizer(object):
|
||||||
def adj(self, string, morphology=None):
|
def adj(self, string, morphology=None):
|
||||||
return self(string, "adj", morphology)
|
return self(string, "adj", morphology)
|
||||||
|
|
||||||
|
def det(self, string, morphology=None):
|
||||||
|
return self(string, "det", morphology)
|
||||||
|
|
||||||
|
def pron(self, string, morphology=None):
|
||||||
|
return self(string, "pron", morphology)
|
||||||
|
|
||||||
|
def adp(self, string, morphology=None):
|
||||||
|
return self(string, "adp", morphology)
|
||||||
|
|
||||||
|
def num(self, string, morphology=None):
|
||||||
|
return self(string, "num", morphology)
|
||||||
|
|
||||||
def punct(self, string, morphology=None):
|
def punct(self, string, morphology=None):
|
||||||
return self(string, "punct", morphology)
|
return self(string, "punct", morphology)
|
||||||
|
|
||||||
|
@ -103,37 +139,37 @@ class Lemmatizer(object):
|
||||||
RETURNS (unicode): The lemma if the string was found, otherwise the
|
RETURNS (unicode): The lemma if the string was found, otherwise the
|
||||||
original string.
|
original string.
|
||||||
"""
|
"""
|
||||||
|
lookup_table = self.lookups.get_table("lemma_lookup", {})
|
||||||
key = orth if orth is not None else string
|
key = orth if orth is not None else string
|
||||||
if key in self.lookup_table:
|
if key in lookup_table:
|
||||||
return self.lookup_table[key]
|
return lookup_table[key]
|
||||||
return string
|
return string
|
||||||
|
|
||||||
|
def lemmatize(self, string, index, exceptions, rules):
|
||||||
def lemmatize(string, index, exceptions, rules):
|
orig = string
|
||||||
orig = string
|
string = string.lower()
|
||||||
string = string.lower()
|
forms = []
|
||||||
forms = []
|
oov_forms = []
|
||||||
oov_forms = []
|
for old, new in rules:
|
||||||
for old, new in rules:
|
if string.endswith(old):
|
||||||
if string.endswith(old):
|
form = string[: len(string) - len(old)] + new
|
||||||
form = string[: len(string) - len(old)] + new
|
if not form:
|
||||||
if not form:
|
pass
|
||||||
pass
|
elif form in index or not form.isalpha():
|
||||||
elif form in index or not form.isalpha():
|
forms.append(form)
|
||||||
forms.append(form)
|
else:
|
||||||
else:
|
oov_forms.append(form)
|
||||||
oov_forms.append(form)
|
# Remove duplicates but preserve the ordering of applied "rules"
|
||||||
# Remove duplicates but preserve the ordering of applied "rules"
|
forms = list(OrderedDict.fromkeys(forms))
|
||||||
forms = list(OrderedDict.fromkeys(forms))
|
# Put exceptions at the front of the list, so they get priority.
|
||||||
# Put exceptions at the front of the list, so they get priority.
|
# This is a dodgy heuristic -- but it's the best we can do until we get
|
||||||
# This is a dodgy heuristic -- but it's the best we can do until we get
|
# frequencies on this. We can at least prune out problematic exceptions,
|
||||||
# frequencies on this. We can at least prune out problematic exceptions,
|
# if they shadow more frequent analyses.
|
||||||
# if they shadow more frequent analyses.
|
for form in exceptions.get(string, []):
|
||||||
for form in exceptions.get(string, []):
|
if form not in forms:
|
||||||
if form not in forms:
|
forms.insert(0, form)
|
||||||
forms.insert(0, form)
|
if not forms:
|
||||||
if not forms:
|
forms.extend(oov_forms)
|
||||||
forms.extend(oov_forms)
|
if not forms:
|
||||||
if not forms:
|
forms.append(orig)
|
||||||
forms.append(orig)
|
return forms
|
||||||
return forms
|
|
||||||
|
|
|
@ -10,6 +10,9 @@ from .util import SimpleFrozenDict, ensure_path
|
||||||
from .strings import get_string_id
|
from .strings import get_string_id
|
||||||
|
|
||||||
|
|
||||||
|
UNSET = object()
|
||||||
|
|
||||||
|
|
||||||
class Lookups(object):
|
class Lookups(object):
|
||||||
"""Container for large lookup tables and dictionaries, e.g. lemmatization
|
"""Container for large lookup tables and dictionaries, e.g. lemmatization
|
||||||
data or tokenizer exception lists. Lookups are available via vocab.lookups,
|
data or tokenizer exception lists. Lookups are available via vocab.lookups,
|
||||||
|
@ -60,16 +63,20 @@ class Lookups(object):
|
||||||
self._tables[name] = table
|
self._tables[name] = table
|
||||||
return table
|
return table
|
||||||
|
|
||||||
def get_table(self, name):
|
def get_table(self, name, default=UNSET):
|
||||||
"""Get a table. Raises an error if the table doesn't exist.
|
"""Get a table. Raises an error if the table doesn't exist and no
|
||||||
|
default value is provided.
|
||||||
|
|
||||||
name (unicode): Name of the table.
|
name (unicode): Name of the table.
|
||||||
|
default: Optional default value to return if table doesn't exist.
|
||||||
RETURNS (Table): The table.
|
RETURNS (Table): The table.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lookups#get_table
|
DOCS: https://spacy.io/api/lookups#get_table
|
||||||
"""
|
"""
|
||||||
if name not in self._tables:
|
if name not in self._tables:
|
||||||
raise KeyError(Errors.E159.format(name=name, tables=self.tables))
|
if default == UNSET:
|
||||||
|
raise KeyError(Errors.E159.format(name=name, tables=self.tables))
|
||||||
|
return default
|
||||||
return self._tables[name]
|
return self._tables[name]
|
||||||
|
|
||||||
def remove_table(self, name):
|
def remove_table(self, name):
|
||||||
|
@ -111,6 +118,7 @@ class Lookups(object):
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lookups#from_bytes
|
DOCS: https://spacy.io/api/lookups#from_bytes
|
||||||
"""
|
"""
|
||||||
|
self._tables = OrderedDict()
|
||||||
for key, value in srsly.msgpack_loads(bytes_data).items():
|
for key, value in srsly.msgpack_loads(bytes_data).items():
|
||||||
self._tables[key] = Table(key)
|
self._tables[key] = Table(key)
|
||||||
self._tables[key].update(value)
|
self._tables[key].update(value)
|
||||||
|
|
|
@ -5,13 +5,14 @@ import pytest
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
from spacy.lemmatizer import Lemmatizer
|
from spacy.lemmatizer import Lemmatizer
|
||||||
from spacy.lookups import Table
|
from spacy.lookups import Lookups
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def lemmatizer():
|
def lemmatizer():
|
||||||
lookup = Table(data={"dogs": "dog", "boxen": "box", "mice": "mouse"})
|
lookups = Lookups()
|
||||||
return Lemmatizer(lookup=lookup)
|
lookups.add_table("lemma_lookup", {"dogs": "dog", "boxen": "box", "mice": "mouse"})
|
||||||
|
return Lemmatizer(lookups)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
|
|
@ -5,11 +5,13 @@ import pytest
|
||||||
from spacy.morphology import Morphology
|
from spacy.morphology import Morphology
|
||||||
from spacy.strings import StringStore, get_string_id
|
from spacy.strings import StringStore, get_string_id
|
||||||
from spacy.lemmatizer import Lemmatizer
|
from spacy.lemmatizer import Lemmatizer
|
||||||
|
from spacy.lookups import Lookups
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def morphology():
|
def morphology():
|
||||||
return Morphology(StringStore(), {}, Lemmatizer())
|
lemmatizer = Lemmatizer(Lookups())
|
||||||
|
return Morphology(StringStore(), {}, lemmatizer)
|
||||||
|
|
||||||
|
|
||||||
def test_init(morphology):
|
def test_init(morphology):
|
||||||
|
|
|
@ -1,22 +0,0 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
from spacy.lang.en import English
|
|
||||||
from spacy.lookups import Lookups
|
|
||||||
|
|
||||||
|
|
||||||
def test_tagger_warns_no_lemma_lookups():
|
|
||||||
nlp = English()
|
|
||||||
nlp.vocab.lookups = Lookups()
|
|
||||||
assert not len(nlp.vocab.lookups)
|
|
||||||
tagger = nlp.create_pipe("tagger")
|
|
||||||
with pytest.warns(UserWarning):
|
|
||||||
tagger.begin_training()
|
|
||||||
nlp.add_pipe(tagger)
|
|
||||||
with pytest.warns(UserWarning):
|
|
||||||
nlp.begin_training()
|
|
||||||
nlp.vocab.lookups.add_table("lemma_lookup")
|
|
||||||
with pytest.warns(None) as record:
|
|
||||||
nlp.begin_training()
|
|
||||||
assert not record.list
|
|
|
@ -9,6 +9,7 @@ from spacy.symbols import POS, VERB, VerbForm_inf
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
from spacy.lemmatizer import Lemmatizer
|
from spacy.lemmatizer import Lemmatizer
|
||||||
|
from spacy.lookups import Lookups
|
||||||
from spacy.tokens import Doc, Span
|
from spacy.tokens import Doc, Span
|
||||||
|
|
||||||
from ..util import get_doc, make_tempdir
|
from ..util import get_doc, make_tempdir
|
||||||
|
@ -173,8 +174,11 @@ def test_issue595():
|
||||||
"""Test lemmatization of base forms"""
|
"""Test lemmatization of base forms"""
|
||||||
words = ["Do", "n't", "feed", "the", "dog"]
|
words = ["Do", "n't", "feed", "the", "dog"]
|
||||||
tag_map = {"VB": {POS: VERB, VerbForm_inf: True}}
|
tag_map = {"VB": {POS: VERB, VerbForm_inf: True}}
|
||||||
rules = {"verb": [["ed", "e"]]}
|
lookups = Lookups()
|
||||||
lemmatizer = Lemmatizer({"verb": {}}, {"verb": {}}, rules)
|
lookups.add_table("lemma_rules", {"verb": [["ed", "e"]]})
|
||||||
|
lookups.add_table("lemma_index", {"verb": {}})
|
||||||
|
lookups.add_table("lemma_exc", {"verb": {}})
|
||||||
|
lemmatizer = Lemmatizer(lookups)
|
||||||
vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
|
vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
|
||||||
doc = Doc(vocab, words=words)
|
doc = Doc(vocab, words=words)
|
||||||
doc[2].tag_ = "VB"
|
doc[2].tag_ = "VB"
|
||||||
|
|
|
@ -10,6 +10,7 @@ from spacy.lang.lex_attrs import LEX_ATTRS
|
||||||
from spacy.matcher import Matcher
|
from spacy.matcher import Matcher
|
||||||
from spacy.tokenizer import Tokenizer
|
from spacy.tokenizer import Tokenizer
|
||||||
from spacy.lemmatizer import Lemmatizer
|
from spacy.lemmatizer import Lemmatizer
|
||||||
|
from spacy.lookups import Lookups
|
||||||
from spacy.symbols import ORTH, LEMMA, POS, VERB, VerbForm_part
|
from spacy.symbols import ORTH, LEMMA, POS, VERB, VerbForm_part
|
||||||
|
|
||||||
|
|
||||||
|
@ -91,10 +92,11 @@ def test_issue1375():
|
||||||
|
|
||||||
def test_issue1387():
|
def test_issue1387():
|
||||||
tag_map = {"VBG": {POS: VERB, VerbForm_part: True}}
|
tag_map = {"VBG": {POS: VERB, VerbForm_part: True}}
|
||||||
index = {"verb": ("cope", "cop")}
|
lookups = Lookups()
|
||||||
exc = {"verb": {"coping": ("cope",)}}
|
lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
|
||||||
rules = {"verb": [["ing", ""]]}
|
lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
|
||||||
lemmatizer = Lemmatizer(index, exc, rules)
|
lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
|
||||||
|
lemmatizer = Lemmatizer(lookups)
|
||||||
vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
|
vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
|
||||||
doc = Doc(vocab, words=["coping"])
|
doc = Doc(vocab, words=["coping"])
|
||||||
doc[0].tag_ = "VBG"
|
doc[0].tag_ = "VBG"
|
||||||
|
|
|
@ -126,7 +126,8 @@ def test_issue1727():
|
||||||
vectors = Vectors(data=data, keys=["I", "am", "Matt"])
|
vectors = Vectors(data=data, keys=["I", "am", "Matt"])
|
||||||
tagger = Tagger(Vocab())
|
tagger = Tagger(Vocab())
|
||||||
tagger.add_label("PRP")
|
tagger.add_label("PRP")
|
||||||
tagger.begin_training()
|
with pytest.warns(UserWarning):
|
||||||
|
tagger.begin_training()
|
||||||
assert tagger.cfg.get("pretrained_dims", 0) == 0
|
assert tagger.cfg.get("pretrained_dims", 0) == 0
|
||||||
tagger.vocab.vectors = vectors
|
tagger.vocab.vectors = vectors
|
||||||
with make_tempdir() as path:
|
with make_tempdir() as path:
|
||||||
|
|
|
@ -22,7 +22,8 @@ def test_issue2564():
|
||||||
"""Test the tagger sets is_tagged correctly when used via Language.pipe."""
|
"""Test the tagger sets is_tagged correctly when used via Language.pipe."""
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
tagger = nlp.create_pipe("tagger")
|
tagger = nlp.create_pipe("tagger")
|
||||||
tagger.begin_training() # initialise weights
|
with pytest.warns(UserWarning):
|
||||||
|
tagger.begin_training() # initialise weights
|
||||||
nlp.add_pipe(tagger)
|
nlp.add_pipe(tagger)
|
||||||
doc = nlp("hello world")
|
doc = nlp("hello world")
|
||||||
assert doc.is_tagged
|
assert doc.is_tagged
|
||||||
|
|
49
spacy/tests/test_lemmatizer.py
Normal file
49
spacy/tests/test_lemmatizer.py
Normal file
|
@ -0,0 +1,49 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from spacy.tokens import Doc
|
||||||
|
from spacy.language import Language
|
||||||
|
from spacy.lookups import Lookups
|
||||||
|
|
||||||
|
|
||||||
|
def test_lemmatizer_reflects_lookups_changes():
|
||||||
|
"""Test for an issue that'd cause lookups available in a model loaded from
|
||||||
|
disk to not be reflected in the lemmatizer."""
|
||||||
|
nlp = Language()
|
||||||
|
assert Doc(nlp.vocab, words=["foo"])[0].lemma_ == "foo"
|
||||||
|
table = nlp.vocab.lookups.add_table("lemma_lookup")
|
||||||
|
table["foo"] = "bar"
|
||||||
|
assert Doc(nlp.vocab, words=["foo"])[0].lemma_ == "bar"
|
||||||
|
table = nlp.vocab.lookups.get_table("lemma_lookup")
|
||||||
|
table["hello"] = "world"
|
||||||
|
# The update to the table should be reflected in the lemmatizer
|
||||||
|
assert Doc(nlp.vocab, words=["hello"])[0].lemma_ == "world"
|
||||||
|
new_nlp = Language()
|
||||||
|
table = new_nlp.vocab.lookups.add_table("lemma_lookup")
|
||||||
|
table["hello"] = "hi"
|
||||||
|
assert Doc(new_nlp.vocab, words=["hello"])[0].lemma_ == "hi"
|
||||||
|
nlp_bytes = nlp.to_bytes()
|
||||||
|
new_nlp.from_bytes(nlp_bytes)
|
||||||
|
# Make sure we have the previously saved lookup table
|
||||||
|
assert len(new_nlp.vocab.lookups) == 1
|
||||||
|
assert len(new_nlp.vocab.lookups.get_table("lemma_lookup")) == 2
|
||||||
|
assert new_nlp.vocab.lookups.get_table("lemma_lookup")["hello"] == "world"
|
||||||
|
assert Doc(new_nlp.vocab, words=["foo"])[0].lemma_ == "bar"
|
||||||
|
assert Doc(new_nlp.vocab, words=["hello"])[0].lemma_ == "world"
|
||||||
|
|
||||||
|
|
||||||
|
def test_tagger_warns_no_lemma_lookups():
|
||||||
|
nlp = Language()
|
||||||
|
nlp.vocab.lookups = Lookups()
|
||||||
|
assert not len(nlp.vocab.lookups)
|
||||||
|
tagger = nlp.create_pipe("tagger")
|
||||||
|
with pytest.warns(UserWarning):
|
||||||
|
tagger.begin_training()
|
||||||
|
nlp.add_pipe(tagger)
|
||||||
|
with pytest.warns(UserWarning):
|
||||||
|
nlp.begin_training()
|
||||||
|
nlp.vocab.lookups.add_table("lemma_lookup")
|
||||||
|
with pytest.warns(None) as record:
|
||||||
|
nlp.begin_training()
|
||||||
|
assert not record.list
|
|
@ -111,16 +111,12 @@ SUFFIXES = ['"', ":", ">"]
|
||||||
|
|
||||||
@pytest.mark.parametrize("url", URLS_SHOULD_MATCH)
|
@pytest.mark.parametrize("url", URLS_SHOULD_MATCH)
|
||||||
def test_should_match(en_tokenizer, url):
|
def test_should_match(en_tokenizer, url):
|
||||||
token_match = en_tokenizer.token_match
|
assert en_tokenizer.token_match(url) is not None
|
||||||
if token_match:
|
|
||||||
assert token_match(url)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("url", URLS_SHOULD_NOT_MATCH)
|
@pytest.mark.parametrize("url", URLS_SHOULD_NOT_MATCH)
|
||||||
def test_should_not_match(en_tokenizer, url):
|
def test_should_not_match(en_tokenizer, url):
|
||||||
token_match = en_tokenizer.token_match
|
assert en_tokenizer.token_match(url) is None
|
||||||
if token_match:
|
|
||||||
assert not token_match(url)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("url", URLS_BASIC)
|
@pytest.mark.parametrize("url", URLS_BASIC)
|
||||||
|
|
|
@ -467,31 +467,6 @@ def expand_exc(excs, search, replace):
|
||||||
return new_excs
|
return new_excs
|
||||||
|
|
||||||
|
|
||||||
def get_lemma_tables(lookups):
|
|
||||||
"""Load lemmatizer data from lookups table. Mostly used via
|
|
||||||
Language.Defaults.create_lemmatizer, but available as helper so it can be
|
|
||||||
reused in language classes that implement custom lemmatizers.
|
|
||||||
|
|
||||||
lookups (Lookups): The lookups table.
|
|
||||||
RETURNS (tuple): A (lemma_rules, lemma_index, lemma_exc, lemma_lookup)
|
|
||||||
tuple that can be used to initialize a Lemmatizer.
|
|
||||||
"""
|
|
||||||
lemma_rules = {}
|
|
||||||
lemma_index = {}
|
|
||||||
lemma_exc = {}
|
|
||||||
lemma_lookup = None
|
|
||||||
if lookups is not None:
|
|
||||||
if "lemma_rules" in lookups:
|
|
||||||
lemma_rules = lookups.get_table("lemma_rules")
|
|
||||||
if "lemma_index" in lookups:
|
|
||||||
lemma_index = lookups.get_table("lemma_index")
|
|
||||||
if "lemma_exc" in lookups:
|
|
||||||
lemma_exc = lookups.get_table("lemma_exc")
|
|
||||||
if "lemma_lookup" in lookups:
|
|
||||||
lemma_lookup = lookups.get_table("lemma_lookup")
|
|
||||||
return (lemma_rules, lemma_index, lemma_exc, lemma_lookup)
|
|
||||||
|
|
||||||
|
|
||||||
def normalize_slice(length, start, stop, step=None):
|
def normalize_slice(length, start, stop, step=None):
|
||||||
if not (step is None or step == 1):
|
if not (step is None or step == 1):
|
||||||
raise ValueError(Errors.E057)
|
raise ValueError(Errors.E057)
|
||||||
|
|
|
@ -50,10 +50,10 @@ cdef class Vocab:
|
||||||
"""
|
"""
|
||||||
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
|
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
|
||||||
tag_map = tag_map if tag_map is not None else {}
|
tag_map = tag_map if tag_map is not None else {}
|
||||||
if lemmatizer in (None, True, False):
|
|
||||||
lemmatizer = Lemmatizer({}, {}, {})
|
|
||||||
if lookups in (None, True, False):
|
if lookups in (None, True, False):
|
||||||
lookups = Lookups()
|
lookups = Lookups()
|
||||||
|
if lemmatizer in (None, True, False):
|
||||||
|
lemmatizer = Lemmatizer(lookups)
|
||||||
self.cfg = {'oov_prob': oov_prob}
|
self.cfg = {'oov_prob': oov_prob}
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self._by_orth = PreshMap()
|
self._by_orth = PreshMap()
|
||||||
|
|
|
@ -10,22 +10,40 @@ lookup tables.
|
||||||
|
|
||||||
## Lemmatizer.\_\_init\_\_ {#init tag="method"}
|
## Lemmatizer.\_\_init\_\_ {#init tag="method"}
|
||||||
|
|
||||||
Create a `Lemmatizer`.
|
Initialize a `Lemmatizer`. Typically, this happens under the hood within spaCy
|
||||||
|
when a `Language` subclass and its `Vocab` is initialized.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> from spacy.lemmatizer import Lemmatizer
|
> from spacy.lemmatizer import Lemmatizer
|
||||||
> lemmatizer = Lemmatizer()
|
> from spacy.lookups import Lookups
|
||||||
|
> lookups = Lookups()
|
||||||
|
> lookups.add_table("lemma_rules", {"noun": [["s", ""]]})
|
||||||
|
> lemmatizer = Lemmatizer(lookups)
|
||||||
> ```
|
> ```
|
||||||
|
>
|
||||||
|
> For examples of the data format, see the
|
||||||
|
> [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) repo.
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ------------ | ------------- | ---------------------------------------------------------- |
|
| -------------------------------------- | ------------------------- | ------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `index` | dict / `None` | Inventory of lemmas in the language. |
|
| `lookups` <Tag variant="new">2.2</Tag> | [`Lookups`](/api/lookups) | The lookups object containing the (optional) tables `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. |
|
||||||
| `exceptions` | dict / `None` | Mapping of string forms to lemmas that bypass the `rules`. |
|
| **RETURNS** | `Lemmatizer` | The newly created object. |
|
||||||
| `rules` | dict / `None` | List of suffix rewrite rules. |
|
|
||||||
| `lookup` | dict / `None` | Lookup table mapping string to their lemmas. |
|
<Infobox title="Deprecation note" variant="danger">
|
||||||
| **RETURNS** | `Lemmatizer` | The newly created object. |
|
|
||||||
|
As of v2.2, the lemmatizer is initialized with a [`Lookups`](/api/lookups)
|
||||||
|
object containing tables for the different components. This makes it easier for
|
||||||
|
spaCy to share and serialize rules and lookup tables via the `Vocab`, and allows
|
||||||
|
users to modify lemmatizer data at runtime by updating `nlp.vocab.lookups`.
|
||||||
|
|
||||||
|
```diff
|
||||||
|
- lemmatizer = Lemmatizer(rules=lemma_rules)
|
||||||
|
+ lemmatizer = Lemmatizer(lookups)
|
||||||
|
```
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
## Lemmatizer.\_\_call\_\_ {#call tag="method"}
|
## Lemmatizer.\_\_call\_\_ {#call tag="method"}
|
||||||
|
|
||||||
|
@ -35,8 +53,10 @@ Lemmatize a string.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> from spacy.lemmatizer import Lemmatizer
|
> from spacy.lemmatizer import Lemmatizer
|
||||||
> rules = {"noun": [["s", ""]]}
|
> from spacy.lookups import Lookups
|
||||||
> lemmatizer = Lemmatizer(index={}, exceptions={}, rules=rules)
|
> lookups = Loookups()
|
||||||
|
> lookups.add_table("lemma_rules", {"noun": [["s", ""]]})
|
||||||
|
> lemmatizer = Lemmatizer(lookups)
|
||||||
> lemmas = lemmatizer("ducks", "NOUN")
|
> lemmas = lemmatizer("ducks", "NOUN")
|
||||||
> assert lemmas == ["duck"]
|
> assert lemmas == ["duck"]
|
||||||
> ```
|
> ```
|
||||||
|
@ -52,14 +72,13 @@ Lemmatize a string.
|
||||||
|
|
||||||
Look up a lemma in the lookup table, if available. If no lemma is found, the
|
Look up a lemma in the lookup table, if available. If no lemma is found, the
|
||||||
original string is returned. Languages can provide a
|
original string is returned. Languages can provide a
|
||||||
[lookup table](/usage/adding-languages#lemmatizer) via the `resources`, set on
|
[lookup table](/usage/adding-languages#lemmatizer) via the `Lookups`.
|
||||||
the individual `Language` class.
|
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> lookup = {"going": "go"}
|
> lookups = Lookups()
|
||||||
> lemmatizer = Lemmatizer(lookup=lookup)
|
> lookups.add_table("lemma_lookup", {"going": "go"})
|
||||||
> assert lemmatizer.lookup("going") == "go"
|
> assert lemmatizer.lookup("going") == "go"
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
@ -91,9 +110,6 @@ lemmatization entirely.
|
||||||
|
|
||||||
## Attributes {#attributes}
|
## Attributes {#attributes}
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------------------------------------- | ------------- | ---------------------------------------------------------- |
|
| -------------------------------------- | ------------------------- | --------------------------------------------------------------- |
|
||||||
| `index` | dict / `None` | Inventory of lemmas in the language. |
|
| `lookups` <Tag variant="new">2.2</Tag> | [`Lookups`](/api/lookups) | The lookups object containing the rules and data, if available. |
|
||||||
| `exc` | dict / `None` | Mapping of string forms to lemmas that bypass the `rules`. |
|
|
||||||
| `rules` | dict / `None` | List of suffix rewrite rules. |
|
|
||||||
| `lookup_table` <Tag variant="new">2</Tag> | dict / `None` | The lemma lookup table, if available. |
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user