Move lemmatizer is_base_form to language settings (#5663)

Move `Lemmatizer.is_base_form` to the language settings so that each
language can provide a language-specific method as
`LanguageDefaults.is_base_form`.

The existing English-specific `Lemmatizer.is_base_form` is moved to
`EnglishDefaults`.
This commit is contained in:
Adriane Boyd 2020-06-29 14:16:57 +02:00 committed by GitHub
parent c4d0209472
commit 167df42cb6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 55 additions and 38 deletions

View File

@ -18,6 +18,41 @@ def _return_en(_):
return "en" return "en"
def en_is_base_form(univ_pos, morphology=None):
"""
Check whether we're dealing with an uninflected paradigm, so we can
avoid lemmatization entirely.
univ_pos (unicode / int): The token's universal part-of-speech tag.
morphology (dict): The token's morphological features following the
Universal Dependencies scheme.
"""
if morphology is None:
morphology = {}
if univ_pos == "noun" and morphology.get("Number") == "sing":
return True
elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
return True
# This maps 'VBP' to base form -- probably just need 'IS_BASE'
# morphology
elif univ_pos == "verb" and (
morphology.get("VerbForm") == "fin"
and morphology.get("Tense") == "pres"
and morphology.get("Number") is None
):
return True
elif univ_pos == "adj" and morphology.get("Degree") == "pos":
return True
elif morphology.get("VerbForm") == "inf":
return True
elif morphology.get("VerbForm") == "none":
return True
elif morphology.get("Degree") == "pos":
return True
else:
return False
class EnglishDefaults(Language.Defaults): class EnglishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS) lex_attr_getters.update(LEX_ATTRS)
@ -26,6 +61,7 @@ class EnglishDefaults(Language.Defaults):
tag_map = TAG_MAP tag_map = TAG_MAP
stop_words = STOP_WORDS stop_words = STOP_WORDS
morph_rules = MORPH_RULES morph_rules = MORPH_RULES
is_base_form = en_is_base_form
syntax_iterators = SYNTAX_ITERATORS syntax_iterators = SYNTAX_ITERATORS
single_orth_variants = [ single_orth_variants = [
{"tags": ["NFP"], "variants": ["", "..."]}, {"tags": ["NFP"], "variants": ["", "..."]},

View File

@ -46,7 +46,7 @@ class BaseDefaults(object):
def create_lemmatizer(cls, nlp=None, lookups=None): def create_lemmatizer(cls, nlp=None, lookups=None):
if lookups is None: if lookups is None:
lookups = cls.create_lookups(nlp=nlp) lookups = cls.create_lookups(nlp=nlp)
return Lemmatizer(lookups=lookups) return Lemmatizer(lookups=lookups, is_base_form=cls.is_base_form)
@classmethod @classmethod
def create_lookups(cls, nlp=None): def create_lookups(cls, nlp=None):
@ -120,6 +120,7 @@ class BaseDefaults(object):
tokenizer_exceptions = {} tokenizer_exceptions = {}
stop_words = set() stop_words = set()
morph_rules = {} morph_rules = {}
is_base_form = None
lex_attr_getters = LEX_ATTRS lex_attr_getters = LEX_ATTRS
syntax_iterators = {} syntax_iterators = {}
resources = {} resources = {}

View File

@ -21,7 +21,7 @@ class Lemmatizer(object):
def load(cls, *args, **kwargs): def load(cls, *args, **kwargs):
raise NotImplementedError(Errors.E172) raise NotImplementedError(Errors.E172)
def __init__(self, lookups, *args, **kwargs): def __init__(self, lookups, *args, is_base_form=None, **kwargs):
"""Initialize a Lemmatizer. """Initialize a Lemmatizer.
lookups (Lookups): The lookups object containing the (optional) tables lookups (Lookups): The lookups object containing the (optional) tables
@ -31,6 +31,7 @@ class Lemmatizer(object):
if args or kwargs or not isinstance(lookups, Lookups): if args or kwargs or not isinstance(lookups, Lookups):
raise ValueError(Errors.E173) raise ValueError(Errors.E173)
self.lookups = lookups self.lookups = lookups
self.is_base_form = is_base_form
def __call__(self, string, univ_pos, morphology=None): def __call__(self, string, univ_pos, morphology=None):
"""Lemmatize a string. """Lemmatize a string.
@ -51,7 +52,7 @@ class Lemmatizer(object):
if univ_pos in ("", "eol", "space"): if univ_pos in ("", "eol", "space"):
return [string.lower()] return [string.lower()]
# See Issue #435 for example of where this logic is requied. # See Issue #435 for example of where this logic is requied.
if self.is_base_form(univ_pos, morphology): if callable(self.is_base_form) and self.is_base_form(univ_pos, morphology):
return [string.lower()] return [string.lower()]
index_table = self.lookups.get_table("lemma_index", {}) index_table = self.lookups.get_table("lemma_index", {})
exc_table = self.lookups.get_table("lemma_exc", {}) exc_table = self.lookups.get_table("lemma_exc", {})
@ -69,40 +70,6 @@ class Lemmatizer(object):
) )
return lemmas return lemmas
def is_base_form(self, univ_pos, morphology=None):
"""
Check whether we're dealing with an uninflected paradigm, so we can
avoid lemmatization entirely.
univ_pos (unicode / int): The token's universal part-of-speech tag.
morphology (dict): The token's morphological features following the
Universal Dependencies scheme.
"""
if morphology is None:
morphology = {}
if univ_pos == "noun" and morphology.get("Number") == "sing":
return True
elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
return True
# This maps 'VBP' to base form -- probably just need 'IS_BASE'
# morphology
elif univ_pos == "verb" and (
morphology.get("VerbForm") == "fin"
and morphology.get("Tense") == "pres"
and morphology.get("Number") is None
):
return True
elif univ_pos == "adj" and morphology.get("Degree") == "pos":
return True
elif morphology.get("VerbForm") == "inf":
return True
elif morphology.get("VerbForm") == "none":
return True
elif morphology.get("Degree") == "pos":
return True
else:
return False
def noun(self, string, morphology=None): def noun(self, string, morphology=None):
return self(string, "noun", morphology) return self(string, "noun", morphology)

View File

@ -11,6 +11,7 @@ from spacy.language import Language
from spacy.lemmatizer import Lemmatizer from spacy.lemmatizer import Lemmatizer
from spacy.lookups import Lookups from spacy.lookups import Lookups
from spacy.tokens import Doc, Span from spacy.tokens import Doc, Span
from spacy.lang.en import EnglishDefaults
from ..util import get_doc, make_tempdir from ..util import get_doc, make_tempdir
@ -172,7 +173,7 @@ def test_issue595():
lookups.add_table("lemma_rules", {"verb": [["ed", "e"]]}) lookups.add_table("lemma_rules", {"verb": [["ed", "e"]]})
lookups.add_table("lemma_index", {"verb": {}}) lookups.add_table("lemma_index", {"verb": {}})
lookups.add_table("lemma_exc", {"verb": {}}) lookups.add_table("lemma_exc", {"verb": {}})
lemmatizer = Lemmatizer(lookups) lemmatizer = Lemmatizer(lookups, is_base_form=EnglishDefaults.is_base_form)
vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map) vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
doc = Doc(vocab, words=words) doc = Doc(vocab, words=words)
doc[2].tag_ = "VB" doc[2].tag_ = "VB"

View File

@ -5,6 +5,7 @@ import pytest
from spacy.tokens import Doc from spacy.tokens import Doc
from spacy.language import Language from spacy.language import Language
from spacy.lookups import Lookups from spacy.lookups import Lookups
from spacy.lemmatizer import Lemmatizer
def test_lemmatizer_reflects_lookups_changes(): def test_lemmatizer_reflects_lookups_changes():
@ -47,3 +48,14 @@ def test_tagger_warns_no_lookups():
with pytest.warns(None) as record: with pytest.warns(None) as record:
nlp.begin_training() nlp.begin_training()
assert not record.list assert not record.list
def test_lemmatizer_without_is_base_form_implementation():
# Norwegian example from #5658
lookups = Lookups()
lookups.add_table("lemma_rules", {"noun": []})
lookups.add_table("lemma_index", {"noun": {}})
lookups.add_table("lemma_exc", {"noun": {"formuesskatten": ["formuesskatt"]}})
lemmatizer = Lemmatizer(lookups, is_base_form=None)
assert lemmatizer("Formuesskatten", "noun", {'Definite': 'def', 'Gender': 'masc', 'Number': 'sing'}) == ["formuesskatt"]