diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py index 4304b3c6a..d52f3dfd8 100644 --- a/spacy/lang/en/__init__.py +++ b/spacy/lang/en/__init__.py @@ -18,6 +18,41 @@ def _return_en(_): return "en" +def en_is_base_form(univ_pos, morphology=None): + """ + Check whether we're dealing with an uninflected paradigm, so we can + avoid lemmatization entirely. + + univ_pos (unicode / int): The token's universal part-of-speech tag. + morphology (dict): The token's morphological features following the + Universal Dependencies scheme. + """ + if morphology is None: + morphology = {} + if univ_pos == "noun" and morphology.get("Number") == "sing": + return True + elif univ_pos == "verb" and morphology.get("VerbForm") == "inf": + return True + # This maps 'VBP' to base form -- probably just need 'IS_BASE' + # morphology + elif univ_pos == "verb" and ( + morphology.get("VerbForm") == "fin" + and morphology.get("Tense") == "pres" + and morphology.get("Number") is None + ): + return True + elif univ_pos == "adj" and morphology.get("Degree") == "pos": + return True + elif morphology.get("VerbForm") == "inf": + return True + elif morphology.get("VerbForm") == "none": + return True + elif morphology.get("Degree") == "pos": + return True + else: + return False + + class EnglishDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters.update(LEX_ATTRS) @@ -26,6 +61,7 @@ class EnglishDefaults(Language.Defaults): tag_map = TAG_MAP stop_words = STOP_WORDS morph_rules = MORPH_RULES + is_base_form = en_is_base_form syntax_iterators = SYNTAX_ITERATORS single_orth_variants = [ {"tags": ["NFP"], "variants": ["…", "..."]}, diff --git a/spacy/language.py b/spacy/language.py index 2058def8a..faa0447a4 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -46,7 +46,7 @@ class BaseDefaults(object): def create_lemmatizer(cls, nlp=None, lookups=None): if lookups is None: lookups = cls.create_lookups(nlp=nlp) - return Lemmatizer(lookups=lookups) + return Lemmatizer(lookups=lookups, is_base_form=cls.is_base_form) @classmethod def create_lookups(cls, nlp=None): @@ -120,6 +120,7 @@ class BaseDefaults(object): tokenizer_exceptions = {} stop_words = set() morph_rules = {} + is_base_form = None lex_attr_getters = LEX_ATTRS syntax_iterators = {} resources = {} diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 1f0f0da3f..f72eae128 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -21,7 +21,7 @@ class Lemmatizer(object): def load(cls, *args, **kwargs): raise NotImplementedError(Errors.E172) - def __init__(self, lookups, *args, **kwargs): + def __init__(self, lookups, *args, is_base_form=None, **kwargs): """Initialize a Lemmatizer. lookups (Lookups): The lookups object containing the (optional) tables @@ -31,6 +31,7 @@ class Lemmatizer(object): if args or kwargs or not isinstance(lookups, Lookups): raise ValueError(Errors.E173) self.lookups = lookups + self.is_base_form = is_base_form def __call__(self, string, univ_pos, morphology=None): """Lemmatize a string. @@ -51,7 +52,7 @@ class Lemmatizer(object): if univ_pos in ("", "eol", "space"): return [string.lower()] # See Issue #435 for example of where this logic is requied. - if self.is_base_form(univ_pos, morphology): + if callable(self.is_base_form) and self.is_base_form(univ_pos, morphology): return [string.lower()] index_table = self.lookups.get_table("lemma_index", {}) exc_table = self.lookups.get_table("lemma_exc", {}) @@ -69,40 +70,6 @@ class Lemmatizer(object): ) return lemmas - def is_base_form(self, univ_pos, morphology=None): - """ - Check whether we're dealing with an uninflected paradigm, so we can - avoid lemmatization entirely. - - univ_pos (unicode / int): The token's universal part-of-speech tag. - morphology (dict): The token's morphological features following the - Universal Dependencies scheme. - """ - if morphology is None: - morphology = {} - if univ_pos == "noun" and morphology.get("Number") == "sing": - return True - elif univ_pos == "verb" and morphology.get("VerbForm") == "inf": - return True - # This maps 'VBP' to base form -- probably just need 'IS_BASE' - # morphology - elif univ_pos == "verb" and ( - morphology.get("VerbForm") == "fin" - and morphology.get("Tense") == "pres" - and morphology.get("Number") is None - ): - return True - elif univ_pos == "adj" and morphology.get("Degree") == "pos": - return True - elif morphology.get("VerbForm") == "inf": - return True - elif morphology.get("VerbForm") == "none": - return True - elif morphology.get("Degree") == "pos": - return True - else: - return False - def noun(self, string, morphology=None): return self(string, "noun", morphology) diff --git a/spacy/tests/regression/test_issue1-1000.py b/spacy/tests/regression/test_issue1-1000.py index 6d88d68c2..38a99371e 100644 --- a/spacy/tests/regression/test_issue1-1000.py +++ b/spacy/tests/regression/test_issue1-1000.py @@ -11,6 +11,7 @@ from spacy.language import Language from spacy.lemmatizer import Lemmatizer from spacy.lookups import Lookups from spacy.tokens import Doc, Span +from spacy.lang.en import EnglishDefaults from ..util import get_doc, make_tempdir @@ -172,7 +173,7 @@ def test_issue595(): lookups.add_table("lemma_rules", {"verb": [["ed", "e"]]}) lookups.add_table("lemma_index", {"verb": {}}) lookups.add_table("lemma_exc", {"verb": {}}) - lemmatizer = Lemmatizer(lookups) + lemmatizer = Lemmatizer(lookups, is_base_form=EnglishDefaults.is_base_form) vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map) doc = Doc(vocab, words=words) doc[2].tag_ = "VB" diff --git a/spacy/tests/test_lemmatizer.py b/spacy/tests/test_lemmatizer.py index fce3772c4..e7736b042 100644 --- a/spacy/tests/test_lemmatizer.py +++ b/spacy/tests/test_lemmatizer.py @@ -5,6 +5,7 @@ import pytest from spacy.tokens import Doc from spacy.language import Language from spacy.lookups import Lookups +from spacy.lemmatizer import Lemmatizer def test_lemmatizer_reflects_lookups_changes(): @@ -47,3 +48,14 @@ def test_tagger_warns_no_lookups(): with pytest.warns(None) as record: nlp.begin_training() assert not record.list + + +def test_lemmatizer_without_is_base_form_implementation(): + # Norwegian example from #5658 + lookups = Lookups() + lookups.add_table("lemma_rules", {"noun": []}) + lookups.add_table("lemma_index", {"noun": {}}) + lookups.add_table("lemma_exc", {"noun": {"formuesskatten": ["formuesskatt"]}}) + + lemmatizer = Lemmatizer(lookups, is_base_form=None) + assert lemmatizer("Formuesskatten", "noun", {'Definite': 'def', 'Gender': 'masc', 'Number': 'sing'}) == ["formuesskatt"]