mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Move lemmatizer is_base_form to language settings (#5663)
Move `Lemmatizer.is_base_form` to the language settings so that each language can provide a language-specific method as `LanguageDefaults.is_base_form`. The existing English-specific `Lemmatizer.is_base_form` is moved to `EnglishDefaults`.
This commit is contained in:
parent
c4d0209472
commit
167df42cb6
|
@ -18,6 +18,41 @@ def _return_en(_):
|
||||||
return "en"
|
return "en"
|
||||||
|
|
||||||
|
|
||||||
|
def en_is_base_form(univ_pos, morphology=None):
|
||||||
|
"""
|
||||||
|
Check whether we're dealing with an uninflected paradigm, so we can
|
||||||
|
avoid lemmatization entirely.
|
||||||
|
|
||||||
|
univ_pos (unicode / int): The token's universal part-of-speech tag.
|
||||||
|
morphology (dict): The token's morphological features following the
|
||||||
|
Universal Dependencies scheme.
|
||||||
|
"""
|
||||||
|
if morphology is None:
|
||||||
|
morphology = {}
|
||||||
|
if univ_pos == "noun" and morphology.get("Number") == "sing":
|
||||||
|
return True
|
||||||
|
elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
|
||||||
|
return True
|
||||||
|
# This maps 'VBP' to base form -- probably just need 'IS_BASE'
|
||||||
|
# morphology
|
||||||
|
elif univ_pos == "verb" and (
|
||||||
|
morphology.get("VerbForm") == "fin"
|
||||||
|
and morphology.get("Tense") == "pres"
|
||||||
|
and morphology.get("Number") is None
|
||||||
|
):
|
||||||
|
return True
|
||||||
|
elif univ_pos == "adj" and morphology.get("Degree") == "pos":
|
||||||
|
return True
|
||||||
|
elif morphology.get("VerbForm") == "inf":
|
||||||
|
return True
|
||||||
|
elif morphology.get("VerbForm") == "none":
|
||||||
|
return True
|
||||||
|
elif morphology.get("Degree") == "pos":
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
class EnglishDefaults(Language.Defaults):
|
class EnglishDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
lex_attr_getters.update(LEX_ATTRS)
|
||||||
|
@ -26,6 +61,7 @@ class EnglishDefaults(Language.Defaults):
|
||||||
tag_map = TAG_MAP
|
tag_map = TAG_MAP
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
morph_rules = MORPH_RULES
|
morph_rules = MORPH_RULES
|
||||||
|
is_base_form = en_is_base_form
|
||||||
syntax_iterators = SYNTAX_ITERATORS
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
single_orth_variants = [
|
single_orth_variants = [
|
||||||
{"tags": ["NFP"], "variants": ["…", "..."]},
|
{"tags": ["NFP"], "variants": ["…", "..."]},
|
||||||
|
|
|
@ -46,7 +46,7 @@ class BaseDefaults(object):
|
||||||
def create_lemmatizer(cls, nlp=None, lookups=None):
|
def create_lemmatizer(cls, nlp=None, lookups=None):
|
||||||
if lookups is None:
|
if lookups is None:
|
||||||
lookups = cls.create_lookups(nlp=nlp)
|
lookups = cls.create_lookups(nlp=nlp)
|
||||||
return Lemmatizer(lookups=lookups)
|
return Lemmatizer(lookups=lookups, is_base_form=cls.is_base_form)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create_lookups(cls, nlp=None):
|
def create_lookups(cls, nlp=None):
|
||||||
|
@ -120,6 +120,7 @@ class BaseDefaults(object):
|
||||||
tokenizer_exceptions = {}
|
tokenizer_exceptions = {}
|
||||||
stop_words = set()
|
stop_words = set()
|
||||||
morph_rules = {}
|
morph_rules = {}
|
||||||
|
is_base_form = None
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
syntax_iterators = {}
|
syntax_iterators = {}
|
||||||
resources = {}
|
resources = {}
|
||||||
|
|
|
@ -21,7 +21,7 @@ class Lemmatizer(object):
|
||||||
def load(cls, *args, **kwargs):
|
def load(cls, *args, **kwargs):
|
||||||
raise NotImplementedError(Errors.E172)
|
raise NotImplementedError(Errors.E172)
|
||||||
|
|
||||||
def __init__(self, lookups, *args, **kwargs):
|
def __init__(self, lookups, *args, is_base_form=None, **kwargs):
|
||||||
"""Initialize a Lemmatizer.
|
"""Initialize a Lemmatizer.
|
||||||
|
|
||||||
lookups (Lookups): The lookups object containing the (optional) tables
|
lookups (Lookups): The lookups object containing the (optional) tables
|
||||||
|
@ -31,6 +31,7 @@ class Lemmatizer(object):
|
||||||
if args or kwargs or not isinstance(lookups, Lookups):
|
if args or kwargs or not isinstance(lookups, Lookups):
|
||||||
raise ValueError(Errors.E173)
|
raise ValueError(Errors.E173)
|
||||||
self.lookups = lookups
|
self.lookups = lookups
|
||||||
|
self.is_base_form = is_base_form
|
||||||
|
|
||||||
def __call__(self, string, univ_pos, morphology=None):
|
def __call__(self, string, univ_pos, morphology=None):
|
||||||
"""Lemmatize a string.
|
"""Lemmatize a string.
|
||||||
|
@ -51,7 +52,7 @@ class Lemmatizer(object):
|
||||||
if univ_pos in ("", "eol", "space"):
|
if univ_pos in ("", "eol", "space"):
|
||||||
return [string.lower()]
|
return [string.lower()]
|
||||||
# See Issue #435 for example of where this logic is requied.
|
# See Issue #435 for example of where this logic is requied.
|
||||||
if self.is_base_form(univ_pos, morphology):
|
if callable(self.is_base_form) and self.is_base_form(univ_pos, morphology):
|
||||||
return [string.lower()]
|
return [string.lower()]
|
||||||
index_table = self.lookups.get_table("lemma_index", {})
|
index_table = self.lookups.get_table("lemma_index", {})
|
||||||
exc_table = self.lookups.get_table("lemma_exc", {})
|
exc_table = self.lookups.get_table("lemma_exc", {})
|
||||||
|
@ -69,40 +70,6 @@ class Lemmatizer(object):
|
||||||
)
|
)
|
||||||
return lemmas
|
return lemmas
|
||||||
|
|
||||||
def is_base_form(self, univ_pos, morphology=None):
|
|
||||||
"""
|
|
||||||
Check whether we're dealing with an uninflected paradigm, so we can
|
|
||||||
avoid lemmatization entirely.
|
|
||||||
|
|
||||||
univ_pos (unicode / int): The token's universal part-of-speech tag.
|
|
||||||
morphology (dict): The token's morphological features following the
|
|
||||||
Universal Dependencies scheme.
|
|
||||||
"""
|
|
||||||
if morphology is None:
|
|
||||||
morphology = {}
|
|
||||||
if univ_pos == "noun" and morphology.get("Number") == "sing":
|
|
||||||
return True
|
|
||||||
elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
|
|
||||||
return True
|
|
||||||
# This maps 'VBP' to base form -- probably just need 'IS_BASE'
|
|
||||||
# morphology
|
|
||||||
elif univ_pos == "verb" and (
|
|
||||||
morphology.get("VerbForm") == "fin"
|
|
||||||
and morphology.get("Tense") == "pres"
|
|
||||||
and morphology.get("Number") is None
|
|
||||||
):
|
|
||||||
return True
|
|
||||||
elif univ_pos == "adj" and morphology.get("Degree") == "pos":
|
|
||||||
return True
|
|
||||||
elif morphology.get("VerbForm") == "inf":
|
|
||||||
return True
|
|
||||||
elif morphology.get("VerbForm") == "none":
|
|
||||||
return True
|
|
||||||
elif morphology.get("Degree") == "pos":
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
return False
|
|
||||||
|
|
||||||
def noun(self, string, morphology=None):
|
def noun(self, string, morphology=None):
|
||||||
return self(string, "noun", morphology)
|
return self(string, "noun", morphology)
|
||||||
|
|
||||||
|
|
|
@ -11,6 +11,7 @@ from spacy.language import Language
|
||||||
from spacy.lemmatizer import Lemmatizer
|
from spacy.lemmatizer import Lemmatizer
|
||||||
from spacy.lookups import Lookups
|
from spacy.lookups import Lookups
|
||||||
from spacy.tokens import Doc, Span
|
from spacy.tokens import Doc, Span
|
||||||
|
from spacy.lang.en import EnglishDefaults
|
||||||
|
|
||||||
from ..util import get_doc, make_tempdir
|
from ..util import get_doc, make_tempdir
|
||||||
|
|
||||||
|
@ -172,7 +173,7 @@ def test_issue595():
|
||||||
lookups.add_table("lemma_rules", {"verb": [["ed", "e"]]})
|
lookups.add_table("lemma_rules", {"verb": [["ed", "e"]]})
|
||||||
lookups.add_table("lemma_index", {"verb": {}})
|
lookups.add_table("lemma_index", {"verb": {}})
|
||||||
lookups.add_table("lemma_exc", {"verb": {}})
|
lookups.add_table("lemma_exc", {"verb": {}})
|
||||||
lemmatizer = Lemmatizer(lookups)
|
lemmatizer = Lemmatizer(lookups, is_base_form=EnglishDefaults.is_base_form)
|
||||||
vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
|
vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
|
||||||
doc = Doc(vocab, words=words)
|
doc = Doc(vocab, words=words)
|
||||||
doc[2].tag_ = "VB"
|
doc[2].tag_ = "VB"
|
||||||
|
|
|
@ -5,6 +5,7 @@ import pytest
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
from spacy.lookups import Lookups
|
from spacy.lookups import Lookups
|
||||||
|
from spacy.lemmatizer import Lemmatizer
|
||||||
|
|
||||||
|
|
||||||
def test_lemmatizer_reflects_lookups_changes():
|
def test_lemmatizer_reflects_lookups_changes():
|
||||||
|
@ -47,3 +48,14 @@ def test_tagger_warns_no_lookups():
|
||||||
with pytest.warns(None) as record:
|
with pytest.warns(None) as record:
|
||||||
nlp.begin_training()
|
nlp.begin_training()
|
||||||
assert not record.list
|
assert not record.list
|
||||||
|
|
||||||
|
|
||||||
|
def test_lemmatizer_without_is_base_form_implementation():
|
||||||
|
# Norwegian example from #5658
|
||||||
|
lookups = Lookups()
|
||||||
|
lookups.add_table("lemma_rules", {"noun": []})
|
||||||
|
lookups.add_table("lemma_index", {"noun": {}})
|
||||||
|
lookups.add_table("lemma_exc", {"noun": {"formuesskatten": ["formuesskatt"]}})
|
||||||
|
|
||||||
|
lemmatizer = Lemmatizer(lookups, is_base_form=None)
|
||||||
|
assert lemmatizer("Formuesskatten", "noun", {'Definite': 'def', 'Gender': 'masc', 'Number': 'sing'}) == ["formuesskatt"]
|
||||||
|
|
Loading…
Reference in New Issue
Block a user