Fix lemmatizer is_base_form for python2.7 (#5734)

* Fix lemmatizer init args for python2.7

* Move English is_base_form to a class method

* Skip test pickling PhraseMatcher for python2
This commit is contained in:
Adriane Boyd 2020-07-09 22:11:24 +02:00 committed by GitHub
parent 923affd091
commit 0a62098c5f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 37 additions and 37 deletions

View File

@ -18,41 +18,6 @@ def _return_en(_):
return "en"
def en_is_base_form(univ_pos, morphology=None):
"""
Check whether we're dealing with an uninflected paradigm, so we can
avoid lemmatization entirely.
univ_pos (unicode / int): The token's universal part-of-speech tag.
morphology (dict): The token's morphological features following the
Universal Dependencies scheme.
"""
if morphology is None:
morphology = {}
if univ_pos == "noun" and morphology.get("Number") == "sing":
return True
elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
return True
# This maps 'VBP' to base form -- probably just need 'IS_BASE'
# morphology
elif univ_pos == "verb" and (
morphology.get("VerbForm") == "fin"
and morphology.get("Tense") == "pres"
and morphology.get("Number") is None
):
return True
elif univ_pos == "adj" and morphology.get("Degree") == "pos":
return True
elif morphology.get("VerbForm") == "inf":
return True
elif morphology.get("VerbForm") == "none":
return True
elif morphology.get("Degree") == "pos":
return True
else:
return False
class EnglishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
@ -61,7 +26,6 @@ class EnglishDefaults(Language.Defaults):
tag_map = TAG_MAP
stop_words = STOP_WORDS
morph_rules = MORPH_RULES
is_base_form = en_is_base_form
syntax_iterators = SYNTAX_ITERATORS
single_orth_variants = [
{"tags": ["NFP"], "variants": ["", "..."]},
@ -72,6 +36,41 @@ class EnglishDefaults(Language.Defaults):
{"tags": ["``", "''"], "variants": [('"', '"'), ("", "")]},
]
@classmethod
def is_base_form(cls, univ_pos, morphology=None):
"""
Check whether we're dealing with an uninflected paradigm, so we can
avoid lemmatization entirely.
univ_pos (unicode / int): The token's universal part-of-speech tag.
morphology (dict): The token's morphological features following the
Universal Dependencies scheme.
"""
if morphology is None:
morphology = {}
if univ_pos == "noun" and morphology.get("Number") == "sing":
return True
elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
return True
# This maps 'VBP' to base form -- probably just need 'IS_BASE'
# morphology
elif univ_pos == "verb" and (
morphology.get("VerbForm") == "fin"
and morphology.get("Tense") == "pres"
and morphology.get("Number") is None
):
return True
elif univ_pos == "adj" and morphology.get("Degree") == "pos":
return True
elif morphology.get("VerbForm") == "inf":
return True
elif morphology.get("VerbForm") == "none":
return True
elif morphology.get("Degree") == "pos":
return True
else:
return False
class English(Language):
lang = "en"

View File

@ -21,7 +21,7 @@ class Lemmatizer(object):
def load(cls, *args, **kwargs):
raise NotImplementedError(Errors.E172)
def __init__(self, lookups, *args, is_base_form=None, **kwargs):
def __init__(self, lookups, is_base_form=None, *args, **kwargs):
"""Initialize a Lemmatizer.
lookups (Lookups): The lookups object containing the (optional) tables

View File

@ -121,6 +121,7 @@ def test_issue3248_1():
assert len(matcher) == 2
@pytest.mark.skipif(is_python2, reason="Can't pickle instancemethod for is_base_form")
def test_issue3248_2():
"""Test that the PhraseMatcher can be pickled correctly."""
nlp = English()