mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Fix lemmatizer is_base_form for python2.7 (#5734)
* Fix lemmatizer init args for python2.7 * Move English is_base_form to a class method * Skip test pickling PhraseMatcher for python2
This commit is contained in:
parent
923affd091
commit
0a62098c5f
|
@ -18,41 +18,6 @@ def _return_en(_):
|
||||||
return "en"
|
return "en"
|
||||||
|
|
||||||
|
|
||||||
def en_is_base_form(univ_pos, morphology=None):
|
|
||||||
"""
|
|
||||||
Check whether we're dealing with an uninflected paradigm, so we can
|
|
||||||
avoid lemmatization entirely.
|
|
||||||
|
|
||||||
univ_pos (unicode / int): The token's universal part-of-speech tag.
|
|
||||||
morphology (dict): The token's morphological features following the
|
|
||||||
Universal Dependencies scheme.
|
|
||||||
"""
|
|
||||||
if morphology is None:
|
|
||||||
morphology = {}
|
|
||||||
if univ_pos == "noun" and morphology.get("Number") == "sing":
|
|
||||||
return True
|
|
||||||
elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
|
|
||||||
return True
|
|
||||||
# This maps 'VBP' to base form -- probably just need 'IS_BASE'
|
|
||||||
# morphology
|
|
||||||
elif univ_pos == "verb" and (
|
|
||||||
morphology.get("VerbForm") == "fin"
|
|
||||||
and morphology.get("Tense") == "pres"
|
|
||||||
and morphology.get("Number") is None
|
|
||||||
):
|
|
||||||
return True
|
|
||||||
elif univ_pos == "adj" and morphology.get("Degree") == "pos":
|
|
||||||
return True
|
|
||||||
elif morphology.get("VerbForm") == "inf":
|
|
||||||
return True
|
|
||||||
elif morphology.get("VerbForm") == "none":
|
|
||||||
return True
|
|
||||||
elif morphology.get("Degree") == "pos":
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
class EnglishDefaults(Language.Defaults):
|
class EnglishDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
lex_attr_getters.update(LEX_ATTRS)
|
||||||
|
@ -61,7 +26,6 @@ class EnglishDefaults(Language.Defaults):
|
||||||
tag_map = TAG_MAP
|
tag_map = TAG_MAP
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
morph_rules = MORPH_RULES
|
morph_rules = MORPH_RULES
|
||||||
is_base_form = en_is_base_form
|
|
||||||
syntax_iterators = SYNTAX_ITERATORS
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
single_orth_variants = [
|
single_orth_variants = [
|
||||||
{"tags": ["NFP"], "variants": ["…", "..."]},
|
{"tags": ["NFP"], "variants": ["…", "..."]},
|
||||||
|
@ -72,6 +36,41 @@ class EnglishDefaults(Language.Defaults):
|
||||||
{"tags": ["``", "''"], "variants": [('"', '"'), ("“", "”")]},
|
{"tags": ["``", "''"], "variants": [('"', '"'), ("“", "”")]},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def is_base_form(cls, univ_pos, morphology=None):
|
||||||
|
"""
|
||||||
|
Check whether we're dealing with an uninflected paradigm, so we can
|
||||||
|
avoid lemmatization entirely.
|
||||||
|
|
||||||
|
univ_pos (unicode / int): The token's universal part-of-speech tag.
|
||||||
|
morphology (dict): The token's morphological features following the
|
||||||
|
Universal Dependencies scheme.
|
||||||
|
"""
|
||||||
|
if morphology is None:
|
||||||
|
morphology = {}
|
||||||
|
if univ_pos == "noun" and morphology.get("Number") == "sing":
|
||||||
|
return True
|
||||||
|
elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
|
||||||
|
return True
|
||||||
|
# This maps 'VBP' to base form -- probably just need 'IS_BASE'
|
||||||
|
# morphology
|
||||||
|
elif univ_pos == "verb" and (
|
||||||
|
morphology.get("VerbForm") == "fin"
|
||||||
|
and morphology.get("Tense") == "pres"
|
||||||
|
and morphology.get("Number") is None
|
||||||
|
):
|
||||||
|
return True
|
||||||
|
elif univ_pos == "adj" and morphology.get("Degree") == "pos":
|
||||||
|
return True
|
||||||
|
elif morphology.get("VerbForm") == "inf":
|
||||||
|
return True
|
||||||
|
elif morphology.get("VerbForm") == "none":
|
||||||
|
return True
|
||||||
|
elif morphology.get("Degree") == "pos":
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
class English(Language):
|
class English(Language):
|
||||||
lang = "en"
|
lang = "en"
|
||||||
|
|
|
@ -21,7 +21,7 @@ class Lemmatizer(object):
|
||||||
def load(cls, *args, **kwargs):
|
def load(cls, *args, **kwargs):
|
||||||
raise NotImplementedError(Errors.E172)
|
raise NotImplementedError(Errors.E172)
|
||||||
|
|
||||||
def __init__(self, lookups, *args, is_base_form=None, **kwargs):
|
def __init__(self, lookups, is_base_form=None, *args, **kwargs):
|
||||||
"""Initialize a Lemmatizer.
|
"""Initialize a Lemmatizer.
|
||||||
|
|
||||||
lookups (Lookups): The lookups object containing the (optional) tables
|
lookups (Lookups): The lookups object containing the (optional) tables
|
||||||
|
|
|
@ -121,6 +121,7 @@ def test_issue3248_1():
|
||||||
assert len(matcher) == 2
|
assert len(matcher) == 2
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(is_python2, reason="Can't pickle instancemethod for is_base_form")
|
||||||
def test_issue3248_2():
|
def test_issue3248_2():
|
||||||
"""Test that the PhraseMatcher can be pickled correctly."""
|
"""Test that the PhraseMatcher can be pickled correctly."""
|
||||||
nlp = English()
|
nlp = English()
|
||||||
|
|
Loading…
Reference in New Issue
Block a user