From 0a62098c5f0e0abe640a76776ddf6ea7094e2c23 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 9 Jul 2020 22:11:24 +0200 Subject: [PATCH] Fix lemmatizer is_base_form for python2.7 (#5734) * Fix lemmatizer init args for python2.7 * Move English is_base_form to a class method * Skip test pickling PhraseMatcher for python2 --- spacy/lang/en/__init__.py | 71 +++++++++---------- spacy/lemmatizer.py | 2 +- spacy/tests/regression/test_issue3001-3500.py | 1 + 3 files changed, 37 insertions(+), 37 deletions(-) diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py index d52f3dfd8..f58ae4a4e 100644 --- a/spacy/lang/en/__init__.py +++ b/spacy/lang/en/__init__.py @@ -18,41 +18,6 @@ def _return_en(_): return "en" -def en_is_base_form(univ_pos, morphology=None): - """ - Check whether we're dealing with an uninflected paradigm, so we can - avoid lemmatization entirely. - - univ_pos (unicode / int): The token's universal part-of-speech tag. - morphology (dict): The token's morphological features following the - Universal Dependencies scheme. - """ - if morphology is None: - morphology = {} - if univ_pos == "noun" and morphology.get("Number") == "sing": - return True - elif univ_pos == "verb" and morphology.get("VerbForm") == "inf": - return True - # This maps 'VBP' to base form -- probably just need 'IS_BASE' - # morphology - elif univ_pos == "verb" and ( - morphology.get("VerbForm") == "fin" - and morphology.get("Tense") == "pres" - and morphology.get("Number") is None - ): - return True - elif univ_pos == "adj" and morphology.get("Degree") == "pos": - return True - elif morphology.get("VerbForm") == "inf": - return True - elif morphology.get("VerbForm") == "none": - return True - elif morphology.get("Degree") == "pos": - return True - else: - return False - - class EnglishDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters.update(LEX_ATTRS) @@ -61,7 +26,6 @@ class EnglishDefaults(Language.Defaults): tag_map = TAG_MAP stop_words = STOP_WORDS morph_rules = MORPH_RULES - is_base_form = en_is_base_form syntax_iterators = SYNTAX_ITERATORS single_orth_variants = [ {"tags": ["NFP"], "variants": ["…", "..."]}, @@ -72,6 +36,41 @@ class EnglishDefaults(Language.Defaults): {"tags": ["``", "''"], "variants": [('"', '"'), ("“", "”")]}, ] + @classmethod + def is_base_form(cls, univ_pos, morphology=None): + """ + Check whether we're dealing with an uninflected paradigm, so we can + avoid lemmatization entirely. + + univ_pos (unicode / int): The token's universal part-of-speech tag. + morphology (dict): The token's morphological features following the + Universal Dependencies scheme. + """ + if morphology is None: + morphology = {} + if univ_pos == "noun" and morphology.get("Number") == "sing": + return True + elif univ_pos == "verb" and morphology.get("VerbForm") == "inf": + return True + # This maps 'VBP' to base form -- probably just need 'IS_BASE' + # morphology + elif univ_pos == "verb" and ( + morphology.get("VerbForm") == "fin" + and morphology.get("Tense") == "pres" + and morphology.get("Number") is None + ): + return True + elif univ_pos == "adj" and morphology.get("Degree") == "pos": + return True + elif morphology.get("VerbForm") == "inf": + return True + elif morphology.get("VerbForm") == "none": + return True + elif morphology.get("Degree") == "pos": + return True + else: + return False + class English(Language): lang = "en" diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index f72eae128..8b2375257 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -21,7 +21,7 @@ class Lemmatizer(object): def load(cls, *args, **kwargs): raise NotImplementedError(Errors.E172) - def __init__(self, lookups, *args, is_base_form=None, **kwargs): + def __init__(self, lookups, is_base_form=None, *args, **kwargs): """Initialize a Lemmatizer. lookups (Lookups): The lookups object containing the (optional) tables diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py index effbebb92..a10225390 100644 --- a/spacy/tests/regression/test_issue3001-3500.py +++ b/spacy/tests/regression/test_issue3001-3500.py @@ -121,6 +121,7 @@ def test_issue3248_1(): assert len(matcher) == 2 +@pytest.mark.skipif(is_python2, reason="Can't pickle instancemethod for is_base_form") def test_issue3248_2(): """Test that the PhraseMatcher can be pickled correctly.""" nlp = English()