Fix lemmatizer is_base_form for python2.7 (#5734)

* Fix lemmatizer init args for python2.7 * Move English is_base_form to a class method * Skip test pickling PhraseMatcher for python2
2025-12-23 18:13:13 +03:00 · 2020-07-09 22:11:24 +02:00 · 2020-07-09 22:11:24 +02:00 · 0a62098c5f
commit 0a62098c5f
parent 923affd091
3 changed files with 37 additions and 37 deletions
--- a/spacy/lang/en/init.py
+++ b/spacy/lang/en/init.py
@ -18,41 +18,6 @@ def _return_en(_):
    return "en"
 def en_is_base_form(univ_pos, morphology=None):
    """
    Check whether we're dealing with an uninflected paradigm, so we can
    avoid lemmatization entirely.
    univ_pos (unicode / int): The token's universal part-of-speech tag.
    morphology (dict): The token's morphological features following the
        Universal Dependencies scheme.
    """
    if morphology is None:
        morphology = {}
    if univ_pos == "noun" and morphology.get("Number") == "sing":
        return True
    elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
        return True
    # This maps 'VBP' to base form -- probably just need 'IS_BASE'
    # morphology
    elif univ_pos == "verb" and (
        morphology.get("VerbForm") == "fin"
        and morphology.get("Tense") == "pres"
        and morphology.get("Number") is None
    ):
        return True
    elif univ_pos == "adj" and morphology.get("Degree") == "pos":
        return True
    elif morphology.get("VerbForm") == "inf":
        return True
    elif morphology.get("VerbForm") == "none":
        return True
    elif morphology.get("Degree") == "pos":
        return True
    else:
        return False
 class EnglishDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters.update(LEX_ATTRS)
@ -61,7 +26,6 @@ class EnglishDefaults(Language.Defaults):
    tag_map = TAG_MAP
    stop_words = STOP_WORDS
    morph_rules = MORPH_RULES
    is_base_form = en_is_base_form
    syntax_iterators = SYNTAX_ITERATORS
    single_orth_variants = [
        {"tags": ["NFP"], "variants": ["…", "..."]},
@ -72,6 +36,41 @@ class EnglishDefaults(Language.Defaults):
        {"tags": ["``", "''"], "variants": [('"', '"'), ("“", "”")]},
    ]
    @classmethod
    def is_base_form(cls, univ_pos, morphology=None):
        """
        Check whether we're dealing with an uninflected paradigm, so we can
        avoid lemmatization entirely.
        univ_pos (unicode / int): The token's universal part-of-speech tag.
        morphology (dict): The token's morphological features following the
            Universal Dependencies scheme.
        """
        if morphology is None:
            morphology = {}
        if univ_pos == "noun" and morphology.get("Number") == "sing":
            return True
        elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
            return True
        # This maps 'VBP' to base form -- probably just need 'IS_BASE'
        # morphology
        elif univ_pos == "verb" and (
            morphology.get("VerbForm") == "fin"
            and morphology.get("Tense") == "pres"
            and morphology.get("Number") is None
        ):
            return True
        elif univ_pos == "adj" and morphology.get("Degree") == "pos":
            return True
        elif morphology.get("VerbForm") == "inf":
            return True
        elif morphology.get("VerbForm") == "none":
            return True
        elif morphology.get("Degree") == "pos":
            return True
        else:
            return False
 class English(Language):
    lang = "en"
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@ -21,7 +21,7 @@ class Lemmatizer(object):
    def load(cls, *args, **kwargs):
        raise NotImplementedError(Errors.E172)
-    def __init__(self, lookups, *args, is_base_form=None, **kwargs):
+    def __init__(self, lookups, is_base_form=None, *args, **kwargs):
        """Initialize a Lemmatizer.
        lookups (Lookups): The lookups object containing the (optional) tables
--- a/spacy/tests/regression/test_issue3001-3500.py
+++ b/spacy/tests/regression/test_issue3001-3500.py
@ -121,6 +121,7 @@ def test_issue3248_1():
    assert len(matcher) == 2
@pytest.mark.skipif(is_python2, reason="Can't pickle instancemethod for is_base_form")
 def test_issue3248_2():
    """Test that the PhraseMatcher can be pickled correctly."""
    nlp = English()