From e81a608173e78b10da5984cf0d2632de29f407f1 Mon Sep 17 00:00:00 2001 From: Orion Montoya Date: Thu, 5 Oct 2017 10:47:48 -0400 Subject: [PATCH 1/3] Regression test for lemmatizer exceptions -- demonstrate issue #1387 --- spacy/tests/regression/test_issue1387.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 spacy/tests/regression/test_issue1387.py diff --git a/spacy/tests/regression/test_issue1387.py b/spacy/tests/regression/test_issue1387.py new file mode 100644 index 000000000..c5f01d145 --- /dev/null +++ b/spacy/tests/regression/test_issue1387.py @@ -0,0 +1,22 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from ...symbols import POS, VERB, VerbForm_part +from ...vocab import Vocab +from ...lemmatizer import Lemmatizer +from ..util import get_doc + +import pytest + +def test_issue1387(): + tag_map = {'VBG': {POS: VERB, VerbForm_part: True}} + index = {"verb": ("cope","cop")} + exc = {"verb": {"coping": ("cope",)}} + rules = {"verb": [["ing", ""]]} + lemmatizer = Lemmatizer(index, exc, rules) + vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map) + doc = get_doc(vocab, ["coping"]) + doc[0].tag_ = 'VBG' + assert doc[0].text == "coping" + assert doc[0].lemma_ == "cope" + From ffb50d21a043a1028a7a8ac3f354483ec100fce6 Mon Sep 17 00:00:00 2001 From: Orion Montoya Date: Thu, 5 Oct 2017 10:49:02 -0400 Subject: [PATCH 2/3] Lemmatizer honors exceptions: Fix #1387 --- spacy/lemmatizer.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index d7541c56b..1112bcee3 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -78,15 +78,16 @@ def lemmatize(string, index, exceptions, rules): # forms.append(string) forms.extend(exceptions.get(string, [])) oov_forms = [] - for old, new in rules: - if string.endswith(old): - form = string[:len(string) - len(old)] + new - if not form: - pass - elif form in index or not form.isalpha(): - forms.append(form) - else: - oov_forms.append(form) + if not forms: + for old, new in rules: + if string.endswith(old): + form = string[:len(string) - len(old)] + new + if not form: + pass + elif form in index or not form.isalpha(): + forms.append(form) + else: + oov_forms.append(form) if not forms: forms.extend(oov_forms) if not forms: From b0d271809dab5146fdc45cfcfab2e467b8a9347e Mon Sep 17 00:00:00 2001 From: Orion Montoya Date: Thu, 5 Oct 2017 10:49:28 -0400 Subject: [PATCH 3/3] Unit test for lemmatizer exceptions -- copied from regression test for #1387 --- spacy/tests/tagger/test_lemmatizer.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/spacy/tests/tagger/test_lemmatizer.py b/spacy/tests/tagger/test_lemmatizer.py index 5db0d0b2c..91ed7d2f1 100644 --- a/spacy/tests/tagger/test_lemmatizer.py +++ b/spacy/tests/tagger/test_lemmatizer.py @@ -47,3 +47,20 @@ def test_tagger_lemmatizer_lemma_assignment(EN): assert all(t.lemma_ == '' for t in doc) EN.tagger(doc) assert all(t.lemma_ != '' for t in doc) + + +from ...symbols import POS, VERB, VerbForm_part +from ...vocab import Vocab +from ...lemmatizer import Lemmatizer +from ..util import get_doc +def test_tagger_lemmatizer_exceptions(): + index = {"verb": ("cope","cop")} + exc = {"verb": {"coping": ("cope",)}} + rules = {"verb": [["ing", ""]]} + tag_map = {'VBG': {POS: VERB, VerbForm_part: True}} + lemmatizer = Lemmatizer(index, exc, rules) + vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map) + doc = get_doc(vocab, ["coping"]) + doc[0].tag_ = 'VBG' + assert doc[0].text == "coping" + assert doc[0].lemma_ == "cope"