Merge pull request #1389 from mdcclv/lemmatizer_obey_exceptions

Lemmatizer obey exceptions
This commit is contained in:
Matthew Honnibal 2017-10-05 22:11:21 +02:00 committed by GitHub
commit dea81f113d
3 changed files with 49 additions and 9 deletions

View File

@ -78,15 +78,16 @@ def lemmatize(string, index, exceptions, rules):
# forms.append(string) # forms.append(string)
forms.extend(exceptions.get(string, [])) forms.extend(exceptions.get(string, []))
oov_forms = [] oov_forms = []
for old, new in rules: if not forms:
if string.endswith(old): for old, new in rules:
form = string[:len(string) - len(old)] + new if string.endswith(old):
if not form: form = string[:len(string) - len(old)] + new
pass if not form:
elif form in index or not form.isalpha(): pass
forms.append(form) elif form in index or not form.isalpha():
else: forms.append(form)
oov_forms.append(form) else:
oov_forms.append(form)
if not forms: if not forms:
forms.extend(oov_forms) forms.extend(oov_forms)
if not forms: if not forms:

View File

@ -0,0 +1,22 @@
# coding: utf-8
from __future__ import unicode_literals
from ...symbols import POS, VERB, VerbForm_part
from ...vocab import Vocab
from ...lemmatizer import Lemmatizer
from ..util import get_doc
import pytest
def test_issue1387():
tag_map = {'VBG': {POS: VERB, VerbForm_part: True}}
index = {"verb": ("cope","cop")}
exc = {"verb": {"coping": ("cope",)}}
rules = {"verb": [["ing", ""]]}
lemmatizer = Lemmatizer(index, exc, rules)
vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
doc = get_doc(vocab, ["coping"])
doc[0].tag_ = 'VBG'
assert doc[0].text == "coping"
assert doc[0].lemma_ == "cope"

View File

@ -47,3 +47,20 @@ def test_tagger_lemmatizer_lemma_assignment(EN):
assert all(t.lemma_ == '' for t in doc) assert all(t.lemma_ == '' for t in doc)
EN.tagger(doc) EN.tagger(doc)
assert all(t.lemma_ != '' for t in doc) assert all(t.lemma_ != '' for t in doc)
from ...symbols import POS, VERB, VerbForm_part
from ...vocab import Vocab
from ...lemmatizer import Lemmatizer
from ..util import get_doc
def test_tagger_lemmatizer_exceptions():
index = {"verb": ("cope","cop")}
exc = {"verb": {"coping": ("cope",)}}
rules = {"verb": [["ing", ""]]}
tag_map = {'VBG': {POS: VERB, VerbForm_part: True}}
lemmatizer = Lemmatizer(index, exc, rules)
vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
doc = get_doc(vocab, ["coping"])
doc[0].tag_ = 'VBG'
assert doc[0].text == "coping"
assert doc[0].lemma_ == "cope"