mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-27 17:54:39 +03:00
Merge pull request #1389 from mdcclv/lemmatizer_obey_exceptions
Lemmatizer obey exceptions
This commit is contained in:
commit
dea81f113d
|
@ -78,15 +78,16 @@ def lemmatize(string, index, exceptions, rules):
|
||||||
# forms.append(string)
|
# forms.append(string)
|
||||||
forms.extend(exceptions.get(string, []))
|
forms.extend(exceptions.get(string, []))
|
||||||
oov_forms = []
|
oov_forms = []
|
||||||
for old, new in rules:
|
if not forms:
|
||||||
if string.endswith(old):
|
for old, new in rules:
|
||||||
form = string[:len(string) - len(old)] + new
|
if string.endswith(old):
|
||||||
if not form:
|
form = string[:len(string) - len(old)] + new
|
||||||
pass
|
if not form:
|
||||||
elif form in index or not form.isalpha():
|
pass
|
||||||
forms.append(form)
|
elif form in index or not form.isalpha():
|
||||||
else:
|
forms.append(form)
|
||||||
oov_forms.append(form)
|
else:
|
||||||
|
oov_forms.append(form)
|
||||||
if not forms:
|
if not forms:
|
||||||
forms.extend(oov_forms)
|
forms.extend(oov_forms)
|
||||||
if not forms:
|
if not forms:
|
||||||
|
|
22
spacy/tests/regression/test_issue1387.py
Normal file
22
spacy/tests/regression/test_issue1387.py
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
# coding: utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from ...symbols import POS, VERB, VerbForm_part
|
||||||
|
from ...vocab import Vocab
|
||||||
|
from ...lemmatizer import Lemmatizer
|
||||||
|
from ..util import get_doc
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
def test_issue1387():
|
||||||
|
tag_map = {'VBG': {POS: VERB, VerbForm_part: True}}
|
||||||
|
index = {"verb": ("cope","cop")}
|
||||||
|
exc = {"verb": {"coping": ("cope",)}}
|
||||||
|
rules = {"verb": [["ing", ""]]}
|
||||||
|
lemmatizer = Lemmatizer(index, exc, rules)
|
||||||
|
vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
|
||||||
|
doc = get_doc(vocab, ["coping"])
|
||||||
|
doc[0].tag_ = 'VBG'
|
||||||
|
assert doc[0].text == "coping"
|
||||||
|
assert doc[0].lemma_ == "cope"
|
||||||
|
|
|
@ -47,3 +47,20 @@ def test_tagger_lemmatizer_lemma_assignment(EN):
|
||||||
assert all(t.lemma_ == '' for t in doc)
|
assert all(t.lemma_ == '' for t in doc)
|
||||||
EN.tagger(doc)
|
EN.tagger(doc)
|
||||||
assert all(t.lemma_ != '' for t in doc)
|
assert all(t.lemma_ != '' for t in doc)
|
||||||
|
|
||||||
|
|
||||||
|
from ...symbols import POS, VERB, VerbForm_part
|
||||||
|
from ...vocab import Vocab
|
||||||
|
from ...lemmatizer import Lemmatizer
|
||||||
|
from ..util import get_doc
|
||||||
|
def test_tagger_lemmatizer_exceptions():
|
||||||
|
index = {"verb": ("cope","cop")}
|
||||||
|
exc = {"verb": {"coping": ("cope",)}}
|
||||||
|
rules = {"verb": [["ing", ""]]}
|
||||||
|
tag_map = {'VBG': {POS: VERB, VerbForm_part: True}}
|
||||||
|
lemmatizer = Lemmatizer(index, exc, rules)
|
||||||
|
vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
|
||||||
|
doc = get_doc(vocab, ["coping"])
|
||||||
|
doc[0].tag_ = 'VBG'
|
||||||
|
assert doc[0].text == "coping"
|
||||||
|
assert doc[0].lemma_ == "cope"
|
||||||
|
|
Loading…
Reference in New Issue
Block a user