Add test for #1250: Tokenizer cache clobbered special-case attrs

This commit is contained in:
Matthew Honnibal 2017-10-24 16:07:18 +02:00
parent 18f1c1d0ba
commit 63f0bde749

View File

@ -0,0 +1,13 @@
from __future__ import unicode_literals
from ...tokenizer import Tokenizer
from ...symbols import ORTH, LEMMA, POS
from ...lang.en import English
def test_issue1250_cached_special_cases():
nlp = English()
nlp.tokenizer.add_special_case(u'reimbur', [{ORTH: u'reimbur', LEMMA: u'reimburse', POS: u'VERB'}])
lemmas = [w.lemma_ for w in nlp(u'reimbur, reimbur...')]
assert lemmas == ['reimburse', ',', 'reimburse', '...']
lemmas = [w.lemma_ for w in nlp(u'reimbur, reimbur...')]
assert lemmas == ['reimburse', ',', 'reimburse', '...']