From 63f0bde749018909812de4f1cf3ec12cf6770483 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 24 Oct 2017 16:07:18 +0200 Subject: [PATCH] Add test for #1250: Tokenizer cache clobbered special-case attrs --- spacy/tests/regression/test_issue1250.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 spacy/tests/regression/test_issue1250.py diff --git a/spacy/tests/regression/test_issue1250.py b/spacy/tests/regression/test_issue1250.py new file mode 100644 index 000000000..3b6e0bbf2 --- /dev/null +++ b/spacy/tests/regression/test_issue1250.py @@ -0,0 +1,13 @@ +from __future__ import unicode_literals +from ...tokenizer import Tokenizer +from ...symbols import ORTH, LEMMA, POS +from ...lang.en import English + +def test_issue1250_cached_special_cases(): + nlp = English() + nlp.tokenizer.add_special_case(u'reimbur', [{ORTH: u'reimbur', LEMMA: u'reimburse', POS: u'VERB'}]) + + lemmas = [w.lemma_ for w in nlp(u'reimbur, reimbur...')] + assert lemmas == ['reimburse', ',', 'reimburse', '...'] + lemmas = [w.lemma_ for w in nlp(u'reimbur, reimbur...')] + assert lemmas == ['reimburse', ',', 'reimburse', '...']