From 50878ef598e7978f280026cb939a0c76ba97a766 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 16 Jan 2017 13:10:38 +0100 Subject: [PATCH] Exclude "were" and "Were" from tokenizer exceptions and add regression test (resolves #744) --- spacy/en/tokenizer_exceptions.py | 2 +- spacy/tests/regression/test_issue744.py | 13 +++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) create mode 100644 spacy/tests/regression/test_issue744.py diff --git a/spacy/en/tokenizer_exceptions.py b/spacy/en/tokenizer_exceptions.py index 798f9ec53..36bb0d7f0 100644 --- a/spacy/en/tokenizer_exceptions.py +++ b/spacy/en/tokenizer_exceptions.py @@ -7,7 +7,7 @@ from ..language_data import PRON_LEMMA EXC = {} -EXCLUDE_EXC = ["Ill", "ill", "Its", "its", "Hell", "hell", "Well", "well", "Whore", "whore"] +EXCLUDE_EXC = ["Ill", "ill", "Its", "its", "Hell", "hell", "were", "Were", "Well", "well", "Whore", "whore"] # Pronouns diff --git a/spacy/tests/regression/test_issue744.py b/spacy/tests/regression/test_issue744.py new file mode 100644 index 000000000..4e5eb2e10 --- /dev/null +++ b/spacy/tests/regression/test_issue744.py @@ -0,0 +1,13 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + + +@pytest.mark.parametrize('text', ["We were scared", "We Were Scared"]) +def test_issue744(en_tokenizer, text): + """Test that 'were' and 'Were' are excluded from the contractions + generated by the English tokenizer exceptions.""" + tokens = en_tokenizer(text) + assert len(tokens) == 3 + assert tokens[1].text.lower() == "were"