From 6f7e7d88b929054bdae342c6a83b66415bed8c6f Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Wed, 6 Jan 2021 05:30:30 +0100 Subject: [PATCH] remove cause without apostrophe from norm exceptions (#6636) --- spacy/lang/en/tokenizer_exceptions.py | 1 - spacy/tests/lang/en/test_exceptions.py | 10 +++++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/spacy/lang/en/tokenizer_exceptions.py b/spacy/lang/en/tokenizer_exceptions.py index 964a714ae..dbe1b2a51 100644 --- a/spacy/lang/en/tokenizer_exceptions.py +++ b/spacy/lang/en/tokenizer_exceptions.py @@ -319,7 +319,6 @@ for exc_data in [ # Other contractions with leading apostrophe for exc_data in [ - {ORTH: "cause", NORM: "because"}, {ORTH: "em", LEMMA: PRON_LEMMA, NORM: "them"}, {ORTH: "ll", LEMMA: "will", NORM: "will"}, {ORTH: "nuff", LEMMA: "enough", NORM: "enough"}, diff --git a/spacy/tests/lang/en/test_exceptions.py b/spacy/tests/lang/en/test_exceptions.py index 1ff64eff2..6f747c550 100644 --- a/spacy/tests/lang/en/test_exceptions.py +++ b/spacy/tests/lang/en/test_exceptions.py @@ -111,7 +111,15 @@ def test_en_tokenizer_handles_times(en_tokenizer, text): @pytest.mark.parametrize( - "text,norms", [("I'm", ["i", "am"]), ("shan't", ["shall", "not"])] + "text,norms", + [ + ("I'm", ["i", "am"]), + ("shan't", ["shall", "not"]), + ( + "Many factors cause cancer 'cause it is complex", + ["many", "factors", "cause", "cancer", "because", "it", "is", "complex"], + ), + ], ) def test_en_tokenizer_norm_exceptions(en_tokenizer, text, norms): tokens = en_tokenizer(text)