diff --git a/spacy/lang/en/stop_words.py b/spacy/lang/en/stop_words.py index 07d4ff34c..aae3e5e01 100644 --- a/spacy/lang/en/stop_words.py +++ b/spacy/lang/en/stop_words.py @@ -69,7 +69,10 @@ yet you your yours yourself yourselves """.split() ) -for hyphen in ["'", "`", "‘", "´", "’"]: - for stopword in u"n't 'd 'll 'm 're 's 've".split(): - STOP_WORDS.add(stopword.replace("'", hyphen)) +contractions = ["n't", "'d", "'ll", "'m", "'re", "'s", "'ve"] +STOP_WORDS.update(contractions) + +for apostrophe in ["‘", "’"]: + for stopword in contractions: + STOP_WORDS.add(stopword.replace("'", apostrophe)) diff --git a/spacy/tests/regression/test_issue3449.py b/spacy/tests/regression/test_issue3449.py index 9f670d5aa..61a76334a 100644 --- a/spacy/tests/regression/test_issue3449.py +++ b/spacy/tests/regression/test_issue3449.py @@ -1,4 +1,6 @@ # coding: utf8 +from __future__ import unicode_literals + import pytest from spacy.lang.en import English diff --git a/spacy/tests/regression/test_issue3521.py b/spacy/tests/regression/test_issue3521.py index 12f285099..6d841894a 100644 --- a/spacy/tests/regression/test_issue3521.py +++ b/spacy/tests/regression/test_issue3521.py @@ -1,14 +1,16 @@ # coding: utf8 +from __future__ import unicode_literals + import pytest @pytest.mark.parametrize( "word", [ - u"don't", - u"don’t", - u"I'd", - u"I’d", + "don't", + "don’t", + "I'd", + "I’d", ], ) def test_issue3521(en_tokenizer, word):