addressed all comments by Ines

2025-10-20 18:54:21 +03:00 · 2019-04-03 13:50:33 +02:00 · 2019-04-03 13:50:33 +02:00 · 4ff786e113
commit 4ff786e113
parent 85b4319f33
3 changed files with 14 additions and 7 deletions
--- a/spacy/lang/en/stop_words.py
+++ b/spacy/lang/en/stop_words.py
@ -69,7 +69,10 @@ yet you your yours yourself yourselves
 """.split()
 )

-for hyphen in ["'", "`", "‘", "´", "’"]:
-    for stopword in u"n't 'd 'll 'm 're 's 've".split():
-        STOP_WORDS.add(stopword.replace("'", hyphen))
+contractions = ["n't", "'d", "'ll", "'m", "'re", "'s", "'ve"]
+STOP_WORDS.update(contractions)
+
+for apostrophe in ["‘", "’"]:
+    for stopword in contractions:
+        STOP_WORDS.add(stopword.replace("'", apostrophe))

--- a/spacy/tests/regression/test_issue3449.py
+++ b/spacy/tests/regression/test_issue3449.py
@ -1,4 +1,6 @@
 # coding: utf8
+from __future__ import unicode_literals
+
 import pytest

 from spacy.lang.en import English
--- a/spacy/tests/regression/test_issue3521.py
+++ b/spacy/tests/regression/test_issue3521.py
@ -1,14 +1,16 @@
 # coding: utf8
+from __future__ import unicode_literals
+
 import pytest


@pytest.mark.parametrize(
    "word",
    [
-        u"don't",
-        u"don’t",
-        u"I'd",
-        u"I’d",
+        "don't",
+        "don’t",
+        "I'd",
+        "I’d",
    ],
 )
 def test_issue3521(en_tokenizer, word):