fixing Issue #3521 by adding all hyphen variants for each stopword

2025-07-19 04:32:32 +03:00 · 2019-04-02 13:24:59 +02:00 · 2019-04-02 13:24:59 +02:00 · eca9cc5417
commit eca9cc5417
parent e7062cf699
2 changed files with 9 additions and 10 deletions
--- a/spacy/lang/en/stop_words.py
+++ b/spacy/lang/en/stop_words.py
@ -39,7 +39,7 @@ made make many may me meanwhile might mine more moreover most mostly move much
 must my myself

 name namely neither never nevertheless next nine no nobody none noone nor not
-nothing now nowhere n't
+nothing now nowhere 

 of off often on once one only onto or other others otherwise our ours ourselves
 out over own
@ -66,7 +66,10 @@ whereafter whereas whereby wherein whereupon wherever whether which while
 whither who whoever whole whom whose why will with within without would

 yet you your yours yourself yourselves
-
-'d 'll 'm 're 's 've
 """.split()
 )
+
+for hyphen in ["'", "`", "‘", "´", "’"]:
+    for stopword in "n't 'd 'll 'm 're 's 've".split():
+        STOP_WORDS.add(stopword.replace("'", hyphen))
+
--- a/spacy/tests/regression/test_issue3521.py
+++ b/spacy/tests/regression/test_issue3521.py
@ -1,7 +1,5 @@
 import pytest

-from spacy.lang.en import English
-

@pytest.mark.parametrize(
    "word",
@ -12,9 +10,7 @@ from spacy.lang.en import English
        "I’d",
    ],
 )
-def test_issue3521(fr_tokenizer, word):
-    nlp = English()
-
-    tok = nlp(word)[1]
+def test_issue3521(en_tokenizer, word):
+    tok = en_tokenizer(word)[1]
+    # 'not' and 'would' should be stopwords, also in their abbreviated forms
    assert tok.is_stop
-