fixing Issue #3521 by adding all hyphen variants for each stopword

2025-12-12 04:34:31 +03:00 · 2019-04-02 13:24:59 +02:00 · 2019-04-02 13:24:59 +02:00 · eca9cc5417
commit eca9cc5417
parent e7062cf699
2 changed files with 9 additions and 10 deletions
--- a/spacy/lang/en/stop_words.py
+++ b/spacy/lang/en/stop_words.py
@ -39,7 +39,7 @@ made make many may me meanwhile might mine more moreover most mostly move much
 must my myself
 name namely neither never nevertheless next nine no nobody none noone nor not
-nothing now nowhere n't
+nothing now nowhere 
 of off often on once one only onto or other others otherwise our ours ourselves
 out over own
@ -66,7 +66,10 @@ whereafter whereas whereby wherein whereupon wherever whether which while
 whither who whoever whole whom whose why will with within without would
 yet you your yours yourself yourselves
 'd 'll 'm 're 's 've
 """.split()
 )
 for hyphen in ["'", "`", "‘", "´", "’"]:
    for stopword in "n't 'd 'll 'm 're 's 've".split():
        STOP_WORDS.add(stopword.replace("'", hyphen))
--- a/spacy/tests/regression/test_issue3521.py
+++ b/spacy/tests/regression/test_issue3521.py
@ -1,7 +1,5 @@
 import pytest
 from spacy.lang.en import English
@pytest.mark.parametrize(
    "word",
@ -12,9 +10,7 @@ from spacy.lang.en import English
        "I’d",
    ],
 )
-def test_issue3521(fr_tokenizer, word):
+def test_issue3521(en_tokenizer, word):
-    nlp = English()
+    tok = en_tokenizer(word)[1]
-
+    # 'not' and 'would' should be stopwords, also in their abbreviated forms
    tok = nlp(word)[1]
    assert tok.is_stop