fixing Issue #3521 by adding all hyphen variants for each stopword

This commit is contained in:
svlandeg 2019-04-02 13:24:59 +02:00
parent e7062cf699
commit eca9cc5417
2 changed files with 9 additions and 10 deletions

View File

@ -39,7 +39,7 @@ made make many may me meanwhile might mine more moreover most mostly move much
must my myself must my myself
name namely neither never nevertheless next nine no nobody none noone nor not name namely neither never nevertheless next nine no nobody none noone nor not
nothing now nowhere n't nothing now nowhere
of off often on once one only onto or other others otherwise our ours ourselves of off often on once one only onto or other others otherwise our ours ourselves
out over own out over own
@ -66,7 +66,10 @@ whereafter whereas whereby wherein whereupon wherever whether which while
whither who whoever whole whom whose why will with within without would whither who whoever whole whom whose why will with within without would
yet you your yours yourself yourselves yet you your yours yourself yourselves
'd 'll 'm 're 's 've
""".split() """.split()
) )
for hyphen in ["'", "`", "", "´", ""]:
for stopword in "n't 'd 'll 'm 're 's 've".split():
STOP_WORDS.add(stopword.replace("'", hyphen))

View File

@ -1,7 +1,5 @@
import pytest import pytest
from spacy.lang.en import English
@pytest.mark.parametrize( @pytest.mark.parametrize(
"word", "word",
@ -12,9 +10,7 @@ from spacy.lang.en import English
"Id", "Id",
], ],
) )
def test_issue3521(fr_tokenizer, word): def test_issue3521(en_tokenizer, word):
nlp = English() tok = en_tokenizer(word)[1]
# 'not' and 'would' should be stopwords, also in their abbreviated forms
tok = nlp(word)[1]
assert tok.is_stop assert tok.is_stop