mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
fixing Issue #3521 by adding all hyphen variants for each stopword
This commit is contained in:
parent
e7062cf699
commit
eca9cc5417
|
@ -39,7 +39,7 @@ made make many may me meanwhile might mine more moreover most mostly move much
|
||||||
must my myself
|
must my myself
|
||||||
|
|
||||||
name namely neither never nevertheless next nine no nobody none noone nor not
|
name namely neither never nevertheless next nine no nobody none noone nor not
|
||||||
nothing now nowhere n't
|
nothing now nowhere
|
||||||
|
|
||||||
of off often on once one only onto or other others otherwise our ours ourselves
|
of off often on once one only onto or other others otherwise our ours ourselves
|
||||||
out over own
|
out over own
|
||||||
|
@ -66,7 +66,10 @@ whereafter whereas whereby wherein whereupon wherever whether which while
|
||||||
whither who whoever whole whom whose why will with within without would
|
whither who whoever whole whom whose why will with within without would
|
||||||
|
|
||||||
yet you your yours yourself yourselves
|
yet you your yours yourself yourselves
|
||||||
|
|
||||||
'd 'll 'm 're 's 've
|
|
||||||
""".split()
|
""".split()
|
||||||
)
|
)
|
||||||
|
|
||||||
|
for hyphen in ["'", "`", "‘", "´", "’"]:
|
||||||
|
for stopword in "n't 'd 'll 'm 're 's 've".split():
|
||||||
|
STOP_WORDS.add(stopword.replace("'", hyphen))
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,5 @@
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from spacy.lang.en import English
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"word",
|
"word",
|
||||||
|
@ -12,9 +10,7 @@ from spacy.lang.en import English
|
||||||
"I’d",
|
"I’d",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_issue3521(fr_tokenizer, word):
|
def test_issue3521(en_tokenizer, word):
|
||||||
nlp = English()
|
tok = en_tokenizer(word)[1]
|
||||||
|
# 'not' and 'would' should be stopwords, also in their abbreviated forms
|
||||||
tok = nlp(word)[1]
|
|
||||||
assert tok.is_stop
|
assert tok.is_stop
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user