mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-10 19:57:17 +03:00
fixing Issue #3521 by adding all hyphen variants for each stopword
This commit is contained in:
parent
e7062cf699
commit
eca9cc5417
|
@ -39,7 +39,7 @@ made make many may me meanwhile might mine more moreover most mostly move much
|
|||
must my myself
|
||||
|
||||
name namely neither never nevertheless next nine no nobody none noone nor not
|
||||
nothing now nowhere n't
|
||||
nothing now nowhere
|
||||
|
||||
of off often on once one only onto or other others otherwise our ours ourselves
|
||||
out over own
|
||||
|
@ -66,7 +66,10 @@ whereafter whereas whereby wherein whereupon wherever whether which while
|
|||
whither who whoever whole whom whose why will with within without would
|
||||
|
||||
yet you your yours yourself yourselves
|
||||
|
||||
'd 'll 'm 're 's 've
|
||||
""".split()
|
||||
)
|
||||
|
||||
for hyphen in ["'", "`", "‘", "´", "’"]:
|
||||
for stopword in "n't 'd 'll 'm 're 's 've".split():
|
||||
STOP_WORDS.add(stopword.replace("'", hyphen))
|
||||
|
||||
|
|
|
@ -1,7 +1,5 @@
|
|||
import pytest
|
||||
|
||||
from spacy.lang.en import English
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"word",
|
||||
|
@ -12,9 +10,7 @@ from spacy.lang.en import English
|
|||
"I’d",
|
||||
],
|
||||
)
|
||||
def test_issue3521(fr_tokenizer, word):
|
||||
nlp = English()
|
||||
|
||||
tok = nlp(word)[1]
|
||||
def test_issue3521(en_tokenizer, word):
|
||||
tok = en_tokenizer(word)[1]
|
||||
# 'not' and 'would' should be stopwords, also in their abbreviated forms
|
||||
assert tok.is_stop
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user