addressed all comments by Ines

This commit is contained in:
svlandeg 2019-04-03 13:50:33 +02:00
parent 85b4319f33
commit 4ff786e113
3 changed files with 14 additions and 7 deletions

View File

@ -69,7 +69,10 @@ yet you your yours yourself yourselves
""".split()
)
for hyphen in ["'", "`", "", "´", ""]:
for stopword in u"n't 'd 'll 'm 're 's 've".split():
STOP_WORDS.add(stopword.replace("'", hyphen))
contractions = ["n't", "'d", "'ll", "'m", "'re", "'s", "'ve"]
STOP_WORDS.update(contractions)
for apostrophe in ["", ""]:
for stopword in contractions:
STOP_WORDS.add(stopword.replace("'", apostrophe))

View File

@ -1,4 +1,6 @@
# coding: utf8
from __future__ import unicode_literals
import pytest
from spacy.lang.en import English

View File

@ -1,14 +1,16 @@
# coding: utf8
from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize(
"word",
[
u"don't",
u"dont",
u"I'd",
u"Id",
"don't",
"dont",
"I'd",
"Id",
],
)
def test_issue3521(en_tokenizer, word):