addressed all comments by Ines

This commit is contained in:
svlandeg 2019-04-03 13:50:33 +02:00
parent 85b4319f33
commit 4ff786e113
3 changed files with 14 additions and 7 deletions

View File

@ -69,7 +69,10 @@ yet you your yours yourself yourselves
""".split() """.split()
) )
for hyphen in ["'", "`", "", "´", ""]: contractions = ["n't", "'d", "'ll", "'m", "'re", "'s", "'ve"]
for stopword in u"n't 'd 'll 'm 're 's 've".split(): STOP_WORDS.update(contractions)
STOP_WORDS.add(stopword.replace("'", hyphen))
for apostrophe in ["", ""]:
for stopword in contractions:
STOP_WORDS.add(stopword.replace("'", apostrophe))

View File

@ -1,4 +1,6 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals
import pytest import pytest
from spacy.lang.en import English from spacy.lang.en import English

View File

@ -1,14 +1,16 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals
import pytest import pytest
@pytest.mark.parametrize( @pytest.mark.parametrize(
"word", "word",
[ [
u"don't", "don't",
u"dont", "dont",
u"I'd", "I'd",
u"Id", "Id",
], ],
) )
def test_issue3521(en_tokenizer, word): def test_issue3521(en_tokenizer, word):