Merge pull request #806 from wallinm1/fix/swedish-tokenizer-exceptions

Fix issue #805
This commit is contained in:
Ines Montani 2017-02-04 17:40:40 +01:00 committed by GitHub
commit cf529f4774
4 changed files with 24 additions and 3 deletions

View File

@ -5,12 +5,14 @@ from .. import language_data as base
from ..language_data import update_exc, strings_to_exc
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY
STOP_WORDS = set(STOP_WORDS)
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))

View File

@ -107,7 +107,6 @@ ORTH_ONLY = [
"p.g.a.",
"ref.",
"resp.",
"s.",
"s.a.s.",
"s.k.",
"st.",

View File

@ -68,6 +68,11 @@ def fi_tokenizer():
return Finnish.Defaults.create_tokenizer()
@pytest.fixture
def sv_tokenizer():
return Swedish.Defaults.create_tokenizer()
@pytest.fixture
def stringstore():
return StringStore()

View File

@ -0,0 +1,15 @@
# encoding: utf8
from __future__ import unicode_literals
import pytest
SV_TOKEN_EXCEPTION_TESTS = [
('Smörsåsen används bl.a. till fisk', ['Smörsåsen', 'används', 'bl.a.', 'till', 'fisk']),
('Jag kommer först kl. 13 p.g.a. diverse förseningar', ['Jag', 'kommer', 'först', 'kl.', '13', 'p.g.a.', 'diverse', 'förseningar'])
]
@pytest.mark.parametrize('text,expected_tokens', SV_TOKEN_EXCEPTION_TESTS)
def test_issue805(sv_tokenizer, text, expected_tokens):
tokens = sv_tokenizer(text)
token_list = [token.text for token in tokens if not token.is_space]
assert expected_tokens == token_list