Fix formatting and add comment on languages

This commit is contained in:
ines 2017-10-14 13:11:18 +02:00
parent a4d974d97b
commit 9b3f8f9ec3

View File

@ -11,8 +11,12 @@ from ..strings import StringStore
from .. import util
# These languages are used for generic tokenizer tests only add a language
# here if it's using spaCy's tokenizer (not a different library)
# TODO: re-implement generic tokenizer tests
_languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'he', 'hu', 'id',
'it', 'nb', 'nl', 'pl', 'pt', 'sv', 'th','xx']
'it', 'nb', 'nl', 'pl', 'pt', 'sv', 'xx']
_models = {'en': ['en_core_web_sm'],
'de': ['de_core_news_md'],
'fr': ['fr_depvec_web_lg'],
@ -42,6 +46,7 @@ def FR(request):
#lang = util.get_lang_class(request.param)
#return lang.Defaults.create_tokenizer()
@pytest.fixture
def tokenizer():
return util.get_lang_class('xx').Defaults.create_tokenizer()
@ -87,10 +92,12 @@ def hu_tokenizer():
def fi_tokenizer():
return util.get_lang_class('fi').Defaults.create_tokenizer()
@pytest.fixture
def id_tokenizer():
return util.get_lang_class('id').Defaults.create_tokenizer()
@pytest.fixture
def sv_tokenizer():
return util.get_lang_class('sv').Defaults.create_tokenizer()
@ -105,6 +112,7 @@ def bn_tokenizer():
def he_tokenizer():
return util.get_lang_class('he').Defaults.create_tokenizer()
@pytest.fixture
def nb_tokenizer():
return util.get_lang_class('nb').Defaults.create_tokenizer()
@ -129,6 +137,7 @@ def en_entityrecognizer():
def text_file():
return StringIO()
@pytest.fixture
def text_file_b():
return BytesIO()