spaCy/spacy/tests/conftest.py
Stanisław Giziński 1448ad100c Improved polish tokenizer and stop words. (#2974)
* Improved stop words list

* Removed some wrong stop words form list

* Improved stop words list

* Removed some wrong stop words form list

* Improved Polish Tokenizer (#38)

* Add tests for polish tokenizer

* Add polish tokenizer exceptions

* Don't split any words containing hyphens

* Fix test case with wrong model answer

* Remove commented out line of code until better solution is found

* Add source srx' license

* Rename exception_list.py to match spaCy conventionality

* Add a brief explanation of where the exception list comes from

* Add newline after reach exception

* Rename COPYING.txt to LICENSE

* Delete old files

* Add header to the license

* Agreements signed

* Stanisław Giziński agreement

* Krzysztof Kowalczyk - signed agreement

* Mateusz Olko agreement

* Add DoomCoder's contributor agreement

* Improve like number checking in polish lang


* like num tests added

* all from SI system added

* Final licence and removed splitting exceptions

* Added polish stop words to LEX_ATTRA

* Add encoding info to pl tokenizer exceptions
2019-02-08 14:27:21 +11:00

251 lines
7.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# coding: utf-8
from __future__ import unicode_literals
from io import StringIO, BytesIO
from pathlib import Path
import pytest
from .util import load_test_model
from ..tokens import Doc
from ..strings import StringStore
from .. import util
# These languages are used for generic tokenizer tests only add a language
# here if it's using spaCy's tokenizer (not a different library)
# TODO: re-implement generic tokenizer tests
_languages = ['bn', 'ca', 'da', 'de', 'el', 'en', 'es', 'fi', 'fr', 'ga', 'he', 'hu', 'id',
'it', 'nb', 'nl', 'pl', 'pt', 'ro', 'ru', 'sv', 'tr', 'ar', 'ur', 'tt', 'uk',
'xx']
_models = {'en': ['en_core_web_sm'],
'de': ['de_core_news_sm'],
'fr': ['fr_core_news_sm'],
'xx': ['xx_ent_web_sm'],
'en_core_web_md': ['en_core_web_md'],
'es_core_news_md': ['es_core_news_md']}
# only used for tests that require loading the models
# in all other cases, use specific instances
@pytest.fixture(params=_models['en'])
def EN(request):
return load_test_model(request.param)
@pytest.fixture(params=_models['de'])
def DE(request):
return load_test_model(request.param)
@pytest.fixture(params=_models['fr'])
def FR(request):
return load_test_model(request.param)
@pytest.fixture()
def RU(request):
pymorphy = pytest.importorskip('pymorphy2')
return util.get_lang_class('ru')()
@pytest.fixture()
def UK(request):
pymorphy = pytest.importorskip('pymorphy2')
return util.get_lang_class('uk')()
@pytest.fixture()
def JA(request):
mecab = pytest.importorskip("MeCab")
return util.get_lang_class('ja')()
#@pytest.fixture(params=_languages)
#def tokenizer(request):
#lang = util.get_lang_class(request.param)
#return lang.Defaults.create_tokenizer()
@pytest.fixture
def tokenizer():
return util.get_lang_class('xx').Defaults.create_tokenizer()
@pytest.fixture(scope='session')
def en_tokenizer():
return util.get_lang_class('en').Defaults.create_tokenizer()
@pytest.fixture
def en_vocab():
return util.get_lang_class('en').Defaults.create_vocab()
@pytest.fixture
def en_parser(en_vocab):
nlp = util.get_lang_class('en')(en_vocab)
return nlp.create_pipe('parser')
@pytest.fixture(scope='session')
def es_tokenizer():
return util.get_lang_class('es').Defaults.create_tokenizer()
@pytest.fixture(scope='session')
def de_tokenizer():
return util.get_lang_class('de').Defaults.create_tokenizer()
@pytest.fixture(scope='session')
def hu_tokenizer():
return util.get_lang_class('hu').Defaults.create_tokenizer()
@pytest.fixture(scope='session')
def fi_tokenizer():
return util.get_lang_class('fi').Defaults.create_tokenizer()
@pytest.fixture(scope='session')
def ro_tokenizer():
return util.get_lang_class('ro').Defaults.create_tokenizer()
@pytest.fixture(scope='session')
def id_tokenizer():
return util.get_lang_class('id').Defaults.create_tokenizer()
@pytest.fixture(scope='session')
def sv_tokenizer():
return util.get_lang_class('sv').Defaults.create_tokenizer()
@pytest.fixture(scope='session')
def bn_tokenizer():
return util.get_lang_class('bn').Defaults.create_tokenizer()
@pytest.fixture(scope='session')
def ga_tokenizer():
return util.get_lang_class('ga').Defaults.create_tokenizer()
@pytest.fixture(scope='session')
def he_tokenizer():
return util.get_lang_class('he').Defaults.create_tokenizer()
@pytest.fixture(scope='session')
def nb_tokenizer():
return util.get_lang_class('nb').Defaults.create_tokenizer()
@pytest.fixture(scope='session')
def da_tokenizer():
return util.get_lang_class('da').Defaults.create_tokenizer()
@pytest.fixture(scope='session')
def ja_tokenizer():
mecab = pytest.importorskip("MeCab")
return util.get_lang_class('ja').Defaults.create_tokenizer()
@pytest.fixture(scope='session')
def th_tokenizer():
pythainlp = pytest.importorskip("pythainlp")
return util.get_lang_class('th').Defaults.create_tokenizer()
@pytest.fixture(scope='session')
def tr_tokenizer():
return util.get_lang_class('tr').Defaults.create_tokenizer()
@pytest.fixture(scope='session')
def tt_tokenizer():
return util.get_lang_class('tt').Defaults.create_tokenizer()
@pytest.fixture(scope='session')
def el_tokenizer():
return util.get_lang_class('el').Defaults.create_tokenizer()
@pytest.fixture(scope='session')
def ar_tokenizer():
return util.get_lang_class('ar').Defaults.create_tokenizer()
@pytest.fixture(scope='session')
def ur_tokenizer():
return util.get_lang_class('ur').Defaults.create_tokenizer()
@pytest.fixture(scope='session')
def ru_tokenizer():
pymorphy = pytest.importorskip('pymorphy2')
return util.get_lang_class('ru').Defaults.create_tokenizer()
@pytest.fixture(scope='session')
def uk_tokenizer():
pymorphy = pytest.importorskip('pymorphy2')
return util.get_lang_class('uk').Defaults.create_tokenizer()
@pytest.fixture(scope='session')
def ca_tokenizer():
return util.get_lang_class('ca').Defaults.create_tokenizer()
@pytest.fixture(scope='session')
def pl_tokenizer():
return util.get_lang_class('pl').Defaults.create_tokenizer()
@pytest.fixture
def stringstore():
return StringStore()
@pytest.fixture
def en_entityrecognizer():
return util.get_lang_class('en').Defaults.create_entity()
@pytest.fixture
def text_file():
return StringIO()
@pytest.fixture
def text_file_b():
return BytesIO()
def pytest_addoption(parser):
parser.addoption("--models", action="store_true",
help="include tests that require full models")
parser.addoption("--vectors", action="store_true",
help="include word vectors tests")
parser.addoption("--slow", action="store_true",
help="include slow tests")
for lang in _languages + ['all']:
parser.addoption("--%s" % lang, action="store_true", help="Use %s models" % lang)
for model in _models:
if model not in _languages:
parser.addoption("--%s" % model, action="store_true", help="Use %s model" % model)
def pytest_runtest_setup(item):
def getopt(opt):
# When using 'pytest --pyargs spacy' to test an installed copy of
# spacy, pytest skips running our pytest_addoption() hook. Later, when
# we call getoption(), pytest raises an error, because it doesn't
# recognize the option we're asking about. To avoid this, we need to
# pass a default value. We default to False, i.e., we act like all the
# options weren't given.
return item.config.getoption("--%s" % opt, False)
for opt in ['models', 'vectors', 'slow']:
if opt in item.keywords and not getopt(opt):
pytest.skip("need --%s option to run" % opt)
# Check if test is marked with models and has arguments set, i.e. specific
# language. If so, skip test if flag not set.
if item.get_marker('models'):
for arg in item.get_marker('models').args:
if not getopt(arg) and not getopt("all"):
pytest.skip("need --%s or --all option to run" % arg)