spaCy/spacy/tests/conftest.py

# coding: utf-8
from __future__ import unicode_literals

from ..en import English
from ..de import German
from ..es import Spanish
from ..it import Italian
from ..ja import Japanese
from ..fr import French
from ..pt import Portuguese
from ..nl import Dutch
from ..sv import Swedish
from ..hu import Hungarian
from ..fi import Finnish
from ..bn import Bengali
from ..he import Hebrew
from ..nb import Norwegian


from ..tokens import Doc
from ..strings import StringStore
from ..lemmatizer import Lemmatizer
from ..attrs import ORTH, TAG, HEAD, DEP

from io import StringIO, BytesIO
from pathlib import Path
import os
import pytest

# These languages get run through generic tokenizer tests
LANGUAGES = [English, German, Spanish, Italian, French, Portuguese, Dutch,
             Swedish, Hungarian, Finnish, Bengali, Norwegian]


@pytest.fixture(params=LANGUAGES)
def tokenizer(request):
    lang = request.param
    return lang.Defaults.create_tokenizer()


@pytest.fixture
def en_tokenizer():
    return English.Defaults.create_tokenizer()


@pytest.fixture
def en_vocab():
    return English.Defaults.create_vocab()


@pytest.fixture
def en_parser():
    return English.Defaults.create_parser()

@pytest.fixture
def es_tokenizer():
    return Spanish.Defaults.create_tokenizer()


@pytest.fixture
def de_tokenizer():
    return German.Defaults.create_tokenizer()


@pytest.fixture(scope='module')
def fr_tokenizer():
    return French.Defaults.create_tokenizer()


@pytest.fixture
def hu_tokenizer():
    return Hungarian.Defaults.create_tokenizer()


@pytest.fixture
def fi_tokenizer():
    return Finnish.Defaults.create_tokenizer()


@pytest.fixture
def ja_tokenizer():
    pytest.importorskip("MeCab")
    return Japanese.Defaults.create_tokenizer()

@pytest.fixture
def japanese():
    return Japanese()

@pytest.fixture
def sv_tokenizer():
    return Swedish.Defaults.create_tokenizer()


@pytest.fixture
def bn_tokenizer():
    return Bengali.Defaults.create_tokenizer()


@pytest.fixture
def he_tokenizer():
    return Hebrew.Defaults.create_tokenizer()

@pytest.fixture
def nb_tokenizer():
    return Norwegian.Defaults.create_tokenizer()

@pytest.fixture
def stringstore():
    return StringStore()


@pytest.fixture
def en_entityrecognizer():
     return English.Defaults.create_entity()


@pytest.fixture
def lemmatizer():
    return English.Defaults.create_lemmatizer()


@pytest.fixture
def text_file():
    return StringIO()

@pytest.fixture
def text_file_b():
    return BytesIO()


# only used for tests that require loading the models
# in all other cases, use specific instances
@pytest.fixture(scope="session")
def EN():
    return English()


@pytest.fixture(scope="session")
def DE():
    return German()


def pytest_addoption(parser):
    parser.addoption("--models", action="store_true",
        help="include tests that require full models")
    parser.addoption("--vectors", action="store_true",
        help="include word vectors tests")
    parser.addoption("--slow", action="store_true",
        help="include slow tests")


def pytest_runtest_setup(item):
    for opt in ['models', 'vectors', 'slow']:
        if opt in item.keywords and not item.config.getoption("--%s" % opt):
            pytest.skip("need --%s option to run" % opt)
Merge conftests into one cohesive file 2017-01-11 15:56:32 +03:00			`# coding: utf-8`
			`from __future__ import unicode_literals`
* Add conftest.py to tests/, to allow session-global pipeline. This allows much faster tests. 2015-06-07 18:53:14 +03:00
Test with the non-loaded versions of the English and German pipelines. 2016-10-12 20:13:31 +03:00			`from ..en import English`
			`from ..de import German`
Merge conftests into one cohesive file 2017-01-11 15:56:32 +03:00			`from ..es import Spanish`
			`from ..it import Italian`
Add basic Japanese tokenizer test 2017-06-27 19:24:25 +03:00			`from ..ja import Japanese`
Merge conftests into one cohesive file 2017-01-11 15:56:32 +03:00			`from ..fr import French`
			`from ..pt import Portuguese`
			`from ..nl import Dutch`
			`from ..sv import Swedish`
			`from ..hu import Hungarian`
[finnish] Add initial tests for tokenizer 2017-02-04 14:47:29 +03:00			`from ..fi import Finnish`
add tests for Bengali 2017-03-05 04:11:26 +03:00			`from ..bn import Bengali`
add hebrew tokenizer 2017-03-24 18:27:44 +03:00			`from ..he import Hebrew`
Hooked up tokenizer tests 2017-04-27 00:21:41 +03:00			`from ..nb import Norwegian`

Merge branch 'master' into master 2017-03-25 12:03:47 +03:00
Merge conftests into one cohesive file 2017-01-11 15:56:32 +03:00			`from ..tokens import Doc`
Add fixture for StringStore 2017-01-12 17:05:40 +03:00			`from ..strings import StringStore`
Add Lemmatizer fixture 2017-01-13 01:38:55 +03:00			`from ..lemmatizer import Lemmatizer`
Merge conftests into one cohesive file 2017-01-11 15:56:32 +03:00			`from ..attrs import ORTH, TAG, HEAD, DEP`

Add text_file_b fixture using BytesIO 2017-01-13 04:23:50 +03:00			`from io import StringIO, BytesIO`
Add path fixture for spaCy data path 2017-01-13 01:38:47 +03:00			`from pathlib import Path`
			`import os`
Merge conftests into one cohesive file 2017-01-11 15:56:32 +03:00			`import pytest`

Add comment clarifying what LANGUAGES does 2017-07-09 10:28:55 +03:00			`# These languages get run through generic tokenizer tests`
Remove Japanese from LANGUAGES LANGUAGES is a list of languages whose tokenizers get run through a variety of generic tests. Since the generic tests don't check the JA fixture, it blows up when it can't find janome. -POLM 2017-07-09 10:23:26 +03:00			`LANGUAGES = [English, German, Spanish, Italian, French, Portuguese, Dutch,`
Hooked up tokenizer tests 2017-04-27 00:21:41 +03:00			`Swedish, Hungarian, Finnish, Bengali, Norwegian]`
Merge conftests into one cohesive file 2017-01-11 15:56:32 +03:00

			`@pytest.fixture(params=LANGUAGES)`
			`def tokenizer(request):`
			`lang = request.param`
			`return lang.Defaults.create_tokenizer()`


			`@pytest.fixture`
			`def en_tokenizer():`
			`return English.Defaults.create_tokenizer()`
* Add conftest.py to tests/, to allow session-global pipeline. This allows much faster tests. 2015-06-07 18:53:14 +03:00
Add .blank() method to Parser. Start housing default dep labels and entity types within the Defaults class. 2016-09-26 12:57:54 +03:00
Merge conftests into one cohesive file 2017-01-11 15:56:32 +03:00			`@pytest.fixture`
			`def en_vocab():`
			`return English.Defaults.create_vocab()`


Add en_parser fixture 2017-01-11 23:29:59 +03:00			`@pytest.fixture`
			`def en_parser():`
			`return English.Defaults.create_parser()`

feature(model): Add support for creating the Spanish model, including rich tagset, configuration, and basich tests 2017-04-06 19:48:45 +03:00			`@pytest.fixture`
			`def es_tokenizer():`
			`return Spanish.Defaults.create_tokenizer()`

Add en_parser fixture 2017-01-11 23:29:59 +03:00
Merge conftests into one cohesive file 2017-01-11 15:56:32 +03:00			`@pytest.fixture`
			`def de_tokenizer():`
			`return German.Defaults.create_tokenizer()`


Revert "Revert "Merge pull request #818 from raphael0202/tokenizer_exceptions"" This reverts commit f02a2f9322969a637ee2445efd7d1901d2a0d09a. 2017-02-10 15:17:05 +03:00			`@pytest.fixture(scope='module')`
Add fr tokenization unit tests 2017-01-24 12:55:02 +03:00			`def fr_tokenizer():`
			`return French.Defaults.create_tokenizer()`


Merge conftests into one cohesive file 2017-01-11 15:56:32 +03:00			`@pytest.fixture`
			`def hu_tokenizer():`
			`return Hungarian.Defaults.create_tokenizer()`

Adjust formatting 2017-01-12 18:49:19 +03:00
[finnish] Add initial tests for tokenizer 2017-02-04 14:47:29 +03:00			`@pytest.fixture`
			`def fi_tokenizer():`
			`return Finnish.Defaults.create_tokenizer()`


Add basic Japanese tokenizer test 2017-06-27 19:24:25 +03:00			`@pytest.fixture`
			`def ja_tokenizer():`
Sample implementation of Japanese Tagger (ref #1214) This is far from complete but it should be enough to check some things. 1. Mecab transition. Janome doesn't support Unidic, only IPAdic, but UD tag mappings are based on Unidic. This switches out Mecab for Janome to get around that. 2. Raw tag extension. A simple tag map can't meet the specifications for UD tag mappings, so this adds an extra field to ambiguous cases. For this demo it just deals with the simplest case, which only needs to look at the literal token. (In reality it may be necessary to look at the whole sentence, but that's another issue.) 3. General code structure. Seems nobody else has implemented a custom Tagger yet, so still not sure this is the correct way to pass the vocabulary around, for example. Any feedback would be greatly appreciated. -POLM 2017-08-07 19:27:15 +03:00			`pytest.importorskip("MeCab")`
Add basic Japanese tokenizer test 2017-06-27 19:24:25 +03:00			`return Japanese.Defaults.create_tokenizer()`

Sample implementation of Japanese Tagger (ref #1214) This is far from complete but it should be enough to check some things. 1. Mecab transition. Janome doesn't support Unidic, only IPAdic, but UD tag mappings are based on Unidic. This switches out Mecab for Janome to get around that. 2. Raw tag extension. A simple tag map can't meet the specifications for UD tag mappings, so this adds an extra field to ambiguous cases. For this demo it just deals with the simplest case, which only needs to look at the literal token. (In reality it may be necessary to look at the whole sentence, but that's another issue.) 3. General code structure. Seems nobody else has implemented a custom Tagger yet, so still not sure this is the correct way to pass the vocabulary around, for example. Any feedback would be greatly appreciated. -POLM 2017-08-07 19:27:15 +03:00			`@pytest.fixture`
			`def japanese():`
			`return Japanese()`
Add basic Japanese tokenizer test 2017-06-27 19:24:25 +03:00
[issue 805] Add regression test and the required fixture 2017-02-04 17:21:34 +03:00			`@pytest.fixture`
			`def sv_tokenizer():`
			`return Swedish.Defaults.create_tokenizer()`


add hebrew tokenizer 2017-03-24 18:27:44 +03:00			`@pytest.fixture`
add tests for Bengali 2017-03-05 04:11:26 +03:00			`def bn_tokenizer():`
			`return Bengali.Defaults.create_tokenizer()`

feature(model): Add support for creating the Spanish model, including rich tagset, configuration, and basich tests 2017-04-06 19:48:45 +03:00
			`@pytest.fixture`
add hebrew tokenizer 2017-03-24 18:27:44 +03:00			`def he_tokenizer():`
			`return Hebrew.Defaults.create_tokenizer()`

Hooked up tokenizer tests 2017-04-27 00:21:41 +03:00			`@pytest.fixture`
			`def nb_tokenizer():`
			`return Norwegian.Defaults.create_tokenizer()`
add hebrew tokenizer 2017-03-24 18:27:44 +03:00
Add fixture for StringStore 2017-01-12 17:05:40 +03:00			`@pytest.fixture`
			`def stringstore():`
			`return StringStore()`
Merge conftests into one cohesive file 2017-01-11 15:56:32 +03:00
Adjust formatting 2017-01-12 18:49:19 +03:00
Add fixture for entity recognizer 2017-01-12 23:56:32 +03:00			`@pytest.fixture`
			`def en_entityrecognizer():`
			`return English.Defaults.create_entity()`


Add Lemmatizer fixture 2017-01-13 01:38:55 +03:00			`@pytest.fixture`
Fix tests 2017-03-17 03:48:00 +03:00			`def lemmatizer():`
Fix typo in tests 2017-03-17 04:50:36 +03:00			`return English.Defaults.create_lemmatizer()`
Add Lemmatizer fixture 2017-01-13 01:38:55 +03:00

Merge conftests into one cohesive file 2017-01-11 15:56:32 +03:00			`@pytest.fixture`
			`def text_file():`
			`return StringIO()`

Add text_file_b fixture using BytesIO 2017-01-13 04:23:50 +03:00			`@pytest.fixture`
			`def text_file_b():`
			`return BytesIO()`

Merge conftests into one cohesive file 2017-01-11 15:56:32 +03:00
Update comments on EN and DE fixtures 2017-01-13 00:03:07 +03:00			`# only used for tests that require loading the models`
			`# in all other cases, use specific instances`
* Add conftest.py to tests/, to allow session-global pipeline. This allows much faster tests. 2015-06-07 18:53:14 +03:00			`@pytest.fixture(scope="session")`
			`def EN():`
Set default path in EN/DE tests. 2016-10-17 02:52:49 +03:00			`return English()`
add model sanity test 2016-05-03 13:51:47 +03:00
Merge conftests into one cohesive file 2017-01-11 15:56:32 +03:00
reformulate noun chunk tests for English 2016-05-03 15:24:35 +03:00			`@pytest.fixture(scope="session")`
add model sanity test 2016-05-03 13:51:47 +03:00			`def DE():`
Set default path in EN/DE tests. 2016-10-17 02:52:49 +03:00			`return German()`
* Add flags to pytest to tests requiring models, vectors or slow functions to be toggled. 2015-07-23 02:19:03 +03:00

			`def pytest_addoption(parser):`
			`parser.addoption("--models", action="store_true",`
			`help="include tests that require full models")`
			`parser.addoption("--vectors", action="store_true",`
			`help="include word vectors tests")`
			`parser.addoption("--slow", action="store_true",`
			`help="include slow tests")`


			`def pytest_runtest_setup(item):`
			`for opt in ['models', 'vectors', 'slow']:`
			`if opt in item.keywords and not item.config.getoption("--%s" % opt):`
			`pytest.skip("need --%s option to run" % opt)`