Modernize and merge tokenizer tests for text from file

This commit is contained in:
Ines Montani 2017-01-05 13:15:52 +01:00
parent 2c2e878653
commit 8b284fc6f1
3 changed files with 7 additions and 21 deletions

View File

@ -1,5 +1,6 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
from os import path
import pytest import pytest
import io import io
@ -9,6 +10,7 @@ import tempfile
from ... import util from ... import util
from ...language_data import TOKENIZER_PREFIXES from ...language_data import TOKENIZER_PREFIXES
from spacy.util import utf8open
en_search_prefixes = util.compile_prefix_regex(TOKENIZER_PREFIXES).search en_search_prefixes = util.compile_prefix_regex(TOKENIZER_PREFIXES).search
@ -79,6 +81,11 @@ untimely death" of the rapier-tongued Scottish barrister and parliamentarian.
def test_cnts1(en_tokenizer): def test_cnts1(en_tokenizer):
text = u"""The U.S. Army likes Shock and Awe.""" text = u"""The U.S. Army likes Shock and Awe."""
@pytest.mark.parametrize('file_name', ["sun.txt"])
def test_tokenizer_handle_text_from_file(en_tokenizer, file_name):
loc = path.join(path.dirname(__file__), file_name)
text = utf8open(loc).read()
assert len(text) != 0
tokens = en_tokenizer(text) tokens = en_tokenizer(text)
assert len(tokens) == 8 assert len(tokens) == 8

View File

@ -1,21 +0,0 @@
from __future__ import unicode_literals
from spacy.util import utf8open
import pytest
from os import path
HERE = path.dirname(__file__)
@pytest.fixture
def sun_txt():
loc = path.join(HERE, '..', 'sun.txt')
return utf8open(loc).read()
def test_tokenize(sun_txt, en_tokenizer):
assert len(sun_txt) != 0
tokens = en_tokenizer(sun_txt)
assert len(tokens) > 100