mirror of
https://github.com/explosion/spaCy.git
synced 2025-03-03 19:08:06 +03:00
Modernize and merge tokenizer tests for text from file
This commit is contained in:
parent
2c2e878653
commit
8b284fc6f1
|
@ -1,5 +1,6 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
from os import path
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import io
|
import io
|
||||||
|
@ -9,6 +10,7 @@ import tempfile
|
||||||
|
|
||||||
from ... import util
|
from ... import util
|
||||||
from ...language_data import TOKENIZER_PREFIXES
|
from ...language_data import TOKENIZER_PREFIXES
|
||||||
|
from spacy.util import utf8open
|
||||||
|
|
||||||
en_search_prefixes = util.compile_prefix_regex(TOKENIZER_PREFIXES).search
|
en_search_prefixes = util.compile_prefix_regex(TOKENIZER_PREFIXES).search
|
||||||
|
|
||||||
|
@ -79,6 +81,11 @@ untimely death" of the rapier-tongued Scottish barrister and parliamentarian.
|
||||||
|
|
||||||
def test_cnts1(en_tokenizer):
|
def test_cnts1(en_tokenizer):
|
||||||
text = u"""The U.S. Army likes Shock and Awe."""
|
text = u"""The U.S. Army likes Shock and Awe."""
|
||||||
|
@pytest.mark.parametrize('file_name', ["sun.txt"])
|
||||||
|
def test_tokenizer_handle_text_from_file(en_tokenizer, file_name):
|
||||||
|
loc = path.join(path.dirname(__file__), file_name)
|
||||||
|
text = utf8open(loc).read()
|
||||||
|
assert len(text) != 0
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
assert len(tokens) == 8
|
assert len(tokens) == 8
|
||||||
|
|
||||||
|
|
|
@ -1,21 +0,0 @@
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from spacy.util import utf8open
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
from os import path
|
|
||||||
|
|
||||||
|
|
||||||
HERE = path.dirname(__file__)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def sun_txt():
|
|
||||||
loc = path.join(HERE, '..', 'sun.txt')
|
|
||||||
return utf8open(loc).read()
|
|
||||||
|
|
||||||
|
|
||||||
def test_tokenize(sun_txt, en_tokenizer):
|
|
||||||
assert len(sun_txt) != 0
|
|
||||||
tokens = en_tokenizer(sun_txt)
|
|
||||||
assert len(tokens) > 100
|
|
Loading…
Reference in New Issue
Block a user