Modernize and merge tokenizer tests for text from file

2025-07-11 08:42:28 +03:00 · 2017-01-05 13:15:52 +01:00 · 2017-01-05 13:15:52 +01:00 · 8b284fc6f1
commit 8b284fc6f1
parent 2c2e878653
3 changed files with 7 additions and 21 deletions
--- a/spacy/tests/tokenizer/sun.txt
+++ b/spacy/tests/tokenizer/sun.txt
--- a/spacy/tests/tokenizer/test_tokenizer.py
+++ b/spacy/tests/tokenizer/test_tokenizer.py
@ -1,5 +1,6 @@
 # coding: utf-8
 from __future__ import unicode_literals
 from os import path
 import pytest
 import io
@ -9,6 +10,7 @@ import tempfile
 from ... import util
 from ...language_data import TOKENIZER_PREFIXES
 from spacy.util import utf8open
 en_search_prefixes = util.compile_prefix_regex(TOKENIZER_PREFIXES).search
@ -79,6 +81,11 @@ untimely death" of the rapier-tongued Scottish barrister and parliamentarian.
 def test_cnts1(en_tokenizer):
    text = u"""The U.S. Army likes Shock and Awe."""
@pytest.mark.parametrize('file_name', ["sun.txt"])
 def test_tokenizer_handle_text_from_file(en_tokenizer, file_name):
    loc = path.join(path.dirname(__file__), file_name)
    text = utf8open(loc).read()
    assert len(text) != 0
    tokens = en_tokenizer(text)
    assert len(tokens) == 8
--- a/spacy/tests/tokenizer/test_wiki_sun.py
+++ b/spacy/tests/tokenizer/test_wiki_sun.py
@ -1,21 +0,0 @@
 from __future__ import unicode_literals
 from spacy.util import utf8open
 import pytest
 from os import path
 HERE = path.dirname(__file__)
@pytest.fixture
 def sun_txt():
    loc = path.join(HERE, '..', 'sun.txt')
    return utf8open(loc).read()
 def test_tokenize(sun_txt, en_tokenizer):
    assert len(sun_txt) != 0
    tokens = en_tokenizer(sun_txt)
    assert len(tokens) > 100