Modernize and merge tokenizer tests for text from file

2025-11-03 17:38:02 +03:00 · 2017-01-05 13:15:52 +01:00 · 2017-01-05 13:15:52 +01:00 · 8b284fc6f1
commit 8b284fc6f1
parent 2c2e878653
3 changed files with 7 additions and 21 deletions
--- a/spacy/tests/tokenizer/sun.txt
+++ b/spacy/tests/tokenizer/sun.txt
--- a/spacy/tests/tokenizer/test_tokenizer.py
+++ b/spacy/tests/tokenizer/test_tokenizer.py
@ -1,5 +1,6 @@
 # coding: utf-8
 from __future__ import unicode_literals
+from os import path

 import pytest
 import io
@ -9,6 +10,7 @@ import tempfile

 from ... import util
 from ...language_data import TOKENIZER_PREFIXES
+from spacy.util import utf8open

 en_search_prefixes = util.compile_prefix_regex(TOKENIZER_PREFIXES).search

@ -79,6 +81,11 @@ untimely death" of the rapier-tongued Scottish barrister and parliamentarian.

 def test_cnts1(en_tokenizer):
    text = u"""The U.S. Army likes Shock and Awe."""
+@pytest.mark.parametrize('file_name', ["sun.txt"])
+def test_tokenizer_handle_text_from_file(en_tokenizer, file_name):
+    loc = path.join(path.dirname(__file__), file_name)
+    text = utf8open(loc).read()
+    assert len(text) != 0
    tokens = en_tokenizer(text)
    assert len(tokens) == 8

--- a/spacy/tests/tokenizer/test_wiki_sun.py
+++ b/spacy/tests/tokenizer/test_wiki_sun.py
@ -1,21 +0,0 @@
-from __future__ import unicode_literals
-
-from spacy.util import utf8open
-
-import pytest
-from os import path
-
-
-HERE = path.dirname(__file__)
-
-
-@pytest.fixture
-def sun_txt():
-    loc = path.join(HERE, '..', 'sun.txt')
-    return utf8open(loc).read()
-
-
-def test_tokenize(sun_txt, en_tokenizer):
-    assert len(sun_txt) != 0
-    tokens = en_tokenizer(sun_txt)
-    assert len(tokens) > 100