From 8b284fc6f192a0c832e1edaf4e0d860f0d4706cd Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 5 Jan 2017 13:15:52 +0100 Subject: [PATCH] Modernize and merge tokenizer tests for text from file --- spacy/tests/{ => tokenizer}/sun.txt | 0 spacy/tests/tokenizer/test_tokenizer.py | 7 +++++++ spacy/tests/tokenizer/test_wiki_sun.py | 21 --------------------- 3 files changed, 7 insertions(+), 21 deletions(-) rename spacy/tests/{ => tokenizer}/sun.txt (100%) delete mode 100644 spacy/tests/tokenizer/test_wiki_sun.py diff --git a/spacy/tests/sun.txt b/spacy/tests/tokenizer/sun.txt similarity index 100% rename from spacy/tests/sun.txt rename to spacy/tests/tokenizer/sun.txt diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py index fab7d49d8..92e610fe0 100644 --- a/spacy/tests/tokenizer/test_tokenizer.py +++ b/spacy/tests/tokenizer/test_tokenizer.py @@ -1,5 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals +from os import path import pytest import io @@ -9,6 +10,7 @@ import tempfile from ... import util from ...language_data import TOKENIZER_PREFIXES +from spacy.util import utf8open en_search_prefixes = util.compile_prefix_regex(TOKENIZER_PREFIXES).search @@ -79,6 +81,11 @@ untimely death" of the rapier-tongued Scottish barrister and parliamentarian. def test_cnts1(en_tokenizer): text = u"""The U.S. Army likes Shock and Awe.""" +@pytest.mark.parametrize('file_name', ["sun.txt"]) +def test_tokenizer_handle_text_from_file(en_tokenizer, file_name): + loc = path.join(path.dirname(__file__), file_name) + text = utf8open(loc).read() + assert len(text) != 0 tokens = en_tokenizer(text) assert len(tokens) == 8 diff --git a/spacy/tests/tokenizer/test_wiki_sun.py b/spacy/tests/tokenizer/test_wiki_sun.py deleted file mode 100644 index 8d2a6682e..000000000 --- a/spacy/tests/tokenizer/test_wiki_sun.py +++ /dev/null @@ -1,21 +0,0 @@ -from __future__ import unicode_literals - -from spacy.util import utf8open - -import pytest -from os import path - - -HERE = path.dirname(__file__) - - -@pytest.fixture -def sun_txt(): - loc = path.join(HERE, '..', 'sun.txt') - return utf8open(loc).read() - - -def test_tokenize(sun_txt, en_tokenizer): - assert len(sun_txt) != 0 - tokens = en_tokenizer(sun_txt) - assert len(tokens) > 100