From 138c53ff2e59776ef4e040ff14eaf694ab456aa3 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 13 Jan 2017 01:34:14 +0100 Subject: [PATCH] Merge tokenizer tests --- spacy/tests/tokenizer/test_tokenizer.py | 30 +++++++++++++-- spacy/tests/unit/test_tokenizer.py | 50 ------------------------- 2 files changed, 27 insertions(+), 53 deletions(-) delete mode 100644 spacy/tests/unit/test_tokenizer.py diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py index cd0043a10..a82284b34 100644 --- a/spacy/tests/tokenizer/test_tokenizer.py +++ b/spacy/tests/tokenizer/test_tokenizer.py @@ -1,11 +1,13 @@ # coding: utf-8 from __future__ import unicode_literals -from os import path - -import pytest +from ...vocab import Vocab +from ...tokenizer import Tokenizer from ...util import utf8open +from os import path +import pytest + def test_tokenizer_handles_no_word(tokenizer): tokens = tokenizer("") @@ -81,3 +83,25 @@ def test_tokenizer_suspected_freeing_strings(tokenizer): tokens2 = tokenizer(text2) assert tokens1[0].text == "Lorem" assert tokens2[0].text == "Lorem" + + +@pytest.mark.parametrize('text,tokens', [ + ("lorem", [{'orth': 'lo'}, {'orth': 'rem'}])]) +def test_tokenizer_add_special_case(tokenizer, text, tokens): + tokenizer.add_special_case(text, tokens) + doc = tokenizer(text) + assert doc[0].text == tokens[0]['orth'] + assert doc[1].text == tokens[1]['orth'] + + +@pytest.mark.parametrize('text,tokens', [ + ("lorem", [{'orth': 'lo', 'tag': 'NN'}, {'orth': 'rem'}])]) +def test_tokenizer_add_special_case_tag(text, tokens): + vocab = Vocab(tag_map={'NN': {'pos': 'NOUN'}}) + tokenizer = Tokenizer(vocab, {}, None, None, None) + tokenizer.add_special_case(text, tokens) + doc = tokenizer(text) + assert doc[0].text == tokens[0]['orth'] + assert doc[0].tag_ == tokens[0]['tag'] + assert doc[0].pos_ == 'NOUN' + assert doc[1].text == tokens[1]['orth'] diff --git a/spacy/tests/unit/test_tokenizer.py b/spacy/tests/unit/test_tokenizer.py deleted file mode 100644 index f062f94e3..000000000 --- a/spacy/tests/unit/test_tokenizer.py +++ /dev/null @@ -1,50 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from ...vocab import Vocab -from ...tokenizer import Tokenizer - -import re -import pytest - - -@pytest.fixture -def vocab(): - return Vocab(tag_map={'NN': {'pos': 'NOUN'}}) - -@pytest.fixture -def rules(): - return {} - -@pytest.fixture -def prefix_search(): - return None - -@pytest.fixture -def suffix_search(): - return None - -@pytest.fixture -def infix_finditer(): - return None - - -@pytest.fixture -def tokenizer(vocab, rules, prefix_search, suffix_search, infix_finditer): - return Tokenizer(vocab, rules, prefix_search, suffix_search, infix_finditer) - - -def test_add_special_case(tokenizer): - tokenizer.add_special_case('dog', [{'orth': 'd'}, {'orth': 'og'}]) - doc = tokenizer('dog') - assert doc[0].text == 'd' - assert doc[1].text == 'og' - - -def test_special_case_tag(tokenizer): - tokenizer.add_special_case('dog', [{'orth': 'd', 'tag': 'NN'}, {'orth': 'og'}]) - doc = tokenizer('dog') - assert doc[0].text == 'd' - assert doc[0].tag_ == 'NN' - assert doc[0].pos_ == 'NOUN' - assert doc[1].text == 'og'