Merge tokenizer tests

This commit is contained in:
Ines Montani 2017-01-13 01:34:14 +01:00
parent 01f36ca3ff
commit 138c53ff2e
2 changed files with 27 additions and 53 deletions

View File

@ -1,11 +1,13 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
from os import path
import pytest
from ...vocab import Vocab
from ...tokenizer import Tokenizer
from ...util import utf8open from ...util import utf8open
from os import path
import pytest
def test_tokenizer_handles_no_word(tokenizer): def test_tokenizer_handles_no_word(tokenizer):
tokens = tokenizer("") tokens = tokenizer("")
@ -81,3 +83,25 @@ def test_tokenizer_suspected_freeing_strings(tokenizer):
tokens2 = tokenizer(text2) tokens2 = tokenizer(text2)
assert tokens1[0].text == "Lorem" assert tokens1[0].text == "Lorem"
assert tokens2[0].text == "Lorem" assert tokens2[0].text == "Lorem"
@pytest.mark.parametrize('text,tokens', [
("lorem", [{'orth': 'lo'}, {'orth': 'rem'}])])
def test_tokenizer_add_special_case(tokenizer, text, tokens):
tokenizer.add_special_case(text, tokens)
doc = tokenizer(text)
assert doc[0].text == tokens[0]['orth']
assert doc[1].text == tokens[1]['orth']
@pytest.mark.parametrize('text,tokens', [
("lorem", [{'orth': 'lo', 'tag': 'NN'}, {'orth': 'rem'}])])
def test_tokenizer_add_special_case_tag(text, tokens):
vocab = Vocab(tag_map={'NN': {'pos': 'NOUN'}})
tokenizer = Tokenizer(vocab, {}, None, None, None)
tokenizer.add_special_case(text, tokens)
doc = tokenizer(text)
assert doc[0].text == tokens[0]['orth']
assert doc[0].tag_ == tokens[0]['tag']
assert doc[0].pos_ == 'NOUN'
assert doc[1].text == tokens[1]['orth']

View File

@ -1,50 +0,0 @@
# coding: utf-8
from __future__ import unicode_literals
from ...vocab import Vocab
from ...tokenizer import Tokenizer
import re
import pytest
@pytest.fixture
def vocab():
return Vocab(tag_map={'NN': {'pos': 'NOUN'}})
@pytest.fixture
def rules():
return {}
@pytest.fixture
def prefix_search():
return None
@pytest.fixture
def suffix_search():
return None
@pytest.fixture
def infix_finditer():
return None
@pytest.fixture
def tokenizer(vocab, rules, prefix_search, suffix_search, infix_finditer):
return Tokenizer(vocab, rules, prefix_search, suffix_search, infix_finditer)
def test_add_special_case(tokenizer):
tokenizer.add_special_case('dog', [{'orth': 'd'}, {'orth': 'og'}])
doc = tokenizer('dog')
assert doc[0].text == 'd'
assert doc[1].text == 'og'
def test_special_case_tag(tokenizer):
tokenizer.add_special_case('dog', [{'orth': 'd', 'tag': 'NN'}, {'orth': 'og'}])
doc = tokenizer('dog')
assert doc[0].text == 'd'
assert doc[0].tag_ == 'NN'
assert doc[0].pos_ == 'NOUN'
assert doc[1].text == 'og'