Modernize and merge tokenizer tests for punctuation

This commit is contained in:
Ines Montani 2017-01-05 13:14:16 +01:00
parent 8a74129cdf
commit 2c2e878653
2 changed files with 19 additions and 16 deletions

View File

@ -6,13 +6,19 @@ from __future__ import unicode_literals
import pytest
from ... import util
from ...language_data import TOKENIZER_PREFIXES
en_search_prefixes = util.compile_prefix_regex(TOKENIZER_PREFIXES).search
PUNCT_OPEN = ['(', '[', '{', '*']
PUNCT_CLOSE = [')', ']', '}', '*']
PUNCT_PAIRED = [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')]
@pytest.mark.parametrize('text', ["(", "((", "<"])
def test_tokenizer_only_punct(en_tokenizer, text):
def test_tokenizer_handles_only_punct(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == len(text)
@ -111,3 +117,15 @@ def test_two_different(en_tokenizer, punct_open, punct_close, punct_open_add, pu
assert tokens[2].text == text
assert tokens[3].text == punct_close
assert tokens[4].text == punct_close_add
@pytest.mark.parametrize('text,punct', [("(can't", "(")])
def test_tokenizer_splits_pre_punct_regex(text, punct):
match = en_search_prefixes(text)
assert match.group() == punct
def test_tokenizer_splits_bracket_period(en_tokenizer):
text = "(And a 6a.m. run through Washington Park)."
tokens = en_tokenizer(text)
assert tokens[len(tokens) - 1].text == "."

View File

@ -20,13 +20,6 @@ en_search_prefixes = util.compile_prefix_regex(TOKENIZER_PREFIXES).search
# loaded = pickle.load(file_)
# assert loaded is not None
def test_pre_punct_regex():
string = "(can't"
match = en_search_prefixes(string)
assert match.group() == "("
def test_no_word(en_tokenizer):
tokens = en_tokenizer(u'')
assert len(tokens) == 0
@ -65,14 +58,6 @@ def test_contraction(en_tokenizer):
assert len(tokens) == 5
assert tokens[4].orth == en_tokenizer.vocab['!'].orth
def test_contraction_punct(en_tokenizer):
tokens = [w.text for w in en_tokenizer("(can't")]
assert tokens == ['(', 'ca', "n't"]
tokens = en_tokenizer("`ain't")
assert len(tokens) == 3
tokens = en_tokenizer('''"isn't''')
assert len(tokens) == 3
tokens = en_tokenizer("can't!")
assert len(tokens) == 3