mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 10:46:29 +03:00
Modernize and merge tokenizer tests for punctuation
This commit is contained in:
parent
8a74129cdf
commit
2c2e878653
|
@ -6,13 +6,19 @@ from __future__ import unicode_literals
|
|||
|
||||
import pytest
|
||||
|
||||
from ... import util
|
||||
from ...language_data import TOKENIZER_PREFIXES
|
||||
|
||||
en_search_prefixes = util.compile_prefix_regex(TOKENIZER_PREFIXES).search
|
||||
|
||||
|
||||
PUNCT_OPEN = ['(', '[', '{', '*']
|
||||
PUNCT_CLOSE = [')', ']', '}', '*']
|
||||
PUNCT_PAIRED = [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')]
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["(", "((", "<"])
|
||||
def test_tokenizer_only_punct(en_tokenizer, text):
|
||||
def test_tokenizer_handles_only_punct(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == len(text)
|
||||
|
||||
|
@ -111,3 +117,15 @@ def test_two_different(en_tokenizer, punct_open, punct_close, punct_open_add, pu
|
|||
assert tokens[2].text == text
|
||||
assert tokens[3].text == punct_close
|
||||
assert tokens[4].text == punct_close_add
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,punct', [("(can't", "(")])
|
||||
def test_tokenizer_splits_pre_punct_regex(text, punct):
|
||||
match = en_search_prefixes(text)
|
||||
assert match.group() == punct
|
||||
|
||||
|
||||
def test_tokenizer_splits_bracket_period(en_tokenizer):
|
||||
text = "(And a 6a.m. run through Washington Park)."
|
||||
tokens = en_tokenizer(text)
|
||||
assert tokens[len(tokens) - 1].text == "."
|
||||
|
|
|
@ -20,13 +20,6 @@ en_search_prefixes = util.compile_prefix_regex(TOKENIZER_PREFIXES).search
|
|||
# loaded = pickle.load(file_)
|
||||
# assert loaded is not None
|
||||
|
||||
def test_pre_punct_regex():
|
||||
string = "(can't"
|
||||
match = en_search_prefixes(string)
|
||||
assert match.group() == "("
|
||||
|
||||
def test_no_word(en_tokenizer):
|
||||
tokens = en_tokenizer(u'')
|
||||
assert len(tokens) == 0
|
||||
|
||||
|
||||
|
@ -65,14 +58,6 @@ def test_contraction(en_tokenizer):
|
|||
assert len(tokens) == 5
|
||||
assert tokens[4].orth == en_tokenizer.vocab['!'].orth
|
||||
|
||||
def test_contraction_punct(en_tokenizer):
|
||||
tokens = [w.text for w in en_tokenizer("(can't")]
|
||||
assert tokens == ['(', 'ca', "n't"]
|
||||
tokens = en_tokenizer("`ain't")
|
||||
assert len(tokens) == 3
|
||||
tokens = en_tokenizer('''"isn't''')
|
||||
assert len(tokens) == 3
|
||||
tokens = en_tokenizer("can't!")
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user