mirror of
https://github.com/explosion/spaCy.git
synced 2025-04-26 20:03:40 +03:00
Modernize and merge tokenizer tests for punctuation
This commit is contained in:
parent
8a74129cdf
commit
2c2e878653
|
@ -6,13 +6,19 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
from ... import util
|
||||||
|
from ...language_data import TOKENIZER_PREFIXES
|
||||||
|
|
||||||
|
en_search_prefixes = util.compile_prefix_regex(TOKENIZER_PREFIXES).search
|
||||||
|
|
||||||
|
|
||||||
PUNCT_OPEN = ['(', '[', '{', '*']
|
PUNCT_OPEN = ['(', '[', '{', '*']
|
||||||
PUNCT_CLOSE = [')', ']', '}', '*']
|
PUNCT_CLOSE = [')', ']', '}', '*']
|
||||||
PUNCT_PAIRED = [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')]
|
PUNCT_PAIRED = [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["(", "((", "<"])
|
@pytest.mark.parametrize('text', ["(", "((", "<"])
|
||||||
def test_tokenizer_only_punct(en_tokenizer, text):
|
def test_tokenizer_handles_only_punct(en_tokenizer, text):
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
assert len(tokens) == len(text)
|
assert len(tokens) == len(text)
|
||||||
|
|
||||||
|
@ -111,3 +117,15 @@ def test_two_different(en_tokenizer, punct_open, punct_close, punct_open_add, pu
|
||||||
assert tokens[2].text == text
|
assert tokens[2].text == text
|
||||||
assert tokens[3].text == punct_close
|
assert tokens[3].text == punct_close
|
||||||
assert tokens[4].text == punct_close_add
|
assert tokens[4].text == punct_close_add
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text,punct', [("(can't", "(")])
|
||||||
|
def test_tokenizer_splits_pre_punct_regex(text, punct):
|
||||||
|
match = en_search_prefixes(text)
|
||||||
|
assert match.group() == punct
|
||||||
|
|
||||||
|
|
||||||
|
def test_tokenizer_splits_bracket_period(en_tokenizer):
|
||||||
|
text = "(And a 6a.m. run through Washington Park)."
|
||||||
|
tokens = en_tokenizer(text)
|
||||||
|
assert tokens[len(tokens) - 1].text == "."
|
||||||
|
|
|
@ -20,13 +20,6 @@ en_search_prefixes = util.compile_prefix_regex(TOKENIZER_PREFIXES).search
|
||||||
# loaded = pickle.load(file_)
|
# loaded = pickle.load(file_)
|
||||||
# assert loaded is not None
|
# assert loaded is not None
|
||||||
|
|
||||||
def test_pre_punct_regex():
|
|
||||||
string = "(can't"
|
|
||||||
match = en_search_prefixes(string)
|
|
||||||
assert match.group() == "("
|
|
||||||
|
|
||||||
def test_no_word(en_tokenizer):
|
|
||||||
tokens = en_tokenizer(u'')
|
|
||||||
assert len(tokens) == 0
|
assert len(tokens) == 0
|
||||||
|
|
||||||
|
|
||||||
|
@ -65,14 +58,6 @@ def test_contraction(en_tokenizer):
|
||||||
assert len(tokens) == 5
|
assert len(tokens) == 5
|
||||||
assert tokens[4].orth == en_tokenizer.vocab['!'].orth
|
assert tokens[4].orth == en_tokenizer.vocab['!'].orth
|
||||||
|
|
||||||
def test_contraction_punct(en_tokenizer):
|
|
||||||
tokens = [w.text for w in en_tokenizer("(can't")]
|
|
||||||
assert tokens == ['(', 'ca', "n't"]
|
|
||||||
tokens = en_tokenizer("`ain't")
|
|
||||||
assert len(tokens) == 3
|
|
||||||
tokens = en_tokenizer('''"isn't''')
|
|
||||||
assert len(tokens) == 3
|
|
||||||
tokens = en_tokenizer("can't!")
|
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user