Modernize and merge tokenizer tests for punctuation

2025-09-18 10:02:40 +03:00 · 2017-01-05 13:14:16 +01:00 · 2017-01-05 13:14:16 +01:00 · 2c2e878653
commit 2c2e878653
parent 8a74129cdf
2 changed files with 19 additions and 16 deletions
--- a/spacy/tests/tokenizer/test_punct.py
+++ b/spacy/tests/tokenizer/test_punct.py
@ -6,13 +6,19 @@ from __future__ import unicode_literals
 import pytest
 from ... import util
 from ...language_data import TOKENIZER_PREFIXES
 en_search_prefixes = util.compile_prefix_regex(TOKENIZER_PREFIXES).search
 PUNCT_OPEN = ['(', '[', '{', '*']
 PUNCT_CLOSE = [')', ']', '}', '*']
 PUNCT_PAIRED = [('(', ')'),  ('[', ']'), ('{', '}'), ('*', '*')]
@pytest.mark.parametrize('text', ["(", "((", "<"])
-def test_tokenizer_only_punct(en_tokenizer, text):
+def test_tokenizer_handles_only_punct(en_tokenizer, text):
    tokens = en_tokenizer(text)
    assert len(tokens) == len(text)
@ -111,3 +117,15 @@ def test_two_different(en_tokenizer, punct_open, punct_close, punct_open_add, pu
    assert tokens[2].text == text
    assert tokens[3].text == punct_close
    assert tokens[4].text == punct_close_add
@pytest.mark.parametrize('text,punct', [("(can't", "(")])
 def test_tokenizer_splits_pre_punct_regex(text, punct):
    match = en_search_prefixes(text)
    assert match.group() == punct
 def test_tokenizer_splits_bracket_period(en_tokenizer):
    text = "(And a 6a.m. run through Washington Park)."
    tokens = en_tokenizer(text)
    assert tokens[len(tokens) - 1].text == "."
--- a/spacy/tests/tokenizer/test_tokenizer.py
+++ b/spacy/tests/tokenizer/test_tokenizer.py
@ -20,13 +20,6 @@ en_search_prefixes = util.compile_prefix_regex(TOKENIZER_PREFIXES).search
 #     loaded = pickle.load(file_)
 #     assert loaded is not None
 def test_pre_punct_regex():
    string = "(can't"
    match = en_search_prefixes(string)
    assert match.group() == "("
 def test_no_word(en_tokenizer):
    tokens = en_tokenizer(u'')
    assert len(tokens) == 0
@ -65,14 +58,6 @@ def test_contraction(en_tokenizer):
    assert len(tokens) == 5
    assert tokens[4].orth == en_tokenizer.vocab['!'].orth
 def test_contraction_punct(en_tokenizer):
    tokens = [w.text for w in en_tokenizer("(can't")]
    assert tokens == ['(', 'ca', "n't"]
    tokens = en_tokenizer("`ain't")
    assert len(tokens) == 3
    tokens = en_tokenizer('''"isn't''')
    assert len(tokens) == 3
    tokens = en_tokenizer("can't!")
    assert len(tokens) == 3