Modernize and merge tokenizer tests for punctuation

2025-08-02 03:10:22 +03:00 · 2017-01-05 13:14:16 +01:00 · 2017-01-05 13:14:16 +01:00 · 2c2e878653
commit 2c2e878653
parent 8a74129cdf
2 changed files with 19 additions and 16 deletions
--- a/spacy/tests/tokenizer/test_punct.py
+++ b/spacy/tests/tokenizer/test_punct.py
@ -6,13 +6,19 @@ from __future__ import unicode_literals

 import pytest

+from ... import util
+from ...language_data import TOKENIZER_PREFIXES
+
+en_search_prefixes = util.compile_prefix_regex(TOKENIZER_PREFIXES).search
+
+
 PUNCT_OPEN = ['(', '[', '{', '*']
 PUNCT_CLOSE = [')', ']', '}', '*']
 PUNCT_PAIRED = [('(', ')'),  ('[', ']'), ('{', '}'), ('*', '*')]


@pytest.mark.parametrize('text', ["(", "((", "<"])
-def test_tokenizer_only_punct(en_tokenizer, text):
+def test_tokenizer_handles_only_punct(en_tokenizer, text):
    tokens = en_tokenizer(text)
    assert len(tokens) == len(text)

@ -111,3 +117,15 @@ def test_two_different(en_tokenizer, punct_open, punct_close, punct_open_add, pu
    assert tokens[2].text == text
    assert tokens[3].text == punct_close
    assert tokens[4].text == punct_close_add
+
+
+@pytest.mark.parametrize('text,punct', [("(can't", "(")])
+def test_tokenizer_splits_pre_punct_regex(text, punct):
+    match = en_search_prefixes(text)
+    assert match.group() == punct
+
+
+def test_tokenizer_splits_bracket_period(en_tokenizer):
+    text = "(And a 6a.m. run through Washington Park)."
+    tokens = en_tokenizer(text)
+    assert tokens[len(tokens) - 1].text == "."
--- a/spacy/tests/tokenizer/test_tokenizer.py
+++ b/spacy/tests/tokenizer/test_tokenizer.py
@ -20,13 +20,6 @@ en_search_prefixes = util.compile_prefix_regex(TOKENIZER_PREFIXES).search
 #     loaded = pickle.load(file_)
 #     assert loaded is not None

-def test_pre_punct_regex():
-    string = "(can't"
-    match = en_search_prefixes(string)
-    assert match.group() == "("
-
-def test_no_word(en_tokenizer):
-    tokens = en_tokenizer(u'')
    assert len(tokens) == 0


@ -65,14 +58,6 @@ def test_contraction(en_tokenizer):
    assert len(tokens) == 5
    assert tokens[4].orth == en_tokenizer.vocab['!'].orth

-def test_contraction_punct(en_tokenizer):
-    tokens = [w.text for w in en_tokenizer("(can't")]
-    assert tokens == ['(', 'ca', "n't"]
-    tokens = en_tokenizer("`ain't")
-    assert len(tokens) == 3
-    tokens = en_tokenizer('''"isn't''')
-    assert len(tokens) == 3
-    tokens = en_tokenizer("can't!")
    assert len(tokens) == 3