diff --git a/spacy/tests/tokenizer/test_only_punct.py b/spacy/tests/tokenizer/test_only_punct.py deleted file mode 100644 index 12c958088..000000000 --- a/spacy/tests/tokenizer/test_only_punct.py +++ /dev/null @@ -1,9 +0,0 @@ -from __future__ import unicode_literals - - -def test_only_pre1(en_tokenizer): - assert len(en_tokenizer("(")) == 1 - - -def test_only_pre2(en_tokenizer): - assert len(en_tokenizer("((")) == 2 diff --git a/spacy/tests/tokenizer/test_post_punct.py b/spacy/tests/tokenizer/test_post_punct.py deleted file mode 100644 index ff1120c63..000000000 --- a/spacy/tests/tokenizer/test_post_punct.py +++ /dev/null @@ -1,43 +0,0 @@ -from __future__ import unicode_literals -import pytest - - -@pytest.fixture -def close_puncts(): - return [')', ']', '}', '*'] - - -def test_close(close_puncts, en_tokenizer): - word_str = 'Hello' - for p in close_puncts: - string = word_str + p - tokens = en_tokenizer(string) - assert len(tokens) == 2 - assert tokens[1].string == p - assert tokens[0].string == word_str - - -def test_two_different_close(close_puncts, en_tokenizer): - word_str = 'Hello' - for p in close_puncts: - string = word_str + p + "'" - tokens = en_tokenizer(string) - assert len(tokens) == 3 - assert tokens[0].string == word_str - assert tokens[1].string == p - assert tokens[2].string == "'" - - -def test_three_same_close(close_puncts, en_tokenizer): - word_str = 'Hello' - for p in close_puncts: - string = word_str + p + p + p - tokens = en_tokenizer(string) - assert len(tokens) == 4 - assert tokens[0].string == word_str - assert tokens[1].string == p - - -def test_double_end_quote(en_tokenizer): - assert len(en_tokenizer("Hello''")) == 2 - assert len(en_tokenizer("''")) == 1 diff --git a/spacy/tests/tokenizer/test_pre_punct.py b/spacy/tests/tokenizer/test_pre_punct.py deleted file mode 100644 index 9aec1dc7b..000000000 --- a/spacy/tests/tokenizer/test_pre_punct.py +++ /dev/null @@ -1,46 +0,0 @@ -from __future__ import unicode_literals - -import pytest - - -@pytest.fixture -def open_puncts(): - return ['(', '[', '{', '*'] - - -def test_open(open_puncts, en_tokenizer): - word_str = 'Hello' - for p in open_puncts: - string = p + word_str - tokens = en_tokenizer(string) - assert len(tokens) == 2 - assert tokens[0].orth_ == p - assert tokens[1].orth_ == word_str - - -def test_two_different_open(open_puncts, en_tokenizer): - word_str = 'Hello' - for p in open_puncts: - string = p + "`" + word_str - tokens = en_tokenizer(string) - assert len(tokens) == 3 - assert tokens[0].orth_ == p - assert tokens[1].orth_ == "`" - assert tokens[2].orth_ == word_str - - -def test_three_same_open(open_puncts, en_tokenizer): - word_str = 'Hello' - for p in open_puncts: - string = p + p + p + word_str - tokens = en_tokenizer(string) - assert len(tokens) == 4 - assert tokens[0].orth_ == p - assert tokens[3].orth_ == word_str - - -def test_open_appostrophe(en_tokenizer): - string = "'The" - tokens = en_tokenizer(string) - assert len(tokens) == 2 - assert tokens[0].orth_ == "'" diff --git a/spacy/tests/tokenizer/test_punct.py b/spacy/tests/tokenizer/test_punct.py new file mode 100644 index 000000000..d238e593e --- /dev/null +++ b/spacy/tests/tokenizer/test_punct.py @@ -0,0 +1,109 @@ +from __future__ import unicode_literals + +import pytest + +PUNCT_OPEN = ['(', '[', '{', '*'] +PUNCT_CLOSE = [')', ']', '}', '*'] +PUNCT_PAIRED = [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')] + + +@pytest.mark.parametrize('text', ["(", "((", "<"]) +def test_tokenizer_only_punct(en_tokenizer, text): + tokens = en_tokenizer(text) + assert len(tokens) == len(text) + + +@pytest.mark.parametrize('punct', PUNCT_OPEN) +@pytest.mark.parametrize('text', ["Hello"]) +def test_tokenizer_splits_open_punct(en_tokenizer, punct, text): + tokens = en_tokenizer(punct + text) + assert len(tokens) == 2 + assert tokens[0].text == punct + assert tokens[1].text == text + + +@pytest.mark.parametrize('punct', PUNCT_CLOSE) +@pytest.mark.parametrize('text', ["Hello"]) +def test_tokenizer_splits_close_punct(en_tokenizer, punct, text): + tokens = en_tokenizer(text + punct) + assert len(tokens) == 2 + assert tokens[0].text == text + assert tokens[1].text == punct + + +@pytest.mark.parametrize('punct', PUNCT_OPEN) +@pytest.mark.parametrize('punct_add', ["`"]) +@pytest.mark.parametrize('text', ["Hello"]) +def test_tokenizer_splits_two_diff_open_punct(en_tokenizer, punct, punct_add, text): + tokens = en_tokenizer(punct + punct_add + text) + assert len(tokens) == 3 + assert tokens[0].text == punct + assert tokens[1].text == punct_add + assert tokens[2].text == text + + +@pytest.mark.parametrize('punct', PUNCT_CLOSE) +@pytest.mark.parametrize('punct_add', ["'"]) +@pytest.mark.parametrize('text', ["Hello"]) +def test_tokenizer_splits_two_diff_close_punct(en_tokenizer, punct, punct_add, text): + tokens = en_tokenizer(text + punct + punct_add) + assert len(tokens) == 3 + assert tokens[0].text == text + assert tokens[1].text == punct + assert tokens[2].text == punct_add + + +@pytest.mark.parametrize('punct', PUNCT_OPEN) +@pytest.mark.parametrize('text', ["Hello"]) +def test_tokenizer_splits_same_open_punct(en_tokenizer, punct, text): + tokens = en_tokenizer(punct + punct + punct + text) + assert len(tokens) == 4 + assert tokens[0].text == punct + assert tokens[3].text == text + + +@pytest.mark.parametrize('punct', PUNCT_CLOSE) +@pytest.mark.parametrize('text', ["Hello"]) +def test_tokenizer_splits_same_close_punct(en_tokenizer, punct, text): + tokens = en_tokenizer(text + punct + punct + punct) + assert len(tokens) == 4 + assert tokens[0].text == text + assert tokens[1].text == punct + + +@pytest.mark.parametrize('text', ["'The"]) +def test_tokenizer_splits_open_appostrophe(en_tokenizer, text): + tokens = en_tokenizer(text) + assert len(tokens) == 2 + assert tokens[0].text == "'" + + +@pytest.mark.parametrize('text', ["Hello''"]) +def test_tokenizer_splits_double_end_quote(en_tokenizer, text): + tokens = en_tokenizer(text) + assert len(tokens) == 2 + tokens_punct = en_tokenizer("''") + assert len(tokens_punct) == 1 + + +@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED) +@pytest.mark.parametrize('text', ["Hello"]) +def test_tokenizer_splits_open_close_punct(en_tokenizer, punct_open, punct_close, text): + tokens = en_tokenizer(punct_open + text + punct_close) + assert len(tokens) == 3 + assert tokens[0].text == punct_open + assert tokens[1].text == text + assert tokens[2].text == punct_close + + +@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED) +@pytest.mark.parametrize('punct_open_add,punct_close_add', [("`", "'")]) +@pytest.mark.parametrize('text', ["Hello"]) +def test_two_different(en_tokenizer, punct_open, punct_close, punct_open_add, punct_close_add, text): + tokens = en_tokenizer(punct_open_add + punct_open + text + punct_close + punct_close_add) + assert len(tokens) == 5 + assert tokens[0].text == punct_open_add + assert tokens[1].text == punct_open + assert tokens[2].text == text + assert tokens[3].text == punct_close + assert tokens[4].text == punct_close_add diff --git a/spacy/tests/tokenizer/test_surround_punct.py b/spacy/tests/tokenizer/test_surround_punct.py deleted file mode 100644 index 7c7a50904..000000000 --- a/spacy/tests/tokenizer/test_surround_punct.py +++ /dev/null @@ -1,32 +0,0 @@ -from __future__ import unicode_literals -import pytest - - -@pytest.fixture -def paired_puncts(): - return [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')] - - -def test_token(paired_puncts, en_tokenizer): - word_str = 'Hello' - for open_, close_ in paired_puncts: - string = open_ + word_str + close_ - tokens = en_tokenizer(string) - assert len(tokens) == 3 - assert tokens[0].orth_ == open_ - assert tokens[1].orth_ == word_str - assert tokens[2].orth_ == close_ - - -def test_two_different(paired_puncts, en_tokenizer): - word_str = 'Hello' - for open_, close_ in paired_puncts: - string = "`" + open_ + word_str + close_ + "'" - tokens = en_tokenizer(string) - assert len(tokens) == 5 - assert tokens[0].orth_ == "`" - assert tokens[1].orth_ == open_ - assert tokens[2].orth_ == word_str - assert tokens[2].orth_ == word_str - assert tokens[3].orth_ == close_ - assert tokens[4].orth_ == "'"