From bbe7cab3a145fb9c3e849fbbeede5957a16589d6 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 5 Jan 2017 18:09:29 +0100 Subject: [PATCH] Move non-English-specific tests back to general tokenizer tests --- spacy/tests/en/tokenizer/test_exceptions.py | 36 +------------ .../en/tokenizer/test_prefix_suffix_infix.py | 12 ----- spacy/tests/en/tokenizer/test_whitespace.py | 51 ------------------- spacy/tests/{en => }/tokenizer/sun.txt | 0 spacy/tests/tokenizer/test_exceptions.py | 41 +++++++++++++++ spacy/tests/tokenizer/test_tokenizer.py | 45 ++++++++-------- spacy/tests/tokenizer/test_whitespace.py | 51 +++++++++++++++++++ 7 files changed, 117 insertions(+), 119 deletions(-) delete mode 100644 spacy/tests/en/tokenizer/test_whitespace.py rename spacy/tests/{en => }/tokenizer/sun.txt (100%) create mode 100644 spacy/tests/tokenizer/test_exceptions.py create mode 100644 spacy/tests/tokenizer/test_whitespace.py diff --git a/spacy/tests/en/tokenizer/test_exceptions.py b/spacy/tests/en/tokenizer/test_exceptions.py index c194dce21..ac7ed452f 100644 --- a/spacy/tests/en/tokenizer/test_exceptions.py +++ b/spacy/tests/en/tokenizer/test_exceptions.py @@ -1,5 +1,5 @@ # coding: utf-8 -"""Test that tokenizer exceptions and emoticons are handles correctly.""" +"""Test that tokenizer exceptions are handled correctly.""" from __future__ import unicode_literals @@ -18,37 +18,3 @@ def test_tokenizer_handles_exc_in_text(en_tokenizer): tokens = en_tokenizer(text) assert len(tokens) == 6 assert tokens[3].text == "i.e." - - -def test_tokenizer_handles_emoticons(en_tokenizer): - # Tweebo challenge (CMU) - text = """:o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ....""" - tokens = en_tokenizer(text) - assert tokens[0].text == ":o" - assert tokens[1].text == ":/" - assert tokens[2].text == ":'(" - assert tokens[3].text == ">:o" - assert tokens[4].text == "(:" - assert tokens[5].text == ":)" - assert tokens[6].text == ">.<" - assert tokens[7].text == "XD" - assert tokens[8].text == "-__-" - assert tokens[9].text == "o.O" - assert tokens[10].text == ";D" - assert tokens[11].text == ":-)" - assert tokens[12].text == "@_@" - assert tokens[13].text == ":P" - assert tokens[14].text == "8D" - assert tokens[15].text == ":1" - assert tokens[16].text == ">:(" - assert tokens[17].text == ":D" - assert tokens[18].text == "=|" - assert tokens[19].text == '")' - assert tokens[20].text == ':>' - assert tokens[21].text == '....' - - -@pytest.mark.parametrize('text,length', [("example:)", 3), ("108)", 2), ("XDN", 1)]) -def test_tokenizer_excludes_false_pos_emoticons(en_tokenizer, text, length): - tokens = en_tokenizer(text) - assert len(tokens) == length diff --git a/spacy/tests/en/tokenizer/test_prefix_suffix_infix.py b/spacy/tests/en/tokenizer/test_prefix_suffix_infix.py index d6963ada1..042934d4e 100644 --- a/spacy/tests/en/tokenizer/test_prefix_suffix_infix.py +++ b/spacy/tests/en/tokenizer/test_prefix_suffix_infix.py @@ -100,18 +100,6 @@ def test_tokenizer_splits_ellipsis_infix(en_tokenizer, text): assert len(tokens) == 3 -@pytest.mark.parametrize('text', ["google.com", "python.org", "spacy.io", "explosion.ai"]) -def test_tokenizer_keep_urls(en_tokenizer, text): - tokens = en_tokenizer(text) - assert len(tokens) == 1 - - -@pytest.mark.parametrize('text', ["hello123@example.com", "hi+there@gmail.it", "matt@explosion.ai"]) -def test_tokenizer_keeps_email(en_tokenizer, text): - tokens = en_tokenizer(text) - assert len(tokens) == 1 - - def test_tokenizer_splits_double_hyphen_infix(en_tokenizer): tokens = en_tokenizer("No decent--let alone well-bred--people.") assert tokens[0].text == "No" diff --git a/spacy/tests/en/tokenizer/test_whitespace.py b/spacy/tests/en/tokenizer/test_whitespace.py deleted file mode 100644 index 90dc80615..000000000 --- a/spacy/tests/en/tokenizer/test_whitespace.py +++ /dev/null @@ -1,51 +0,0 @@ -# coding: utf-8 -"""Test that tokens are created correctly for whitespace.""" - - -from __future__ import unicode_literals - -import pytest - - -@pytest.mark.parametrize('text', ["hello possums"]) -def test_tokenizer_splits_single_space(en_tokenizer, text): - tokens = en_tokenizer(text) - assert len(tokens) == 2 - - -@pytest.mark.parametrize('text', ["hello possums"]) -def test_tokenizer_splits_double_space(en_tokenizer, text): - tokens = en_tokenizer(text) - assert len(tokens) == 3 - assert tokens[1].text == " " - - -@pytest.mark.parametrize('text', ["two spaces after this "]) -def test_tokenizer_handles_double_trainling_ws(en_tokenizer, text): - tokens = en_tokenizer(text) - assert repr(tokens.text_with_ws) == repr(text) - - -@pytest.mark.parametrize('text', ["hello\npossums"]) -def test_tokenizer_splits_newline(en_tokenizer, text): - tokens = en_tokenizer(text) - assert len(tokens) == 3 - assert tokens[1].text == "\n" - - -@pytest.mark.parametrize('text', ["hello \npossums"]) -def test_tokenizer_splits_newline_space(en_tokenizer, text): - tokens = en_tokenizer('hello \npossums') - assert len(tokens) == 3 - - -@pytest.mark.parametrize('text', ["hello \npossums"]) -def test_tokenizer_splits_newline_double_space(en_tokenizer, text): - tokens = en_tokenizer(text) - assert len(tokens) == 3 - - -@pytest.mark.parametrize('text', ["hello \n possums"]) -def test_tokenizer_splits_newline_space_wrap(en_tokenizer, text): - tokens = en_tokenizer(text) - assert len(tokens) == 3 diff --git a/spacy/tests/en/tokenizer/sun.txt b/spacy/tests/tokenizer/sun.txt similarity index 100% rename from spacy/tests/en/tokenizer/sun.txt rename to spacy/tests/tokenizer/sun.txt diff --git a/spacy/tests/tokenizer/test_exceptions.py b/spacy/tests/tokenizer/test_exceptions.py new file mode 100644 index 000000000..aab27714e --- /dev/null +++ b/spacy/tests/tokenizer/test_exceptions.py @@ -0,0 +1,41 @@ +# coding: utf-8 +"""Test that tokenizer exceptions and emoticons are handled correctly.""" + + +from __future__ import unicode_literals + +import pytest + + +def test_tokenizer_handles_emoticons(tokenizer): + # Tweebo challenge (CMU) + text = """:o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ....""" + tokens = tokenizer(text) + assert tokens[0].text == ":o" + assert tokens[1].text == ":/" + assert tokens[2].text == ":'(" + assert tokens[3].text == ">:o" + assert tokens[4].text == "(:" + assert tokens[5].text == ":)" + assert tokens[6].text == ">.<" + assert tokens[7].text == "XD" + assert tokens[8].text == "-__-" + assert tokens[9].text == "o.O" + assert tokens[10].text == ";D" + assert tokens[11].text == ":-)" + assert tokens[12].text == "@_@" + assert tokens[13].text == ":P" + assert tokens[14].text == "8D" + assert tokens[15].text == ":1" + assert tokens[16].text == ">:(" + assert tokens[17].text == ":D" + assert tokens[18].text == "=|" + assert tokens[19].text == '")' + assert tokens[20].text == ':>' + assert tokens[21].text == '....' + + +@pytest.mark.parametrize('text,length', [("example:)", 3), ("108)", 2), ("XDN", 1)]) +def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length): + tokens = tokenizer(text) + assert len(tokens) == length diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py index 49bfdcb26..cd0043a10 100644 --- a/spacy/tests/tokenizer/test_tokenizer.py +++ b/spacy/tests/tokenizer/test_tokenizer.py @@ -1,8 +1,11 @@ # coding: utf-8 from __future__ import unicode_literals +from os import path import pytest +from ...util import utf8open + def test_tokenizer_handles_no_word(tokenizer): tokens = tokenizer("") @@ -15,27 +18,6 @@ def test_tokenizer_handles_single_word(tokenizer, text): assert tokens[0].text == text -@pytest.mark.parametrize('text', ["lorem ipsum"]) -def test_tokenizer_handles_two_words(tokenizer, text): - tokens = tokenizer(text) - assert len(tokens) == 2 - assert tokens[0].text != tokens[1].text - - -@pytest.mark.parametrize('text', ["lorem ipsum"]) -def test_tokenizer_splits_double_space(tokenizer, text): - tokens = tokenizer(text) - assert len(tokens) == 3 - assert tokens[1].text == " " - - -@pytest.mark.parametrize('text', ["lorem\nipsum"]) -def test_tokenizer_splits_newline(tokenizer, text): - tokens = tokenizer(text) - assert len(tokens) == 3 - assert tokens[1].text == "\n" - - def test_tokenizer_handles_punct(tokenizer): text = "Lorem, ipsum." tokens = tokenizer(text) @@ -57,6 +39,18 @@ def test_tokenizer_handles_digits(tokenizer): assert tokens[3].text == "1984" +@pytest.mark.parametrize('text', ["google.com", "python.org", "spacy.io", "explosion.ai"]) +def test_tokenizer_keep_urls(tokenizer, text): + tokens = tokenizer(text) + assert len(tokens) == 1 + + +@pytest.mark.parametrize('text', ["hello123@example.com", "hi+there@gmail.it", "matt@explosion.ai"]) +def test_tokenizer_keeps_email(tokenizer, text): + tokens = tokenizer(text) + assert len(tokens) == 1 + + def test_tokenizer_handles_long_text(tokenizer): text = """Lorem ipsum dolor sit amet, consectetur adipiscing elit @@ -71,6 +65,15 @@ Phasellus tincidunt, augue quis porta finibus, massa sapien consectetur augue, n assert len(tokens) > 5 +@pytest.mark.parametrize('file_name', ["sun.txt"]) +def test_tokenizer_handle_text_from_file(tokenizer, file_name): + loc = path.join(path.dirname(__file__), file_name) + text = utf8open(loc).read() + assert len(text) != 0 + tokens = tokenizer(text) + assert len(tokens) > 100 + + def test_tokenizer_suspected_freeing_strings(tokenizer): text1 = "Lorem dolor sit amet, consectetur adipiscing elit." text2 = "Lorem ipsum dolor sit amet, consectetur adipiscing elit." diff --git a/spacy/tests/tokenizer/test_whitespace.py b/spacy/tests/tokenizer/test_whitespace.py new file mode 100644 index 000000000..7ff3106a8 --- /dev/null +++ b/spacy/tests/tokenizer/test_whitespace.py @@ -0,0 +1,51 @@ +# coding: utf-8 +"""Test that tokens are created correctly for whitespace.""" + + +from __future__ import unicode_literals + +import pytest + + +@pytest.mark.parametrize('text', ["lorem ipsum"]) +def test_tokenizer_splits_single_space(tokenizer, text): + tokens = tokenizer(text) + assert len(tokens) == 2 + + +@pytest.mark.parametrize('text', ["lorem ipsum"]) +def test_tokenizer_splits_double_space(tokenizer, text): + tokens = tokenizer(text) + assert len(tokens) == 3 + assert tokens[1].text == " " + + +@pytest.mark.parametrize('text', ["lorem ipsum "]) +def test_tokenizer_handles_double_trainling_ws(tokenizer, text): + tokens = tokenizer(text) + assert repr(tokens.text_with_ws) == repr(text) + + +@pytest.mark.parametrize('text', ["lorem\nipsum"]) +def test_tokenizer_splits_newline(tokenizer, text): + tokens = tokenizer(text) + assert len(tokens) == 3 + assert tokens[1].text == "\n" + + +@pytest.mark.parametrize('text', ["lorem \nipsum"]) +def test_tokenizer_splits_newline_space(tokenizer, text): + tokens = tokenizer(text) + assert len(tokens) == 3 + + +@pytest.mark.parametrize('text', ["lorem \nipsum"]) +def test_tokenizer_splits_newline_double_space(tokenizer, text): + tokens = tokenizer(text) + assert len(tokens) == 3 + + +@pytest.mark.parametrize('text', ["lorem \n ipsum"]) +def test_tokenizer_splits_newline_space_wrap(tokenizer, text): + tokens = tokenizer(text) + assert len(tokens) == 3