Move non-English-specific tests back to general tokenizer tests

This commit is contained in:
Ines Montani 2017-01-05 18:09:29 +01:00
parent 038002d616
commit bbe7cab3a1
7 changed files with 117 additions and 119 deletions

View File

@ -1,5 +1,5 @@
# coding: utf-8
"""Test that tokenizer exceptions and emoticons are handles correctly."""
"""Test that tokenizer exceptions are handled correctly."""
from __future__ import unicode_literals
@ -18,37 +18,3 @@ def test_tokenizer_handles_exc_in_text(en_tokenizer):
tokens = en_tokenizer(text)
assert len(tokens) == 6
assert tokens[3].text == "i.e."
def test_tokenizer_handles_emoticons(en_tokenizer):
# Tweebo challenge (CMU)
text = """:o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ...."""
tokens = en_tokenizer(text)
assert tokens[0].text == ":o"
assert tokens[1].text == ":/"
assert tokens[2].text == ":'("
assert tokens[3].text == ">:o"
assert tokens[4].text == "(:"
assert tokens[5].text == ":)"
assert tokens[6].text == ">.<"
assert tokens[7].text == "XD"
assert tokens[8].text == "-__-"
assert tokens[9].text == "o.O"
assert tokens[10].text == ";D"
assert tokens[11].text == ":-)"
assert tokens[12].text == "@_@"
assert tokens[13].text == ":P"
assert tokens[14].text == "8D"
assert tokens[15].text == ":1"
assert tokens[16].text == ">:("
assert tokens[17].text == ":D"
assert tokens[18].text == "=|"
assert tokens[19].text == '")'
assert tokens[20].text == ':>'
assert tokens[21].text == '....'
@pytest.mark.parametrize('text,length', [("example:)", 3), ("108)", 2), ("XDN", 1)])
def test_tokenizer_excludes_false_pos_emoticons(en_tokenizer, text, length):
tokens = en_tokenizer(text)
assert len(tokens) == length

View File

@ -100,18 +100,6 @@ def test_tokenizer_splits_ellipsis_infix(en_tokenizer, text):
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["google.com", "python.org", "spacy.io", "explosion.ai"])
def test_tokenizer_keep_urls(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 1
@pytest.mark.parametrize('text', ["hello123@example.com", "hi+there@gmail.it", "matt@explosion.ai"])
def test_tokenizer_keeps_email(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 1
def test_tokenizer_splits_double_hyphen_infix(en_tokenizer):
tokens = en_tokenizer("No decent--let alone well-bred--people.")
assert tokens[0].text == "No"

View File

@ -1,51 +0,0 @@
# coding: utf-8
"""Test that tokens are created correctly for whitespace."""
from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize('text', ["hello possums"])
def test_tokenizer_splits_single_space(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 2
@pytest.mark.parametrize('text', ["hello possums"])
def test_tokenizer_splits_double_space(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 3
assert tokens[1].text == " "
@pytest.mark.parametrize('text', ["two spaces after this "])
def test_tokenizer_handles_double_trainling_ws(en_tokenizer, text):
tokens = en_tokenizer(text)
assert repr(tokens.text_with_ws) == repr(text)
@pytest.mark.parametrize('text', ["hello\npossums"])
def test_tokenizer_splits_newline(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 3
assert tokens[1].text == "\n"
@pytest.mark.parametrize('text', ["hello \npossums"])
def test_tokenizer_splits_newline_space(en_tokenizer, text):
tokens = en_tokenizer('hello \npossums')
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["hello \npossums"])
def test_tokenizer_splits_newline_double_space(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["hello \n possums"])
def test_tokenizer_splits_newline_space_wrap(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 3

View File

@ -0,0 +1,41 @@
# coding: utf-8
"""Test that tokenizer exceptions and emoticons are handled correctly."""
from __future__ import unicode_literals
import pytest
def test_tokenizer_handles_emoticons(tokenizer):
# Tweebo challenge (CMU)
text = """:o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ...."""
tokens = tokenizer(text)
assert tokens[0].text == ":o"
assert tokens[1].text == ":/"
assert tokens[2].text == ":'("
assert tokens[3].text == ">:o"
assert tokens[4].text == "(:"
assert tokens[5].text == ":)"
assert tokens[6].text == ">.<"
assert tokens[7].text == "XD"
assert tokens[8].text == "-__-"
assert tokens[9].text == "o.O"
assert tokens[10].text == ";D"
assert tokens[11].text == ":-)"
assert tokens[12].text == "@_@"
assert tokens[13].text == ":P"
assert tokens[14].text == "8D"
assert tokens[15].text == ":1"
assert tokens[16].text == ">:("
assert tokens[17].text == ":D"
assert tokens[18].text == "=|"
assert tokens[19].text == '")'
assert tokens[20].text == ':>'
assert tokens[21].text == '....'
@pytest.mark.parametrize('text,length', [("example:)", 3), ("108)", 2), ("XDN", 1)])
def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length):
tokens = tokenizer(text)
assert len(tokens) == length

View File

@ -1,8 +1,11 @@
# coding: utf-8
from __future__ import unicode_literals
from os import path
import pytest
from ...util import utf8open
def test_tokenizer_handles_no_word(tokenizer):
tokens = tokenizer("")
@ -15,27 +18,6 @@ def test_tokenizer_handles_single_word(tokenizer, text):
assert tokens[0].text == text
@pytest.mark.parametrize('text', ["lorem ipsum"])
def test_tokenizer_handles_two_words(tokenizer, text):
tokens = tokenizer(text)
assert len(tokens) == 2
assert tokens[0].text != tokens[1].text
@pytest.mark.parametrize('text', ["lorem ipsum"])
def test_tokenizer_splits_double_space(tokenizer, text):
tokens = tokenizer(text)
assert len(tokens) == 3
assert tokens[1].text == " "
@pytest.mark.parametrize('text', ["lorem\nipsum"])
def test_tokenizer_splits_newline(tokenizer, text):
tokens = tokenizer(text)
assert len(tokens) == 3
assert tokens[1].text == "\n"
def test_tokenizer_handles_punct(tokenizer):
text = "Lorem, ipsum."
tokens = tokenizer(text)
@ -57,6 +39,18 @@ def test_tokenizer_handles_digits(tokenizer):
assert tokens[3].text == "1984"
@pytest.mark.parametrize('text', ["google.com", "python.org", "spacy.io", "explosion.ai"])
def test_tokenizer_keep_urls(tokenizer, text):
tokens = tokenizer(text)
assert len(tokens) == 1
@pytest.mark.parametrize('text', ["hello123@example.com", "hi+there@gmail.it", "matt@explosion.ai"])
def test_tokenizer_keeps_email(tokenizer, text):
tokens = tokenizer(text)
assert len(tokens) == 1
def test_tokenizer_handles_long_text(tokenizer):
text = """Lorem ipsum dolor sit amet, consectetur adipiscing elit
@ -71,6 +65,15 @@ Phasellus tincidunt, augue quis porta finibus, massa sapien consectetur augue, n
assert len(tokens) > 5
@pytest.mark.parametrize('file_name', ["sun.txt"])
def test_tokenizer_handle_text_from_file(tokenizer, file_name):
loc = path.join(path.dirname(__file__), file_name)
text = utf8open(loc).read()
assert len(text) != 0
tokens = tokenizer(text)
assert len(tokens) > 100
def test_tokenizer_suspected_freeing_strings(tokenizer):
text1 = "Lorem dolor sit amet, consectetur adipiscing elit."
text2 = "Lorem ipsum dolor sit amet, consectetur adipiscing elit."

View File

@ -0,0 +1,51 @@
# coding: utf-8
"""Test that tokens are created correctly for whitespace."""
from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize('text', ["lorem ipsum"])
def test_tokenizer_splits_single_space(tokenizer, text):
tokens = tokenizer(text)
assert len(tokens) == 2
@pytest.mark.parametrize('text', ["lorem ipsum"])
def test_tokenizer_splits_double_space(tokenizer, text):
tokens = tokenizer(text)
assert len(tokens) == 3
assert tokens[1].text == " "
@pytest.mark.parametrize('text', ["lorem ipsum "])
def test_tokenizer_handles_double_trainling_ws(tokenizer, text):
tokens = tokenizer(text)
assert repr(tokens.text_with_ws) == repr(text)
@pytest.mark.parametrize('text', ["lorem\nipsum"])
def test_tokenizer_splits_newline(tokenizer, text):
tokens = tokenizer(text)
assert len(tokens) == 3
assert tokens[1].text == "\n"
@pytest.mark.parametrize('text', ["lorem \nipsum"])
def test_tokenizer_splits_newline_space(tokenizer, text):
tokens = tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["lorem \nipsum"])
def test_tokenizer_splits_newline_double_space(tokenizer, text):
tokens = tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["lorem \n ipsum"])
def test_tokenizer_splits_newline_space_wrap(tokenizer, text):
tokens = tokenizer(text)
assert len(tokens) == 3