mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
Move non-English-specific tests back to general tokenizer tests
This commit is contained in:
parent
038002d616
commit
bbe7cab3a1
|
@ -1,5 +1,5 @@
|
|||
# coding: utf-8
|
||||
"""Test that tokenizer exceptions and emoticons are handles correctly."""
|
||||
"""Test that tokenizer exceptions are handled correctly."""
|
||||
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
@ -18,37 +18,3 @@ def test_tokenizer_handles_exc_in_text(en_tokenizer):
|
|||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 6
|
||||
assert tokens[3].text == "i.e."
|
||||
|
||||
|
||||
def test_tokenizer_handles_emoticons(en_tokenizer):
|
||||
# Tweebo challenge (CMU)
|
||||
text = """:o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ...."""
|
||||
tokens = en_tokenizer(text)
|
||||
assert tokens[0].text == ":o"
|
||||
assert tokens[1].text == ":/"
|
||||
assert tokens[2].text == ":'("
|
||||
assert tokens[3].text == ">:o"
|
||||
assert tokens[4].text == "(:"
|
||||
assert tokens[5].text == ":)"
|
||||
assert tokens[6].text == ">.<"
|
||||
assert tokens[7].text == "XD"
|
||||
assert tokens[8].text == "-__-"
|
||||
assert tokens[9].text == "o.O"
|
||||
assert tokens[10].text == ";D"
|
||||
assert tokens[11].text == ":-)"
|
||||
assert tokens[12].text == "@_@"
|
||||
assert tokens[13].text == ":P"
|
||||
assert tokens[14].text == "8D"
|
||||
assert tokens[15].text == ":1"
|
||||
assert tokens[16].text == ">:("
|
||||
assert tokens[17].text == ":D"
|
||||
assert tokens[18].text == "=|"
|
||||
assert tokens[19].text == '")'
|
||||
assert tokens[20].text == ':>'
|
||||
assert tokens[21].text == '....'
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,length', [("example:)", 3), ("108)", 2), ("XDN", 1)])
|
||||
def test_tokenizer_excludes_false_pos_emoticons(en_tokenizer, text, length):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == length
|
||||
|
|
|
@ -100,18 +100,6 @@ def test_tokenizer_splits_ellipsis_infix(en_tokenizer, text):
|
|||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["google.com", "python.org", "spacy.io", "explosion.ai"])
|
||||
def test_tokenizer_keep_urls(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["hello123@example.com", "hi+there@gmail.it", "matt@explosion.ai"])
|
||||
def test_tokenizer_keeps_email(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
|
||||
|
||||
def test_tokenizer_splits_double_hyphen_infix(en_tokenizer):
|
||||
tokens = en_tokenizer("No decent--let alone well-bred--people.")
|
||||
assert tokens[0].text == "No"
|
||||
|
|
|
@ -1,51 +0,0 @@
|
|||
# coding: utf-8
|
||||
"""Test that tokens are created correctly for whitespace."""
|
||||
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["hello possums"])
|
||||
def test_tokenizer_splits_single_space(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["hello possums"])
|
||||
def test_tokenizer_splits_double_space(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
assert tokens[1].text == " "
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["two spaces after this "])
|
||||
def test_tokenizer_handles_double_trainling_ws(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert repr(tokens.text_with_ws) == repr(text)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["hello\npossums"])
|
||||
def test_tokenizer_splits_newline(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
assert tokens[1].text == "\n"
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["hello \npossums"])
|
||||
def test_tokenizer_splits_newline_space(en_tokenizer, text):
|
||||
tokens = en_tokenizer('hello \npossums')
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["hello \npossums"])
|
||||
def test_tokenizer_splits_newline_double_space(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["hello \n possums"])
|
||||
def test_tokenizer_splits_newline_space_wrap(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 3
|
41
spacy/tests/tokenizer/test_exceptions.py
Normal file
41
spacy/tests/tokenizer/test_exceptions.py
Normal file
|
@ -0,0 +1,41 @@
|
|||
# coding: utf-8
|
||||
"""Test that tokenizer exceptions and emoticons are handled correctly."""
|
||||
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
def test_tokenizer_handles_emoticons(tokenizer):
|
||||
# Tweebo challenge (CMU)
|
||||
text = """:o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ...."""
|
||||
tokens = tokenizer(text)
|
||||
assert tokens[0].text == ":o"
|
||||
assert tokens[1].text == ":/"
|
||||
assert tokens[2].text == ":'("
|
||||
assert tokens[3].text == ">:o"
|
||||
assert tokens[4].text == "(:"
|
||||
assert tokens[5].text == ":)"
|
||||
assert tokens[6].text == ">.<"
|
||||
assert tokens[7].text == "XD"
|
||||
assert tokens[8].text == "-__-"
|
||||
assert tokens[9].text == "o.O"
|
||||
assert tokens[10].text == ";D"
|
||||
assert tokens[11].text == ":-)"
|
||||
assert tokens[12].text == "@_@"
|
||||
assert tokens[13].text == ":P"
|
||||
assert tokens[14].text == "8D"
|
||||
assert tokens[15].text == ":1"
|
||||
assert tokens[16].text == ">:("
|
||||
assert tokens[17].text == ":D"
|
||||
assert tokens[18].text == "=|"
|
||||
assert tokens[19].text == '")'
|
||||
assert tokens[20].text == ':>'
|
||||
assert tokens[21].text == '....'
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,length', [("example:)", 3), ("108)", 2), ("XDN", 1)])
|
||||
def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length):
|
||||
tokens = tokenizer(text)
|
||||
assert len(tokens) == length
|
|
@ -1,8 +1,11 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
from os import path
|
||||
|
||||
import pytest
|
||||
|
||||
from ...util import utf8open
|
||||
|
||||
|
||||
def test_tokenizer_handles_no_word(tokenizer):
|
||||
tokens = tokenizer("")
|
||||
|
@ -15,27 +18,6 @@ def test_tokenizer_handles_single_word(tokenizer, text):
|
|||
assert tokens[0].text == text
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["lorem ipsum"])
|
||||
def test_tokenizer_handles_two_words(tokenizer, text):
|
||||
tokens = tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
assert tokens[0].text != tokens[1].text
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["lorem ipsum"])
|
||||
def test_tokenizer_splits_double_space(tokenizer, text):
|
||||
tokens = tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
assert tokens[1].text == " "
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["lorem\nipsum"])
|
||||
def test_tokenizer_splits_newline(tokenizer, text):
|
||||
tokens = tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
assert tokens[1].text == "\n"
|
||||
|
||||
|
||||
def test_tokenizer_handles_punct(tokenizer):
|
||||
text = "Lorem, ipsum."
|
||||
tokens = tokenizer(text)
|
||||
|
@ -57,6 +39,18 @@ def test_tokenizer_handles_digits(tokenizer):
|
|||
assert tokens[3].text == "1984"
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["google.com", "python.org", "spacy.io", "explosion.ai"])
|
||||
def test_tokenizer_keep_urls(tokenizer, text):
|
||||
tokens = tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["hello123@example.com", "hi+there@gmail.it", "matt@explosion.ai"])
|
||||
def test_tokenizer_keeps_email(tokenizer, text):
|
||||
tokens = tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
|
||||
|
||||
def test_tokenizer_handles_long_text(tokenizer):
|
||||
text = """Lorem ipsum dolor sit amet, consectetur adipiscing elit
|
||||
|
||||
|
@ -71,6 +65,15 @@ Phasellus tincidunt, augue quis porta finibus, massa sapien consectetur augue, n
|
|||
assert len(tokens) > 5
|
||||
|
||||
|
||||
@pytest.mark.parametrize('file_name', ["sun.txt"])
|
||||
def test_tokenizer_handle_text_from_file(tokenizer, file_name):
|
||||
loc = path.join(path.dirname(__file__), file_name)
|
||||
text = utf8open(loc).read()
|
||||
assert len(text) != 0
|
||||
tokens = tokenizer(text)
|
||||
assert len(tokens) > 100
|
||||
|
||||
|
||||
def test_tokenizer_suspected_freeing_strings(tokenizer):
|
||||
text1 = "Lorem dolor sit amet, consectetur adipiscing elit."
|
||||
text2 = "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
|
||||
|
|
51
spacy/tests/tokenizer/test_whitespace.py
Normal file
51
spacy/tests/tokenizer/test_whitespace.py
Normal file
|
@ -0,0 +1,51 @@
|
|||
# coding: utf-8
|
||||
"""Test that tokens are created correctly for whitespace."""
|
||||
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["lorem ipsum"])
|
||||
def test_tokenizer_splits_single_space(tokenizer, text):
|
||||
tokens = tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["lorem ipsum"])
|
||||
def test_tokenizer_splits_double_space(tokenizer, text):
|
||||
tokens = tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
assert tokens[1].text == " "
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["lorem ipsum "])
|
||||
def test_tokenizer_handles_double_trainling_ws(tokenizer, text):
|
||||
tokens = tokenizer(text)
|
||||
assert repr(tokens.text_with_ws) == repr(text)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["lorem\nipsum"])
|
||||
def test_tokenizer_splits_newline(tokenizer, text):
|
||||
tokens = tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
assert tokens[1].text == "\n"
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["lorem \nipsum"])
|
||||
def test_tokenizer_splits_newline_space(tokenizer, text):
|
||||
tokens = tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["lorem \nipsum"])
|
||||
def test_tokenizer_splits_newline_double_space(tokenizer, text):
|
||||
tokens = tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["lorem \n ipsum"])
|
||||
def test_tokenizer_splits_newline_space_wrap(tokenizer, text):
|
||||
tokens = tokenizer(text)
|
||||
assert len(tokens) == 3
|
Loading…
Reference in New Issue
Block a user