diff --git a/spacy/tests/tokenizer/test_emoticons.py b/spacy/tests/tokenizer/test_emoticons.py deleted file mode 100644 index 3f5c4bc04..000000000 --- a/spacy/tests/tokenizer/test_emoticons.py +++ /dev/null @@ -1,37 +0,0 @@ -from __future__ import unicode_literals - -import pytest - - -def test_tokenizer_handles_emoticons(en_tokenizer): - # Tweebo challenge (CMU) - text = u""":o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ....""" - tokens = en_tokenizer(text) - assert tokens[0].orth_ == ":o" - assert tokens[1].orth_ == ":/" - assert tokens[2].orth_ == ":'(" - assert tokens[3].orth_ == ">:o" - assert tokens[4].orth_ == "(:" - assert tokens[5].orth_ == ":)" - assert tokens[6].orth_ == ">.<" - assert tokens[7].orth_ == "XD" - assert tokens[8].orth_ == "-__-" - assert tokens[9].orth_ == "o.O" - assert tokens[10].orth_ == ";D" - assert tokens[11].orth_ == ":-)" - assert tokens[12].orth_ == "@_@" - assert tokens[13].orth_ == ":P" - assert tokens[14].orth_ == "8D" - assert tokens[15].orth_ == ":1" - assert tokens[16].orth_ == ">:(" - assert tokens[17].orth_ == ":D" - assert tokens[18].orth_ == "=|" - assert tokens[19].orth_ == '")' - assert tokens[20].orth_ == ':>' - assert tokens[21].orth_ == '....' - - -@pytest.mark.parametrize('text,length', [("example:)", 3), ("108)", 2), ("XDN", 1)]) -def test_tokenizer_excludes_false_pos_emoticons(en_tokenizer, text, length): - tokens = en_tokenizer(text) - assert len(tokens) == length diff --git a/spacy/tests/tokenizer/test_exceptions.py b/spacy/tests/tokenizer/test_exceptions.py new file mode 100644 index 000000000..c194dce21 --- /dev/null +++ b/spacy/tests/tokenizer/test_exceptions.py @@ -0,0 +1,54 @@ +# coding: utf-8 +"""Test that tokenizer exceptions and emoticons are handles correctly.""" + + +from __future__ import unicode_literals + +import pytest + + +@pytest.mark.parametrize('text', ["e.g.", "p.m.", "Jan.", "Dec.", "Inc."]) +def test_tokenizer_handles_abbr(en_tokenizer, text): + tokens = en_tokenizer(text) + assert len(tokens) == 1 + + +def test_tokenizer_handles_exc_in_text(en_tokenizer): + text = "It's mediocre i.e. bad." + tokens = en_tokenizer(text) + assert len(tokens) == 6 + assert tokens[3].text == "i.e." + + +def test_tokenizer_handles_emoticons(en_tokenizer): + # Tweebo challenge (CMU) + text = """:o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ....""" + tokens = en_tokenizer(text) + assert tokens[0].text == ":o" + assert tokens[1].text == ":/" + assert tokens[2].text == ":'(" + assert tokens[3].text == ">:o" + assert tokens[4].text == "(:" + assert tokens[5].text == ":)" + assert tokens[6].text == ">.<" + assert tokens[7].text == "XD" + assert tokens[8].text == "-__-" + assert tokens[9].text == "o.O" + assert tokens[10].text == ";D" + assert tokens[11].text == ":-)" + assert tokens[12].text == "@_@" + assert tokens[13].text == ":P" + assert tokens[14].text == "8D" + assert tokens[15].text == ":1" + assert tokens[16].text == ">:(" + assert tokens[17].text == ":D" + assert tokens[18].text == "=|" + assert tokens[19].text == '")' + assert tokens[20].text == ':>' + assert tokens[21].text == '....' + + +@pytest.mark.parametrize('text,length', [("example:)", 3), ("108)", 2), ("XDN", 1)]) +def test_tokenizer_excludes_false_pos_emoticons(en_tokenizer, text, length): + tokens = en_tokenizer(text) + assert len(tokens) == length