# coding: utf-8 """Test that tokenizer exceptions and emoticons are handles correctly.""" from __future__ import unicode_literals import pytest @pytest.mark.parametrize('text', ["e.g.", "p.m.", "Jan.", "Dec.", "Inc."]) def test_tokenizer_handles_abbr(en_tokenizer, text): tokens = en_tokenizer(text) assert len(tokens) == 1 def test_tokenizer_handles_exc_in_text(en_tokenizer): text = "It's mediocre i.e. bad." tokens = en_tokenizer(text) assert len(tokens) == 6 assert tokens[3].text == "i.e." def test_tokenizer_handles_emoticons(en_tokenizer): # Tweebo challenge (CMU) text = """:o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ....""" tokens = en_tokenizer(text) assert tokens[0].text == ":o" assert tokens[1].text == ":/" assert tokens[2].text == ":'(" assert tokens[3].text == ">:o" assert tokens[4].text == "(:" assert tokens[5].text == ":)" assert tokens[6].text == ">.<" assert tokens[7].text == "XD" assert tokens[8].text == "-__-" assert tokens[9].text == "o.O" assert tokens[10].text == ";D" assert tokens[11].text == ":-)" assert tokens[12].text == "@_@" assert tokens[13].text == ":P" assert tokens[14].text == "8D" assert tokens[15].text == ":1" assert tokens[16].text == ">:(" assert tokens[17].text == ":D" assert tokens[18].text == "=|" assert tokens[19].text == '")' assert tokens[20].text == ':>' assert tokens[21].text == '....' @pytest.mark.parametrize('text,length', [("example:)", 3), ("108)", 2), ("XDN", 1)]) def test_tokenizer_excludes_false_pos_emoticons(en_tokenizer, text, length): tokens = en_tokenizer(text) assert len(tokens) == length