From ee6b49b293279d14466744debc3392081f6da4ec Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 4 Jan 2017 00:47:59 +0100 Subject: [PATCH] Modernize tokenizer tests for emoticons --- spacy/tests/tokenizer/test_emoticons.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/spacy/tests/tokenizer/test_emoticons.py b/spacy/tests/tokenizer/test_emoticons.py index e0022dbbd..3f5c4bc04 100644 --- a/spacy/tests/tokenizer/test_emoticons.py +++ b/spacy/tests/tokenizer/test_emoticons.py @@ -1,8 +1,10 @@ from __future__ import unicode_literals + import pytest -def test_tweebo_challenge(en_tokenizer): +def test_tokenizer_handles_emoticons(en_tokenizer): + # Tweebo challenge (CMU) text = u""":o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ....""" tokens = en_tokenizer(text) assert tokens[0].orth_ == ":o" @@ -29,7 +31,7 @@ def test_tweebo_challenge(en_tokenizer): assert tokens[21].orth_ == '....' -def test_false_positive(en_tokenizer): - text = "example:)" +@pytest.mark.parametrize('text,length', [("example:)", 3), ("108)", 2), ("XDN", 1)]) +def test_tokenizer_excludes_false_pos_emoticons(en_tokenizer, text, length): tokens = en_tokenizer(text) - assert len(tokens) == 3 + assert len(tokens) == length