mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-10 16:22:29 +03:00
Modernize tokenizer tests for emoticons
This commit is contained in:
parent
f09b5a5dfd
commit
ee6b49b293
|
@ -1,8 +1,10 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
def test_tweebo_challenge(en_tokenizer):
|
def test_tokenizer_handles_emoticons(en_tokenizer):
|
||||||
|
# Tweebo challenge (CMU)
|
||||||
text = u""":o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ...."""
|
text = u""":o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ...."""
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
assert tokens[0].orth_ == ":o"
|
assert tokens[0].orth_ == ":o"
|
||||||
|
@ -29,7 +31,7 @@ def test_tweebo_challenge(en_tokenizer):
|
||||||
assert tokens[21].orth_ == '....'
|
assert tokens[21].orth_ == '....'
|
||||||
|
|
||||||
|
|
||||||
def test_false_positive(en_tokenizer):
|
@pytest.mark.parametrize('text,length', [("example:)", 3), ("108)", 2), ("XDN", 1)])
|
||||||
text = "example:)"
|
def test_tokenizer_excludes_false_pos_emoticons(en_tokenizer, text, length):
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == length
|
||||||
|
|
Loading…
Reference in New Issue
Block a user