2016-09-24 16:42:01 +03:00
|
|
|
from spacy.deprecated import detokenize
|
2014-10-18 11:02:05 +04:00
|
|
|
|
|
|
|
def test_punct():
|
|
|
|
tokens = 'Pierre Vinken , 61 years old .'.split()
|
|
|
|
detoks = [(0,), (1, 2), (3,), (4,), (5, 6)]
|
|
|
|
token_rules = ('<SEP>,', '<SEP>.')
|
|
|
|
assert detokenize(token_rules, tokens) == detoks
|
|
|
|
|
|
|
|
|
|
|
|
def test_contractions():
|
|
|
|
tokens = "I ca n't even".split()
|
|
|
|
detoks = [(0,), (1, 2), (3,)]
|
|
|
|
token_rules = ("ca<SEP>n't",)
|
|
|
|
assert detokenize(token_rules, tokens) == detoks
|
|
|
|
|
|
|
|
|
|
|
|
def test_contractions_punct():
|
|
|
|
tokens = "I ca n't !".split()
|
|
|
|
detoks = [(0,), (1, 2, 3)]
|
|
|
|
token_rules = ("ca<SEP>n't", '<SEP>!')
|
|
|
|
assert detokenize(token_rules, tokens) == detoks
|