mirror of
https://github.com/explosion/spaCy.git
synced 2025-10-04 10:56:45 +03:00
46 lines
3.7 KiB
Python
46 lines
3.7 KiB
Python
# coding: utf8
|
|
from __future__ import unicode_literals
|
|
|
|
import pytest
|
|
|
|
TESTCASES = []
|
|
|
|
PUNCTUATION_TESTS = [
|
|
(u'āĻāĻŽāĻŋ āĻŦāĻžāĻāϞāĻžāϝāĻŧ āĻāĻžāύ āĻāĻžāĻ!', [u'āĻāĻŽāĻŋ', u'āĻŦāĻžāĻāϞāĻžāϝāĻŧ', u'āĻāĻžāύ', u'āĻāĻžāĻ', u'!']),
|
|
(u'āĻāĻŽāĻŋ āĻŦāĻžāĻāϞāĻžāϝāĻŧ āĻāĻĨāĻž āĻāĻāĨ¤', [u'āĻāĻŽāĻŋ', u'āĻŦāĻžāĻāϞāĻžāϝāĻŧ', u'āĻāĻĨāĻž', u'āĻāĻ', u'āĨ¤']),
|
|
(u'āĻŦāϏā§āύā§āϧāϰāĻž āĻāύāϏāĻŽā§āĻŽā§āĻā§ āĻĻā§āώ āϏā§āĻŦā§āĻāĻžāϰ āĻāϰāϞ⧠āύāĻž?', [u'āĻŦāϏā§āύā§āϧāϰāĻž', u'āĻāύāϏāĻŽā§āĻŽā§āĻā§', u'āĻĻā§āώ', u'āϏā§āĻŦā§āĻāĻžāϰ', u'āĻāϰāϞā§', u'āύāĻž', u'?']),
|
|
(u'āĻāĻžāĻāĻž āĻĨāĻžāĻāϞ⧠āĻāĻŋ āύāĻž āĻšāϝāĻŧ!', [u'āĻāĻžāĻāĻž', u'āĻĨāĻžāĻāϞā§', u'āĻāĻŋ', u'āύāĻž', u'āĻšāϝāĻŧ', u'!']),
|
|
(u'āϏāϰāĻāĻžāϰāĻŋ āĻŦāĻŋāĻļā§āĻŦāĻŦāĻŋāĻĻā§āϝāĻžāϞā§-āĻāϰ āĻāĻžāϤā§āϰ āύāĻ āĻŦāϞā§āĻ āĻāĻŋ āĻāĻŽāύ āĻāĻāϰāĻŖ?',
|
|
[u'āϏāϰāĻāĻžāϰāĻŋ', u'āĻŦāĻŋāĻļā§āĻŦāĻŦāĻŋāĻĻā§āϝāĻžāϞā§', u'-', u'āĻāϰ', u'āĻāĻžāϤā§āϰ', u'āύāĻ', u'āĻŦāϞā§āĻ', u'āĻāĻŋ', u'āĻāĻŽāύ', u'āĻāĻāϰāĻŖ', u'?']),
|
|
(u'āϤāĻžāϰāĻž āĻŦāϞā§, "āĻāϰāĻž āĻāĻžāĻŽāĻžāϰā§āϰ āĻŽā§āϰāĻāĻŋāĨ¤"', [u'āϤāĻžāϰāĻž', u'āĻŦāϞā§', ',', '"', u'āĻāϰāĻž', u'āĻāĻžāĻŽāĻžāϰā§āϰ', u'āĻŽā§āϰāĻāĻŋ', u'āĨ¤', '"']),
|
|
(u'ā§Š*ā§Š=ā§Ŧ?', [u'ā§Š', u'*', u'ā§Š', '=', u'ā§Ŧ', '?']),
|
|
(u'āĻāĻžāĻāĻ āĻžāϞ-āĻāϰ āĻāύā§āϧāĻ āĻ
āύā§āϝāϰāĻāĻŽ', [u'āĻāĻžāĻāĻ āĻžāϞ', '-', u'āĻāϰ', u'āĻāύā§āϧāĻ', u'āĻ
āύā§āϝāϰāĻāĻŽ']),
|
|
]
|
|
|
|
ABBREVIATIONS = [
|
|
(u'āĻĄāĻ āĻāĻžāϞā§āĻĻ āĻŦāϞāϞā§āύ āĻĸāĻžāĻāĻžāϝāĻŧ ā§Šā§Ģ āĻĄāĻŋāĻā§āϰāĻŋ āϏā§.āĨ¤', [u'āĻĄāĻ', u'āĻāĻžāϞā§āĻĻ', u'āĻŦāϞāϞā§āύ', u'āĻĸāĻžāĻāĻžāϝāĻŧ', u'ā§Šā§Ģ', u'āĻĄāĻŋāĻā§āϰāĻŋ', u'āϏā§.', u'āĨ¤'])
|
|
]
|
|
|
|
TESTCASES.extend(PUNCTUATION_TESTS)
|
|
TESTCASES.extend(ABBREVIATIONS)
|
|
|
|
|
|
@pytest.mark.parametrize('text,expected_tokens', TESTCASES)
|
|
def test_tokenizer_handles_testcases(bn_tokenizer, text, expected_tokens):
|
|
tokens = bn_tokenizer(text)
|
|
token_list = [token.text for token in tokens if not token.is_space]
|
|
assert expected_tokens == token_list
|
|
|
|
|
|
def test_tokenizer_handles_long_text(bn_tokenizer):
|
|
text = u"""āύāϰā§āĻĨ āϏāĻžāĻāĻĨ āĻŦāĻŋāĻļā§āĻŦāĻŦāĻŋāĻĻā§āϝāĻžāϞāϝāĻŧā§ āϏāĻžāϰāĻžāĻŦāĻāϰ āĻā§āύ āύāĻž āĻā§āύ āĻŦāĻŋāώāϝāĻŧā§ āĻāĻŦā§āώāĻŖāĻž āĻāϞāϤā§āĻ āĻĨāĻžāĻā§āĨ¤ \
|
|
āĻ
āĻāĻŋāĻā§āĻ āĻĢā§āϝāĻžāĻāĻžāϞā§āĻāĻŋ āĻŽā§āĻŽā§āĻŦāĻžāϰāĻāĻŖ āĻĒā§āϰāĻžāϝāĻŧāĻ āĻļāĻŋāĻā§āώāĻžāϰā§āĻĨā§āĻĻā§āϰ āύāĻŋāϝāĻŧā§ āĻŦāĻŋāĻāĻŋāύā§āύ āĻāĻŦā§āώāĻŖāĻž āĻĒā§āϰāĻāϞā§āĻĒā§ āĻāĻžāĻ āĻāϰā§āύ, \
|
|
āϝāĻžāϰ āĻŽāϧā§āϝ⧠āϰāϝāĻŧā§āĻā§ āϰā§āĻŦāĻ āĻĨā§āĻā§ āĻŽā§āĻļāĻŋāύ āϞāĻžāϰā§āύāĻŋāĻ āϏāĻŋāϏā§āĻā§āĻŽ āĻ āĻāϰā§āĻāĻŋāĻĢāĻŋāĻļāĻŋāϝāĻŧāĻžāϞ āĻāύā§āĻā§āϞāĻŋāĻā§āύā§āϏāĨ¤ \
|
|
āĻāϏāĻāϞ āĻĒā§āϰāĻāϞā§āĻĒā§ āĻāĻžāĻ āĻāϰāĻžāϰ āĻŽāĻžāϧā§āϝāĻŽā§ āϏāĻāĻļā§āϞāĻŋāώā§āĻ āĻā§āώā§āϤā§āϰ⧠āϝāĻĨā§āώā§āĻ āĻĒāϰāĻŋāĻŽāĻžāĻŖ āϏā§āĻĒā§āĻļāĻžāϞāĻžāĻāĻāĻĄ āĻšāĻāϝāĻŧāĻž āϏāĻŽā§āĻāĻŦāĨ¤ \
|
|
āĻāϰ āĻāĻŦā§āώāĻŖāĻžāϰ āĻāĻžāĻ āϤā§āĻŽāĻžāϰ āĻā§āϝāĻžāϰāĻŋāϝāĻŧāĻžāϰāĻā§ āĻ ā§āϞ⧠āύāĻŋāϝāĻŧā§ āϝāĻžāĻŦā§ āĻ
āύā§āĻāĻāĻžāύāĻŋ! \
|
|
āĻāύā§āĻā§āϏā§āĻ āĻĒā§āϰā§āĻā§āϰāĻžāĻŽāĻžāϰ āĻšāĻ, āĻāĻŦā§āώāĻ āĻāĻŋāĻāĻŦāĻž āĻĄā§āĻā§āϞāĻĒāĻžāϰ - āύāϰā§āĻĨ āϏāĻžāĻāĻĨ āĻāĻāύāĻŋāĻāĻžāϰā§āϏāĻŋāĻāĻŋāϤ⧠āϤā§āĻŽāĻžāϰ āĻĒā§āϰāϤāĻŋāĻāĻž āĻŦāĻŋāĻāĻžāĻļā§āϰ āϏā§āϝā§āĻ āϰāϝāĻŧā§āĻā§āĻāĨ¤ \
|
|
āύāϰā§āĻĨ āϏāĻžāĻāĻĨā§āϰ āĻ
āϏāĻžāϧāĻžāϰāĻŖ āĻāĻŽāĻŋāĻāύāĻŋāĻāĻŋāϤ⧠āϤā§āĻŽāĻžāĻā§ āϏāĻžāĻĻāϰ āĻāĻŽāύā§āϤā§āϰāĻŖāĨ¤"""
|
|
|
|
tokens = bn_tokenizer(text)
|
|
assert len(tokens) == 84
|