mirror of
https://github.com/explosion/spaCy.git
synced 2025-04-25 03:13:41 +03:00
41 lines
3.0 KiB
Python
41 lines
3.0 KiB
Python
# coding: utf8
|
|
from __future__ import unicode_literals
|
|
|
|
import pytest
|
|
|
|
TESTCASES = []
|
|
|
|
PUNCTUATION_TESTS = [
|
|
(u'āĻāĻŽāĻŋ āĻŦāĻžāĻāϞāĻžāϝāĻŧ āĻāĻžāύ āĻāĻžāĻ!', [u'āĻāĻŽāĻŋ', u'āĻŦāĻžāĻāϞāĻžāϝāĻŧ', u'āĻāĻžāύ', u'āĻāĻžāĻ', u'!']),
|
|
(u'āĻāĻŽāĻŋ āĻŦāĻžāĻāϞāĻžāϝāĻŧ āĻāĻĨāĻž āĻāĻāĨ¤', [u'āĻāĻŽāĻŋ', u'āĻŦāĻžāĻāϞāĻžāϝāĻŧ', u'āĻāĻĨāĻž', u'āĻāĻ', u'āĨ¤']),
|
|
(u'āĻŦāϏā§āύā§āϧāϰāĻž āĻāύāϏāĻŽā§āĻŽā§āĻā§ āĻĻā§āώ āϏā§āĻŦā§āĻāĻžāϰ āĻāϰāϞ⧠āύāĻž?', [u'āĻŦāϏā§āύā§āϧāϰāĻž', u'āĻāύāϏāĻŽā§āĻŽā§āĻā§', u'āĻĻā§āώ', u'āϏā§āĻŦā§āĻāĻžāϰ', u'āĻāϰāϞā§', u'āύāĻž', u'?']),
|
|
(u'āĻāĻžāĻāĻž āĻĨāĻžāĻāϞ⧠āĻāĻŋ āύāĻž āĻšāϝāĻŧ!', [u'āĻāĻžāĻāĻž', u'āĻĨāĻžāĻāϞā§', u'āĻāĻŋ', u'āύāĻž', u'āĻšāϝāĻŧ', u'!']),
|
|
]
|
|
|
|
ABBREVIATIONS = [
|
|
(u'āĻĄāĻ āĻāĻžāϞā§āĻĻ āĻŦāϞāϞā§āύ āĻĸāĻžāĻāĻžāϝāĻŧ ā§Šā§Ģ āĻĄāĻŋāĻā§āϰāĻŋ āϏā§.āĨ¤', [u'āĻĄāĻ', u'āĻāĻžāϞā§āĻĻ', u'āĻŦāϞāϞā§āύ', u'āĻĸāĻžāĻāĻžāϝāĻŧ', u'ā§Šā§Ģ', u'āĻĄāĻŋāĻā§āϰāĻŋ', u'āϏā§.', u'āĨ¤'])
|
|
]
|
|
|
|
TESTCASES.extend(PUNCTUATION_TESTS)
|
|
TESTCASES.extend(ABBREVIATIONS)
|
|
|
|
|
|
@pytest.mark.parametrize('text,expected_tokens', TESTCASES)
|
|
def test_tokenizer_handles_testcases(bn_tokenizer, text, expected_tokens):
|
|
tokens = bn_tokenizer(text)
|
|
token_list = [token.text for token in tokens if not token.is_space]
|
|
assert expected_tokens == token_list
|
|
|
|
|
|
def test_tokenizer_handles_long_text(bn_tokenizer):
|
|
text = u"""āύāϰā§āĻĨ āϏāĻžāĻāĻĨ āĻŦāĻŋāĻļā§āĻŦāĻŦāĻŋāĻĻā§āϝāĻžāϞāϝāĻŧā§ āϏāĻžāϰāĻžāĻŦāĻāϰ āĻā§āύ āύāĻž āĻā§āύ āĻŦāĻŋāώāϝāĻŧā§ āĻāĻŦā§āώāĻŖāĻž āĻāϞāϤā§āĻ āĻĨāĻžāĻā§āĨ¤ \
|
|
āĻ
āĻāĻŋāĻā§āĻ āĻĢā§āϝāĻžāĻāĻžāϞā§āĻāĻŋ āĻŽā§āĻŽā§āĻŦāĻžāϰāĻāĻŖ āĻĒā§āϰāĻžāϝāĻŧāĻ āĻļāĻŋāĻā§āώāĻžāϰā§āĻĨā§āĻĻā§āϰ āύāĻŋāϝāĻŧā§ āĻŦāĻŋāĻāĻŋāύā§āύ āĻāĻŦā§āώāĻŖāĻž āĻĒā§āϰāĻāϞā§āĻĒā§ āĻāĻžāĻ āĻāϰā§āύ, \
|
|
āϝāĻžāϰ āĻŽāϧā§āϝ⧠āϰāϝāĻŧā§āĻā§ āϰā§āĻŦāĻ āĻĨā§āĻā§ āĻŽā§āĻļāĻŋāύ āϞāĻžāϰā§āύāĻŋāĻ āϏāĻŋāϏā§āĻā§āĻŽ āĻ āĻāϰā§āĻāĻŋāĻĢāĻŋāĻļāĻŋāϝāĻŧāĻžāϞ āĻāύā§āĻā§āϞāĻŋāĻā§āύā§āϏāĨ¤ \
|
|
āĻāϏāĻāϞ āĻĒā§āϰāĻāϞā§āĻĒā§ āĻāĻžāĻ āĻāϰāĻžāϰ āĻŽāĻžāϧā§āϝāĻŽā§ āϏāĻāĻļā§āϞāĻŋāώā§āĻ āĻā§āώā§āϤā§āϰ⧠āϝāĻĨā§āώā§āĻ āĻĒāϰāĻŋāĻŽāĻžāĻŖ āϏā§āĻĒā§āĻļāĻžāϞāĻžāĻāĻāĻĄ āĻšāĻāϝāĻŧāĻž āϏāĻŽā§āĻāĻŦāĨ¤ \
|
|
āĻāϰ āĻāĻŦā§āώāĻŖāĻžāϰ āĻāĻžāĻ āϤā§āĻŽāĻžāϰ āĻā§āϝāĻžāϰāĻŋāϝāĻŧāĻžāϰāĻā§ āĻ ā§āϞ⧠āύāĻŋāϝāĻŧā§ āϝāĻžāĻŦā§ āĻ
āύā§āĻāĻāĻžāύāĻŋ! \
|
|
āĻāύā§āĻā§āϏā§āĻ āĻĒā§āϰā§āĻā§āϰāĻžāĻŽāĻžāϰ āĻšāĻ, āĻāĻŦā§āώāĻ āĻāĻŋāĻāĻŦāĻž āĻĄā§āĻā§āϞāĻĒāĻžāϰ - āύāϰā§āĻĨ āϏāĻžāĻāĻĨ āĻāĻāύāĻŋāĻāĻžāϰā§āϏāĻŋāĻāĻŋāϤ⧠āϤā§āĻŽāĻžāϰ āĻĒā§āϰāϤāĻŋāĻāĻž āĻŦāĻŋāĻāĻžāĻļā§āϰ āϏā§āϝā§āĻ āϰāϝāĻŧā§āĻā§āĻāĨ¤ \
|
|
āύāϰā§āĻĨ āϏāĻžāĻāĻĨā§āϰ āĻ
āϏāĻžāϧāĻžāϰāĻŖ āĻāĻŽāĻŋāĻāύāĻŋāĻāĻŋāϤ⧠āϤā§āĻŽāĻžāĻā§ āϏāĻžāĻĻāϰ āĻāĻŽāύā§āϤā§āϰāĻŖāĨ¤"""
|
|
|
|
tokens = bn_tokenizer(text)
|
|
assert len(tokens) == 84
|