mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-28 02:46:35 +03:00
b6e991440c
* Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility
41 lines
2.9 KiB
Python
41 lines
2.9 KiB
Python
# coding: utf8
|
|
from __future__ import unicode_literals
|
|
|
|
import pytest
|
|
|
|
|
|
TESTCASES = [
|
|
# punctuation tests
|
|
("āĻāĻŽāĻŋ āĻŦāĻžāĻāĻ˛āĻžāĻ¯āĻŧ āĻāĻžāĻ¨ āĻāĻžāĻ!", ["āĻāĻŽāĻŋ", "āĻŦāĻžāĻāĻ˛āĻžāĻ¯āĻŧ", "āĻāĻžāĻ¨", "āĻāĻžāĻ", "!"]),
|
|
("āĻāĻŽāĻŋ āĻŦāĻžāĻāĻ˛āĻžāĻ¯āĻŧ āĻāĻĨāĻž āĻāĻāĨ¤", ["āĻāĻŽāĻŋ", "āĻŦāĻžāĻāĻ˛āĻžāĻ¯āĻŧ", "āĻāĻĨāĻž", "āĻāĻ", "āĨ¤"]),
|
|
(
|
|
"āĻŦāĻ¸ā§āĻ¨ā§āĻ§āĻ°āĻž āĻāĻ¨āĻ¸āĻŽā§āĻŽā§āĻā§ āĻĻā§āĻˇ āĻ¸ā§āĻŦā§āĻāĻžāĻ° āĻāĻ°āĻ˛ā§ āĻ¨āĻž?",
|
|
["āĻŦāĻ¸ā§āĻ¨ā§āĻ§āĻ°āĻž", "āĻāĻ¨āĻ¸āĻŽā§āĻŽā§āĻā§", "āĻĻā§āĻˇ", "āĻ¸ā§āĻŦā§āĻāĻžāĻ°", "āĻāĻ°āĻ˛ā§", "āĻ¨āĻž", "?"],
|
|
),
|
|
("āĻāĻžāĻāĻž āĻĨāĻžāĻāĻ˛ā§ āĻāĻŋ āĻ¨āĻž āĻšāĻ¯āĻŧ!", ["āĻāĻžāĻāĻž", "āĻĨāĻžāĻāĻ˛ā§", "āĻāĻŋ", "āĻ¨āĻž", "āĻšāĻ¯āĻŧ", "!"]),
|
|
# abbreviations
|
|
(
|
|
"āĻĄāĻ āĻāĻžāĻ˛ā§āĻĻ āĻŦāĻ˛āĻ˛ā§āĻ¨ āĻĸāĻžāĻāĻžāĻ¯āĻŧ ā§Šā§Ģ āĻĄāĻŋāĻā§āĻ°āĻŋ āĻ¸ā§.āĨ¤",
|
|
["āĻĄāĻ", "āĻāĻžāĻ˛ā§āĻĻ", "āĻŦāĻ˛āĻ˛ā§āĻ¨", "āĻĸāĻžāĻāĻžāĻ¯āĻŧ", "ā§Šā§Ģ", "āĻĄāĻŋāĻā§āĻ°āĻŋ", "āĻ¸ā§.", "āĨ¤"],
|
|
),
|
|
]
|
|
|
|
|
|
@pytest.mark.parametrize("text,expected_tokens", TESTCASES)
|
|
def test_bn_tokenizer_handles_testcases(bn_tokenizer, text, expected_tokens):
|
|
tokens = bn_tokenizer(text)
|
|
token_list = [token.text for token in tokens if not token.is_space]
|
|
assert expected_tokens == token_list
|
|
|
|
|
|
def test_bn_tokenizer_handles_long_text(bn_tokenizer):
|
|
text = """āĻ¨āĻ°ā§āĻĨ āĻ¸āĻžāĻāĻĨ āĻŦāĻŋāĻļā§āĻŦāĻŦāĻŋāĻĻā§āĻ¯āĻžāĻ˛āĻ¯āĻŧā§ āĻ¸āĻžāĻ°āĻžāĻŦāĻāĻ° āĻā§āĻ¨ āĻ¨āĻž āĻā§āĻ¨ āĻŦāĻŋāĻˇāĻ¯āĻŧā§ āĻāĻŦā§āĻˇāĻŖāĻž āĻāĻ˛āĻ¤ā§āĻ āĻĨāĻžāĻā§āĨ¤ \
|
|
āĻ
āĻāĻŋāĻā§āĻ āĻĢā§āĻ¯āĻžāĻāĻžāĻ˛ā§āĻāĻŋ āĻŽā§āĻŽā§āĻŦāĻžāĻ°āĻāĻŖ āĻĒā§āĻ°āĻžāĻ¯āĻŧāĻ āĻļāĻŋāĻā§āĻˇāĻžāĻ°ā§āĻĨā§āĻĻā§āĻ° āĻ¨āĻŋāĻ¯āĻŧā§ āĻŦāĻŋāĻāĻŋāĻ¨ā§āĻ¨ āĻāĻŦā§āĻˇāĻŖāĻž āĻĒā§āĻ°āĻāĻ˛ā§āĻĒā§ āĻāĻžāĻ āĻāĻ°ā§āĻ¨, \
|
|
āĻ¯āĻžāĻ° āĻŽāĻ§ā§āĻ¯ā§ āĻ°āĻ¯āĻŧā§āĻā§ āĻ°ā§āĻŦāĻ āĻĨā§āĻā§ āĻŽā§āĻļāĻŋāĻ¨ āĻ˛āĻžāĻ°ā§āĻ¨āĻŋāĻ āĻ¸āĻŋāĻ¸ā§āĻā§āĻŽ āĻ āĻāĻ°ā§āĻāĻŋāĻĢāĻŋāĻļāĻŋāĻ¯āĻŧāĻžāĻ˛ āĻāĻ¨ā§āĻā§āĻ˛āĻŋāĻā§āĻ¨ā§āĻ¸āĨ¤ \
|
|
āĻāĻ¸āĻāĻ˛ āĻĒā§āĻ°āĻāĻ˛ā§āĻĒā§ āĻāĻžāĻ āĻāĻ°āĻžāĻ° āĻŽāĻžāĻ§ā§āĻ¯āĻŽā§ āĻ¸āĻāĻļā§āĻ˛āĻŋāĻˇā§āĻ āĻā§āĻˇā§āĻ¤ā§āĻ°ā§ āĻ¯āĻĨā§āĻˇā§āĻ āĻĒāĻ°āĻŋāĻŽāĻžāĻŖ āĻ¸ā§āĻĒā§āĻļāĻžāĻ˛āĻžāĻāĻāĻĄ āĻšāĻāĻ¯āĻŧāĻž āĻ¸āĻŽā§āĻāĻŦāĨ¤ \
|
|
āĻāĻ° āĻāĻŦā§āĻˇāĻŖāĻžāĻ° āĻāĻžāĻ āĻ¤ā§āĻŽāĻžāĻ° āĻā§āĻ¯āĻžāĻ°āĻŋāĻ¯āĻŧāĻžāĻ°āĻā§ āĻ ā§āĻ˛ā§ āĻ¨āĻŋāĻ¯āĻŧā§ āĻ¯āĻžāĻŦā§ āĻ
āĻ¨ā§āĻāĻāĻžāĻ¨āĻŋ! \
|
|
āĻāĻ¨ā§āĻā§āĻ¸ā§āĻ āĻĒā§āĻ°ā§āĻā§āĻ°āĻžāĻŽāĻžāĻ° āĻšāĻ, āĻāĻŦā§āĻˇāĻ āĻāĻŋāĻāĻŦāĻž āĻĄā§āĻā§āĻ˛āĻĒāĻžāĻ° - āĻ¨āĻ°ā§āĻĨ āĻ¸āĻžāĻāĻĨ āĻāĻāĻ¨āĻŋāĻāĻžāĻ°ā§āĻ¸āĻŋāĻāĻŋāĻ¤ā§ āĻ¤ā§āĻŽāĻžāĻ° āĻĒā§āĻ°āĻ¤āĻŋāĻāĻž āĻŦāĻŋāĻāĻžāĻļā§āĻ° āĻ¸ā§āĻ¯ā§āĻ āĻ°āĻ¯āĻŧā§āĻā§āĻāĨ¤ \
|
|
āĻ¨āĻ°ā§āĻĨ āĻ¸āĻžāĻāĻĨā§āĻ° āĻ
āĻ¸āĻžāĻ§āĻžāĻ°āĻŖ āĻāĻŽāĻŋāĻāĻ¨āĻŋāĻāĻŋāĻ¤ā§ āĻ¤ā§āĻŽāĻžāĻā§ āĻ¸āĻžāĻĻāĻ° āĻāĻŽāĻ¨ā§āĻ¤ā§āĻ°āĻŖāĨ¤"""
|
|
tokens = bn_tokenizer(text)
|
|
assert len(tokens) == 84
|