mirror of
https://github.com/explosion/spaCy.git
synced 2025-04-24 19:11:58 +03:00
* Use isort with Black profile * isort all the things * Fix import cycles as a result of import sorting * Add DOCBIN_ALL_ATTRS type definition * Add isort to requirements * Remove isort from build dependencies check * Typo
37 lines
3.5 KiB
Python
37 lines
3.5 KiB
Python
import pytest
|
|
|
|
# fmt: off
|
|
TESTCASES = [
|
|
# Punctuation tests
|
|
("āĻāĻŽāĻŋ āĻŦāĻžāĻāϞāĻžāϝāĻŧ āĻāĻžāύ āĻāĻžāĻ!", ["āĻāĻŽāĻŋ", "āĻŦāĻžāĻāϞāĻžāϝāĻŧ", "āĻāĻžāύ", "āĻāĻžāĻ", "!"]),
|
|
("āĻāĻŽāĻŋ āĻŦāĻžāĻāϞāĻžāϝāĻŧ āĻāĻĨāĻž āĻāĻāĨ¤", ["āĻāĻŽāĻŋ", "āĻŦāĻžāĻāϞāĻžāϝāĻŧ", "āĻāĻĨāĻž", "āĻāĻ", "āĨ¤"]),
|
|
("āĻŦāϏā§āύā§āϧāϰāĻž āĻāύāϏāĻŽā§āĻŽā§āĻā§ āĻĻā§āώ āϏā§āĻŦā§āĻāĻžāϰ āĻāϰāϞ⧠āύāĻž?", ["āĻŦāϏā§āύā§āϧāϰāĻž", "āĻāύāϏāĻŽā§āĻŽā§āĻā§", "āĻĻā§āώ", "āϏā§āĻŦā§āĻāĻžāϰ", "āĻāϰāϞā§", "āύāĻž", "?"]),
|
|
("āĻāĻžāĻāĻž āĻĨāĻžāĻāϞ⧠āĻāĻŋ āύāĻž āĻšāϝāĻŧ!", ["āĻāĻžāĻāĻž", "āĻĨāĻžāĻāϞā§", "āĻāĻŋ", "āύāĻž", "āĻšāϝāĻŧ", "!"]),
|
|
("āϏāϰāĻāĻžāϰāĻŋ āĻŦāĻŋāĻļā§āĻŦāĻŦāĻŋāĻĻā§āϝāĻžāϞā§-āĻāϰ āĻāĻžāϤā§āϰ āύāĻ āĻŦāϞā§āĻ āĻāĻŋ āĻāĻŽāύ āĻāĻāϰāĻŖ?", ["āϏāϰāĻāĻžāϰāĻŋ", "āĻŦāĻŋāĻļā§āĻŦāĻŦāĻŋāĻĻā§āϝāĻžāϞā§", "-", "āĻāϰ", "āĻāĻžāϤā§āϰ", "āύāĻ", "āĻŦāϞā§āĻ", "āĻāĻŋ", "āĻāĻŽāύ", "āĻāĻāϰāĻŖ", "?"]),
|
|
('āϤāĻžāϰāĻž āĻŦāϞā§, "āĻāϰāĻž āĻāĻžāĻŽāĻžāϰā§āϰ āĻŽā§āϰāĻāĻŋāĨ¤"', ["āϤāĻžāϰāĻž", "āĻŦāϞā§", ",", '"', "āĻāϰāĻž", "āĻāĻžāĻŽāĻžāϰā§āϰ", "āĻŽā§āϰāĻāĻŋ", "āĨ¤", '"']),
|
|
("ā§Š*ā§Š=ā§Ŧ?", ["ā§Š", "*", "ā§Š", "=", "ā§Ŧ", "?"]),
|
|
("āĻāĻžāĻāĻ āĻžāϞ-āĻāϰ āĻāύā§āϧāĻ āĻ
āύā§āϝāϰāĻāĻŽ", ["āĻāĻžāĻāĻ āĻžāϞ", "-", "āĻāϰ", "āĻāύā§āϧāĻ", "āĻ
āύā§āϝāϰāĻāĻŽ"]),
|
|
# Abbreviations
|
|
("āĻĄāĻ āĻāĻžāϞā§āĻĻ āĻŦāϞāϞā§āύ āĻĸāĻžāĻāĻžāϝāĻŧ ā§Šā§Ģ āĻĄāĻŋāĻā§āϰāĻŋ āϏā§.āĨ¤", ["āĻĄāĻ", "āĻāĻžāϞā§āĻĻ", "āĻŦāϞāϞā§āύ", "āĻĸāĻžāĻāĻžāϝāĻŧ", "ā§Šā§Ģ", "āĻĄāĻŋāĻā§āϰāĻŋ", "āϏā§.", "āĨ¤"]),
|
|
]
|
|
# fmt: on
|
|
|
|
|
|
@pytest.mark.parametrize("text,expected_tokens", TESTCASES)
|
|
def test_bn_tokenizer_handles_testcases(bn_tokenizer, text, expected_tokens):
|
|
tokens = bn_tokenizer(text)
|
|
token_list = [token.text for token in tokens if not token.is_space]
|
|
assert expected_tokens == token_list
|
|
|
|
|
|
def test_bn_tokenizer_handles_long_text(bn_tokenizer):
|
|
text = """āύāϰā§āĻĨ āϏāĻžāĻāĻĨ āĻŦāĻŋāĻļā§āĻŦāĻŦāĻŋāĻĻā§āϝāĻžāϞāϝāĻŧā§ āϏāĻžāϰāĻžāĻŦāĻāϰ āĻā§āύ āύāĻž āĻā§āύ āĻŦāĻŋāώāϝāĻŧā§ āĻāĻŦā§āώāĻŖāĻž āĻāϞāϤā§āĻ āĻĨāĻžāĻā§āĨ¤ \
|
|
āĻ
āĻāĻŋāĻā§āĻ āĻĢā§āϝāĻžāĻāĻžāϞā§āĻāĻŋ āĻŽā§āĻŽā§āĻŦāĻžāϰāĻāĻŖ āĻĒā§āϰāĻžāϝāĻŧāĻ āĻļāĻŋāĻā§āώāĻžāϰā§āĻĨā§āĻĻā§āϰ āύāĻŋāϝāĻŧā§ āĻŦāĻŋāĻāĻŋāύā§āύ āĻāĻŦā§āώāĻŖāĻž āĻĒā§āϰāĻāϞā§āĻĒā§ āĻāĻžāĻ āĻāϰā§āύ, \
|
|
āϝāĻžāϰ āĻŽāϧā§āϝ⧠āϰāϝāĻŧā§āĻā§ āϰā§āĻŦāĻ āĻĨā§āĻā§ āĻŽā§āĻļāĻŋāύ āϞāĻžāϰā§āύāĻŋāĻ āϏāĻŋāϏā§āĻā§āĻŽ āĻ āĻāϰā§āĻāĻŋāĻĢāĻŋāĻļāĻŋāϝāĻŧāĻžāϞ āĻāύā§āĻā§āϞāĻŋāĻā§āύā§āϏāĨ¤ \
|
|
āĻāϏāĻāϞ āĻĒā§āϰāĻāϞā§āĻĒā§ āĻāĻžāĻ āĻāϰāĻžāϰ āĻŽāĻžāϧā§āϝāĻŽā§ āϏāĻāĻļā§āϞāĻŋāώā§āĻ āĻā§āώā§āϤā§āϰ⧠āϝāĻĨā§āώā§āĻ āĻĒāϰāĻŋāĻŽāĻžāĻŖ āϏā§āĻĒā§āĻļāĻžāϞāĻžāĻāĻāĻĄ āĻšāĻāϝāĻŧāĻž āϏāĻŽā§āĻāĻŦāĨ¤ \
|
|
āĻāϰ āĻāĻŦā§āώāĻŖāĻžāϰ āĻāĻžāĻ āϤā§āĻŽāĻžāϰ āĻā§āϝāĻžāϰāĻŋāϝāĻŧāĻžāϰāĻā§ āĻ ā§āϞ⧠āύāĻŋāϝāĻŧā§ āϝāĻžāĻŦā§ āĻ
āύā§āĻāĻāĻžāύāĻŋ! \
|
|
āĻāύā§āĻā§āϏā§āĻ āĻĒā§āϰā§āĻā§āϰāĻžāĻŽāĻžāϰ āĻšāĻ, āĻāĻŦā§āώāĻ āĻāĻŋāĻāĻŦāĻž āĻĄā§āĻā§āϞāĻĒāĻžāϰ - āύāϰā§āĻĨ āϏāĻžāĻāĻĨ āĻāĻāύāĻŋāĻāĻžāϰā§āϏāĻŋāĻāĻŋāϤ⧠āϤā§āĻŽāĻžāϰ āĻĒā§āϰāϤāĻŋāĻāĻž āĻŦāĻŋāĻāĻžāĻļā§āϰ āϏā§āϝā§āĻ āϰāϝāĻŧā§āĻā§āĻāĨ¤ \
|
|
āύāϰā§āĻĨ āϏāĻžāĻāĻĨā§āϰ āĻ
āϏāĻžāϧāĻžāϰāĻŖ āĻāĻŽāĻŋāĻāύāĻŋāĻāĻŋāϤ⧠āϤā§āĻŽāĻžāĻā§ āϏāĻžāĻĻāϰ āĻāĻŽāύā§āϤā§āϰāĻŖāĨ¤"""
|
|
tokens = bn_tokenizer(text)
|
|
assert len(tokens) == 84
|