spaCy/spacy/tests/lang/bn/test_tokenizer.py
DaniÃĢl de Kok e2b70df012
Configure isort to use the Black profile, recursively isort the spacy module (#12721)
* Use isort with Black profile

* isort all the things

* Fix import cycles as a result of import sorting

* Add DOCBIN_ALL_ATTRS type definition

* Add isort to requirements

* Remove isort from build dependencies check

* Typo
2023-06-14 17:48:41 +02:00

37 lines
3.5 KiB
Python

import pytest
# fmt: off
TESTCASES = [
# Punctuation tests
("āφāĻŽāĻŋ āĻŦāĻžāĻ‚āϞāĻžāϝāĻŧ āĻ—āĻžāύ āĻ—āĻžāχ!", ["āφāĻŽāĻŋ", "āĻŦāĻžāĻ‚āϞāĻžāϝāĻŧ", "āĻ—āĻžāύ", "āĻ—āĻžāχ", "!"]),
("āφāĻŽāĻŋ āĻŦāĻžāĻ‚āϞāĻžāϝāĻŧ āĻ•āĻĨāĻž āĻ•āχāĨ¤", ["āφāĻŽāĻŋ", "āĻŦāĻžāĻ‚āϞāĻžāϝāĻŧ", "āĻ•āĻĨāĻž", "āĻ•āχ", "āĨ¤"]),
("āĻŦāϏ⧁āĻ¨ā§āϧāϰāĻž āϜāύāϏāĻŽā§āĻŽā§āϖ⧇ āĻĻā§‹āώ āĻ¸ā§āĻŦā§€āĻ•āĻžāϰ āĻ•āϰāϞ⧋ āύāĻž?", ["āĻŦāϏ⧁āĻ¨ā§āϧāϰāĻž", "āϜāύāϏāĻŽā§āĻŽā§āϖ⧇", "āĻĻā§‹āώ", "āĻ¸ā§āĻŦā§€āĻ•āĻžāϰ", "āĻ•āϰāϞ⧋", "āύāĻž", "?"]),
("āϟāĻžāĻ•āĻž āĻĨāĻžāĻ•āϞ⧇ āĻ•āĻŋ āύāĻž āĻšāϝāĻŧ!", ["āϟāĻžāĻ•āĻž", "āĻĨāĻžāĻ•āϞ⧇", "āĻ•āĻŋ", "āύāĻž", "āĻšāϝāĻŧ", "!"]),
("āϏāϰāĻ•āĻžāϰāĻŋ āĻŦāĻŋāĻļā§āĻŦāĻŦāĻŋāĻĻā§āϝāĻžāϞ⧟-āĻāϰ āĻ›āĻžāĻ¤ā§āϰ āύāχ āĻŦāϞ⧇āχ āĻ•āĻŋ āĻāĻŽāύ āφāϚāϰāĻŖ?", ["āϏāϰāĻ•āĻžāϰāĻŋ", "āĻŦāĻŋāĻļā§āĻŦāĻŦāĻŋāĻĻā§āϝāĻžāϞ⧟", "-", "āĻāϰ", "āĻ›āĻžāĻ¤ā§āϰ", "āύāχ", "āĻŦāϞ⧇āχ", "āĻ•āĻŋ", "āĻāĻŽāύ", "āφāϚāϰāĻŖ", "?"]),
('āϤāĻžāϰāĻž āĻŦāϞ⧇, "āĻ“āϰāĻž āĻ–āĻžāĻŽāĻžāϰ⧇āϰ āĻŽā§āϰāĻ—āĻŋāĨ¤"', ["āϤāĻžāϰāĻž", "āĻŦāϞ⧇", ",", '"', "āĻ“āϰāĻž", "āĻ–āĻžāĻŽāĻžāϰ⧇āϰ", "āĻŽā§āϰāĻ—āĻŋ", "āĨ¤", '"']),
("ā§Š*ā§Š=ā§Ŧ?", ["ā§Š", "*", "ā§Š", "=", "ā§Ŧ", "?"]),
("āĻ•āĻžāρāĻ āĻžāϞ-āĻāϰ āĻ—āĻ¨ā§āϧāχ āĻ…āĻ¨ā§āϝāϰāĻ•āĻŽ", ["āĻ•āĻžāρāĻ āĻžāϞ", "-", "āĻāϰ", "āĻ—āĻ¨ā§āϧāχ", "āĻ…āĻ¨ā§āϝāϰāĻ•āĻŽ"]),
# Abbreviations
("āĻĄāσ āĻ–āĻžāϞ⧇āĻĻ āĻŦāϞāϞ⧇āύ āĻĸāĻžāĻ•āĻžāϝāĻŧ ā§Šā§Ģ āĻĄāĻŋāĻ—ā§āϰāĻŋ āϏ⧇.āĨ¤", ["āĻĄāσ", "āĻ–āĻžāϞ⧇āĻĻ", "āĻŦāϞāϞ⧇āύ", "āĻĸāĻžāĻ•āĻžāϝāĻŧ", "ā§Šā§Ģ", "āĻĄāĻŋāĻ—ā§āϰāĻŋ", "āϏ⧇.", "āĨ¤"]),
]
# fmt: on
@pytest.mark.parametrize("text,expected_tokens", TESTCASES)
def test_bn_tokenizer_handles_testcases(bn_tokenizer, text, expected_tokens):
tokens = bn_tokenizer(text)
token_list = [token.text for token in tokens if not token.is_space]
assert expected_tokens == token_list
def test_bn_tokenizer_handles_long_text(bn_tokenizer):
text = """āύāĻ°ā§āĻĨ āϏāĻžāωāĻĨ āĻŦāĻŋāĻļā§āĻŦāĻŦāĻŋāĻĻā§āϝāĻžāϞāϝāĻŧ⧇ āϏāĻžāϰāĻžāĻŦāĻ›āϰ āϕ⧋āύ āύāĻž āϕ⧋āύ āĻŦāĻŋāώāϝāĻŧ⧇ āĻ—āĻŦ⧇āώāĻŖāĻž āϚāϞāϤ⧇āχ āĻĨāĻžāϕ⧇āĨ¤ \
āĻ…āĻ­āĻŋāĻœā§āĻž āĻĢā§āϝāĻžāĻ•āĻžāĻ˛ā§āϟāĻŋ āĻŽā§‡āĻŽā§āĻŦāĻžāϰāĻ—āĻŖ āĻĒā§āϰāĻžāϝāĻŧāχ āĻļāĻŋāĻ•ā§āώāĻžāĻ°ā§āĻĨā§€āĻĻ⧇āϰ āύāĻŋāϝāĻŧ⧇ āĻŦāĻŋāĻ­āĻŋāĻ¨ā§āύ āĻ—āĻŦ⧇āώāĻŖāĻž āĻĒā§āϰāĻ•āĻ˛ā§āĻĒ⧇ āĻ•āĻžāϜ āĻ•āϰ⧇āύ, \
āϝāĻžāϰ āĻŽāĻ§ā§āϝ⧇ āϰāϝāĻŧ⧇āϛ⧇ āϰ⧋āĻŦāϟ āĻĨ⧇āϕ⧇ āĻŽā§‡āĻļāĻŋāύ āϞāĻžāĻ°ā§āύāĻŋāĻ‚ āϏāĻŋāĻ¸ā§āĻŸā§‡āĻŽ āĻ“ āφāĻ°ā§āϟāĻŋāĻĢāĻŋāĻļāĻŋāϝāĻŧāĻžāϞ āχāĻ¨ā§āĻŸā§‡āϞāĻŋāĻœā§‡āĻ¨ā§āϏāĨ¤ \
āĻāϏāĻ•āϞ āĻĒā§āϰāĻ•āĻ˛ā§āĻĒ⧇ āĻ•āĻžāϜ āĻ•āϰāĻžāϰ āĻŽāĻžāĻ§ā§āϝāĻŽā§‡ āϏāĻ‚āĻļā§āϞāĻŋāĻˇā§āϟ āĻ•ā§āώ⧇āĻ¤ā§āϰ⧇ āϝāĻĨ⧇āĻˇā§āĻ  āĻĒāϰāĻŋāĻŽāĻžāĻŖ āĻ¸ā§āĻĒ⧇āĻļāĻžāϞāĻžāχāϜāĻĄ āĻšāĻ“āϝāĻŧāĻž āϏāĻŽā§āĻ­āĻŦāĨ¤ \
āφāϰ āĻ—āĻŦ⧇āώāĻŖāĻžāϰ āĻ•āĻžāϜ āϤ⧋āĻŽāĻžāϰ āĻ•ā§āϝāĻžāϰāĻŋāϝāĻŧāĻžāϰāϕ⧇ āϠ⧇āϞ⧇ āύāĻŋāϝāĻŧ⧇ āϝāĻžāĻŦ⧇ āĻ…āύ⧇āĻ•āĻ–āĻžāύāĻŋ! \
āĻ•āĻ¨ā§āĻŸā§‡āĻ¸ā§āϟ āĻĒā§āϰ⧋āĻ—ā§āϰāĻžāĻŽāĻžāϰ āĻšāĻ“, āĻ—āĻŦ⧇āώāĻ• āĻ•āĻŋāĻ‚āĻŦāĻž āĻĄā§‡āϭ⧇āϞāĻĒāĻžāϰ - āύāĻ°ā§āĻĨ āϏāĻžāωāĻĨ āχāωāύāĻŋāĻ­āĻžāĻ°ā§āϏāĻŋāϟāĻŋāϤ⧇ āϤ⧋āĻŽāĻžāϰ āĻĒā§āϰāϤāĻŋāĻ­āĻž āĻŦāĻŋāĻ•āĻžāĻļ⧇āϰ āϏ⧁āϝ⧋āĻ— āϰāϝāĻŧ⧇āϛ⧇āχāĨ¤ \
āύāĻ°ā§āĻĨ āϏāĻžāωāĻĨ⧇āϰ āĻ…āϏāĻžāϧāĻžāϰāĻŖ āĻ•āĻŽāĻŋāωāύāĻŋāϟāĻŋāϤ⧇ āϤ⧋āĻŽāĻžāϕ⧇ āϏāĻžāĻĻāϰ āφāĻŽāĻ¨ā§āĻ¤ā§āϰāĻŖāĨ¤"""
tokens = bn_tokenizer(text)
assert len(tokens) == 84