spaCy/spacy/tests/lang/xx/test_tokenizer.py

import pytest

XX_BASIC_TOKENIZATION_TESTS = [
    (
        "Lääʹddjânnmest lie nuʹtt 10 000 säʹmmliʹžžed. Seeʹst pâʹjjel",
        [
            "Lääʹddjânnmest",
            "lie",
            "nuʹtt",
            "10",
            "000",
            "säʹmmliʹžžed",
            ".",
            "Seeʹst",
            "pâʹjjel",
        ],
    ),
]


@pytest.mark.parametrize("text,expected_tokens", XX_BASIC_TOKENIZATION_TESTS)
def test_xx_tokenizer_basic(xx_tokenizer, text, expected_tokens):
    tokens = xx_tokenizer(text)
    token_list = [token.text for token in tokens if not token.is_space]
    assert expected_tokens == token_list
-												New tests for a number of alpha languages (#9703)

* Added Slovak

* Added Slovenian tests

* Added Estonian tests

* Added Croatian tests

* Added Latvian tests

* Added Icelandic tests

* Added Afrikaans tests

* Added language-independent tests

* Added Kannada tests

* Tidied up

* Added Albanian tests

* Formatted with black

* Added failing tests for anomalies

* Update spacy/tests/lang/af/test_text.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Added context to failing Estonian tokenizer test

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Added context to failing Croatian tokenizer test

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Added context to failing Icelandic tokenizer test

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Added context to failing Latvian tokenizer test

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Added context to failing Slovak tokenizer test

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Added context to failing Slovenian tokenizer test

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
											
										
										
											2021-11-28 23:59:23 +03:00
+								import pytest
 								XX_BASIC_TOKENIZATION_TESTS = [
 								    (
 								        "Lääʹddjânnmest lie nuʹtt 10 000 säʹmmliʹžžed. Seeʹst pâʹjjel",
 								        [
 								            "Lääʹddjânnmest",
 								            "lie",
 								            "nuʹtt",
 								            "10",
 								            "000",
 								            "säʹmmliʹžžed",
 								            ".",
 								            "Seeʹst",
 								            "pâʹjjel",
 								        ],
 								    ),
 								]
 								@pytest.mark.parametrize("text,expected_tokens", XX_BASIC_TOKENIZATION_TESTS)
 								def test_xx_tokenizer_basic(xx_tokenizer, text, expected_tokens):
 								    tokens = xx_tokenizer(text)
 								    token_list = [token.text for token in tokens if not token.is_space]
 								    assert expected_tokens == token_list