spaCy/spacy/tests/lang/mul/test_tokenizer.py

import pytest

MUL_BASIC_TOKENIZATION_TESTS = [
    (
        "Lääʹddjânnmest lie nuʹtt 10 000 säʹmmliʹžžed. Seeʹst pâʹjjel",
        [
            "Lääʹddjânnmest",
            "lie",
            "nuʹtt",
            "10",
            "000",
            "säʹmmliʹžžed",
            ".",
            "Seeʹst",
            "pâʹjjel",
        ],
    ),
]


@pytest.mark.parametrize("text,expected_tokens", MUL_BASIC_TOKENIZATION_TESTS)
def test_mul_tokenizer_basic(mul_tokenizer, text, expected_tokens):
    tokens = mul_tokenizer(text)
    token_list = [token.text for token in tokens if not token.is_space]
    assert expected_tokens == token_list
-												New tests for a number of alpha languages (#9703)

* Added Slovak

* Added Slovenian tests

* Added Estonian tests

* Added Croatian tests

* Added Latvian tests

* Added Icelandic tests

* Added Afrikaans tests

* Added language-independent tests

* Added Kannada tests

* Tidied up

* Added Albanian tests

* Formatted with black

* Added failing tests for anomalies

* Update spacy/tests/lang/af/test_text.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Added context to failing Estonian tokenizer test

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Added context to failing Croatian tokenizer test

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Added context to failing Icelandic tokenizer test

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Added context to failing Latvian tokenizer test

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Added context to failing Slovak tokenizer test

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Added context to failing Slovenian tokenizer test

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
											
										
										
											2021-11-28 23:59:23 +03:00
+								import pytest
-												Rename language codes (Icelandic, multi-language) (#12149)

* Init

* fix tests

* Update spacy/errors.py

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Fix test_blank_languages

* Rename xx to mul in docs

* Format _util with black

* prettier formatting

---------

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
											
										
										
											2023-01-31 19:30:43 +03:00
+								MUL_BASIC_TOKENIZATION_TESTS = [
-												New tests for a number of alpha languages (#9703)

* Added Slovak

* Added Slovenian tests

* Added Estonian tests

* Added Croatian tests

* Added Latvian tests

* Added Icelandic tests

* Added Afrikaans tests

* Added language-independent tests

* Added Kannada tests

* Tidied up

* Added Albanian tests

* Formatted with black

* Added failing tests for anomalies

* Update spacy/tests/lang/af/test_text.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Added context to failing Estonian tokenizer test

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Added context to failing Croatian tokenizer test

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Added context to failing Icelandic tokenizer test

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Added context to failing Latvian tokenizer test

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Added context to failing Slovak tokenizer test

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Added context to failing Slovenian tokenizer test

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
											
										
										
											2021-11-28 23:59:23 +03:00
+								    (
 								        "Lääʹddjânnmest lie nuʹtt 10 000 säʹmmliʹžžed. Seeʹst pâʹjjel",
 								        [
 								            "Lääʹddjânnmest",
 								            "lie",
 								            "nuʹtt",
 								            "10",
 								            "000",
 								            "säʹmmliʹžžed",
 								            ".",
 								            "Seeʹst",
 								            "pâʹjjel",
 								        ],
 								    ),
 								]
-												Rename language codes (Icelandic, multi-language) (#12149)

* Init

* fix tests

* Update spacy/errors.py

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Fix test_blank_languages

* Rename xx to mul in docs

* Format _util with black

* prettier formatting

---------

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
											
										
										
											2023-01-31 19:30:43 +03:00
+								@pytest.mark.parametrize("text,expected_tokens", MUL_BASIC_TOKENIZATION_TESTS)
 								def test_mul_tokenizer_basic(mul_tokenizer, text, expected_tokens):
 								    tokens = mul_tokenizer(text)
-												New tests for a number of alpha languages (#9703)

* Added Slovak

* Added Slovenian tests

* Added Estonian tests

* Added Croatian tests

* Added Latvian tests

* Added Icelandic tests

* Added Afrikaans tests

* Added language-independent tests

* Added Kannada tests

* Tidied up

* Added Albanian tests

* Formatted with black

* Added failing tests for anomalies

* Update spacy/tests/lang/af/test_text.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Added context to failing Estonian tokenizer test

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Added context to failing Croatian tokenizer test

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Added context to failing Icelandic tokenizer test

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Added context to failing Latvian tokenizer test

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Added context to failing Slovak tokenizer test

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Added context to failing Slovenian tokenizer test

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
											
										
										
											2021-11-28 23:59:23 +03:00
+								    token_list = [token.text for token in tokens if not token.is_space]
 								    assert expected_tokens == token_list