spaCy/spacy/tests/lang/lt/test_text.py

import pytest


def test_lt_tokenizer_handles_long_text(lt_tokenizer):
    text = """Tokios sausros kriterijus atitinka pirmadienį atlikti skaičiavimai, palyginus faktinį ir žemiausią vidutinį daugiametį vandens lygį. Nustatyta, kad iš 48 šalies vandens matavimo stočių 28-iose stotyse vandens lygis yra žemesnis arba lygus žemiausiam vidutiniam daugiamečiam šiltojo laikotarpio vandens lygiui."""
    tokens = lt_tokenizer(text)
    assert len(tokens) == 42


@pytest.mark.parametrize(
    "text,length",
    [
        (
            "177R Parodų rūmai–Ozo g. nuo vasario 18 d. bus skelbiamas interneto tinklalapyje.",
            17,
        ),
        (
            "ISM universiteto doc. dr. Ieva Augutytė-Kvedaravičienė pastebi, kad tyrimais nustatyti elgesio pokyčiai.",
            18,
        ),
    ],
)
def test_lt_tokenizer_handles_punct_abbrev(lt_tokenizer, text, length):
    tokens = lt_tokenizer(text)
    assert len(tokens) == length


@pytest.mark.parametrize("text", ["km.", "pvz.", "biol."])
def test_lt_tokenizer_abbrev_exceptions(lt_tokenizer, text):
    tokens = lt_tokenizer(text)
    assert len(tokens) == 2


@pytest.mark.parametrize(
    "text,match",
    [
        ("10", True),
        ("1", True),
        ("10,000", True),
        ("10,00", True),
        ("999.0", True),
        ("vienas", True),
        ("du", True),
        ("milijardas", True),
        ("šuo", False),
        (",", False),
        ("1/2", True),
    ],
)
def test_lt_lex_attrs_like_number(lt_tokenizer, text, match):
    tokens = lt_tokenizer(text)
    assert len(tokens) == 1
    assert tokens[0].like_num == match
-												Lithuanian language support (#3895)

* initial LT lang support

* Added more stopwords. Started setting up some basic test environment (not complete)

* Initial morph rules for LT lang

* Closes #1 Adds tokenizer exceptions for Lithuanian

* Closes #5 Punctuation rules. Closes #6 Lexical Attributes

* test: add native examples to basic tests

* feat: add tag map for lt lang

* fix: remove undefined tag attribute 'Definite'

* feat: add lemmatizer for lt lang

* refactor: add new instances to lt lang morph rules; use tags from tag map

* refactor: add morph rules to lt lang defaults

* refactor: only keep nouns, verbs, adverbs and adjectives in lt lang lemmatizer lookup

* refactor: add capitalized words to lt lang lemmatizer

* refactor: add more num words to lt lang lex attrs

* refactor: update lt lang stop word set

* refactor: add new instances to lt lang tokenizer exceptions

* refactor: remove comments form lt lang init file

* refactor: use function instead of lambda in lt lex lang getter

* refactor: remove conversion to dict in lt init when dict is already provided

* chore: rename lt 'test_basic' to 'test_text'

* feat: add more lt text tests

* feat: add lemmatizer tests

* refactor: remove unused imports, add newline to end of file

* chore: add contributor agreement

* chore: change 'en' to 'lt' in lt example description

* fix: add missing encoding info

* style: add newline to end of file

* refactor: use python2 compatible syntax

* style: reformat code using black

											
										
										
											2019-07-08 11:25:22 +03:00
+								import pytest
 								def test_lt_tokenizer_handles_long_text(lt_tokenizer):
-												Fix test

											
										
										
											2019-07-11 13:16:43 +03:00
+								    text = """Tokios sausros kriterijus atitinka pirmadienį atlikti skaičiavimai, palyginus faktinį ir žemiausią vidutinį daugiametį vandens lygį. Nustatyta, kad iš 48 šalies vandens matavimo stočių 28-iose stotyse vandens lygis yra žemesnis arba lygus žemiausiam vidutiniam daugiamečiam šiltojo laikotarpio vandens lygiui."""
 								    tokens = lt_tokenizer(text)
-												Lithuanian language support (#3895)

* initial LT lang support

* Added more stopwords. Started setting up some basic test environment (not complete)

* Initial morph rules for LT lang

* Closes #1 Adds tokenizer exceptions for Lithuanian

* Closes #5 Punctuation rules. Closes #6 Lexical Attributes

* test: add native examples to basic tests

* feat: add tag map for lt lang

* fix: remove undefined tag attribute 'Definite'

* feat: add lemmatizer for lt lang

* refactor: add new instances to lt lang morph rules; use tags from tag map

* refactor: add morph rules to lt lang defaults

* refactor: only keep nouns, verbs, adverbs and adjectives in lt lang lemmatizer lookup

* refactor: add capitalized words to lt lang lemmatizer

* refactor: add more num words to lt lang lex attrs

* refactor: update lt lang stop word set

* refactor: add new instances to lt lang tokenizer exceptions

* refactor: remove comments form lt lang init file

* refactor: use function instead of lambda in lt lex lang getter

* refactor: remove conversion to dict in lt init when dict is already provided

* chore: rename lt 'test_basic' to 'test_text'

* feat: add more lt text tests

* feat: add lemmatizer tests

* refactor: remove unused imports, add newline to end of file

* chore: add contributor agreement

* chore: change 'en' to 'lt' in lt example description

* fix: add missing encoding info

* style: add newline to end of file

* refactor: use python2 compatible syntax

* style: reformat code using black

											
										
										
											2019-07-08 11:25:22 +03:00
+								    assert len(tokens) == 42
-												Tidy up and auto-format

											
										
										
											2019-07-11 13:02:25 +03:00
+								@pytest.mark.parametrize(
 								    "text,length",
 								    [
 								        (
 								            "177R Parodų rūmai–Ozo g. nuo vasario 18 d. bus skelbiamas interneto tinklalapyje.",
-												Improve Lithuanian tokenization (#5205)

* Improve Lithuanian tokenization

Modify Lithuanian tokenization to improve performance for
UD_Lithuanian-ALKSNIS.

* Update Lithuanian tokenizer tests
											
										
										
											2020-03-25 13:28:12 +03:00
+,
-												Tidy up and auto-format

											
										
										
											2019-07-11 13:02:25 +03:00
+								        ),
 								        (
 								            "ISM universiteto doc. dr. Ieva Augutytė-Kvedaravičienė pastebi, kad tyrimais nustatyti elgesio pokyčiai.",
-												Improve Lithuanian tokenization (#5205)

* Improve Lithuanian tokenization

Modify Lithuanian tokenization to improve performance for
UD_Lithuanian-ALKSNIS.

* Update Lithuanian tokenizer tests
											
										
										
											2020-03-25 13:28:12 +03:00
+,
-												Tidy up and auto-format

											
										
										
											2019-07-11 13:02:25 +03:00
+								        ),
 								    ],
 								)
-												Lithuanian language support (#3895)

* initial LT lang support

* Added more stopwords. Started setting up some basic test environment (not complete)

* Initial morph rules for LT lang

* Closes #1 Adds tokenizer exceptions for Lithuanian

* Closes #5 Punctuation rules. Closes #6 Lexical Attributes

* test: add native examples to basic tests

* feat: add tag map for lt lang

* fix: remove undefined tag attribute 'Definite'

* feat: add lemmatizer for lt lang

* refactor: add new instances to lt lang morph rules; use tags from tag map

* refactor: add morph rules to lt lang defaults

* refactor: only keep nouns, verbs, adverbs and adjectives in lt lang lemmatizer lookup

* refactor: add capitalized words to lt lang lemmatizer

* refactor: add more num words to lt lang lex attrs

* refactor: update lt lang stop word set

* refactor: add new instances to lt lang tokenizer exceptions

* refactor: remove comments form lt lang init file

* refactor: use function instead of lambda in lt lex lang getter

* refactor: remove conversion to dict in lt init when dict is already provided

* chore: rename lt 'test_basic' to 'test_text'

* feat: add more lt text tests

* feat: add lemmatizer tests

* refactor: remove unused imports, add newline to end of file

* chore: add contributor agreement

* chore: change 'en' to 'lt' in lt example description

* fix: add missing encoding info

* style: add newline to end of file

* refactor: use python2 compatible syntax

* style: reformat code using black

											
										
										
											2019-07-08 11:25:22 +03:00
+								def test_lt_tokenizer_handles_punct_abbrev(lt_tokenizer, text, length):
 								    tokens = lt_tokenizer(text)
 								    assert len(tokens) == length
 								@pytest.mark.parametrize("text", ["km.", "pvz.", "biol."])
 								def test_lt_tokenizer_abbrev_exceptions(lt_tokenizer, text):
 								    tokens = lt_tokenizer(text)
-												Improve Lithuanian tokenization (#5205)

* Improve Lithuanian tokenization

Modify Lithuanian tokenization to improve performance for
UD_Lithuanian-ALKSNIS.

* Update Lithuanian tokenizer tests
											
										
										
											2020-03-25 13:28:12 +03:00
+								    assert len(tokens) == 2
-												Lithuanian language support (#3895)

* initial LT lang support

* Added more stopwords. Started setting up some basic test environment (not complete)

* Initial morph rules for LT lang

* Closes #1 Adds tokenizer exceptions for Lithuanian

* Closes #5 Punctuation rules. Closes #6 Lexical Attributes

* test: add native examples to basic tests

* feat: add tag map for lt lang

* fix: remove undefined tag attribute 'Definite'

* feat: add lemmatizer for lt lang

* refactor: add new instances to lt lang morph rules; use tags from tag map

* refactor: add morph rules to lt lang defaults

* refactor: only keep nouns, verbs, adverbs and adjectives in lt lang lemmatizer lookup

* refactor: add capitalized words to lt lang lemmatizer

* refactor: add more num words to lt lang lex attrs

* refactor: update lt lang stop word set

* refactor: add new instances to lt lang tokenizer exceptions

* refactor: remove comments form lt lang init file

* refactor: use function instead of lambda in lt lex lang getter

* refactor: remove conversion to dict in lt init when dict is already provided

* chore: rename lt 'test_basic' to 'test_text'

* feat: add more lt text tests

* feat: add lemmatizer tests

* refactor: remove unused imports, add newline to end of file

* chore: add contributor agreement

* chore: change 'en' to 'lt' in lt example description

* fix: add missing encoding info

* style: add newline to end of file

* refactor: use python2 compatible syntax

* style: reformat code using black

											
										
										
											2019-07-08 11:25:22 +03:00
-												Tidy up and auto-format

											
										
										
											2019-07-11 13:02:25 +03:00
+								@pytest.mark.parametrize(
 								    "text,match",
 								    [
 								        ("10", True),
 								        ("1", True),
 								        ("10,000", True),
 								        ("10,00", True),
 								        ("999.0", True),
 								        ("vienas", True),
 								        ("du", True),
 								        ("milijardas", True),
 								        ("šuo", False),
 								        (",", False),
 								        ("1/2", True),
 								    ],
 								)
-												Lithuanian language support (#3895)

* initial LT lang support

* Added more stopwords. Started setting up some basic test environment (not complete)

* Initial morph rules for LT lang

* Closes #1 Adds tokenizer exceptions for Lithuanian

* Closes #5 Punctuation rules. Closes #6 Lexical Attributes

* test: add native examples to basic tests

* feat: add tag map for lt lang

* fix: remove undefined tag attribute 'Definite'

* feat: add lemmatizer for lt lang

* refactor: add new instances to lt lang morph rules; use tags from tag map

* refactor: add morph rules to lt lang defaults

* refactor: only keep nouns, verbs, adverbs and adjectives in lt lang lemmatizer lookup

* refactor: add capitalized words to lt lang lemmatizer

* refactor: add more num words to lt lang lex attrs

* refactor: update lt lang stop word set

* refactor: add new instances to lt lang tokenizer exceptions

* refactor: remove comments form lt lang init file

* refactor: use function instead of lambda in lt lex lang getter

* refactor: remove conversion to dict in lt init when dict is already provided

* chore: rename lt 'test_basic' to 'test_text'

* feat: add more lt text tests

* feat: add lemmatizer tests

* refactor: remove unused imports, add newline to end of file

* chore: add contributor agreement

* chore: change 'en' to 'lt' in lt example description

* fix: add missing encoding info

* style: add newline to end of file

* refactor: use python2 compatible syntax

* style: reformat code using black

											
										
										
											2019-07-08 11:25:22 +03:00
+								def test_lt_lex_attrs_like_number(lt_tokenizer, text, match):
 								    tokens = lt_tokenizer(text)
 								    assert len(tokens) == 1
 								    assert tokens[0].like_num == match