mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-28 02:04:07 +03:00
61ce126d4c
* initial LT lang support * Added more stopwords. Started setting up some basic test environment (not complete) * Initial morph rules for LT lang * Closes #1 Adds tokenizer exceptions for Lithuanian * Closes #5 Punctuation rules. Closes #6 Lexical Attributes * test: add native examples to basic tests * feat: add tag map for lt lang * fix: remove undefined tag attribute 'Definite' * feat: add lemmatizer for lt lang * refactor: add new instances to lt lang morph rules; use tags from tag map * refactor: add morph rules to lt lang defaults * refactor: only keep nouns, verbs, adverbs and adjectives in lt lang lemmatizer lookup * refactor: add capitalized words to lt lang lemmatizer * refactor: add more num words to lt lang lex attrs * refactor: update lt lang stop word set * refactor: add new instances to lt lang tokenizer exceptions * refactor: remove comments form lt lang init file * refactor: use function instead of lambda in lt lex lang getter * refactor: remove conversion to dict in lt init when dict is already provided * chore: rename lt 'test_basic' to 'test_text' * feat: add more lt text tests * feat: add lemmatizer tests * refactor: remove unused imports, add newline to end of file * chore: add contributor agreement * chore: change 'en' to 'lt' in lt example description * fix: add missing encoding info * style: add newline to end of file * refactor: use python2 compatible syntax * style: reformat code using black
45 lines
1.5 KiB
Python
45 lines
1.5 KiB
Python
# coding: utf-8
|
||
from __future__ import unicode_literals
|
||
|
||
import pytest
|
||
|
||
|
||
def test_lt_tokenizer_handles_long_text(lt_tokenizer):
|
||
text = """Tokios sausros kriterijus atitinka pirmadienį atlikti skaičiavimai, palyginus faktinį ir žemiausią
|
||
vidutinį daugiametį vandens lygį. Nustatyta, kad iš 48 šalies vandens matavimo stočių 28-iose stotyse vandens lygis
|
||
yra žemesnis arba lygus žemiausiam vidutiniam daugiamečiam šiltojo laikotarpio vandens lygiui."""
|
||
tokens = lt_tokenizer(text.replace("\n", ""))
|
||
assert len(tokens) == 42
|
||
|
||
|
||
@pytest.mark.parametrize('text,length', [
|
||
("177R Parodų rūmai–Ozo g. nuo vasario 18 d. bus skelbiamas interneto tinklalapyje.", 15),
|
||
("ISM universiteto doc. dr. Ieva Augutytė-Kvedaravičienė pastebi, kad tyrimais nustatyti elgesio pokyčiai.", 16)])
|
||
def test_lt_tokenizer_handles_punct_abbrev(lt_tokenizer, text, length):
|
||
tokens = lt_tokenizer(text)
|
||
assert len(tokens) == length
|
||
|
||
|
||
@pytest.mark.parametrize("text", ["km.", "pvz.", "biol."])
|
||
def test_lt_tokenizer_abbrev_exceptions(lt_tokenizer, text):
|
||
tokens = lt_tokenizer(text)
|
||
assert len(tokens) == 1
|
||
|
||
|
||
@pytest.mark.parametrize("text,match", [
|
||
("10", True),
|
||
("1", True),
|
||
("10,000", True),
|
||
("10,00", True),
|
||
("999.0", True),
|
||
("vienas", True),
|
||
("du", True),
|
||
("milijardas", True),
|
||
("šuo", False),
|
||
(",", False),
|
||
("1/2", True)])
|
||
def test_lt_lex_attrs_like_number(lt_tokenizer, text, match):
|
||
tokens = lt_tokenizer(text)
|
||
assert len(tokens) == 1
|
||
assert tokens[0].like_num == match
|