spaCy/spacy/tests/lang/hi/test_lex_attrs.py

45 lines
1.8 KiB
Python
Raw Normal View History

# coding: utf-8
from __future__ import unicode_literals
import pytest
from spacy.lang.hi.lex_attrs import norm, like_num
def test_hi_tokenizer_handles_long_text(hi_tokenizer):
text = """
कह 1900 दशक शल (ि जयकर) पत चलत ि उसक
, वद (हर ) पस घर रह वद 10 पहल
पढ करन ि गय उसक टन शल अपन पड
रहन ि (िरण ) बत इस खबर
"""
tokens = hi_tokenizer(text)
assert len(tokens) == 86
@pytest.mark.parametrize(
"word,word_norm",
[
("चलता", "चल"),
("पढ़ाई", "पढ़"),
("देती", "दे"),
("जाती", ""),
("मुस्कुराकर", "मुस्कुर"),
],
)
def test_hi_norm(word, word_norm):
assert norm(word) == word_norm
@pytest.mark.parametrize(
"word", ["१९८७", "1987", "१२,२६७", "उन्नीस", "पाँच", "नवासी", "५/१०"],
)
def test_hi_like_num(word):
assert like_num(word)
@pytest.mark.parametrize(
"word", ["पहला", "तृतीय", "निन्यानवेवाँ", "उन्नीस", "तिहत्तरवाँ", "छत्तीसवाँ",],
)
def test_hi_like_num_ordinal_words(word):
assert like_num(word)