spaCy/spacy/tests/lang/am/test_text.py
Yosi cf52510631
Add Amharic አማርኛ Language support (#6583)
* Add Amharic to space

* clean up

* Add some PRON_LEMMA

* add Tigrinya support

* remove text_noun_chunks

* Tigrinya Support

* added some more details for ti

* fix unit test

* add amharic char range

* changes from review

* amharic and tigrinya share same unicode block

* get rid of _amharic/_tigrinya in char_classes

Co-authored-by: Josiah Solomon <jsolomon@meteorcomm.com>
2020-12-22 16:50:34 +01:00

55 lines
1.9 KiB
Python

# coding: utf-8
from __future__ import unicode_literals
import pytest
from spacy.lang.am.lex_attrs import like_num
def test_am_tokenizer_handles_long_text(am_tokenizer):
text = """ሆሴ ሙጂካ በበጋ ወቅት በኦክስፎርድ ንግግር አንድያቀርቡ ሲጋበዙ ጭንቅላታቸው "ፈነዳ"
“እጅግ ጥንታዊ” የእንግሊዝኛ ተናጋሪ ዩኒቨርስቲ፣ በአስር ሺዎች የሚቆጠሩ ዩሮዎችን ለተማሪዎች በማስተማር የሚያስከፍለው
እና ከማርጋሬት ታቸር እስከ ስቲቨን ሆኪንግ በአዳራሾቻቸው ውስጥ ንግግር ያደረጉበት የትምህርት ማዕከል፣ በሞንቴቪዴኦ
በሚገኘው የመንግስት ትምህርት ቤት የሰለጠኑትን የ81 ዓመቱ አዛውንት አገልግሎት ጠየቁ።"""
tokens = am_tokenizer(text)
assert len(tokens) == 56
@pytest.mark.parametrize(
"text,length",
[
("ሆሴ ሙጂካ ለምን ተመረጠ?", 5),
("“በፍፁም?”", 4),
("""አዎ! ሆዜ አርካዲዮ ቡንዲያ “እንሂድ” ሲል መለሰ።""", 11),
("እነሱ በግምት 10ኪ.ሜ. ሮጡ።", 7),
("እና ከዚያ ለምን...", 4),
],
)
def test_am_tokenizer_handles_cnts(am_tokenizer, text, length):
tokens = am_tokenizer(text)
assert len(tokens) == length
@pytest.mark.parametrize(
"text,match",
[
("10", True),
("1", True),
("10.000", True),
("1000", True),
("999,0", True),
("አንድ", True),
("ሁለት", True),
("ትሪሊዮን", True),
("ውሻ", False),
(",", False),
("1/2", True),
],
)
def test_lex_attrs_like_number(am_tokenizer, text, match):
tokens = am_tokenizer(text)
assert len(tokens) == 1
assert tokens[0].like_num == match