spaCy/spacy/lang/am/tokenizer_exceptions.py

# coding: utf8
from __future__ import unicode_literals

from ...symbols import ORTH, LEMMA, NORM, PRON_LEMMA


_exc = {}


for exc_data in [
    {ORTH: "ት/ቤት", LEMMA: "ትምህርት ቤት"},    
    {ORTH: "ወ/ሮ", LEMMA: PRON_LEMMA, NORM: "ወይዘሮ"},

]:
    _exc[exc_data[ORTH]] = [exc_data]


for orth in [
    "ዓ.ም.",
    "ኪ.ሜ.",
]:
    _exc[orth] = [{ORTH: orth}]


TOKENIZER_EXCEPTIONS = _exc
Add Amharic አማርኛ Language support (#6583) * Add Amharic to space * clean up * Add some PRON_LEMMA * add Tigrinya support * remove text_noun_chunks * Tigrinya Support * added some more details for ti * fix unit test * add amharic char range * changes from review * amharic and tigrinya share same unicode block * get rid of _amharic/_tigrinya in char_classes Co-authored-by: Josiah Solomon <jsolomon@meteorcomm.com> 2020-12-22 18:50:34 +03:00			`# coding: utf8`
			`from __future__ import unicode_literals`

			`from ...symbols import ORTH, LEMMA, NORM, PRON_LEMMA`


			`_exc = {}`


			`for exc_data in [`
			`{ORTH: "ት/ቤት", LEMMA: "ትምህርት ቤት"},`
			`{ORTH: "ወ/ሮ", LEMMA: PRON_LEMMA, NORM: "ወይዘሮ"},`

			`]:`
			`_exc[exc_data[ORTH]] = [exc_data]`


			`for orth in [`
			`"ዓ.ም.",`
			`"ኪ.ሜ.",`
			`]:`
			`_exc[orth] = [{ORTH: orth}]`


			`TOKENIZER_EXCEPTIONS = _exc`