mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-29 11:26:28 +03:00
b6e022381d
* add language extensions for norwegian nynorsk and faroese * update docstring for nn/examples.py * use relative imports * add fo and nn tokenizers to pytest fixtures * add unittests for fo and nn and fix bug in nn * remove module docstring from fo/__init__.py * add comments about example sentences' origin * add license information to faroese data credit * format unittests using black * add __init__ files to test/lang/nn and tests/lang/fo * fix import order and use relative imports in fo/__nit__.py and nn/__init__.py * Make the tests a bit more compact * Add fo and nn to website languages * Add note about jul. * Add "jul." as exception --------- Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
91 lines
1.3 KiB
Python
91 lines
1.3 KiB
Python
from ...symbols import ORTH
|
|
from ...util import update_exc
|
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|
|
|
_exc = {}
|
|
|
|
for orth in [
|
|
"apr.",
|
|
"aug.",
|
|
"avgr.",
|
|
"árg.",
|
|
"ávís.",
|
|
"beinl.",
|
|
"blkv.",
|
|
"blaðkv.",
|
|
"blm.",
|
|
"blaðm.",
|
|
"bls.",
|
|
"blstj.",
|
|
"blaðstj.",
|
|
"des.",
|
|
"eint.",
|
|
"febr.",
|
|
"fyrrv.",
|
|
"góðk.",
|
|
"h.m.",
|
|
"innt.",
|
|
"jan.",
|
|
"kl.",
|
|
"m.a.",
|
|
"mðr.",
|
|
"mió.",
|
|
"nr.",
|
|
"nto.",
|
|
"nov.",
|
|
"nút.",
|
|
"o.a.",
|
|
"o.a.m.",
|
|
"o.a.tíl.",
|
|
"o.fl.",
|
|
"ff.",
|
|
"o.m.a.",
|
|
"o.o.",
|
|
"o.s.fr.",
|
|
"o.tíl.",
|
|
"o.ø.",
|
|
"okt.",
|
|
"omf.",
|
|
"pst.",
|
|
"ritstj.",
|
|
"sbr.",
|
|
"sms.",
|
|
"smst.",
|
|
"smb.",
|
|
"sb.",
|
|
"sbrt.",
|
|
"sp.",
|
|
"sept.",
|
|
"spf.",
|
|
"spsk.",
|
|
"t.e.",
|
|
"t.s.",
|
|
"t.s.s.",
|
|
"tlf.",
|
|
"tel.",
|
|
"tsk.",
|
|
"t.o.v.",
|
|
"t.d.",
|
|
"uml.",
|
|
"ums.",
|
|
"uppl.",
|
|
"upprfr.",
|
|
"uppr.",
|
|
"útg.",
|
|
"útl.",
|
|
"útr.",
|
|
"vanl.",
|
|
"v.",
|
|
"v.h.",
|
|
"v.ø.o.",
|
|
"viðm.",
|
|
"viðv.",
|
|
"vm.",
|
|
"v.m.",
|
|
]:
|
|
_exc[orth] = [{ORTH: orth}]
|
|
capitalized = orth.capitalize()
|
|
_exc[capitalized] = [{ORTH: capitalized}]
|
|
|
|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|