mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-15 06:09:01 +03:00
5ca0dbae76
* Add support basic support for lower sorbian. * Add some test for dsb. * Update spacy/lang/dsb/examples.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
78 lines
1.7 KiB
Python
78 lines
1.7 KiB
Python
from ...attrs import LIKE_NUM
|
|
|
|
_num_words = [
|
|
"nul",
|
|
"jaden", "jadna", "jadno",
|
|
"dwa", "dwě",
|
|
"tśi", "tśo",
|
|
"styri", "styrjo",
|
|
"pěś", "pěśo",
|
|
"šesć", "šesćo",
|
|
"sedym", "sedymjo",
|
|
"wósym", "wósymjo",
|
|
"źewjeś", "źewjeśo",
|
|
"źaseś", "źaseśo",
|
|
"jadnassćo",
|
|
"dwanassćo",
|
|
"tśinasćo",
|
|
"styrnasćo",
|
|
"pěśnasćo",
|
|
"šesnasćo",
|
|
"sedymnasćo",
|
|
"wósymnasćo",
|
|
"źewjeśnasćo",
|
|
"dwanasćo", "dwaźasća",
|
|
"tśiźasća",
|
|
"styrźasća",
|
|
"pěśźaset",
|
|
"šesćźaset",
|
|
"sedymźaset",
|
|
"wósymźaset",
|
|
"źewjeśźaset",
|
|
"sto",
|
|
"tysac",
|
|
"milion",
|
|
"miliarda",
|
|
"bilion",
|
|
"biliarda",
|
|
"trilion",
|
|
"triliarda",
|
|
]
|
|
|
|
_ordinal_words = [
|
|
"prědny", "prědna", "prědne",
|
|
"drugi", "druga", "druge",
|
|
"tśeśi", "tśeśa", "tśeśe",
|
|
"stwórty", "stwórta", "stwórte",
|
|
"pêty", "pěta", "pête",
|
|
"šesty", "šesta", "šeste",
|
|
"sedymy", "sedyma", "sedyme",
|
|
"wósymy", "wósyma", "wósyme",
|
|
"źewjety", "źewjeta", "źewjete",
|
|
"źasety", "źaseta", "źasete",
|
|
"jadnasty", "jadnasta", "jadnaste",
|
|
"dwanasty", "dwanasta", "dwanaste"
|
|
]
|
|
|
|
|
|
def like_num(text):
|
|
if text.startswith(("+", "-", "±", "~")):
|
|
text = text[1:]
|
|
text = text.replace(",", "").replace(".", "")
|
|
if text.isdigit():
|
|
return True
|
|
if text.count("/") == 1:
|
|
num, denom = text.split("/")
|
|
if num.isdigit() and denom.isdigit():
|
|
return True
|
|
text_lower = text.lower()
|
|
if text_lower in _num_words:
|
|
return True
|
|
# Check ordinal number
|
|
if text_lower in _ordinal_words:
|
|
return True
|
|
return False
|
|
|
|
|
|
LEX_ATTRS = {LIKE_NUM: like_num}
|