spaCy/spacy/lang/tr/lex_attrs.py

54 lines
919 B
Python
Raw Normal View History

2018-03-08 17:25:25 +03:00
# coding: utf8
from __future__ import unicode_literals
from ...attrs import LIKE_NUM
# Thirteen, fifteen etc. are written separate: on üç
2019-02-07 22:54:07 +03:00
_num_words = [
"bir",
"iki",
"üç",
"dört",
"beş",
"altı",
"yedi",
"sekiz",
"dokuz",
"on",
"yirmi",
"otuz",
"kırk",
"elli",
"altmış",
"yetmiş",
"seksen",
"doksan",
"yüz",
"bin",
"milyon",
"milyar",
2019-02-07 22:54:07 +03:00
"trilyon",
"katrilyon",
"kentilyon",
]
2018-03-08 17:25:25 +03:00
def like_num(text):
if text.startswith(("+", "-", "±", "~")):
text = text[1:]
text = text.replace(",", "").replace(".", "")
2018-03-08 17:25:25 +03:00
if text.isdigit():
return True
if text.count("/") == 1:
num, denom = text.split("/")
2018-03-08 17:25:25 +03:00
if num.isdigit() and denom.isdigit():
return True
if text.lower() in _num_words:
return True
return False
LEX_ATTRS = {LIKE_NUM: like_num}