mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-30 20:06:30 +03:00
70 lines
1.2 KiB
Python
70 lines
1.2 KiB
Python
# coding: utf8
|
|
from __future__ import unicode_literals
|
|
|
|
from ...attrs import LIKE_NUM
|
|
|
|
|
|
_num_words = [
|
|
"zero",
|
|
"jeden",
|
|
"dwa",
|
|
"trzy",
|
|
"cztery",
|
|
"pięć",
|
|
"sześć",
|
|
"siedem",
|
|
"osiem",
|
|
"dziewięć",
|
|
"dziesięć",
|
|
"jedenaście",
|
|
"dwanaście",
|
|
"trzynaście",
|
|
"czternaście",
|
|
"pietnaście",
|
|
"szesnaście",
|
|
"siedemnaście",
|
|
"osiemnaście",
|
|
"dziewiętnaście",
|
|
"dwadzieścia",
|
|
"trzydzieści",
|
|
"czterdzieści",
|
|
"pięćdziesiąt",
|
|
"szcześćdziesiąt",
|
|
"siedemdziesiąt",
|
|
"osiemdziesiąt",
|
|
"dziewięćdziesiąt",
|
|
"sto",
|
|
"dwieście",
|
|
"trzysta",
|
|
"czterysta",
|
|
"pięćset",
|
|
"sześćset",
|
|
"siedemset",
|
|
"osiemset",
|
|
"dziewięćset",
|
|
"tysiąc",
|
|
"milion",
|
|
"miliard",
|
|
"bilion",
|
|
"biliard",
|
|
"trylion",
|
|
"tryliard",
|
|
"kwadrylion",
|
|
]
|
|
|
|
|
|
def like_num(text):
|
|
text = text.replace(",", "").replace(".", "")
|
|
if text.isdigit():
|
|
return True
|
|
if text.count("/") == 1:
|
|
num, denom = text.split("/")
|
|
if num.isdigit() and denom.isdigit():
|
|
return True
|
|
if text.lower() in _num_words:
|
|
return True
|
|
return False
|
|
|
|
|
|
LEX_ATTRS = {LIKE_NUM: like_num}
|