mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
a341b4ef09
* Create lex_attrs.py Hello, I am missing a CZECH language in SpaCy. So I would like to help to push it a little. This file is base on others lex_attrs.py files just with translation to Czech. * Update __init__.py Updated for use with new Czech Lex_attrs file * Update stop_words.py * Create test_text.py Co-authored-by: Vladimír Holubec <vholubec@arcdata.cz>
65 lines
1.1 KiB
Python
65 lines
1.1 KiB
Python
# coding: utf8
|
|
from __future__ import unicode_literals
|
|
|
|
from ...attrs import LIKE_NUM
|
|
|
|
_num_words = [
|
|
"nula",
|
|
"jedna",
|
|
"dva",
|
|
"tři",
|
|
"čtyři",
|
|
"pět",
|
|
"šest",
|
|
"sedm",
|
|
"osm",
|
|
"devět",
|
|
"deset",
|
|
"jedenáct",
|
|
"dvanáct",
|
|
"třináct",
|
|
"čtrnáct",
|
|
"patnáct",
|
|
"šestnáct",
|
|
"sedmnáct",
|
|
"osmnáct",
|
|
"devatenáct",
|
|
"dvacet",
|
|
"třicet",
|
|
"čtyřicet",
|
|
"padesát",
|
|
"šedesát",
|
|
"sedmdesát",
|
|
"osmdesát",
|
|
"devadesát",
|
|
"sto",
|
|
"tisíc",
|
|
"milion",
|
|
"miliarda",
|
|
"bilion",
|
|
"biliarda",
|
|
"trilion",
|
|
"triliarda",
|
|
"kvadrilion",
|
|
"kvadriliarda",
|
|
"kvintilion",
|
|
]
|
|
|
|
|
|
def like_num(text):
|
|
if text.startswith(("+", "-", "±", "~")):
|
|
text = text[1:]
|
|
text = text.replace(",", "").replace(".", "")
|
|
if text.isdigit():
|
|
return True
|
|
if text.count("/") == 1:
|
|
num, denom = text.split("/")
|
|
if num.isdigit() and denom.isdigit():
|
|
return True
|
|
if text.lower() in _num_words:
|
|
return True
|
|
return False
|
|
|
|
|
|
LEX_ATTRS = {LIKE_NUM: like_num}
|