spaCy/spacy/lang/vi/lex_attrs.py
Viet Trung Tran ea2af94cd9 Add support for Vietnamese in spaCy by leveraging Pyvi, an external Vietnamese tokenizer (#2155)
* support for Vietnamese

* Contributor Agreement for adding Vietnamese support on spaCy
2018-03-29 12:19:51 +02:00

27 lines
591 B
Python

# coding: utf8
from __future__ import unicode_literals
from ...attrs import LIKE_NUM
_num_words = ['không', 'một', 'hai', 'ba', 'bốn', 'năm', 'sáu', 'bẩy',
'tám', 'chín', 'mười', 'trăm', 'tỷ']
def like_num(text):
text = text.replace(',', '').replace('.', '')
if text.isdigit():
return True
if text.count('/') == 1:
num, denom = text.split('/')
if num.isdigit() and denom.isdigit():
return True
if text.lower() in _num_words:
return True
return False
LEX_ATTRS = {
LIKE_NUM: like_num
}