spaCy/spacy/lang/zh/lex_attrs.py

108 lines
1.6 KiB
Python
Raw Normal View History

# coding: utf8
from __future__ import unicode_literals
import re
from ...attrs import LIKE_NUM
_single_num_words = [
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"十一",
"十二",
"十三",
"十四",
"十五",
"十六",
"十七",
"十八",
"十九",
"廿",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"拾壹",
"拾贰",
"拾叁",
"拾肆",
"拾伍",
"拾陆",
"拾柒",
"拾捌",
"拾玖"
]
_count_num_words = [
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
""
]
_base_num_words = [
"",
"",
"",
"",
"亿",
"",
"",
"",
""
]
def like_num(text):
if text.startswith(("+", "-", "±", "~")):
text = text[1:]
text = text.replace(",", "").replace(
".", "").replace("", "").replace("", "")
if text.isdigit():
return True
if text.count("/") == 1:
num, denom = text.split("/")
if num.isdigit() and denom.isdigit():
return True
if text in _single_num_words:
return True
if re.match('^((' + '|'.join(_count_num_words) + '){1}'
+ '(' + '|'.join(_base_num_words) + '){1})+'
+ '(' + '|'.join(_count_num_words) + ')?$', text):
return True
return False
LEX_ATTRS = {LIKE_NUM: like_num}