spaCy/spacy/lang/am/lex_attrs.py

105 lines
2.2 KiB
Python
Raw Normal View History

# coding: utf8
from __future__ import unicode_literals
from ...attrs import LIKE_NUM
_num_words = [
"į‹œįˆ®",
"įŠ įŠ•į‹µ",
"įˆįˆˆį‰µ",
"įˆ¶įˆµį‰µ",
"įŠ įˆ«į‰µ",
"įŠ įˆįˆµį‰µ",
"įˆµį‹µįˆµį‰µ",
"įˆ°į‰£į‰µ",
"įˆµįˆį‰µ",
"į‹˜įŒ įŠ",
"įŠ įˆµįˆ­",
"įŠ įˆµįˆ« įŠ įŠ•į‹µ",
"įŠ įˆµįˆ« įˆįˆˆį‰µ",
"įŠ įˆµįˆ« įˆ¶įˆµį‰µ",
"įŠ įˆµįˆ« įŠ įˆ«į‰µ",
"įŠ įˆµįˆ« įŠ įˆįˆµį‰µ",
"įŠ įˆµįˆ« įˆµį‹µįˆµį‰µ",
"įŠ įˆµįˆ« įˆ°į‰£į‰µ",
"įŠ įˆµįˆ« įˆµįˆįŠ•į‰µ",
"įŠ įˆµįˆ« į‹˜įŒ įŠ",
"įˆƒį‹«",
"įˆ°įˆ‹įˆ³",
"įŠ įˆ­į‰£",
"įˆƒįˆįˆ³",
"įˆµįˆįˆ³",
"įˆ°į‰£",
"įˆ°įˆ›įŠ•į‹«",
"į‹˜įŒ įŠ“",
"įˆ˜į‰¶",
"įˆŗįˆ…",
"įˆšįˆŠį‹®įŠ•",
"į‰¢įˆŠį‹®įŠ•",
"į‰µįˆŖįˆŠį‹®įŠ•",
"įŠ³į‹µįˆŖįˆŠį‹®įŠ•",
"įŒˆįŒ…įˆŠį‹®įŠ•",
"į‰£į‹įˆŠį‹®įŠ•"
]
_ordinal_words = [
"įŠ įŠ•į‹°įŠ›",
"įˆįˆˆį‰°įŠ›",
"įˆ¶įˆµį‰°įŠ›",
"įŠ įˆ«į‰°įŠ›",
"įŠ įˆįˆµį‰°įŠ›",
"įˆµį‹µįˆµį‰°įŠ›",
"įˆ°į‰£į‰°įŠ›",
"įˆµįˆįŠ•į‰°įŠ›",
"į‹˜įŒ įŠįŠ›",
"įŠ įˆµįˆØįŠ›",
"įŠ įˆµįˆ« įŠ įŠ•į‹°įŠ›",
"įŠ įˆµįˆ« įˆįˆˆį‰°įŠ›",
"įŠ įˆµįˆ« įˆ¶įˆµį‰°įŠ›",
"įŠ įˆµįˆ« įŠ įˆ«į‰°įŠ›",
"įŠ įˆµįˆ« įŠ įˆįˆµį‰°įŠ›",
"įŠ įˆµįˆ« įˆµį‹µįˆµį‰°įŠ›",
"įŠ įˆµįˆ« įˆ°į‰£į‰°įŠ›",
"įŠ įˆµįˆ« įˆµįˆįŠ•į‰°įŠ›",
"įŠ įˆµįˆ« į‹˜įŒ įŠįŠ›",
"įˆƒį‹«įŠ›",
"įˆ°įˆ‹įˆ³įŠ›"
"įŠ įˆ­į‰£įŠ›",
"įŠ įˆįˆ³įŠ›",
"įˆµį‹µįˆ³įŠ›",
"įˆ°į‰£įŠ›",
"įˆ°įˆ›įŠ•į‹«įŠ›",
"į‹˜įŒ įŠ“įŠ›",
"įˆ˜į‰¶įŠ›",
"įˆŗįŠ›",
"įˆšįˆŠį‹®įŠ•įŠ›",
"į‰¢įˆŠį‹®įŠ•įŠ›",
"į‰µįˆŖįˆŠį‹®įŠ•įŠ›"
]
def like_num(text):
if text.startswith(("+", "-", "Ā±", "~")):
text = text[1:]
text = text.replace(",", "").replace(".", "")
if text.isdigit():
return True
if text.count("/") == 1:
num, denom = text.split("/")
if num.isdigit() and denom.isdigit():
return True
text_lower = text.lower()
if text_lower in _num_words:
return True
# Check ordinal number
if text_lower in _ordinal_words:
return True
if text_lower.endswith("įŠ›"):
if text_lower[:-2].isdigit():
return True
return False
LEX_ATTRS = {LIKE_NUM: like_num}