Add English lex_attrs overrides

This commit is contained in:
ines 2017-05-09 01:09:52 +02:00
parent 8f3fbbb147
commit 88adeee548
2 changed files with 27 additions and 16 deletions

View File

@ -4,6 +4,7 @@ from __future__ import unicode_literals
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .morph_rules import MORPH_RULES
from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC
@ -19,6 +20,7 @@ class English(Language):
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'en'
lex_attr_getters.update(LEX_ATTRS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tag_map = dict(TAG_MAP)

View File

@ -2,22 +2,31 @@
from __future__ import unicode_literals
# Number words
NUM_WORDS = set("""
zero one two three four five six seven eight nine ten eleven twelve thirteen
fourteen fifteen sixteen seventeen eighteen nineteen twenty thirty forty fifty
sixty seventy eighty ninety hundred thousand million billion trillion
quadrillion gajillion bazillion
""".split())
from ...attrs import LIKE_NUM
# Ordinal words
_num_words = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven',
'eight', 'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen',
'fifteen', 'sixteen', 'seventeen', 'eighteen', 'nineteen', 'twenty',
'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety',
'hundred', 'thousand', 'million', 'billion', 'trillion', 'quadrillion',
'gajillion', 'bazillion']
ORDINAL_WORDS = set("""
first second third fourth fifth sixth seventh eigth ninth tenth eleventh twelveth
thirteenth fourteenth fifteenth sixteenth sventeenth eighteenth nineteenth
twentieth thirtieth fortieth fiftieth sixtieth seventieth eightieth ninetieth
hundreth thousandth millionth billionth trillionth quadrillionth gajillionth
bazillionth
""".split())
def like_num(text):
text = text.replace(',', '')
text = text.replace('.', '')
if text.isdigit():
return True
if text.count('/') == 1:
num, denom = text.split('/')
if num.isdigit() and denom.isdigit():
return True
if text in _num_words:
return True
return False
LEX_ATTRS = {
LIKE_NUM: like_num
}