Add English lex_attrs overrides

This commit is contained in:
ines 2017-05-09 01:09:52 +02:00
parent 8f3fbbb147
commit 88adeee548
2 changed files with 27 additions and 16 deletions

View File

@ -4,6 +4,7 @@ from __future__ import unicode_literals
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .tag_map import TAG_MAP from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .morph_rules import MORPH_RULES from .morph_rules import MORPH_RULES
from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC
@ -19,6 +20,7 @@ class English(Language):
class Defaults(Language.Defaults): class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'en' lex_attr_getters[LANG] = lambda text: 'en'
lex_attr_getters.update(LEX_ATTRS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tag_map = dict(TAG_MAP) tag_map = dict(TAG_MAP)

View File

@ -2,22 +2,31 @@
from __future__ import unicode_literals from __future__ import unicode_literals
# Number words from ...attrs import LIKE_NUM
NUM_WORDS = set("""
zero one two three four five six seven eight nine ten eleven twelve thirteen
fourteen fifteen sixteen seventeen eighteen nineteen twenty thirty forty fifty
sixty seventy eighty ninety hundred thousand million billion trillion
quadrillion gajillion bazillion
""".split())
# Ordinal words _num_words = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven',
'eight', 'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen',
'fifteen', 'sixteen', 'seventeen', 'eighteen', 'nineteen', 'twenty',
'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety',
'hundred', 'thousand', 'million', 'billion', 'trillion', 'quadrillion',
'gajillion', 'bazillion']
ORDINAL_WORDS = set("""
first second third fourth fifth sixth seventh eigth ninth tenth eleventh twelveth def like_num(text):
thirteenth fourteenth fifteenth sixteenth sventeenth eighteenth nineteenth text = text.replace(',', '')
twentieth thirtieth fortieth fiftieth sixtieth seventieth eightieth ninetieth text = text.replace('.', '')
hundreth thousandth millionth billionth trillionth quadrillionth gajillionth if text.isdigit():
bazillionth return True
""".split()) if text.count('/') == 1:
num, denom = text.split('/')
if num.isdigit() and denom.isdigit():
return True
if text in _num_words:
return True
return False
LEX_ATTRS = {
LIKE_NUM: like_num
}