mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 02:06:31 +03:00
Add English lex_attrs overrides
This commit is contained in:
parent
8f3fbbb147
commit
88adeee548
|
@ -4,6 +4,7 @@ from __future__ import unicode_literals
|
|||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .tag_map import TAG_MAP
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .morph_rules import MORPH_RULES
|
||||
from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC
|
||||
|
||||
|
@ -19,6 +20,7 @@ class English(Language):
|
|||
class Defaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'en'
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
tag_map = dict(TAG_MAP)
|
||||
|
|
|
@ -2,22 +2,31 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
# Number words
|
||||
|
||||
NUM_WORDS = set("""
|
||||
zero one two three four five six seven eight nine ten eleven twelve thirteen
|
||||
fourteen fifteen sixteen seventeen eighteen nineteen twenty thirty forty fifty
|
||||
sixty seventy eighty ninety hundred thousand million billion trillion
|
||||
quadrillion gajillion bazillion
|
||||
""".split())
|
||||
from ...attrs import LIKE_NUM
|
||||
|
||||
|
||||
# Ordinal words
|
||||
_num_words = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven',
|
||||
'eight', 'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen',
|
||||
'fifteen', 'sixteen', 'seventeen', 'eighteen', 'nineteen', 'twenty',
|
||||
'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety',
|
||||
'hundred', 'thousand', 'million', 'billion', 'trillion', 'quadrillion',
|
||||
'gajillion', 'bazillion']
|
||||
|
||||
ORDINAL_WORDS = set("""
|
||||
first second third fourth fifth sixth seventh eigth ninth tenth eleventh twelveth
|
||||
thirteenth fourteenth fifteenth sixteenth sventeenth eighteenth nineteenth
|
||||
twentieth thirtieth fortieth fiftieth sixtieth seventieth eightieth ninetieth
|
||||
hundreth thousandth millionth billionth trillionth quadrillionth gajillionth
|
||||
bazillionth
|
||||
""".split())
|
||||
|
||||
def like_num(text):
|
||||
text = text.replace(',', '')
|
||||
text = text.replace('.', '')
|
||||
if text.isdigit():
|
||||
return True
|
||||
if text.count('/') == 1:
|
||||
num, denom = text.split('/')
|
||||
if num.isdigit() and denom.isdigit():
|
||||
return True
|
||||
if text in _num_words:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
LEX_ATTRS = {
|
||||
LIKE_NUM: like_num
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user