mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-14 05:37:03 +03:00
Add English lex_attrs overrides
This commit is contained in:
parent
8f3fbbb147
commit
88adeee548
|
@ -4,6 +4,7 @@ from __future__ import unicode_literals
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .tag_map import TAG_MAP
|
from .tag_map import TAG_MAP
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .morph_rules import MORPH_RULES
|
from .morph_rules import MORPH_RULES
|
||||||
from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC
|
from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC
|
||||||
|
|
||||||
|
@ -19,6 +20,7 @@ class English(Language):
|
||||||
class Defaults(Language.Defaults):
|
class Defaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
lex_attr_getters[LANG] = lambda text: 'en'
|
lex_attr_getters[LANG] = lambda text: 'en'
|
||||||
|
lex_attr_getters.update(LEX_ATTRS)
|
||||||
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
tag_map = dict(TAG_MAP)
|
tag_map = dict(TAG_MAP)
|
||||||
|
|
|
@ -2,22 +2,31 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
# Number words
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
NUM_WORDS = set("""
|
|
||||||
zero one two three four five six seven eight nine ten eleven twelve thirteen
|
|
||||||
fourteen fifteen sixteen seventeen eighteen nineteen twenty thirty forty fifty
|
|
||||||
sixty seventy eighty ninety hundred thousand million billion trillion
|
|
||||||
quadrillion gajillion bazillion
|
|
||||||
""".split())
|
|
||||||
|
|
||||||
|
|
||||||
# Ordinal words
|
_num_words = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven',
|
||||||
|
'eight', 'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen',
|
||||||
|
'fifteen', 'sixteen', 'seventeen', 'eighteen', 'nineteen', 'twenty',
|
||||||
|
'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety',
|
||||||
|
'hundred', 'thousand', 'million', 'billion', 'trillion', 'quadrillion',
|
||||||
|
'gajillion', 'bazillion']
|
||||||
|
|
||||||
ORDINAL_WORDS = set("""
|
|
||||||
first second third fourth fifth sixth seventh eigth ninth tenth eleventh twelveth
|
def like_num(text):
|
||||||
thirteenth fourteenth fifteenth sixteenth sventeenth eighteenth nineteenth
|
text = text.replace(',', '')
|
||||||
twentieth thirtieth fortieth fiftieth sixtieth seventieth eightieth ninetieth
|
text = text.replace('.', '')
|
||||||
hundreth thousandth millionth billionth trillionth quadrillionth gajillionth
|
if text.isdigit():
|
||||||
bazillionth
|
return True
|
||||||
""".split())
|
if text.count('/') == 1:
|
||||||
|
num, denom = text.split('/')
|
||||||
|
if num.isdigit() and denom.isdigit():
|
||||||
|
return True
|
||||||
|
if text in _num_words:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
LEX_ATTRS = {
|
||||||
|
LIKE_NUM: like_num
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user