mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
Implement like_num getter for Dutch (via #1177)
This commit is contained in:
parent
5ee10379db
commit
adda08fe14
|
@ -2,6 +2,7 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
from ..norm_exceptions import BASE_NORMS
|
||||||
|
@ -12,6 +13,7 @@ from ...util import update_exc, add_lookups
|
||||||
|
|
||||||
class DutchDefaults(Language.Defaults):
|
class DutchDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
|
lex_attr_getters.update(LEX_ATTRS)
|
||||||
lex_attr_getters[LANG] = lambda text: 'nl'
|
lex_attr_getters[LANG] = lambda text: 'nl'
|
||||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||||
|
|
||||||
|
|
36
spacy/lang/nl/lex_attrs.py
Normal file
36
spacy/lang/nl/lex_attrs.py
Normal file
|
@ -0,0 +1,36 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
|
|
||||||
|
_num_words = set("""
|
||||||
|
nul een één twee drie vier vijf zes zeven acht negen tien elf twaalf dertien
|
||||||
|
veertien twintig dertig veertig vijftig zestig zeventig tachtig negentig honderd
|
||||||
|
duizend miljoen miljard biljoen biljard triljoen triljard
|
||||||
|
""".split())
|
||||||
|
|
||||||
|
_ordinal_words = set("""
|
||||||
|
eerste tweede derde vierde vijfde zesde zevende achtste negende tiende elfde
|
||||||
|
twaalfde dertiende veertiende twintigste dertigste veertigste vijftigste
|
||||||
|
zestigste zeventigste tachtigste negentigste honderdste duizendste miljoenste
|
||||||
|
miljardste biljoenste biljardste triljoenste triljardste
|
||||||
|
""".split())
|
||||||
|
|
||||||
|
|
||||||
|
def like_num(text):
|
||||||
|
text = text.replace(',', '').replace('.', '')
|
||||||
|
if text.isdigit():
|
||||||
|
return True
|
||||||
|
if text.count('/') == 1:
|
||||||
|
num, denom = text.split('/')
|
||||||
|
if num.isdigit() and denom.isdigit():
|
||||||
|
return True
|
||||||
|
if text in _num_words:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
LEX_ATTRS = {
|
||||||
|
LIKE_NUM: like_num
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user