mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Implement like_num getter for Dutch (via #1177)
This commit is contained in:
parent
5ee10379db
commit
adda08fe14
|
@ -2,6 +2,7 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
|
@ -12,6 +13,7 @@ from ...util import update_exc, add_lookups
|
|||
|
||||
class DutchDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
lex_attr_getters[LANG] = lambda text: 'nl'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||
|
||||
|
|
36
spacy/lang/nl/lex_attrs.py
Normal file
36
spacy/lang/nl/lex_attrs.py
Normal file
|
@ -0,0 +1,36 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...attrs import LIKE_NUM
|
||||
|
||||
|
||||
_num_words = set("""
|
||||
nul een één twee drie vier vijf zes zeven acht negen tien elf twaalf dertien
|
||||
veertien twintig dertig veertig vijftig zestig zeventig tachtig negentig honderd
|
||||
duizend miljoen miljard biljoen biljard triljoen triljard
|
||||
""".split())
|
||||
|
||||
_ordinal_words = set("""
|
||||
eerste tweede derde vierde vijfde zesde zevende achtste negende tiende elfde
|
||||
twaalfde dertiende veertiende twintigste dertigste veertigste vijftigste
|
||||
zestigste zeventigste tachtigste negentigste honderdste duizendste miljoenste
|
||||
miljardste biljoenste biljardste triljoenste triljardste
|
||||
""".split())
|
||||
|
||||
|
||||
def like_num(text):
|
||||
text = text.replace(',', '').replace('.', '')
|
||||
if text.isdigit():
|
||||
return True
|
||||
if text.count('/') == 1:
|
||||
num, denom = text.split('/')
|
||||
if num.isdigit() and denom.isdigit():
|
||||
return True
|
||||
if text in _num_words:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
LEX_ATTRS = {
|
||||
LIKE_NUM: like_num
|
||||
}
|
Loading…
Reference in New Issue
Block a user