Added language class and some language data (with some TODOs) for Dutch

This commit is contained in:
dafnevk 2016-11-24 15:56:38 +01:00
parent 83daade0e4
commit 3db8b0d322
3 changed files with 1811 additions and 0 deletions

View File

@ -0,0 +1,26 @@
from __future__ import unicode_literals, print_function
from os import path
from ..language import Language
from ..attrs import LANG
from . import language_data
class Dutch(Language):
lang = 'nl'
class Defaults(Language.Defaults):
tokenizer_exceptions = dict(language_data.TOKENIZER_EXCEPTIONS)
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'nl'
prefixes = tuple(language_data.TOKENIZER_PREFIXES)
suffixes = tuple(language_data.TOKENIZER_SUFFIXES)
infixes = tuple(language_data.TOKENIZER_INFIXES)
tag_map = dict(language_data.TAG_MAP)
stop_words = set(language_data.STOP_WORDS)

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,7 @@
# encoding: utf8
from __future__ import unicode_literals
from ...fr import French
from ...nl import Dutch
def test_load_french():
nlp = French()
@ -10,3 +11,11 @@ def test_load_french():
assert doc[2].text == u'vous'
assert doc[3].text == u'français'
assert doc[4].text == u'?'
def test_load_dutch():
nlp = Dutch()
doc = nlp(u'Is dit Nederlands?')
assert doc[0].text == u'Is'
assert doc[1].text == u'dit'
assert doc[2].text == u'Nederlands'
assert doc[3].text == u'?'