Reformat dutch language data to match new style

This commit is contained in:
Ines Montani 2016-12-17 13:26:01 +01:00
parent f2c48ef504
commit dd55d085b6
2 changed files with 96 additions and 283 deletions

View File

@ -1,10 +1,28 @@
# encoding: utf8
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function
from os import path from os import path
from ..language import Language from ..language import Language
from ..attrs import LANG
from . import language_data from . import language_data
from ..attrs import LANG
from ..language_data import update_exc
from ..language_data import strings_to_exc
from ..language_data import EMOTICONS
from .language_data import ORTH_ONLY
TOKENIZER_EXCEPTIONS = dict(language_data.TOKENIZER_EXCEPTIONS)
TOKENIZER_PREFIXES = tuple(language_data.TOKENIZER_PREFIXES)
TOKENIZER_SUFFIXES = tuple(language_data.TOKENIZER_SUFFIXES)
TOKENIZER_INFIXES = tuple(language_data.TOKENIZER_INFIXES)
TAG_MAP = dict(language_data.TAG_MAP)
STOP_WORDS = set(language_data.STOP_WORDS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(EMOTICONS))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
class Dutch(Language): class Dutch(Language):
@ -15,12 +33,9 @@ class Dutch(Language):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'nl' lex_attr_getters[LANG] = lambda text: 'nl'
prefixes = tuple(language_data.TOKENIZER_PREFIXES) tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
suffixes = tuple(language_data.TOKENIZER_SUFFIXES) suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES
infixes = tuple(language_data.TOKENIZER_INFIXES) tag_map = TAG_MAP
stop_words = STOP_WORDS
tag_map = dict(language_data.TAG_MAP)
stop_words = set(language_data.STOP_WORDS)

View File

@ -1,285 +1,83 @@
# encoding: utf8 # encoding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
import re
from ..symbols import *
from ..language_data import PRON_LEMMA
from ..language_data import TOKENIZER_PREFIXES
from ..language_data import TOKENIZER_SUFFIXES
from ..language_data import TOKENIZER_INFIXES
# TODO insert TAG_MAP for Dutch
TAG_MAP = {
"ADV": {POS: "ADV"},
"NOUN": {POS: "NOUN"},
"ADP": {POS: "ADP"},
"PRON": {POS: "PRON"},
"SCONJ": {POS: "SCONJ"},
"PROPN": {POS: "PROPN"},
"DET": {POS: "DET"},
"SYM": {POS: "SYM"},
"INTJ": {POS: "INTJ"},
"PUNCT": {POS: "PUNCT"},
"NUM": {POS: "NUM"},
"AUX": {POS: "AUX"},
"X": {POS: "X"},
"CONJ": {POS: "CONJ"},
"ADJ": {POS: "ADJ"},
"VERB": {POS: "VERB"}
}
# Stop words are retrieved from http://www.damienvanholten.com/downloads/dutch-stop-words.txt # Stop words are retrieved from http://www.damienvanholten.com/downloads/dutch-stop-words.txt
STOP_WORDS = set(""" STOP_WORDS = set("""
aan aan af al alles als altijd andere
af
al ben bij
alles
als daar dan dat de der deze die dit doch doen door dus
altijd
andere een eens en er
ben
bij ge geen geweest
daar
dan haar had heb hebben heeft hem het hier hij hoe hun
dat
de iemand iets ik in is
der
deze ja je
die
dit kan kon kunnen
doch
doen maar me meer men met mij mijn moet
door
dus na naar niet niets nog nu
een
eens of om omdat ons ook op over
en
er
ge
geen
geweest
haar
had
heb
hebben
heeft
hem
het
hier
hij
hoe
hun
iemand
iets
ik
in
is
ja
je
kan
kon
kunnen
maar
me
meer
men
met
mij
mijn
moet
na
naar
niet
niets
nog
nu
of
om
omdat
ons
ook
op
over
reeds reeds
te
tegen te tegen toch toen tot
toch
toen u uit uw
tot
u van veel voor
uit
uw want waren was wat we wel werd wezen wie wij wil worden
van
veel zal ze zei zelf zich zij zijn zo zonder zou
voor
want
waren
was
wat
we
wel
werd
wezen
wie
wij
wil
worden
zal
ze
zei
zelf
zich
zij
zijn
zo
zonder
zou
""".split()) """.split())
TOKENIZER_PREFIXES = map(re.escape, r'''
,
"
(
[
{
*
<
>
$
£
'
``
`
#
US$
C$
A$
a-
....
...
»
_
§
'''.strip().split('\n'))
TOKENIZER_SUFFIXES = r'''
,
\"
\)
\]
\}
\*
\!
\?
%
\$
>
:
;
'
«
_
''
's
'S
s
S
°
\.\.
\.\.\.
\.\.\.\.
(?<=[a-zäöüßÖÄÜ)\]"'´«‘’%\)²“”])\.
\-\-
´
(?<=[0-9])km²
(?<=[0-9])
(?<=[0-9])cm²
(?<=[0-9])mm²
(?<=[0-9])km³
(?<=[0-9])
(?<=[0-9])cm³
(?<=[0-9])mm³
(?<=[0-9])ha
(?<=[0-9])km
(?<=[0-9])m
(?<=[0-9])cm
(?<=[0-9])mm
(?<=[0-9])µm
(?<=[0-9])nm
(?<=[0-9])yd
(?<=[0-9])in
(?<=[0-9])ft
(?<=[0-9])kg
(?<=[0-9])g
(?<=[0-9])mg
(?<=[0-9])µg
(?<=[0-9])t
(?<=[0-9])lb
(?<=[0-9])oz
(?<=[0-9])m/s
(?<=[0-9])km/h
(?<=[0-9])mph
(?<=[0-9])°C
(?<=[0-9])°K
(?<=[0-9])°F
(?<=[0-9])hPa
(?<=[0-9])Pa
(?<=[0-9])mbar
(?<=[0-9])mb
(?<=[0-9])T
(?<=[0-9])G
(?<=[0-9])M
(?<=[0-9])K
(?<=[0-9])kb
'''.strip().split('\n')
TOKENIZER_INFIXES = r'''
\.\.\.
(?<=[a-z])\.(?=[A-Z])
(?<=[a-zöäüßA-ZÖÄÜ"]):(?=[a-zöäüßA-ZÖÄÜ])
(?<=[a-zöäüßA-ZÖÄÜ"])>(?=[a-zöäüßA-ZÖÄÜ])
(?<=[a-zöäüßA-ZÖÄÜ"])<(?=[a-zöäüßA-ZÖÄÜ])
(?<=[a-zöäüßA-ZÖÄÜ"])=(?=[a-zöäüßA-ZÖÄÜ])
'''.strip().split('\n')
# TODO Make tokenizer excpetions for Dutch # TODO Make tokenizer excpetions for Dutch
TOKENIZER_EXCEPTIONS = {}
#TODO insert TAG_MAP for Dutch TOKENIZER_EXCEPTIONS = {
TAG_MAP = {
"ADV": {
"pos": "ADV"
},
"NOUN": {
"pos": "NOUN"
},
"ADP": {
"pos": "ADP"
},
"PRON": {
"pos": "PRON"
},
"SCONJ": {
"pos": "SCONJ"
},
"PROPN": {
"pos": "PROPN"
},
"DET": {
"pos": "DET"
},
"SYM": {
"pos": "SYM"
},
"INTJ": {
"pos": "INTJ"
},
"PUNCT": {
"pos": "PUNCT"
},
"NUM": {
"pos": "NUM"
},
"AUX": {
"pos": "AUX"
},
"X": {
"pos": "X"
},
"CONJ": {
"pos": "CONJ"
},
"ADJ": {
"pos": "ADJ"
},
"VERB": {
"pos": "VERB"
} }
ORTH_ONLY = {
} }