From d1f703d78d1fa20078787d8655addd4a31c7c6a4 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 26 Feb 2020 13:06:52 +0100 Subject: [PATCH] Improve German tokenization Improve German tokenization with respect to Tiger. --- spacy/lang/de/__init__.py | 3 +++ spacy/lang/de/punctuation.py | 27 ++++++++++++++++++++++++++- spacy/lang/de/tokenizer_exceptions.py | 11 +++++++++++ 3 files changed, 40 insertions(+), 1 deletion(-) diff --git a/spacy/lang/de/__init__.py b/spacy/lang/de/__init__.py index 1412f033a..dee1841c8 100644 --- a/spacy/lang/de/__init__.py +++ b/spacy/lang/de/__init__.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .norm_exceptions import NORM_EXCEPTIONS +from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_INFIXES from .tag_map import TAG_MAP from .stop_words import STOP_WORDS @@ -22,6 +23,8 @@ class GermanDefaults(Language.Defaults): Language.Defaults.lex_attr_getters[NORM], NORM_EXCEPTIONS, BASE_NORMS ) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + prefixes = TOKENIZER_PREFIXES + suffixes = TOKENIZER_SUFFIXES infixes = TOKENIZER_INFIXES tag_map = TAG_MAP stop_words = STOP_WORDS diff --git a/spacy/lang/de/punctuation.py b/spacy/lang/de/punctuation.py index 7dfa61bd4..c376ce597 100644 --- a/spacy/lang/de/punctuation.py +++ b/spacy/lang/de/punctuation.py @@ -1,10 +1,32 @@ # coding: utf8 from __future__ import unicode_literals -from ..char_classes import LIST_ELLIPSES, LIST_ICONS +from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_PUNCT, LIST_QUOTES +from ..char_classes import LIST_CURRENCY, CURRENCY, UNITS, PUNCT from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER +from ..punctuation import _prefixes, _suffixes +_prefixes = ["``",] + list(_prefixes) + +_suffixes = ( + ["''", "/"] + + LIST_PUNCT + + LIST_ELLIPSES + + LIST_QUOTES + + LIST_ICONS + + [ + r"(?<=[0-9])\+", + r"(?<=°[FfCcKk])\.", + r"(?<=[0-9])(?:{c})".format(c=CURRENCY), + r"(?<=[0-9])(?:{u})".format(u=UNITS), + r"(?<=[{al}{e}{p}(?:{q})])\.".format( + al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, p=PUNCT + ), + r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER), + ] +) + _quotes = CONCAT_QUOTES.replace("'", "") _infixes = ( @@ -15,6 +37,7 @@ _infixes = ( r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA), r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA), r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), + r"(?<=[0-9{a}])\/(?=[0-9{a}])".format(a=ALPHA), r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes), r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA), r"(?<=[0-9])-(?=[0-9])", @@ -22,4 +45,6 @@ _infixes = ( ) +TOKENIZER_PREFIXES = _prefixes +TOKENIZER_SUFFIXES = _suffixes TOKENIZER_INFIXES = _infixes diff --git a/spacy/lang/de/tokenizer_exceptions.py b/spacy/lang/de/tokenizer_exceptions.py index 5b09a0b89..ebbbfba8c 100644 --- a/spacy/lang/de/tokenizer_exceptions.py +++ b/spacy/lang/de/tokenizer_exceptions.py @@ -160,6 +160,8 @@ for exc_data in [ for orth in [ + "``", + "''", "A.C.", "a.D.", "A.D.", @@ -175,10 +177,13 @@ for orth in [ "biol.", "Biol.", "ca.", + "CDU/CSU", "Chr.", "Cie.", + "c/o", "co.", "Co.", + "d'", "D.C.", "Dipl.-Ing.", "Dipl.", @@ -203,12 +208,18 @@ for orth in [ "i.G.", "i.Tr.", "i.V.", + "I.", + "II.", + "III.", + "IV.", + "Inc.", "Ing.", "jr.", "Jr.", "jun.", "jur.", "K.O.", + "L'", "L.A.", "lat.", "M.A.",