From d1f703d78d1fa20078787d8655addd4a31c7c6a4 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 26 Feb 2020 13:06:52 +0100
Subject: [PATCH] Improve German tokenization

Improve German tokenization with respect to Tiger.
---
 spacy/lang/de/__init__.py             |  3 +++
 spacy/lang/de/punctuation.py          | 27 ++++++++++++++++++++++++++-
 spacy/lang/de/tokenizer_exceptions.py | 11 +++++++++++
 3 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/spacy/lang/de/__init__.py b/spacy/lang/de/__init__.py
index 1412f033a..dee1841c8 100644
--- a/spacy/lang/de/__init__.py
+++ b/spacy/lang/de/__init__.py
@@ -3,6 +3,7 @@ from __future__ import unicode_literals
 
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .norm_exceptions import NORM_EXCEPTIONS
+from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
 from .punctuation import TOKENIZER_INFIXES
 from .tag_map import TAG_MAP
 from .stop_words import STOP_WORDS
@@ -22,6 +23,8 @@ class GermanDefaults(Language.Defaults):
         Language.Defaults.lex_attr_getters[NORM], NORM_EXCEPTIONS, BASE_NORMS
     )
     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
+    prefixes = TOKENIZER_PREFIXES
+    suffixes = TOKENIZER_SUFFIXES
     infixes = TOKENIZER_INFIXES
     tag_map = TAG_MAP
     stop_words = STOP_WORDS
diff --git a/spacy/lang/de/punctuation.py b/spacy/lang/de/punctuation.py
index 7dfa61bd4..c376ce597 100644
--- a/spacy/lang/de/punctuation.py
+++ b/spacy/lang/de/punctuation.py
@@ -1,10 +1,32 @@
 # coding: utf8
 from __future__ import unicode_literals
 
-from ..char_classes import LIST_ELLIPSES, LIST_ICONS
+from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_PUNCT, LIST_QUOTES
+from ..char_classes import LIST_CURRENCY, CURRENCY, UNITS, PUNCT
 from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
+from ..punctuation import _prefixes, _suffixes
 
 
+_prefixes = ["``",] + list(_prefixes)
+
+_suffixes = (
+    ["''", "/"]
+    + LIST_PUNCT
+    + LIST_ELLIPSES
+    + LIST_QUOTES
+    + LIST_ICONS
+    + [
+        r"(?<=[0-9])\+",
+        r"(?<=°[FfCcKk])\.",
+        r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
+        r"(?<=[0-9])(?:{u})".format(u=UNITS),
+        r"(?<=[{al}{e}{p}(?:{q})])\.".format(
+            al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, p=PUNCT
+        ),
+        r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
+    ]
+)
+
 _quotes = CONCAT_QUOTES.replace("'", "")
 
 _infixes = (
@@ -15,6 +37,7 @@ _infixes = (
         r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
         r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
         r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
+        r"(?<=[0-9{a}])\/(?=[0-9{a}])".format(a=ALPHA),
         r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
         r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
         r"(?<=[0-9])-(?=[0-9])",
@@ -22,4 +45,6 @@ _infixes = (
 )
 
 
+TOKENIZER_PREFIXES = _prefixes
+TOKENIZER_SUFFIXES = _suffixes
 TOKENIZER_INFIXES = _infixes
diff --git a/spacy/lang/de/tokenizer_exceptions.py b/spacy/lang/de/tokenizer_exceptions.py
index 5b09a0b89..ebbbfba8c 100644
--- a/spacy/lang/de/tokenizer_exceptions.py
+++ b/spacy/lang/de/tokenizer_exceptions.py
@@ -160,6 +160,8 @@ for exc_data in [
 
 
 for orth in [
+    "``",
+    "''",
     "A.C.",
     "a.D.",
     "A.D.",
@@ -175,10 +177,13 @@ for orth in [
     "biol.",
     "Biol.",
     "ca.",
+    "CDU/CSU",
     "Chr.",
     "Cie.",
+    "c/o",
     "co.",
     "Co.",
+    "d'",
     "D.C.",
     "Dipl.-Ing.",
     "Dipl.",
@@ -203,12 +208,18 @@ for orth in [
     "i.G.",
     "i.Tr.",
     "i.V.",
+    "I.",
+    "II.",
+    "III.",
+    "IV.",
+    "Inc.",
     "Ing.",
     "jr.",
     "Jr.",
     "jun.",
     "jur.",
     "K.O.",
+    "L'",
     "L.A.",
     "lat.",
     "M.A.",