From e7f95c37eeced064d6e239fa6a699a0bfd256501 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Mon, 8 May 2017 15:55:52 +0200
Subject: [PATCH] Merge base tokenizer exceptions

---
 spacy/language_data/abbreviations.py        |  43 ------
 spacy/language_data/emoticons.py            | 148 ------------------
 spacy/language_data/tokenizer_exceptions.py | 161 +++++++++++++++++++-
 3 files changed, 160 insertions(+), 192 deletions(-)
 delete mode 100644 spacy/language_data/abbreviations.py
 delete mode 100644 spacy/language_data/emoticons.py
diff --git a/spacy/language_data/abbreviations.py b/spacy/language_data/abbreviations.py
deleted file mode 100644
index a3e95ce1a..000000000
--- a/spacy/language_data/abbreviations.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-
-ABBREVIATIONS = [
-    "'",
-    "\\\")",
-    "<space>",
-    "''",
-    "C++",
-    "a.",
-    "b.",
-    "c.",
-    "d.",
-    "e.",
-    "f.",
-    "g.",
-    "h.",
-    "i.",
-    "j.",
-    "k.",
-    "l.",
-    "m.",
-    "n.",
-    "o.",
-    "p.",
-    "q.",
-    "r.",
-    "s.",
-    "t.",
-    "u.",
-    "v.",
-    "w.",
-    "x.",
-    "y.",
-    "z.",
-    "ä.",
-    "ö.",
-    "ü."
-]
-
-
-__all__ = [ "ABBREVIATIONS" ]
diff --git a/spacy/language_data/emoticons.py b/spacy/language_data/emoticons.py
deleted file mode 100644
index 223176c9c..000000000
--- a/spacy/language_data/emoticons.py
+++ /dev/null
@@ -1,148 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-
-EMOTICONS = set("""
-:)
-:-)
-:))
-:-))
-:)))
-:-)))
-(:
-(-:
-=)
-(=
-")
-:]
-:-]
-[:
-[-:
-:o)
-(o:
-:}
-:-}
-8)
-8-)
-(-8
-
-;)
-;-)
-(;
-(-;
-
-:(
-:-(
-:((
-:-((
-:(((
-:-(((
-):
-)-:
-=(
->:(
-
-:')
-:'-)
-:'(
-:'-(
-
-:/
-:-/
-=/
-=|
-:|
-:-|
-:1
-
-:P
-:-P
-:p
-:-p
-
-:O
-:-O
-:o
-:-o
-:0
-:-0
-:()
->:o
-
-:*
-:-*
-:3
-:-3
-=3
-:>
-:->
-
-:X
-:-X
-:x
-:-x
-
-:D
-:-D
-;D
-;-D
-=D
-xD
-XD
-xDD
-XDD
-8D
-8-D
-
-^_^
-^__^
-^___^
->.<
->.>
-<.<
-._.
-;_;
--_-
--__-
-v.v
-V.V
-v_v
-V_V
-o_o
-o_O
-O_o
-O_O
-0_o
-o_0
-0_0
-o.O
-O.o
-O.O
-o.o
-0.0
-o.0
-0.o
-@_@
-
-<3
-<33
-<333
-</3
-
-(^_^)
-(-_-)
-(._.)
-(>_<)
-(*_*)
-(¬_¬)
-
-ಠ_ಠ
-ಠ︵ಠ
-(ಠ_ಠ)
-¯\(ツ)/¯
-(╯°□°）╯︵┻━┻
-><(((*>
-""".split())
-
-
-__all__ = [ "EMOTICONS" ]
diff --git a/spacy/language_data/tokenizer_exceptions.py b/spacy/language_data/tokenizer_exceptions.py
index b84adb2c4..4c6d0fad2 100644
--- a/spacy/language_data/tokenizer_exceptions.py
+++ b/spacy/language_data/tokenizer_exceptions.py
@@ -1,9 +1,13 @@
+# coding: utf8
 from __future__ import unicode_literals
 
 # The use of this module turns out to be important, to avoid pathological
 # back-tracking. See Issue #957
 import regex
 
+from ..symbols import ORTH, POS, LEMMA, SPACE, PUNCT
+
+
 # URL validation regex courtesy of: https://mathiasbynens.be/demo/url-regex
 # A few minor mods to this regex to account for use cases represented in test_urls
 _URL_PATTERN = (
@@ -51,4 +55,159 @@ _URL_PATTERN = (
 
 TOKEN_MATCH = regex.compile(_URL_PATTERN, regex.UNICODE).match
 
-__all__ = ['TOKEN_MATCH']
+
+
+BASE_EXCEPTIONS = {}
+
+
+for exc_data in [
+    {ORTH: " ", POS: SPACE},
+    {ORTH: "\t", POS: SPACE},
+    {ORTH: "\\t", POS: SPACE},
+    {ORTH: "\n", POS: SPACE},
+    {ORTH: "\\n", POS: SPACE},
+    {ORTH: "\u2014", POS: PUNCT, LEMMA: "--"},
+    {ORTH: "\u00a0", POS: SPACE, LEMMA: "  "}]:
+    BASE_EXCEPTIONS[exc_data[ORTH]] = [dict(exc_data)]
+
+
+for orth in [
+    "'", "\\\")", "<space>", "''", "C++", "a.", "b.", "c.", "d.", "e.", "f.",
+    "g.", "h.", "i.", "j.", "k.", "l.", "m.", "n.", "o.", "p.", "q.", "r.",
+    "s.", "t.", "u.", "v.", "w.", "x.", "y.", "z.", "ä.", "ö.", "ü."]:
+    BASE_EXCEPTIONS[orth] = [{ORTH: orth}]
+
+
+emoticons = set("""
+:)
+:-)
+:))
+:-))
+:)))
+:-)))
+(:
+(-:
+=)
+(=
+")
+:]
+:-]
+[:
+[-:
+:o)
+(o:
+:}
+:-}
+8)
+8-)
+(-8
+;)
+;-)
+(;
+(-;
+:(
+:-(
+:((
+:-((
+:(((
+:-(((
+):
+)-:
+=(
+>:(
+:')
+:'-)
+:'(
+:'-(
+:/
+:-/
+=/
+=|
+:|
+:-|
+:1
+:P
+:-P
+:p
+:-p
+:O
+:-O
+:o
+:-o
+:0
+:-0
+:()
+>:o
+:*
+:-*
+:3
+:-3
+=3
+:>
+:->
+:X
+:-X
+:x
+:-x
+:D
+:-D
+;D
+;-D
+=D
+xD
+XD
+xDD
+XDD
+8D
+8-D
+
+^_^
+^__^
+^___^
+>.<
+>.>
+<.<
+._.
+;_;
+-_-
+-__-
+v.v
+V.V
+v_v
+V_V
+o_o
+o_O
+O_o
+O_O
+0_o
+o_0
+0_0
+o.O
+O.o
+O.O
+o.o
+0.0
+o.0
+0.o
+@_@
+<3
+<33
+<333
+</3
+(^_^)
+(-_-)
+(._.)
+(>_<)
+(*_*)
+(¬_¬)
+ಠ_ಠ
+ಠ︵ಠ
+(ಠ_ಠ)
+¯\(ツ)/¯
+(╯°□°）╯︵┻━┻
+><(((*>
+""".split())
+
+
+for orth in emoticons:
+    BASE_EXCEPTIONS[orth] = [{ORTH: orth}]