From 0d07d7fc80b6b1a16a164326b1b144962c743dca Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 7 Dec 2016 21:11:59 +0100
Subject: [PATCH] Apply emoticon exceptions to tokenizer

---
 spacy/en/language_data.py       | 8 +++++---
 spacy/language_data/__init__.py | 1 +
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/spacy/en/language_data.py b/spacy/en/language_data.py
index f63b88b24..929674e6c 100644
--- a/spacy/en/language_data.py
+++ b/spacy/en/language_data.py
@@ -2,6 +2,7 @@
 from __future__ import unicode_literals
 import re
 from ..symbols import *
+from ..language_data import EMOTICONS
 
 
 PRON_LEMMA = "-PRON-"
@@ -2017,9 +2018,10 @@ self_map = [
     "z."
 ]
 
-overlap = set(TOKENIZER_EXCEPTIONS.keys()).intersection(set(self_map))
-assert not overlap, overlap
-TOKENIZER_EXCEPTIONS.update({orth: [{ORTH: orth}] for orth in self_map})
+for orths in [self_map, EMOTICONS]:
+    overlap = set(TOKENIZER_EXCEPTIONS.keys()).intersection(set(orths))
+    assert not overlap, overlap
+    TOKENIZER_EXCEPTIONS.update({orth: [{ORTH: orth}] for orth in orths})
 
 
 TOKENIZER_PREFIXES = r'''
diff --git a/spacy/language_data/__init__.py b/spacy/language_data/__init__.py
index e69de29bb..42bdf1a9a 100644
--- a/spacy/language_data/__init__.py
+++ b/spacy/language_data/__init__.py
@@ -0,0 +1 @@
+from .emoticons import *