From 0d07d7fc80b6b1a16a164326b1b144962c743dca Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 7 Dec 2016 21:11:59 +0100 Subject: [PATCH] Apply emoticon exceptions to tokenizer --- spacy/en/language_data.py | 8 +++++--- spacy/language_data/__init__.py | 1 + 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/spacy/en/language_data.py b/spacy/en/language_data.py index f63b88b24..929674e6c 100644 --- a/spacy/en/language_data.py +++ b/spacy/en/language_data.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import re from ..symbols import * +from ..language_data import EMOTICONS PRON_LEMMA = "-PRON-" @@ -2017,9 +2018,10 @@ self_map = [ "z." ] -overlap = set(TOKENIZER_EXCEPTIONS.keys()).intersection(set(self_map)) -assert not overlap, overlap -TOKENIZER_EXCEPTIONS.update({orth: [{ORTH: orth}] for orth in self_map}) +for orths in [self_map, EMOTICONS]: + overlap = set(TOKENIZER_EXCEPTIONS.keys()).intersection(set(orths)) + assert not overlap, overlap + TOKENIZER_EXCEPTIONS.update({orth: [{ORTH: orth}] for orth in orths}) TOKENIZER_PREFIXES = r''' diff --git a/spacy/language_data/__init__.py b/spacy/language_data/__init__.py index e69de29bb..42bdf1a9a 100644 --- a/spacy/language_data/__init__.py +++ b/spacy/language_data/__init__.py @@ -0,0 +1 @@ +from .emoticons import *