From 60db497525a88fd44351d91e260c36f7fabe878c Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Mon, 8 May 2017 15:42:12 +0200
Subject: [PATCH] Add update_exc and expand_exc to util

Doesn't require separate language data util anymore
---
 spacy/language_data/__init__.py |  1 -
 spacy/language_data/util.py     | 52 ---------------------------------
 spacy/util.py                   | 36 ++++++++++++++++++++++-
 3 files changed, 35 insertions(+), 54 deletions(-)
 delete mode 100644 spacy/language_data/util.py

diff --git a/spacy/language_data/__init__.py b/spacy/language_data/__init__.py
index 2119c071b..20f9d4a87 100644
--- a/spacy/language_data/__init__.py
+++ b/spacy/language_data/__init__.py
@@ -3,5 +3,4 @@ from .emoticons import *
 from .punctuation import *
 from .tag_map import *
 from .entity_rules import *
-from .util import *
 from .tokenizer_exceptions import *
diff --git a/spacy/language_data/util.py b/spacy/language_data/util.py
deleted file mode 100644
index 10cd161aa..000000000
--- a/spacy/language_data/util.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-from ..symbols import *
-
-try:
-    unicode
-except:
-    unicode = str
-
-
-PRON_LEMMA = "-PRON-"
-DET_LEMMA = "-DET-"
-ENT_ID = "ent_id"
-
-
-def update_exc(exc, additions):
-    for orth, token_attrs in additions.items():
-        if not all(isinstance(attr[ORTH], unicode) for attr in token_attrs):
-            msg = "Invalid value for ORTH in exception: key='%s', orths='%s'"
-            raise ValueError(msg % (orth, token_attrs))
-        described_orth = ''.join(attr[ORTH] for attr in token_attrs)
-        if orth != described_orth:
-            # TODO: Better error
-            msg = "Invalid tokenizer exception: key='%s', orths='%s'"
-            raise ValueError(msg % (orth, described_orth))
-    overlap = set(exc.keys()).intersection(set(additions))
-    assert not overlap, overlap
-    exc.update(additions)
-
-
-def strings_to_exc(orths):
-    return {orth: [{ORTH: orth}] for orth in orths}
-
-
-def expand_exc(excs, search, replace):
-    updates = {}
-
-    for token_string, tokens in excs.items():
-        if search in token_string:
-            new_key = token_string.replace(search, replace)
-            new_value = [_fix_token(t, search, replace) for t in tokens]
-
-            updates[new_key] = new_value
-
-    return updates
-
-
-def _fix_token(token, search, replace):
-    fixed = dict(token)
-    fixed[ORTH] = fixed[ORTH].replace(search, replace)
-    return fixed
diff --git a/spacy/util.py b/spacy/util.py
index e7f52fda0..e6aa27680 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -9,7 +9,8 @@ from pathlib import Path
 import sys
 import textwrap
 
-from .compat import path2str, basestring_, input_
+from .symbols import ORTH
+from .compat import path2str, basestring_, input_, unicode_
 
 
 LANGUAGES = {}
@@ -77,6 +78,39 @@ def compile_infix_regex(entries):
     return re.compile(expression)
 
 
+def update_exc(base_exceptions, *addition_dicts):
+    exc = dict(base_exceptions)
+    for additions in addition_dicts:
+        for orth, token_attrs in additions.items():
+            if not all(isinstance(attr[ORTH], unicode_) for attr in token_attrs):
+                msg = "Invalid value for ORTH in exception: key='%s', orths='%s'"
+                raise ValueError(msg % (orth, token_attrs))
+            described_orth = ''.join(attr[ORTH] for attr in token_attrs)
+            if orth != described_orth:
+                # TODO: Better error
+                msg = "Invalid tokenizer exception: key='%s', orths='%s'"
+                raise ValueError(msg % (orth, described_orth))
+        # overlap = set(exc.keys()).intersection(set(additions))
+        # assert not overlap, overlap
+        exc.update(additions)
+    expand_exc(exc, "'", "’")
+    return exc
+
+
+def expand_exc(excs, search, replace):
+    def _fix_token(token, search, replace):
+        fixed = dict(token)
+        fixed[ORTH] = fixed[ORTH].replace(search, replace)
+        return fixed
+    updates = {}
+    for token_string, tokens in excs.items():
+        if search in token_string:
+            new_key = token_string.replace(search, replace)
+            new_value = [_fix_token(t, search, replace) for t in tokens]
+            updates[new_key] = new_value
+    return updates
+
+
 def normalize_slice(length, start, stop, step=None):
     if not (step is None or step == 1):
         raise ValueError("Stepped slices not supported in Span objects."