Add update_exc and expand_exc to util

Doesn't require separate language data util anymore
2025-10-17 09:14:14 +03:00 · 2017-05-08 15:42:12 +02:00 · 2017-05-08 15:42:12 +02:00 · 60db497525
commit 60db497525
parent 6e5bd4f228
3 changed files with 35 additions and 54 deletions
--- a/spacy/language_data/init.py
+++ b/spacy/language_data/init.py
@ -3,5 +3,4 @@ from .emoticons import *
 from .punctuation import *
 from .tag_map import *
 from .entity_rules import *
-from .util import *
 from .tokenizer_exceptions import *
--- a/spacy/language_data/util.py
+++ b/spacy/language_data/util.py
@ -1,52 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-from ..symbols import *
-
-try:
-    unicode
-except:
-    unicode = str
-
-
-PRON_LEMMA = "-PRON-"
-DET_LEMMA = "-DET-"
-ENT_ID = "ent_id"
-
-
-def update_exc(exc, additions):
-    for orth, token_attrs in additions.items():
-        if not all(isinstance(attr[ORTH], unicode) for attr in token_attrs):
-            msg = "Invalid value for ORTH in exception: key='%s', orths='%s'"
-            raise ValueError(msg % (orth, token_attrs))
-        described_orth = ''.join(attr[ORTH] for attr in token_attrs)
-        if orth != described_orth:
-            # TODO: Better error
-            msg = "Invalid tokenizer exception: key='%s', orths='%s'"
-            raise ValueError(msg % (orth, described_orth))
-    overlap = set(exc.keys()).intersection(set(additions))
-    assert not overlap, overlap
-    exc.update(additions)
-
-
-def strings_to_exc(orths):
-    return {orth: [{ORTH: orth}] for orth in orths}
-
-
-def expand_exc(excs, search, replace):
-    updates = {}
-
-    for token_string, tokens in excs.items():
-        if search in token_string:
-            new_key = token_string.replace(search, replace)
-            new_value = [_fix_token(t, search, replace) for t in tokens]
-
-            updates[new_key] = new_value
-
-    return updates
-
-
-def _fix_token(token, search, replace):
-    fixed = dict(token)
-    fixed[ORTH] = fixed[ORTH].replace(search, replace)
-    return fixed
--- a/spacy/util.py
+++ b/spacy/util.py
@ -9,7 +9,8 @@ from pathlib import Path
 import sys
 import textwrap

-from .compat import path2str, basestring_, input_
+from .symbols import ORTH
+from .compat import path2str, basestring_, input_, unicode_


 LANGUAGES = {}
@ -77,6 +78,39 @@ def compile_infix_regex(entries):
    return re.compile(expression)


+def update_exc(base_exceptions, *addition_dicts):
+    exc = dict(base_exceptions)
+    for additions in addition_dicts:
+        for orth, token_attrs in additions.items():
+            if not all(isinstance(attr[ORTH], unicode_) for attr in token_attrs):
+                msg = "Invalid value for ORTH in exception: key='%s', orths='%s'"
+                raise ValueError(msg % (orth, token_attrs))
+            described_orth = ''.join(attr[ORTH] for attr in token_attrs)
+            if orth != described_orth:
+                # TODO: Better error
+                msg = "Invalid tokenizer exception: key='%s', orths='%s'"
+                raise ValueError(msg % (orth, described_orth))
+        # overlap = set(exc.keys()).intersection(set(additions))
+        # assert not overlap, overlap
+        exc.update(additions)
+    expand_exc(exc, "'", "’")
+    return exc
+
+
+def expand_exc(excs, search, replace):
+    def _fix_token(token, search, replace):
+        fixed = dict(token)
+        fixed[ORTH] = fixed[ORTH].replace(search, replace)
+        return fixed
+    updates = {}
+    for token_string, tokens in excs.items():
+        if search in token_string:
+            new_key = token_string.replace(search, replace)
+            new_value = [_fix_token(t, search, replace) for t in tokens]
+            updates[new_key] = new_value
+    return updates
+
+
 def normalize_slice(length, start, stop, step=None):
    if not (step is None or step == 1):
        raise ValueError("Stepped slices not supported in Span objects."