Add update_exc and expand_exc to util

Doesn't require separate language data util anymore
2025-10-17 09:14:14 +03:00 · 2017-05-08 15:42:12 +02:00 · 2017-05-08 15:42:12 +02:00 · 60db497525
commit 60db497525
parent 6e5bd4f228
3 changed files with 35 additions and 54 deletions
--- a/spacy/language_data/init.py
+++ b/spacy/language_data/init.py
@ -3,5 +3,4 @@ from .emoticons import *
 from .punctuation import *
 from .tag_map import *
 from .entity_rules import *
 from .util import *
 from .tokenizer_exceptions import *
--- a/spacy/language_data/util.py
+++ b/spacy/language_data/util.py
@ -1,52 +0,0 @@
 # coding: utf8
 from __future__ import unicode_literals
 from ..symbols import *
 try:
    unicode
 except:
    unicode = str
 PRON_LEMMA = "-PRON-"
 DET_LEMMA = "-DET-"
 ENT_ID = "ent_id"
 def update_exc(exc, additions):
    for orth, token_attrs in additions.items():
        if not all(isinstance(attr[ORTH], unicode) for attr in token_attrs):
            msg = "Invalid value for ORTH in exception: key='%s', orths='%s'"
            raise ValueError(msg % (orth, token_attrs))
        described_orth = ''.join(attr[ORTH] for attr in token_attrs)
        if orth != described_orth:
            # TODO: Better error
            msg = "Invalid tokenizer exception: key='%s', orths='%s'"
            raise ValueError(msg % (orth, described_orth))
    overlap = set(exc.keys()).intersection(set(additions))
    assert not overlap, overlap
    exc.update(additions)
 def strings_to_exc(orths):
    return {orth: [{ORTH: orth}] for orth in orths}
 def expand_exc(excs, search, replace):
    updates = {}
    for token_string, tokens in excs.items():
        if search in token_string:
            new_key = token_string.replace(search, replace)
            new_value = [_fix_token(t, search, replace) for t in tokens]
            updates[new_key] = new_value
    return updates
 def _fix_token(token, search, replace):
    fixed = dict(token)
    fixed[ORTH] = fixed[ORTH].replace(search, replace)
    return fixed
--- a/spacy/util.py
+++ b/spacy/util.py
@ -9,7 +9,8 @@ from pathlib import Path
 import sys
 import textwrap
-from .compat import path2str, basestring_, input_
+from .symbols import ORTH
 from .compat import path2str, basestring_, input_, unicode_
 LANGUAGES = {}
@ -77,6 +78,39 @@ def compile_infix_regex(entries):
    return re.compile(expression)
 def update_exc(base_exceptions, *addition_dicts):
    exc = dict(base_exceptions)
    for additions in addition_dicts:
        for orth, token_attrs in additions.items():
            if not all(isinstance(attr[ORTH], unicode_) for attr in token_attrs):
                msg = "Invalid value for ORTH in exception: key='%s', orths='%s'"
                raise ValueError(msg % (orth, token_attrs))
            described_orth = ''.join(attr[ORTH] for attr in token_attrs)
            if orth != described_orth:
                # TODO: Better error
                msg = "Invalid tokenizer exception: key='%s', orths='%s'"
                raise ValueError(msg % (orth, described_orth))
        # overlap = set(exc.keys()).intersection(set(additions))
        # assert not overlap, overlap
        exc.update(additions)
    expand_exc(exc, "'", "’")
    return exc
 def expand_exc(excs, search, replace):
    def _fix_token(token, search, replace):
        fixed = dict(token)
        fixed[ORTH] = fixed[ORTH].replace(search, replace)
        return fixed
    updates = {}
    for token_string, tokens in excs.items():
        if search in token_string:
            new_key = token_string.replace(search, replace)
            new_value = [_fix_token(t, search, replace) for t in tokens]
            updates[new_key] = new_value
    return updates
 def normalize_slice(length, start, stop, step=None):
    if not (step is None or step == 1):
        raise ValueError("Stepped slices not supported in Span objects."