Add update_exc and expand_exc to util

Doesn't require separate language data util anymore
This commit is contained in:
ines 2017-05-08 15:42:12 +02:00
parent 6e5bd4f228
commit 60db497525
3 changed files with 35 additions and 54 deletions

View File

@ -3,5 +3,4 @@ from .emoticons import *
from .punctuation import *
from .tag_map import *
from .entity_rules import *
from .util import *
from .tokenizer_exceptions import *

View File

@ -1,52 +0,0 @@
# coding: utf8
from __future__ import unicode_literals
from ..symbols import *
try:
unicode
except:
unicode = str
PRON_LEMMA = "-PRON-"
DET_LEMMA = "-DET-"
ENT_ID = "ent_id"
def update_exc(exc, additions):
for orth, token_attrs in additions.items():
if not all(isinstance(attr[ORTH], unicode) for attr in token_attrs):
msg = "Invalid value for ORTH in exception: key='%s', orths='%s'"
raise ValueError(msg % (orth, token_attrs))
described_orth = ''.join(attr[ORTH] for attr in token_attrs)
if orth != described_orth:
# TODO: Better error
msg = "Invalid tokenizer exception: key='%s', orths='%s'"
raise ValueError(msg % (orth, described_orth))
overlap = set(exc.keys()).intersection(set(additions))
assert not overlap, overlap
exc.update(additions)
def strings_to_exc(orths):
return {orth: [{ORTH: orth}] for orth in orths}
def expand_exc(excs, search, replace):
updates = {}
for token_string, tokens in excs.items():
if search in token_string:
new_key = token_string.replace(search, replace)
new_value = [_fix_token(t, search, replace) for t in tokens]
updates[new_key] = new_value
return updates
def _fix_token(token, search, replace):
fixed = dict(token)
fixed[ORTH] = fixed[ORTH].replace(search, replace)
return fixed

View File

@ -9,7 +9,8 @@ from pathlib import Path
import sys
import textwrap
from .compat import path2str, basestring_, input_
from .symbols import ORTH
from .compat import path2str, basestring_, input_, unicode_
LANGUAGES = {}
@ -77,6 +78,39 @@ def compile_infix_regex(entries):
return re.compile(expression)
def update_exc(base_exceptions, *addition_dicts):
exc = dict(base_exceptions)
for additions in addition_dicts:
for orth, token_attrs in additions.items():
if not all(isinstance(attr[ORTH], unicode_) for attr in token_attrs):
msg = "Invalid value for ORTH in exception: key='%s', orths='%s'"
raise ValueError(msg % (orth, token_attrs))
described_orth = ''.join(attr[ORTH] for attr in token_attrs)
if orth != described_orth:
# TODO: Better error
msg = "Invalid tokenizer exception: key='%s', orths='%s'"
raise ValueError(msg % (orth, described_orth))
# overlap = set(exc.keys()).intersection(set(additions))
# assert not overlap, overlap
exc.update(additions)
expand_exc(exc, "'", "")
return exc
def expand_exc(excs, search, replace):
def _fix_token(token, search, replace):
fixed = dict(token)
fixed[ORTH] = fixed[ORTH].replace(search, replace)
return fixed
updates = {}
for token_string, tokens in excs.items():
if search in token_string:
new_key = token_string.replace(search, replace)
new_value = [_fix_token(t, search, replace) for t in tokens]
updates[new_key] = new_value
return updates
def normalize_slice(length, start, stop, step=None):
if not (step is None or step == 1):
raise ValueError("Stepped slices not supported in Span objects."