2017-03-12 15:07:28 +03:00
|
|
|
# coding: utf8
|
2016-12-17 14:27:41 +03:00
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
|
|
|
from ..symbols import *
|
|
|
|
|
2017-01-12 13:21:01 +03:00
|
|
|
try:
|
|
|
|
unicode
|
|
|
|
except:
|
|
|
|
unicode = str
|
|
|
|
|
2016-12-17 14:27:41 +03:00
|
|
|
|
|
|
|
PRON_LEMMA = "-PRON-"
|
2016-12-21 20:05:41 +03:00
|
|
|
DET_LEMMA = "-DET-"
|
2016-12-18 17:34:21 +03:00
|
|
|
ENT_ID = "ent_id"
|
2016-12-17 14:27:41 +03:00
|
|
|
|
|
|
|
|
|
|
|
def update_exc(exc, additions):
|
2017-01-12 13:21:01 +03:00
|
|
|
for orth, token_attrs in additions.items():
|
|
|
|
if not all(isinstance(attr[ORTH], unicode) for attr in token_attrs):
|
|
|
|
msg = "Invalid value for ORTH in exception: key='%s', orths='%s'"
|
|
|
|
raise ValueError(msg % (orth, token_attrs))
|
|
|
|
described_orth = ''.join(attr[ORTH] for attr in token_attrs)
|
|
|
|
if orth != described_orth:
|
|
|
|
# TODO: Better error
|
|
|
|
msg = "Invalid tokenizer exception: key='%s', orths='%s'"
|
|
|
|
raise ValueError(msg % (orth, described_orth))
|
2016-12-17 14:27:41 +03:00
|
|
|
overlap = set(exc.keys()).intersection(set(additions))
|
|
|
|
assert not overlap, overlap
|
|
|
|
exc.update(additions)
|
|
|
|
|
|
|
|
|
|
|
|
def strings_to_exc(orths):
|
|
|
|
return {orth: [{ORTH: orth}] for orth in orths}
|
|
|
|
|
|
|
|
|
|
|
|
def expand_exc(excs, search, replace):
|
|
|
|
updates = {}
|
|
|
|
|
|
|
|
for token_string, tokens in excs.items():
|
|
|
|
if search in token_string:
|
|
|
|
new_key = token_string.replace(search, replace)
|
|
|
|
new_value = [_fix_token(t, search, replace) for t in tokens]
|
|
|
|
|
|
|
|
updates[new_key] = new_value
|
|
|
|
|
|
|
|
return updates
|
|
|
|
|
|
|
|
|
|
|
|
def _fix_token(token, search, replace):
|
|
|
|
fixed = dict(token)
|
|
|
|
fixed[ORTH] = fixed[ORTH].replace(search, replace)
|
|
|
|
return fixed
|