mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
Add update_exc and expand_exc to util
Doesn't require separate language data util anymore
This commit is contained in:
parent
6e5bd4f228
commit
60db497525
|
@ -3,5 +3,4 @@ from .emoticons import *
|
|||
from .punctuation import *
|
||||
from .tag_map import *
|
||||
from .entity_rules import *
|
||||
from .util import *
|
||||
from .tokenizer_exceptions import *
|
||||
|
|
|
@ -1,52 +0,0 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..symbols import *
|
||||
|
||||
try:
|
||||
unicode
|
||||
except:
|
||||
unicode = str
|
||||
|
||||
|
||||
PRON_LEMMA = "-PRON-"
|
||||
DET_LEMMA = "-DET-"
|
||||
ENT_ID = "ent_id"
|
||||
|
||||
|
||||
def update_exc(exc, additions):
|
||||
for orth, token_attrs in additions.items():
|
||||
if not all(isinstance(attr[ORTH], unicode) for attr in token_attrs):
|
||||
msg = "Invalid value for ORTH in exception: key='%s', orths='%s'"
|
||||
raise ValueError(msg % (orth, token_attrs))
|
||||
described_orth = ''.join(attr[ORTH] for attr in token_attrs)
|
||||
if orth != described_orth:
|
||||
# TODO: Better error
|
||||
msg = "Invalid tokenizer exception: key='%s', orths='%s'"
|
||||
raise ValueError(msg % (orth, described_orth))
|
||||
overlap = set(exc.keys()).intersection(set(additions))
|
||||
assert not overlap, overlap
|
||||
exc.update(additions)
|
||||
|
||||
|
||||
def strings_to_exc(orths):
|
||||
return {orth: [{ORTH: orth}] for orth in orths}
|
||||
|
||||
|
||||
def expand_exc(excs, search, replace):
|
||||
updates = {}
|
||||
|
||||
for token_string, tokens in excs.items():
|
||||
if search in token_string:
|
||||
new_key = token_string.replace(search, replace)
|
||||
new_value = [_fix_token(t, search, replace) for t in tokens]
|
||||
|
||||
updates[new_key] = new_value
|
||||
|
||||
return updates
|
||||
|
||||
|
||||
def _fix_token(token, search, replace):
|
||||
fixed = dict(token)
|
||||
fixed[ORTH] = fixed[ORTH].replace(search, replace)
|
||||
return fixed
|
|
@ -9,7 +9,8 @@ from pathlib import Path
|
|||
import sys
|
||||
import textwrap
|
||||
|
||||
from .compat import path2str, basestring_, input_
|
||||
from .symbols import ORTH
|
||||
from .compat import path2str, basestring_, input_, unicode_
|
||||
|
||||
|
||||
LANGUAGES = {}
|
||||
|
@ -77,6 +78,39 @@ def compile_infix_regex(entries):
|
|||
return re.compile(expression)
|
||||
|
||||
|
||||
def update_exc(base_exceptions, *addition_dicts):
|
||||
exc = dict(base_exceptions)
|
||||
for additions in addition_dicts:
|
||||
for orth, token_attrs in additions.items():
|
||||
if not all(isinstance(attr[ORTH], unicode_) for attr in token_attrs):
|
||||
msg = "Invalid value for ORTH in exception: key='%s', orths='%s'"
|
||||
raise ValueError(msg % (orth, token_attrs))
|
||||
described_orth = ''.join(attr[ORTH] for attr in token_attrs)
|
||||
if orth != described_orth:
|
||||
# TODO: Better error
|
||||
msg = "Invalid tokenizer exception: key='%s', orths='%s'"
|
||||
raise ValueError(msg % (orth, described_orth))
|
||||
# overlap = set(exc.keys()).intersection(set(additions))
|
||||
# assert not overlap, overlap
|
||||
exc.update(additions)
|
||||
expand_exc(exc, "'", "’")
|
||||
return exc
|
||||
|
||||
|
||||
def expand_exc(excs, search, replace):
|
||||
def _fix_token(token, search, replace):
|
||||
fixed = dict(token)
|
||||
fixed[ORTH] = fixed[ORTH].replace(search, replace)
|
||||
return fixed
|
||||
updates = {}
|
||||
for token_string, tokens in excs.items():
|
||||
if search in token_string:
|
||||
new_key = token_string.replace(search, replace)
|
||||
new_value = [_fix_token(t, search, replace) for t in tokens]
|
||||
updates[new_key] = new_value
|
||||
return updates
|
||||
|
||||
|
||||
def normalize_slice(length, start, stop, step=None):
|
||||
if not (step is None or step == 1):
|
||||
raise ValueError("Stepped slices not supported in Span objects."
|
||||
|
|
Loading…
Reference in New Issue
Block a user