From 60db497525a88fd44351d91e260c36f7fabe878c Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 8 May 2017 15:42:12 +0200 Subject: [PATCH] Add update_exc and expand_exc to util Doesn't require separate language data util anymore --- spacy/language_data/__init__.py | 1 - spacy/language_data/util.py | 52 --------------------------------- spacy/util.py | 36 ++++++++++++++++++++++- 3 files changed, 35 insertions(+), 54 deletions(-) delete mode 100644 spacy/language_data/util.py diff --git a/spacy/language_data/__init__.py b/spacy/language_data/__init__.py index 2119c071b..20f9d4a87 100644 --- a/spacy/language_data/__init__.py +++ b/spacy/language_data/__init__.py @@ -3,5 +3,4 @@ from .emoticons import * from .punctuation import * from .tag_map import * from .entity_rules import * -from .util import * from .tokenizer_exceptions import * diff --git a/spacy/language_data/util.py b/spacy/language_data/util.py deleted file mode 100644 index 10cd161aa..000000000 --- a/spacy/language_data/util.py +++ /dev/null @@ -1,52 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -from ..symbols import * - -try: - unicode -except: - unicode = str - - -PRON_LEMMA = "-PRON-" -DET_LEMMA = "-DET-" -ENT_ID = "ent_id" - - -def update_exc(exc, additions): - for orth, token_attrs in additions.items(): - if not all(isinstance(attr[ORTH], unicode) for attr in token_attrs): - msg = "Invalid value for ORTH in exception: key='%s', orths='%s'" - raise ValueError(msg % (orth, token_attrs)) - described_orth = ''.join(attr[ORTH] for attr in token_attrs) - if orth != described_orth: - # TODO: Better error - msg = "Invalid tokenizer exception: key='%s', orths='%s'" - raise ValueError(msg % (orth, described_orth)) - overlap = set(exc.keys()).intersection(set(additions)) - assert not overlap, overlap - exc.update(additions) - - -def strings_to_exc(orths): - return {orth: [{ORTH: orth}] for orth in orths} - - -def expand_exc(excs, search, replace): - updates = {} - - for token_string, tokens in excs.items(): - if search in token_string: - new_key = token_string.replace(search, replace) - new_value = [_fix_token(t, search, replace) for t in tokens] - - updates[new_key] = new_value - - return updates - - -def _fix_token(token, search, replace): - fixed = dict(token) - fixed[ORTH] = fixed[ORTH].replace(search, replace) - return fixed diff --git a/spacy/util.py b/spacy/util.py index e7f52fda0..e6aa27680 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -9,7 +9,8 @@ from pathlib import Path import sys import textwrap -from .compat import path2str, basestring_, input_ +from .symbols import ORTH +from .compat import path2str, basestring_, input_, unicode_ LANGUAGES = {} @@ -77,6 +78,39 @@ def compile_infix_regex(entries): return re.compile(expression) +def update_exc(base_exceptions, *addition_dicts): + exc = dict(base_exceptions) + for additions in addition_dicts: + for orth, token_attrs in additions.items(): + if not all(isinstance(attr[ORTH], unicode_) for attr in token_attrs): + msg = "Invalid value for ORTH in exception: key='%s', orths='%s'" + raise ValueError(msg % (orth, token_attrs)) + described_orth = ''.join(attr[ORTH] for attr in token_attrs) + if orth != described_orth: + # TODO: Better error + msg = "Invalid tokenizer exception: key='%s', orths='%s'" + raise ValueError(msg % (orth, described_orth)) + # overlap = set(exc.keys()).intersection(set(additions)) + # assert not overlap, overlap + exc.update(additions) + expand_exc(exc, "'", "’") + return exc + + +def expand_exc(excs, search, replace): + def _fix_token(token, search, replace): + fixed = dict(token) + fixed[ORTH] = fixed[ORTH].replace(search, replace) + return fixed + updates = {} + for token_string, tokens in excs.items(): + if search in token_string: + new_key = token_string.replace(search, replace) + new_value = [_fix_token(t, search, replace) for t in tokens] + updates[new_key] = new_value + return updates + + def normalize_slice(length, start, stop, step=None): if not (step is None or step == 1): raise ValueError("Stepped slices not supported in Span objects."