spaCy/spacy/language_data/util.py

# encoding: utf8
from __future__ import unicode_literals

from ..symbols import *

try:
    unicode
except:
    unicode = str


PRON_LEMMA = "-PRON-"
DET_LEMMA = "-DET-"
ENT_ID = "ent_id"


def update_exc(exc, additions):
    for orth, token_attrs in additions.items():
        if not all(isinstance(attr[ORTH], unicode) for attr in token_attrs):
            msg = "Invalid value for ORTH in exception: key='%s', orths='%s'"
            raise ValueError(msg % (orth, token_attrs))
        described_orth = ''.join(attr[ORTH] for attr in token_attrs)
        if orth != described_orth:
            # TODO: Better error
            msg = "Invalid tokenizer exception: key='%s', orths='%s'"
            raise ValueError(msg % (orth, described_orth))
    overlap = set(exc.keys()).intersection(set(additions))
    assert not overlap, overlap
    exc.update(additions)


def strings_to_exc(orths):
    return {orth: [{ORTH: orth}] for orth in orths}


def expand_exc(excs, search, replace):
    updates = {}

    for token_string, tokens in excs.items():
        if search in token_string:
            new_key = token_string.replace(search, replace)
            new_value = [_fix_token(t, search, replace) for t in tokens]

            updates[new_key] = new_value

    return updates


def _fix_token(token, search, replace):
    fixed = dict(token)
    fixed[ORTH] = fixed[ORTH].replace(search, replace)
    return fixed
Add global language data utils 2016-12-17 14:27:41 +03:00			`# encoding: utf8`
			`from __future__ import unicode_literals`

			`from ..symbols import *`

Fix Issue #736: Times were being tokenized with incorrect string values. 2017-01-12 13:21:01 +03:00			`try:`
			`unicode`
			`except:`
			`unicode = str`

Add global language data utils 2016-12-17 14:27:41 +03:00
			`PRON_LEMMA = "-PRON-"`
Add DET_LEMMA constant 2016-12-21 20:05:41 +03:00			`DET_LEMMA = "-DET-"`
Add ENT_ID constant 2016-12-18 17:34:21 +03:00			`ENT_ID = "ent_id"`
Add global language data utils 2016-12-17 14:27:41 +03:00

			`def update_exc(exc, additions):`
Fix Issue #736: Times were being tokenized with incorrect string values. 2017-01-12 13:21:01 +03:00			`for orth, token_attrs in additions.items():`
			`if not all(isinstance(attr[ORTH], unicode) for attr in token_attrs):`
			`msg = "Invalid value for ORTH in exception: key='%s', orths='%s'"`
			`raise ValueError(msg % (orth, token_attrs))`
			`described_orth = ''.join(attr[ORTH] for attr in token_attrs)`
			`if orth != described_orth:`
			`# TODO: Better error`
			`msg = "Invalid tokenizer exception: key='%s', orths='%s'"`
			`raise ValueError(msg % (orth, described_orth))`
Add global language data utils 2016-12-17 14:27:41 +03:00			`overlap = set(exc.keys()).intersection(set(additions))`
			`assert not overlap, overlap`
			`exc.update(additions)`


			`def strings_to_exc(orths):`
			`return {orth: [{ORTH: orth}] for orth in orths}`


			`def expand_exc(excs, search, replace):`
			`updates = {}`

			`for token_string, tokens in excs.items():`
			`if search in token_string:`
			`new_key = token_string.replace(search, replace)`
			`new_value = [_fix_token(t, search, replace) for t in tokens]`

			`updates[new_key] = new_value`

			`return updates`


			`def _fix_token(token, search, replace):`
			`fixed = dict(token)`
			`fixed[ORTH] = fixed[ORTH].replace(search, replace)`
			`return fixed`