spaCy/spacy/language_data/util.py

# encoding: utf8
from __future__ import unicode_literals

from ..symbols import *


PRON_LEMMA = "-PRON-"
DET_LEMMA = "-DET-"
ENT_ID = "ent_id"


def update_exc(exc, additions):
    overlap = set(exc.keys()).intersection(set(additions))
    assert not overlap, overlap
    exc.update(additions)


def strings_to_exc(orths):
    return {orth: [{ORTH: orth}] for orth in orths}


def expand_exc(excs, search, replace):
    updates = {}

    for token_string, tokens in excs.items():
        if search in token_string:
            new_key = token_string.replace(search, replace)
            new_value = [_fix_token(t, search, replace) for t in tokens]

            updates[new_key] = new_value

    return updates


def _fix_token(token, search, replace):
    fixed = dict(token)
    fixed[ORTH] = fixed[ORTH].replace(search, replace)
    return fixed
Add global language data utils 2016-12-17 14:27:41 +03:00			`# encoding: utf8`
			`from __future__ import unicode_literals`

			`from ..symbols import *`


			`PRON_LEMMA = "-PRON-"`
Add DET_LEMMA constant 2016-12-21 20:05:41 +03:00			`DET_LEMMA = "-DET-"`
Add ENT_ID constant 2016-12-18 17:34:21 +03:00			`ENT_ID = "ent_id"`
Add global language data utils 2016-12-17 14:27:41 +03:00

			`def update_exc(exc, additions):`
			`overlap = set(exc.keys()).intersection(set(additions))`
			`assert not overlap, overlap`
			`exc.update(additions)`


			`def strings_to_exc(orths):`
			`return {orth: [{ORTH: orth}] for orth in orths}`


			`def expand_exc(excs, search, replace):`
			`updates = {}`

			`for token_string, tokens in excs.items():`
			`if search in token_string:`
			`new_key = token_string.replace(search, replace)`
			`new_value = [_fix_token(t, search, replace) for t in tokens]`

			`updates[new_key] = new_value`

			`return updates`


			`def _fix_token(token, search, replace):`
			`fixed = dict(token)`
			`fixed[ORTH] = fixed[ORTH].replace(search, replace)`
			`return fixed`