diff --git a/spacy/errors.py b/spacy/errors.py index b60fe690a..a557be2e8 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -245,6 +245,8 @@ class Errors(object): "the meta.json. Vector names are required to avoid issue #1660.") E093 = ("token.ent_iob values make invalid sequence: I without B\n{seq}") E094 = ("Error reading line {line_num} in vectors file {loc}.") + E095 = ("Can't write to frozen dictionary. This is likely an internal " + "error. Are you writing to a default function argument?") @add_codes diff --git a/spacy/syntax/_parser_model.pxd b/spacy/syntax/_parser_model.pxd index 38f2f0e4c..75870ef2f 100644 --- a/spacy/syntax/_parser_model.pxd +++ b/spacy/syntax/_parser_model.pxd @@ -19,12 +19,10 @@ cdef struct WeightsC: const float* feat_bias const float* hidden_bias const float* hidden_weights - const float* vectors cdef struct ActivationsC: int* token_ids - float* vectors float* unmaxed float* scores float* hiddens diff --git a/spacy/syntax/_parser_model.pyx b/spacy/syntax/_parser_model.pyx index 962461417..a74878981 100644 --- a/spacy/syntax/_parser_model.pyx +++ b/spacy/syntax/_parser_model.pyx @@ -50,8 +50,6 @@ cdef WeightsC get_c_weights(model) except *: cdef np.ndarray vec2scores_b = model.vec2scores.b output.hidden_weights = vec2scores_W.data output.hidden_bias = vec2scores_b.data - cdef np.ndarray tokvecs = model.tokvecs - output.vectors = tokvecs.data return output @@ -72,7 +70,6 @@ cdef void resize_activations(ActivationsC* A, SizesC n) nogil: return if A._max_size == 0: A.token_ids = calloc(n.states * n.feats, sizeof(A.token_ids[0])) - A.vectors = calloc(n.states * n.embed_width, sizeof(A.vectors[0])) A.scores = calloc(n.states * n.classes, sizeof(A.scores[0])) A.unmaxed = calloc(n.states * n.hiddens * n.pieces, sizeof(A.unmaxed[0])) A.hiddens = calloc(n.states * n.hiddens, sizeof(A.hiddens[0])) @@ -81,8 +78,6 @@ cdef void resize_activations(ActivationsC* A, SizesC n) nogil: else: A.token_ids = realloc(A.token_ids, n.states * n.feats * sizeof(A.token_ids[0])) - A.vectors = realloc(A.vectors, - n.states * n.embed_width * sizeof(A.vectors[0])) A.scores = realloc(A.scores, n.states * n.classes * sizeof(A.scores[0])) A.unmaxed = realloc(A.unmaxed, @@ -242,7 +237,7 @@ class ParserStepModel(Model): def begin_update(self, states, drop=0.): token_ids = self.get_token_ids(states) vector, get_d_tokvecs = self.state2vec.begin_update(token_ids, drop=0.0) - mask = self.ops.get_dropout_mask(vector.shape, drop) + mask = self.vec2scores.ops.get_dropout_mask(vector.shape, drop) if mask is not None: vector *= mask scores, get_d_vector = self.vec2scores.begin_update(vector, drop=drop) @@ -251,7 +246,7 @@ class ParserStepModel(Model): d_vector = get_d_vector(d_scores, sgd=sgd) if mask is not None: d_vector *= mask - if isinstance(self.ops, CupyOps) \ + if isinstance(self.state2vec.ops, CupyOps) \ and not isinstance(token_ids, self.state2vec.ops.xp.ndarray): # Move token_ids and d_vector to GPU, asynchronously self.backprops.append(( diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 06f6a3d30..d9db0916b 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals from ..util import get_doc from ...tokens import Doc from ...vocab import Vocab +from ...attrs import LEMMA import pytest import numpy @@ -178,6 +179,26 @@ def test_doc_api_merge_hang(en_tokenizer): doc.merge(8, 32, tag='', lemma='', ent_type='ORG') +def test_doc_api_retokenizer(en_tokenizer): + doc = en_tokenizer("WKRO played songs by the beach boys all night") + with doc.retokenize() as retokenizer: + retokenizer.merge(doc[4:7]) + assert len(doc) == 7 + assert doc[4].text == 'the beach boys' + + +def test_doc_api_retokenizer_attrs(en_tokenizer): + doc = en_tokenizer("WKRO played songs by the beach boys all night") + # test both string and integer attributes and values + attrs = {LEMMA: 'boys', 'ENT_TYPE': doc.vocab.strings['ORG']} + with doc.retokenize() as retokenizer: + retokenizer.merge(doc[4:7], attrs=attrs) + assert len(doc) == 7 + assert doc[4].text == 'the beach boys' + assert doc[4].lemma_ == 'boys' + assert doc[4].ent_type_ == 'ORG' + + def test_doc_api_sents_empty_string(en_tokenizer): doc = en_tokenizer("") doc.is_parsed = True diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx index 00f724ed6..b405dd000 100644 --- a/spacy/tokens/_retokenize.pyx +++ b/spacy/tokens/_retokenize.pyx @@ -11,11 +11,13 @@ from .span cimport Span from .token cimport Token from ..lexeme cimport Lexeme, EMPTY_LEXEME from ..structs cimport LexemeC, TokenC -from ..attrs cimport * +from ..attrs cimport TAG +from ..attrs import intify_attrs +from ..util import SimpleFrozenDict cdef class Retokenizer: - '''Helper class for doc.retokenize() context manager.''' + """Helper class for doc.retokenize() context manager.""" cdef Doc doc cdef list merges cdef list splits @@ -24,14 +26,18 @@ cdef class Retokenizer: self.merges = [] self.splits = [] - def merge(self, Span span, attrs=None): - '''Mark a span for merging. The attrs will be applied to the resulting - token.''' + def merge(self, Span span, attrs=SimpleFrozenDict()): + """Mark a span for merging. The attrs will be applied to the resulting + token. + """ + attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings) self.merges.append((span.start_char, span.end_char, attrs)) - def split(self, Token token, orths, attrs=None): - '''Mark a Token for splitting, into the specified orths. The attrs - will be applied to each subtoken.''' + def split(self, Token token, orths, attrs=SimpleFrozenDict()): + """Mark a Token for splitting, into the specified orths. The attrs + will be applied to each subtoken. + """ + attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings) self.splits.append((token.start_char, orths, attrs)) def __enter__(self): @@ -125,5 +131,3 @@ def _merge(Doc doc, int start, int end, attributes): # Clear the cached Python objects # Return the merged Python object return doc[start] - - diff --git a/spacy/util.py b/spacy/util.py index cc3e0d9ee..b80142c38 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -635,3 +635,18 @@ def use_gpu(gpu_id): def fix_random_seed(seed=0): random.seed(seed) numpy.random.seed(seed) + + +class SimpleFrozenDict(dict): + """Simplified implementation of a frozen dict, mainly used as default + function or method argument (for arguments that should default to empty + dictionary). Will raise an error if user or spaCy attempts to add to dict. + """ + def __setitem__(self, key, value): + raise NotImplementedError(Errors.E095) + + def pop(self, key, default=None): + raise NotImplementedError(Errors.E095) + + def update(self, other): + raise NotImplementedError(Errors.E095)