Merge branch 'develop' of https://github.com/explosion/spaCy into develop

2025-07-15 10:42:34 +03:00 · 2018-05-20 18:59:24 +02:00 · 2018-05-20 18:59:24 +02:00 · bdc23dd8c1
commit bdc23dd8c1
parent 401213fb1f 5401c55c75
6 changed files with 54 additions and 19 deletions
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -245,6 +245,8 @@ class Errors(object):
            "the meta.json. Vector names are required to avoid issue #1660.")
    E093 = ("token.ent_iob values make invalid sequence: I without B\n{seq}")
    E094 = ("Error reading line {line_num} in vectors file {loc}.")
+    E095 = ("Can't write to frozen dictionary. This is likely an internal "
+            "error. Are you writing to a default function argument?")


@add_codes
--- a/spacy/syntax/_parser_model.pxd
+++ b/spacy/syntax/_parser_model.pxd
@ -19,12 +19,10 @@ cdef struct WeightsC:
    const float* feat_bias
    const float* hidden_bias
    const float* hidden_weights
-    const float* vectors


 cdef struct ActivationsC:
    int* token_ids
-    float* vectors
    float* unmaxed
    float* scores
    float* hiddens
--- a/spacy/syntax/_parser_model.pyx
+++ b/spacy/syntax/_parser_model.pyx
@ -50,8 +50,6 @@ cdef WeightsC get_c_weights(model) except *:
    cdef np.ndarray vec2scores_b = model.vec2scores.b
    output.hidden_weights = <const float*>vec2scores_W.data
    output.hidden_bias = <const float*>vec2scores_b.data
-    cdef np.ndarray tokvecs = model.tokvecs
-    output.vectors = <float*>tokvecs.data
    return output


@ -72,7 +70,6 @@ cdef void resize_activations(ActivationsC* A, SizesC n) nogil:
        return
    if A._max_size == 0:
        A.token_ids = <int*>calloc(n.states * n.feats, sizeof(A.token_ids[0]))
-        A.vectors = <float*>calloc(n.states * n.embed_width, sizeof(A.vectors[0]))
        A.scores = <float*>calloc(n.states * n.classes, sizeof(A.scores[0]))
        A.unmaxed = <float*>calloc(n.states * n.hiddens * n.pieces, sizeof(A.unmaxed[0]))
        A.hiddens = <float*>calloc(n.states * n.hiddens, sizeof(A.hiddens[0]))
@ -81,8 +78,6 @@ cdef void resize_activations(ActivationsC* A, SizesC n) nogil:
    else:
        A.token_ids = <int*>realloc(A.token_ids,
            n.states * n.feats * sizeof(A.token_ids[0]))
-        A.vectors = <float*>realloc(A.vectors,
-            n.states * n.embed_width * sizeof(A.vectors[0]))
        A.scores = <float*>realloc(A.scores,
            n.states * n.classes * sizeof(A.scores[0]))
        A.unmaxed = <float*>realloc(A.unmaxed,
@ -242,7 +237,7 @@ class ParserStepModel(Model):
    def begin_update(self, states, drop=0.):
        token_ids = self.get_token_ids(states)
        vector, get_d_tokvecs = self.state2vec.begin_update(token_ids, drop=0.0)
-        mask = self.ops.get_dropout_mask(vector.shape, drop)
+        mask = self.vec2scores.ops.get_dropout_mask(vector.shape, drop)
        if mask is not None:
            vector *= mask
        scores, get_d_vector = self.vec2scores.begin_update(vector, drop=drop)
@ -251,7 +246,7 @@ class ParserStepModel(Model):
            d_vector = get_d_vector(d_scores, sgd=sgd)
            if mask is not None:
                d_vector *= mask
-            if isinstance(self.ops, CupyOps) \
+            if isinstance(self.state2vec.ops, CupyOps) \
            and not isinstance(token_ids, self.state2vec.ops.xp.ndarray):
                # Move token_ids and d_vector to GPU, asynchronously
                self.backprops.append((
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@ -4,6 +4,7 @@ from __future__ import unicode_literals
 from ..util import get_doc
 from ...tokens import Doc
 from ...vocab import Vocab
+from ...attrs import LEMMA

 import pytest
 import numpy
@ -178,6 +179,26 @@ def test_doc_api_merge_hang(en_tokenizer):
    doc.merge(8, 32, tag='', lemma='', ent_type='ORG')


+def test_doc_api_retokenizer(en_tokenizer):
+    doc = en_tokenizer("WKRO played songs by the beach boys all night")
+    with doc.retokenize() as retokenizer:
+        retokenizer.merge(doc[4:7])
+    assert len(doc) == 7
+    assert doc[4].text == 'the beach boys'
+
+
+def test_doc_api_retokenizer_attrs(en_tokenizer):
+    doc = en_tokenizer("WKRO played songs by the beach boys all night")
+    # test both string and integer attributes and values
+    attrs = {LEMMA: 'boys', 'ENT_TYPE': doc.vocab.strings['ORG']}
+    with doc.retokenize() as retokenizer:
+        retokenizer.merge(doc[4:7], attrs=attrs)
+    assert len(doc) == 7
+    assert doc[4].text == 'the beach boys'
+    assert doc[4].lemma_ == 'boys'
+    assert doc[4].ent_type_ == 'ORG'
+
+
 def test_doc_api_sents_empty_string(en_tokenizer):
    doc = en_tokenizer("")
    doc.is_parsed = True
--- a/spacy/tokens/_retokenize.pyx
+++ b/spacy/tokens/_retokenize.pyx
@ -11,11 +11,13 @@ from .span cimport Span
 from .token cimport Token
 from ..lexeme cimport Lexeme, EMPTY_LEXEME
 from ..structs cimport LexemeC, TokenC
-from ..attrs cimport *
+from ..attrs cimport TAG
+from ..attrs import intify_attrs
+from ..util import SimpleFrozenDict


 cdef class Retokenizer:
-    '''Helper class for doc.retokenize() context manager.'''
+    """Helper class for doc.retokenize() context manager."""
    cdef Doc doc
    cdef list merges
    cdef list splits
@ -24,14 +26,18 @@ cdef class Retokenizer:
        self.merges = []
        self.splits = []

-    def merge(self, Span span, attrs=None):
-        '''Mark a span for merging. The attrs will be applied to the resulting
-        token.'''
+    def merge(self, Span span, attrs=SimpleFrozenDict()):
+        """Mark a span for merging. The attrs will be applied to the resulting
+        token.
+        """
+        attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
        self.merges.append((span.start_char, span.end_char, attrs))

-    def split(self, Token token, orths, attrs=None):
-        '''Mark a Token for splitting, into the specified orths. The attrs
-        will be applied to each subtoken.'''
+    def split(self, Token token, orths, attrs=SimpleFrozenDict()):
+        """Mark a Token for splitting, into the specified orths. The attrs
+        will be applied to each subtoken.
+        """
+        attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
        self.splits.append((token.start_char, orths, attrs))

    def __enter__(self):
@ -125,5 +131,3 @@ def _merge(Doc doc, int start, int end, attributes):
    # Clear the cached Python objects
    # Return the merged Python object
    return doc[start]
-
-
--- a/spacy/util.py
+++ b/spacy/util.py
@ -635,3 +635,18 @@ def use_gpu(gpu_id):
 def fix_random_seed(seed=0):
    random.seed(seed)
    numpy.random.seed(seed)
+
+
+class SimpleFrozenDict(dict):
+    """Simplified implementation of a frozen dict, mainly used as default
+    function or method argument (for arguments that should default to empty
+    dictionary). Will raise an error if user or spaCy attempts to add to dict.
+    """
+    def __setitem__(self, key, value):
+        raise NotImplementedError(Errors.E095)
+
+    def pop(self, key, default=None):
+        raise NotImplementedError(Errors.E095)
+
+    def update(self, other):
+        raise NotImplementedError(Errors.E095)