Merge branch 'develop' of https://github.com/explosion/spaCy into develop

2025-10-20 02:34:33 +03:00 · 2018-05-20 18:59:24 +02:00 · 2018-05-20 18:59:24 +02:00 · bdc23dd8c1
commit bdc23dd8c1
parent 401213fb1f 5401c55c75
6 changed files with 54 additions and 19 deletions
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -245,6 +245,8 @@ class Errors(object):
            "the meta.json. Vector names are required to avoid issue #1660.")
    E093 = ("token.ent_iob values make invalid sequence: I without B\n{seq}")
    E094 = ("Error reading line {line_num} in vectors file {loc}.")
    E095 = ("Can't write to frozen dictionary. This is likely an internal "
            "error. Are you writing to a default function argument?")
@add_codes
--- a/spacy/syntax/_parser_model.pxd
+++ b/spacy/syntax/_parser_model.pxd
@ -19,12 +19,10 @@ cdef struct WeightsC:
    const float* feat_bias
    const float* hidden_bias
    const float* hidden_weights
    const float* vectors
 cdef struct ActivationsC:
    int* token_ids
    float* vectors
    float* unmaxed
    float* scores
    float* hiddens
--- a/spacy/syntax/_parser_model.pyx
+++ b/spacy/syntax/_parser_model.pyx
@ -50,8 +50,6 @@ cdef WeightsC get_c_weights(model) except *:
    cdef np.ndarray vec2scores_b = model.vec2scores.b
    output.hidden_weights = <const float*>vec2scores_W.data
    output.hidden_bias = <const float*>vec2scores_b.data
    cdef np.ndarray tokvecs = model.tokvecs
    output.vectors = <float*>tokvecs.data
    return output
@ -72,7 +70,6 @@ cdef void resize_activations(ActivationsC* A, SizesC n) nogil:
        return
    if A._max_size == 0:
        A.token_ids = <int*>calloc(n.states * n.feats, sizeof(A.token_ids[0]))
        A.vectors = <float*>calloc(n.states * n.embed_width, sizeof(A.vectors[0]))
        A.scores = <float*>calloc(n.states * n.classes, sizeof(A.scores[0]))
        A.unmaxed = <float*>calloc(n.states * n.hiddens * n.pieces, sizeof(A.unmaxed[0]))
        A.hiddens = <float*>calloc(n.states * n.hiddens, sizeof(A.hiddens[0]))
@ -81,8 +78,6 @@ cdef void resize_activations(ActivationsC* A, SizesC n) nogil:
    else:
        A.token_ids = <int*>realloc(A.token_ids,
            n.states * n.feats * sizeof(A.token_ids[0]))
        A.vectors = <float*>realloc(A.vectors,
            n.states * n.embed_width * sizeof(A.vectors[0]))
        A.scores = <float*>realloc(A.scores,
            n.states * n.classes * sizeof(A.scores[0]))
        A.unmaxed = <float*>realloc(A.unmaxed,
@ -242,7 +237,7 @@ class ParserStepModel(Model):
    def begin_update(self, states, drop=0.):
        token_ids = self.get_token_ids(states)
        vector, get_d_tokvecs = self.state2vec.begin_update(token_ids, drop=0.0)
-        mask = self.ops.get_dropout_mask(vector.shape, drop)
+        mask = self.vec2scores.ops.get_dropout_mask(vector.shape, drop)
        if mask is not None:
            vector *= mask
        scores, get_d_vector = self.vec2scores.begin_update(vector, drop=drop)
@ -251,7 +246,7 @@ class ParserStepModel(Model):
            d_vector = get_d_vector(d_scores, sgd=sgd)
            if mask is not None:
                d_vector *= mask
-            if isinstance(self.ops, CupyOps) \
+            if isinstance(self.state2vec.ops, CupyOps) \
            and not isinstance(token_ids, self.state2vec.ops.xp.ndarray):
                # Move token_ids and d_vector to GPU, asynchronously
                self.backprops.append((
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@ -4,6 +4,7 @@ from __future__ import unicode_literals
 from ..util import get_doc
 from ...tokens import Doc
 from ...vocab import Vocab
 from ...attrs import LEMMA
 import pytest
 import numpy
@ -178,6 +179,26 @@ def test_doc_api_merge_hang(en_tokenizer):
    doc.merge(8, 32, tag='', lemma='', ent_type='ORG')
 def test_doc_api_retokenizer(en_tokenizer):
    doc = en_tokenizer("WKRO played songs by the beach boys all night")
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[4:7])
    assert len(doc) == 7
    assert doc[4].text == 'the beach boys'
 def test_doc_api_retokenizer_attrs(en_tokenizer):
    doc = en_tokenizer("WKRO played songs by the beach boys all night")
    # test both string and integer attributes and values
    attrs = {LEMMA: 'boys', 'ENT_TYPE': doc.vocab.strings['ORG']}
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[4:7], attrs=attrs)
    assert len(doc) == 7
    assert doc[4].text == 'the beach boys'
    assert doc[4].lemma_ == 'boys'
    assert doc[4].ent_type_ == 'ORG'
 def test_doc_api_sents_empty_string(en_tokenizer):
    doc = en_tokenizer("")
    doc.is_parsed = True
--- a/spacy/tokens/_retokenize.pyx
+++ b/spacy/tokens/_retokenize.pyx
@ -11,11 +11,13 @@ from .span cimport Span
 from .token cimport Token
 from ..lexeme cimport Lexeme, EMPTY_LEXEME
 from ..structs cimport LexemeC, TokenC
-from ..attrs cimport *
+from ..attrs cimport TAG
 from ..attrs import intify_attrs
 from ..util import SimpleFrozenDict
 cdef class Retokenizer:
-    '''Helper class for doc.retokenize() context manager.'''
+    """Helper class for doc.retokenize() context manager."""
    cdef Doc doc
    cdef list merges
    cdef list splits
@ -24,14 +26,18 @@ cdef class Retokenizer:
        self.merges = []
        self.splits = []
-    def merge(self, Span span, attrs=None):
+    def merge(self, Span span, attrs=SimpleFrozenDict()):
-        '''Mark a span for merging. The attrs will be applied to the resulting
+        """Mark a span for merging. The attrs will be applied to the resulting
-        token.'''
+        token.
        """
        attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
        self.merges.append((span.start_char, span.end_char, attrs))
-    def split(self, Token token, orths, attrs=None):
+    def split(self, Token token, orths, attrs=SimpleFrozenDict()):
-        '''Mark a Token for splitting, into the specified orths. The attrs
+        """Mark a Token for splitting, into the specified orths. The attrs
-        will be applied to each subtoken.'''
+        will be applied to each subtoken.
        """
        attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
        self.splits.append((token.start_char, orths, attrs))
    def __enter__(self):
@ -125,5 +131,3 @@ def _merge(Doc doc, int start, int end, attributes):
    # Clear the cached Python objects
    # Return the merged Python object
    return doc[start]
--- a/spacy/util.py
+++ b/spacy/util.py
@ -635,3 +635,18 @@ def use_gpu(gpu_id):
 def fix_random_seed(seed=0):
    random.seed(seed)
    numpy.random.seed(seed)
 class SimpleFrozenDict(dict):
    """Simplified implementation of a frozen dict, mainly used as default
    function or method argument (for arguments that should default to empty
    dictionary). Will raise an error if user or spaCy attempts to add to dict.
    """
    def __setitem__(self, key, value):
        raise NotImplementedError(Errors.E095)
    def pop(self, key, default=None):
        raise NotImplementedError(Errors.E095)
    def update(self, other):
        raise NotImplementedError(Errors.E095)