Merge branch 'develop' of https://github.com/explosion/spaCy into develop

2026-02-18 21:20:59 +03:00 · 2017-08-22 19:00:43 +02:00 · 2017-08-22 19:00:43 +02:00 · df2745eb08
commit df2745eb08
parent 5b329acbf2 1fe605dfe5
4 changed files with 60 additions and 19 deletions
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@ -239,7 +239,7 @@ def Tok2Vec(width, embed_size, preprocess=None):
                >> uniqued(embed, column=5)
                >> drop_layer(
                    Residual(
-                        (ExtractWindow(nW=1) >> BN(Maxout(width, width*3)))
+                        (ExtractWindow(nW=1) >> ReLu(width, width*3))
                    )
                ) ** 4, pad=4
            )
--- a/spacy/lang/en/tokenizer_exceptions.py
+++ b/spacy/lang/en/tokenizer_exceptions.py
@ -232,7 +232,10 @@ for verb_data in [
    {ORTH: "are", LEMMA: "be", NORM: "are", TAG: "VBP", "number": 2},
    {ORTH: "is", LEMMA: "be", NORM: "is", TAG: "VBZ"},
    {ORTH: "was", LEMMA: "be", NORM: "was"},
-    {ORTH: "were", LEMMA: "be", NORM: "were"}]:
+    {ORTH: "were", LEMMA: "be", NORM: "were"},
+    {ORTH: "have", NORM: "have"},
+    {ORTH: "has", LEMMA: "have", NORM: "has"},
+    {ORTH: "dare", NORM: "dare"}]:
    verb_data_tc = dict(verb_data)
    verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
    for data in [verb_data, verb_data_tc]:
--- a/spacy/syntax/nn_parser.pxd
+++ b/spacy/syntax/nn_parser.pxd
@ -14,4 +14,8 @@ cdef class Parser:
    cdef readonly TransitionSystem moves
    cdef readonly object cfg

+    cdef void _parse_step(self, StateC* state,
+            const float* feat_weights,
+            int nr_class, int nr_feat, int nr_piece) nogil
+
    #cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -257,10 +257,15 @@ cdef class Parser:
                        nI=token_vector_width)

        with Model.use_device('cpu'):
-            upper = chain(
-                clone(Maxout(hidden_width), (depth-1)),
-                zero_init(Affine(nr_class, drop_factor=0.0))
-            )
+            if depth == 0:
+                upper = chain()
+                upper.is_noop = True
+            else:
+                upper = chain(
+                    clone(Maxout(hidden_width), (depth-1)),
+                    zero_init(Affine(nr_class, drop_factor=0.0))
+                )
+                upper.is_noop = False
        # TODO: This is an unfortunate hack atm!
        # Used to set input dimensions in network.
        lower.begin_training(lower.ops.allocate((500, token_vector_width)))
@ -412,20 +417,27 @@ cdef class Parser:
        cdef np.ndarray scores
        c_token_ids = <int*>token_ids.data
        c_is_valid = <int*>is_valid.data
+        cdef int has_hidden = not getattr(vec2scores, 'is_noop', False)
        while not next_step.empty():
-            for i in range(next_step.size()):
-                st = next_step[i]
-                st.set_context_tokens(&c_token_ids[i*nr_feat], nr_feat)
-                self.moves.set_valid(&c_is_valid[i*nr_class], st)
-            vectors = state2vec(token_ids[:next_step.size()])
-            scores = vec2scores(vectors)
-            c_scores = <float*>scores.data
-            for i in range(next_step.size()):
-                st = next_step[i]
-                guess = arg_max_if_valid(
-                    &c_scores[i*nr_class], &c_is_valid[i*nr_class], nr_class)
-                action = self.moves.c[guess]
-                action.do(st, action.label)
+            if not has_hidden:
+                for i in cython.parallel.prange(
+                        next_step.size(), num_threads=6, nogil=True):
+                    self._parse_step(next_step[i],
+                        feat_weights, nr_class, nr_feat, nr_piece)
+            else:
+                for i in range(next_step.size()):
+                    st = next_step[i]
+                    st.set_context_tokens(&c_token_ids[i*nr_feat], nr_feat)
+                    self.moves.set_valid(&c_is_valid[i*nr_class], st)
+                vectors = state2vec(token_ids[:next_step.size()])
+                scores = vec2scores(vectors)
+                c_scores = <float*>scores.data
+                for i in range(next_step.size()):
+                    st = next_step[i]
+                    guess = arg_max_if_valid(
+                        &c_scores[i*nr_class], &c_is_valid[i*nr_class], nr_class)
+                    action = self.moves.c[guess]
+                    action.do(st, action.label)
            this_step, next_step = next_step, this_step
            next_step.clear()
            for st in this_step:
@ -482,6 +494,28 @@ cdef class Parser:
            beams.append(beam)
        return beams

+    cdef void _parse_step(self, StateC* state,
+            const float* feat_weights,
+            int nr_class, int nr_feat, int nr_piece) nogil:
+        '''This only works with no hidden layers -- fast but inaccurate'''
+        #for i in cython.parallel.prange(next_step.size(), num_threads=4, nogil=True):
+        #    self._parse_step(next_step[i], feat_weights, nr_class, nr_feat)
+        token_ids = <int*>calloc(nr_feat, sizeof(int))
+        scores = <float*>calloc(nr_class * nr_piece, sizeof(float))
+        is_valid = <int*>calloc(nr_class, sizeof(int))
+
+        state.set_context_tokens(token_ids, nr_feat)
+        sum_state_features(scores,
+            feat_weights, token_ids, 1, nr_feat, nr_class * nr_piece)
+        self.moves.set_valid(is_valid, state)
+        guess = arg_maxout_if_valid(scores, is_valid, nr_class, nr_piece)
+        action = self.moves.c[guess]
+        action.do(state, action.label)
+
+        free(is_valid)
+        free(scores)
+        free(token_ids)
+
    def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
        if not any(self.moves.has_gold(gold) for gold in golds):
            return None