* Check for errors in parser, and parallelise the left-over batch

2025-12-16 22:54:18 +03:00 · 2016-02-06 10:06:13 +01:00 · 2016-02-06 10:06:13 +01:00 · 1b41f868d2
commit 1b41f868d2
parent 031b00cb91
2 changed files with 22 additions and 12 deletions
--- a/spacy/syntax/parser.pxd
+++ b/spacy/syntax/parser.pxd
@ -12,9 +12,8 @@ from ._state cimport StateC
 cdef class ParserModel(AveragedPerceptron):
    cdef void set_featuresC(self, ExampleC* eg, const StateC* state) nogil
 cdef class Parser:
    cdef readonly ParserModel model
    cdef readonly TransitionSystem moves
-    cdef void parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) nogil
+    cdef int parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) nogil
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@ -123,29 +123,39 @@ cdef class Parser:
        cdef int i
        cdef int nr_class = self.moves.n_moves
        cdef int nr_feat = self.model.nr_feat
        cdef int status
        queue = []
        for doc in stream:
            doc_ptr[len(queue)] = doc.c
            lengths[len(queue)] = doc.length
            queue.append(doc)
            if len(queue) == batch_size:
-                for i in cython.parallel.prange(batch_size, nogil=True,
+                with nogil:
-                                                num_threads=n_threads):
+                    for i in cython.parallel.prange(batch_size, num_threads=n_threads):
-                    self.parseC(doc_ptr[i], lengths[i], nr_feat, nr_class)
+                        status = self.parseC(doc_ptr[i], lengths[i], nr_feat, nr_class)
                        if status != 0:
                            with gil:
                                sent_str = queue[i].text
                                raise ValueError("Error parsing doc: %s" % sent_str)
                PyErr_CheckSignals()
                for doc in queue:
                    doc.is_parsed = True
                    yield doc
                queue = []
        batch_size = len(queue)
-        for i in range(batch_size):
+        with nogil:
-            self.parseC(doc_ptr[i], lengths[i], nr_feat, nr_class)
+            for i in cython.parallel.prange(batch_size, num_threads=n_threads):
                status = self.parseC(doc_ptr[i], lengths[i], nr_feat, nr_class)
                if status != 0:
                    with gil:
                        sent_str = queue[i].text
                        raise ValueError("Error parsing doc: %s" % sent_str)
        for doc in queue:
            doc.is_parsed = True
            yield doc
        PyErr_CheckSignals()
-    cdef void parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) nogil:
+    cdef int parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) nogil:
        cdef ExampleC eg
        eg.nr_feat = nr_feat
        eg.nr_atom = CONTEXT_SIZE
@ -168,7 +178,7 @@ cdef class Parser:
            if not eg.is_valid[guess]:
                with gil:
                    move_name = self.moves.move_name(action.move, action.label)
-                    raise ValueError("Illegal action: %s" % move_name)
+                    return 1
            action.do(state, action.label)
            memset(eg.scores, 0, sizeof(eg.scores[0]) * eg.nr_class)
            for i in range(eg.nr_class):
@ -181,6 +191,7 @@ cdef class Parser:
        free(eg.atoms)
        free(eg.scores)
        free(eg.is_valid)
        return 0
    def train(self, Doc tokens, GoldParse gold):
        self.moves.preprocess_gold(gold)