Restore changes to pipeline.pyx from nn-beam-parser branch

2025-07-15 18:52:29 +03:00 · 2017-08-18 22:02:35 +02:00 · 2017-08-18 22:02:35 +02:00 · ec482580b5
commit ec482580b5
parent 931509d96a
1 changed files with 24 additions and 17 deletions
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@ -42,7 +42,7 @@ from .compat import json_dumps

 from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS
 from ._ml import rebatch, Tok2Vec, flatten, get_col, doc2feats
-from ._ml import build_text_classifier
+from ._ml import build_text_classifier, build_tagger_model
 from .parts_of_speech import X


@ -138,7 +138,7 @@ class TokenVectorEncoder(BaseThincComponent):
    name = 'tensorizer'

    @classmethod
-    def Model(cls, width=128, embed_size=7500, **cfg):
+    def Model(cls, width=128, embed_size=4000, **cfg):
        """Create a new statistical model for the class.

        width (int): Output size of the model.
@ -253,23 +253,25 @@ class NeuralTagger(BaseThincComponent):
        self.cfg = dict(cfg)

    def __call__(self, doc):
-        tags = self.predict([doc.tensor])
+        tags = self.predict(([doc], [doc.tensor]))
        self.set_annotations([doc], tags)
        return doc

    def pipe(self, stream, batch_size=128, n_threads=-1):
        for docs in cytoolz.partition_all(batch_size, stream):
+            docs = list(docs)
            tokvecs = [d.tensor for d in docs]
-            tag_ids = self.predict(tokvecs)
+            tag_ids = self.predict((docs, tokvecs))
            self.set_annotations(docs, tag_ids)
            yield from docs

-    def predict(self, tokvecs):
-        scores = self.model(tokvecs)
+    def predict(self, docs_tokvecs):
+        scores = self.model(docs_tokvecs)
        scores = self.model.ops.flatten(scores)
        guesses = scores.argmax(axis=1)
        if not isinstance(guesses, numpy.ndarray):
            guesses = guesses.get()
+        tokvecs = docs_tokvecs[1]
        guesses = self.model.ops.unflatten(guesses,
                    [tv.shape[0] for tv in tokvecs])
        return guesses
@ -282,6 +284,8 @@ class NeuralTagger(BaseThincComponent):
        cdef Vocab vocab = self.vocab
        for i, doc in enumerate(docs):
            doc_tag_ids = batch_tag_ids[i]
+            if hasattr(doc_tag_ids, 'get'):
+                doc_tag_ids = doc_tag_ids.get()
            for j, tag_id in enumerate(doc_tag_ids):
                # Don't clobber preset POS tags
                if doc.c[j].tag == 0 and doc.c[j].pos == 0:
@ -294,8 +298,7 @@ class NeuralTagger(BaseThincComponent):

        if self.model.nI is None:
            self.model.nI = tokvecs[0].shape[1]
-
-        tag_scores, bp_tag_scores = self.model.begin_update(tokvecs, drop=drop)
+        tag_scores, bp_tag_scores = self.model.begin_update(docs_tokvecs, drop=drop)
        loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)

        d_tokvecs = bp_tag_scores(d_tag_scores, sgd=sgd)
@ -346,10 +349,8 @@ class NeuralTagger(BaseThincComponent):

    @classmethod
    def Model(cls, n_tags, token_vector_width):
-        return with_flatten(
-            chain(Maxout(token_vector_width, token_vector_width),
-                  Softmax(n_tags, token_vector_width)))
-
+        return build_tagger_model(n_tags, token_vector_width)
+ 
    def use_params(self, params):
        with self.model.use_params(params):
            yield
@ -432,7 +433,7 @@ class NeuralLabeller(NeuralTagger):

    @property
    def labels(self):
-        return self.cfg.get('labels', {})
+        return self.cfg.setdefault('labels', {})

    @labels.setter
    def labels(self, value):
@ -455,10 +456,8 @@ class NeuralLabeller(NeuralTagger):

    @classmethod
    def Model(cls, n_tags, token_vector_width):
-        return with_flatten(
-            chain(Maxout(token_vector_width, token_vector_width),
-                  Softmax(n_tags, token_vector_width)))
-
+        return build_tagger_model(n_tags, token_vector_width)
+    
    def get_loss(self, docs, golds, scores):
        scores = self.model.ops.flatten(scores)
        cdef int idx = 0
@ -654,6 +653,14 @@ cdef class NeuralEntityRecognizer(NeuralParser):

    nr_feature = 6

+    def predict_confidences(self, docs):
+        tensors = [d.tensor for d in docs]
+        samples = []
+        for i in range(10):
+            states = self.parse_batch(docs, tensors, drop=0.3)
+            for state in states:
+                samples.append(self._get_entities(state))
+
    def __reduce__(self):
        return (NeuralEntityRecognizer, (self.vocab, self.moves, self.model), None, None)