Tidy up pipeline

2025-07-23 06:29:48 +03:00 · 2017-10-27 20:29:08 +02:00 · 2017-10-27 20:29:08 +02:00 · ba5e646219
commit ba5e646219
parent b4d226a3f1
1 changed files with 59 additions and 67 deletions
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@ -3,26 +3,17 @@
 # coding: utf8
 from __future__ import unicode_literals
 from thinc.api import chain, layerize, with_getitem
 import numpy
 cimport numpy as np
 import cytoolz
 import util
 from collections import OrderedDict
 import ujson
 import msgpack
-from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
+from thinc.api import chain
-from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, SELU
+from thinc.v2v import Softmax
-from thinc.i2v import HashEmbed
+from thinc.t2v import Pooling, max_pool, mean_pool
 from thinc.t2v import Pooling, max_pool, mean_pool, sum_pool
 from thinc.t2t import ExtractWindow, ParametricAttention
 from thinc.misc import Residual
 from thinc.misc import BatchNorm as BN
 from thinc.misc import LayerNorm as LN
 from thinc.neural.util import to_categorical
 from thinc.neural._classes.difference import Siamese, CauchySimilarity
 from .tokens.doc cimport Doc
@ -30,29 +21,23 @@ from .syntax.nn_parser cimport Parser
 from .syntax import nonproj
 from .syntax.ner cimport BiluoPushDown
 from .syntax.arc_eager cimport ArcEager
 from .tagger import Tagger
 from .syntax.stateclass cimport StateClass
 from .gold cimport GoldParse
 from .morphology cimport Morphology
 from .vocab cimport Vocab
 from .syntax import nonproj
 from .compat import json_dumps
-from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS
+from .attrs import POS
 from ._ml import rebatch, Tok2Vec, flatten
 from ._ml import build_text_classifier, build_tagger_model
 from ._ml import link_vectors_to_models
 from .parts_of_speech import X
 from ._ml import Tok2Vec, build_text_classifier, build_tagger_model
 from ._ml import link_vectors_to_models
 from . import util
 class SentenceSegmenter(object):
    """A simple spaCy hook, to allow custom sentence boundary detection logic
-    (that doesn't require the dependency parse).
+    (that doesn't require the dependency parse). To change the sentence
-
+    boundary detection strategy, pass a generator function `strategy` on
-    To change the sentence boundary detection strategy, pass a generator
+    initialization, or assign a new strategy to the .strategy attribute.
    function `strategy` on initialization, or assign a new strategy to
    the .strategy attribute.
    Sentence detection strategies should be generators that take `Doc` objects
    and yield `Span` objects for each sentence.
    """
@ -84,6 +69,10 @@ class SentenceSegmenter(object):
 class Pipe(object):
    """This class is not instantiated directly. Components inherit from it, and
    it defines the interface that components should follow to function as
    components in a spaCy analysis pipeline.
    """
    name = None
    @classmethod
@ -149,8 +138,7 @@ class Pipe(object):
        link_vectors_to_models(self.vocab)
    def use_params(self, params):
-        """Modify the pipe's model, to use the given parameter values.
+        """Modify the pipe's model, to use the given parameter values."""
        """
        with self.model.use_params(params):
            yield
@ -235,8 +223,8 @@ class Tensorizer(Pipe):
        """Construct a new statistical model. Weights are not allocated on
        initialisation.
-        vocab (Vocab): A `Vocab` instance. The model must share the same `Vocab`
+        vocab (Vocab): A `Vocab` instance. The model must share the same
-            instance with the `Doc` objects it will process.
+            `Vocab` instance with the `Doc` objects it will process.
        model (Model): A `Model` instance or `True` allocate one later.
        **cfg: Config parameters.
@ -280,7 +268,7 @@ class Tensorizer(Pipe):
        """Return a single tensor for a batch of documents.
        docs (iterable): A sequence of `Doc` objects.
-        RETURNS (object): Vector representations for each token in the documents.
+        RETURNS (object): Vector representations for each token in the docs.
        """
        tokvecs = self.model(docs)
        return tokvecs
@ -289,7 +277,7 @@ class Tensorizer(Pipe):
        """Set the tensor attribute for a batch of documents.
        docs (iterable): A sequence of `Doc` objects.
-        tokvecs (object): Vector representation for each token in the documents.
+        tokvecs (object): Vector representation for each token in the docs.
        """
        for doc, tokvecs in zip(docs, tokvecses):
            assert tokvecs.shape[0] == len(doc)
@ -328,12 +316,14 @@ class Tensorizer(Pipe):
 class Tagger(Pipe):
    name = 'tagger'
    def __init__(self, vocab, model=True, **cfg):
        self.vocab = vocab
        self.model = model
        self.cfg = dict(cfg)
        self.cfg.setdefault('cnn_maxout_pieces', 2)
-        self.cfg.setdefault('pretrained_dims', self.vocab.vectors.data.shape[1])
+        self.cfg.setdefault('pretrained_dims',
                            self.vocab.vectors.data.shape[1])
    def __call__(self, doc):
        tags = self.predict([doc])
@ -353,8 +343,7 @@ class Tagger(Pipe):
        guesses = scores.argmax(axis=1)
        if not isinstance(guesses, numpy.ndarray):
            guesses = guesses.get()
-        guesses = self.model.ops.unflatten(guesses,
+        guesses = self.model.ops.unflatten(guesses, [len(d) for d in docs])
                    [len(d) for d in docs])
        return guesses
    def set_annotations(self, docs, batch_tag_ids):
@ -387,8 +376,8 @@ class Tagger(Pipe):
    def get_loss(self, docs, golds, scores):
        scores = self.model.ops.flatten(scores)
-        tag_index = {tag: i for i, tag in enumerate(self.vocab.morphology.tag_names)}
+        tag_index = {tag: i
-
+                     for i, tag in enumerate(self.vocab.morphology.tag_names)}
        cdef int idx = 0
        correct = numpy.zeros((scores.shape[0],), dtype='i')
        guesses = scores.argmax(axis=1)
@ -443,17 +432,18 @@ class Tagger(Pipe):
            serialize['model'] = self.model.to_bytes
        serialize['vocab'] = self.vocab.to_bytes
-        serialize['tag_map'] = lambda: msgpack.dumps(self.vocab.morphology.tag_map,
+        serialize['tag_map'] = lambda: msgpack.dumps(
-                                                     use_bin_type=True,
+            self.vocab.morphology.tag_map, use_bin_type=True, encoding='utf8')
                                                     encoding='utf8')
        return util.to_bytes(serialize, exclude)
    def from_bytes(self, bytes_data, **exclude):
        def load_model(b):
            if self.model is True:
-                token_vector_width = util.env_opt('token_vector_width',
+                token_vector_width = util.env_opt(
                    'token_vector_width',
                    self.cfg.get('token_vector_width', 128))
-                self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
+                self.model = self.Model(self.vocab.morphology.n_tags,
                                        **self.cfg)
            self.model.from_bytes(b)
        def load_tag_map(b):
@ -509,11 +499,11 @@ class Tagger(Pipe):
 class MultitaskObjective(Tagger):
-    '''Assist training of a parser or tagger, by training a side-objective.
+    """Experimental: Assist training of a parser or tagger, by training a
-
+    side-objective.
-    Experimental
+    """
    '''
    name = 'nn_labeller'
    def __init__(self, vocab, model=True, target='dep_tag_offset', **cfg):
        self.vocab = vocab
        self.model = model
@ -530,12 +520,12 @@ class MultitaskObjective(Tagger):
        elif hasattr(target, '__call__'):
            self.make_label = target
        else:
-            raise ValueError(
+            raise ValueError("MultitaskObjective target should be function or "
-                "MultitaskObjective target should be function or one of "
+                             "one of: dep, tag, ent, dep_tag_offset, ent_tag.")
                "['dep', 'tag', 'ent', 'dep_tag_offset', 'ent_tag']")
        self.cfg = dict(cfg)
        self.cfg.setdefault('cnn_maxout_pieces', 2)
-        self.cfg.setdefault('pretrained_dims', self.vocab.vectors.data.shape[1])
+        self.cfg.setdefault('pretrained_dims',
                            self.vocab.vectors.data.shape[1])
    @property
    def labels(self):
@ -623,20 +613,19 @@ class MultitaskObjective(Tagger):
 class SimilarityHook(Pipe):
    """
-    Experimental
+    Experimental: A pipeline component to install a hook for supervised
    similarity into `Doc` objects. Requires a `Tensorizer` to pre-process
    documents. The similarity model can be any object obeying the Thinc `Model`
    interface. By default, the model concatenates the elementwise mean and
    elementwise max of the two tensors, and compares them using the
    Cauchy-like similarity function from Chen (2013):
-    A pipeline component to install a hook for supervised similarity into
+        >>> similarity = 1. / (1. + (W * (vec1-vec2)**2).sum())
    Doc objects. Requires a Tensorizer to pre-process documents. The similarity
    model can be any object obeying the Thinc Model interface. By default,
    the model concatenates the elementwise mean and elementwise max of the two
    tensors, and compares them using the Cauchy-like similarity function
    from Chen (2013):
        similarity = 1. / (1. + (W * (vec1-vec2)**2).sum())
    Where W is a vector of dimension weights, initialized to 1.
    """
    name = 'similarity'
    def __init__(self, vocab, model=True, **cfg):
        self.vocab = vocab
        self.model = model
@ -662,8 +651,7 @@ class SimilarityHook(Pipe):
        sims, bp_sims = self.model.begin_update(doc1_doc2, drop=drop)
    def begin_training(self, _=tuple(), pipeline=None):
-        """
+        """Allocate model, using width from tensorizer in pipeline.
        Allocate model, using width from tensorizer in pipeline.
        gold_tuples (iterable): Gold-standard training data.
        pipeline (list): The pipeline the model is part of.
@ -763,12 +751,14 @@ cdef class DependencyParser(Parser):
        for target in []:
            labeller = MultitaskObjective(self.vocab, target=target)
            tok2vec = self.model[0]
-            labeller.begin_training(gold_tuples, pipeline=pipeline, tok2vec=tok2vec)
+            labeller.begin_training(gold_tuples, pipeline=pipeline,
                                    tok2vec=tok2vec)
            pipeline.append(labeller)
            self._multitasks.append(labeller)
    def __reduce__(self):
-        return (DependencyParser, (self.vocab, self.moves, self.model), None, None)
+        return (DependencyParser, (self.vocab, self.moves, self.model),
                None, None)
 cdef class EntityRecognizer(Parser):
@ -781,12 +771,14 @@ cdef class EntityRecognizer(Parser):
        for target in []:
            labeller = MultitaskObjective(self.vocab, target=target)
            tok2vec = self.model[0]
-            labeller.begin_training(gold_tuples, pipeline=pipeline, tok2vec=tok2vec)
+            labeller.begin_training(gold_tuples, pipeline=pipeline,
                                    tok2vec=tok2vec)
            pipeline.append(labeller)
            self._multitasks.append(labeller)
    def __reduce__(self):
-        return (EntityRecognizer, (self.vocab, self.moves, self.model), None, None)
+        return (EntityRecognizer, (self.vocab, self.moves, self.model),
                None, None)
 __all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer']