From e33b7e0b3c8f7a205e093ff481a8d6bc6b402eb9 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Fri, 27 Oct 2017 14:39:30 +0200
Subject: [PATCH] Tidy up parser and ML

---
 spacy/_ml.py               | 295 +++++++++----------------------------
 spacy/syntax/nn_parser.pyx |  59 +++-----
 2 files changed, 94 insertions(+), 260 deletions(-)

diff --git a/spacy/_ml.py b/spacy/_ml.py
index 4c4e36412..89324b3b3 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -1,47 +1,42 @@
-import ujson
-from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, SELU
+# coding: utf8
+from __future__ import unicode_literals
+
+import numpy
+from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu
 from thinc.i2v import HashEmbed, StaticVectors
 from thinc.t2t import ExtractWindow, ParametricAttention
-from thinc.t2v import Pooling, max_pool, mean_pool, sum_pool
+from thinc.t2v import Pooling, sum_pool
 from thinc.misc import Residual
-from thinc.misc import BatchNorm as BN
 from thinc.misc import LayerNorm as LN
 
 from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
-from thinc.api import FeatureExtracter, with_getitem
-from thinc.api import uniqued, wrap, flatten_add_lengths, noop
+from thinc.api import FeatureExtracter, with_getitem, flatten_add_lengths
+from thinc.api import uniqued, wrap, noop
 
 from thinc.linear.linear import LinearModel
 from thinc.neural.ops import NumpyOps, CupyOps
 from thinc.neural.util import get_array_module
 
-import random
-import cytoolz
-
 from thinc import describe
 from thinc.describe import Dimension, Synapses, Biases, Gradient
 from thinc.neural._classes.affine import _set_dimensions_if_needed
 import thinc.extra.load_nlp
 
-from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP, CLUSTER
-from .tokens.doc import Doc
+from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE
 from . import util
 
-import numpy
-import io
-
-# TODO: Unset this once we don't want to support models previous models.
-import thinc.neural._classes.layernorm
-thinc.neural._classes.layernorm.set_compat_six_eight(False)
 
 VECTORS_KEY = 'spacy_pretrained_vectors'
 
+
 @layerize
 def _flatten_add_lengths(seqs, pad=0, drop=0.):
     ops = Model.ops
     lengths = ops.asarray([len(seq) for seq in seqs], dtype='i')
+
     def finish_update(d_X, sgd=None):
         return ops.unflatten(d_X, lengths, pad=pad)
+
     X = ops.flatten(seqs, pad=pad)
     return (X, lengths), finish_update
 
@@ -55,33 +50,14 @@ def _logistic(X, drop=0.):
     X = xp.minimum(X, 10., X)
     X = xp.maximum(X, -10., X)
     Y = 1. / (1. + xp.exp(-X))
+
     def logistic_bwd(dY, sgd=None):
         dX = dY * (Y * (1-Y))
         return dX
+
     return Y, logistic_bwd
 
 
-@layerize
-def add_tuples(X, drop=0.):
-    """Give inputs of sequence pairs, where each sequence is (vals, length),
-    sum the values, returning a single sequence.
-
-    If input is:
-    ((vals1, length), (vals2, length)
-    Output is:
-    (vals1+vals2, length)
-
-    vals are a single tensor for the whole batch.
-    """
-    (vals1, length1), (vals2, length2) = X
-    assert length1 == length2
-
-    def add_tuples_bwd(dY, sgd=None):
-        return (dY, dY)
-
-    return (vals1+vals2, length), add_tuples_bwd
-
-
 def _zero_init(model):
     def _zero_init_impl(self, X, y):
         self.W.fill(0)
@@ -115,13 +91,12 @@ def _init_for_precomputed(W, ops):
     nF=Dimension("Number of features"),
     nO=Dimension("Output size"),
     W=Synapses("Weights matrix",
-        lambda obj: (obj.nF, obj.nO, obj.nI),
-        lambda W, ops: _init_for_precomputed(W, ops)),
+               lambda obj: (obj.nF, obj.nO, obj.nI),
+               lambda W, ops: _init_for_precomputed(W, ops)),
     b=Biases("Bias vector",
-        lambda obj: (obj.nO,)),
+             lambda obj: (obj.nO,)),
     d_W=Gradient("W"),
-    d_b=Gradient("b")
-)
+    d_b=Gradient("b"))
 class PrecomputableAffine(Model):
     def __init__(self, nO=None, nI=None, nF=None, **kwargs):
         Model.__init__(self, **kwargs)
@@ -134,18 +109,19 @@ class PrecomputableAffine(Model):
         # Yf: (b, f, i)
         # dY: (b, o)
         # dYf: (b, f, o)
-        #Yf = numpy.einsum('bi,foi->bfo', X, self.W)
+        # Yf = numpy.einsum('bi,foi->bfo', X, self.W)
         Yf = self.ops.xp.tensordot(
                 X, self.W, axes=[[1], [2]])
         Yf += self.b
+
         def backward(dY_ids, sgd=None):
             tensordot = self.ops.xp.tensordot
             dY, ids = dY_ids
             Xf = X[ids]
 
-            #dXf = numpy.einsum('bo,foi->bfi', dY, self.W)
+            # dXf = numpy.einsum('bo,foi->bfi', dY, self.W)
             dXf = tensordot(dY, self.W, axes=[[1], [1]])
-            #dW = numpy.einsum('bo,bfi->ofi', dY, Xf)
+            # dW = numpy.einsum('bo,bfi->ofi', dY, Xf)
             dW = tensordot(dY, Xf, axes=[[0], [0]])
             # ofi -> foi
             self.d_W += dW.transpose((1, 0, 2))
@@ -154,6 +130,7 @@ class PrecomputableAffine(Model):
             if sgd is not None:
                 sgd(self._mem.weights, self._mem.gradient, key=self.id)
             return dXf
+
         return Yf, backward
 
 
@@ -164,13 +141,12 @@ class PrecomputableAffine(Model):
     nP=Dimension("Number of pieces"),
     nO=Dimension("Output size"),
     W=Synapses("Weights matrix",
-        lambda obj: (obj.nF, obj.nO, obj.nP, obj.nI),
-        lambda W, ops: ops.xavier_uniform_init(W)),
+               lambda obj: (obj.nF, obj.nO, obj.nP, obj.nI),
+               lambda W, ops: ops.xavier_uniform_init(W)),
     b=Biases("Bias vector",
-        lambda obj: (obj.nO, obj.nP)),
+             lambda obj: (obj.nO, obj.nP)),
     d_W=Gradient("W"),
-    d_b=Gradient("b")
-)
+    d_b=Gradient("b"))
 class PrecomputableMaxouts(Model):
     def __init__(self, nO=None, nI=None, nF=None, nP=3, **kwargs):
         Model.__init__(self, **kwargs)
@@ -186,114 +162,26 @@ class PrecomputableMaxouts(Model):
         # dYp: (b, o, p)
         # W: (f, o, p, i)
         # b: (o, p)
-
         # bi,opfi->bfop
         # bop,fopi->bfi
         # bop,fbi->opfi : fopi
-
         tensordot = self.ops.xp.tensordot
-        ascontiguous = self.ops.xp.ascontiguousarray
-
         Yfp = tensordot(X, self.W, axes=[[1], [3]])
         Yfp += self.b
 
         def backward(dYp_ids, sgd=None):
             dYp, ids = dYp_ids
             Xf = X[ids]
-
-            dXf = tensordot(dYp, self.W, axes=[[1, 2], [1,2]])
+            dXf = tensordot(dYp, self.W, axes=[[1, 2], [1, 2]])
             dW = tensordot(dYp, Xf, axes=[[0], [0]])
-
             self.d_W += dW.transpose((2, 0, 1, 3))
             self.d_b += dYp.sum(axis=0)
-
             if sgd is not None:
                 sgd(self._mem.weights, self._mem.gradient, key=self.id)
             return dXf
+
         return Yfp, backward
 
-# Thinc's Embed class is a bit broken atm, so drop this here.
-from thinc import describe
-from thinc.neural._classes.embed import _uniform_init
-
-
-@describe.attributes(
-    nV=describe.Dimension("Number of vectors"),
-    nO=describe.Dimension("Size of output"),
-    vectors=describe.Weights("Embedding table",
-        lambda obj: (obj.nV, obj.nO),
-        _uniform_init(-0.1, 0.1)
-    ),
-    d_vectors=describe.Gradient("vectors")
-)
-class Embed(Model):
-    name = 'embed'
-
-    def __init__(self, nO, nV=None, **kwargs):
-        if nV is not None:
-            nV += 1
-        Model.__init__(self, **kwargs)
-        if 'name' in kwargs:
-            self.name = kwargs['name']
-        self.column = kwargs.get('column', 0)
-        self.nO = nO
-        self.nV = nV
-
-    def predict(self, ids):
-        if ids.ndim == 2:
-            ids = ids[:, self.column]
-        return self.ops.xp.ascontiguousarray(self.vectors[ids], dtype='f')
-
-    def begin_update(self, ids, drop=0.):
-        if ids.ndim == 2:
-            ids = ids[:, self.column]
-        vectors = self.ops.xp.ascontiguousarray(self.vectors[ids], dtype='f')
-        def backprop_embed(d_vectors, sgd=None):
-            n_vectors = d_vectors.shape[0]
-            self.ops.scatter_add(self.d_vectors, ids, d_vectors)
-            if sgd is not None:
-                sgd(self._mem.weights, self._mem.gradient, key=self.id)
-            return None
-        return vectors, backprop_embed
-
-
-def HistoryFeatures(nr_class, hist_size=8, nr_dim=8):
-    '''Wrap a model, adding features representing action history.'''
-    if hist_size == 0:
-        return layerize(noop())
-    embed_tables = [Embed(nr_dim, nr_class, column=i, name='embed%d')
-                    for i in range(hist_size)]
-    embed = chain(concatenate(*embed_tables),
-                  LN(Maxout(hist_size*nr_dim, hist_size*nr_dim)))
-    ops = embed.ops
-    def add_history_fwd(vectors_hists, drop=0.):
-        vectors, hist_ids = vectors_hists
-        hist_feats, bp_hists = embed.begin_update(hist_ids, drop=drop)
-        outputs = ops.xp.hstack((vectors, hist_feats))
-
-        def add_history_bwd(d_outputs, sgd=None):
-            d_vectors = d_outputs[:, :vectors.shape[1]]
-            d_hists = d_outputs[:, vectors.shape[1]:]
-            bp_hists(d_hists, sgd=sgd)
-            return embed.ops.xp.ascontiguousarray(d_vectors)
-        return outputs, add_history_bwd
-    return wrap(add_history_fwd, embed)
-
-
-def drop_layer(layer, factor=2.):
-    def drop_layer_fwd(X, drop=0.):
-        if drop <= 0.:
-            return layer.begin_update(X, drop=drop)
-        else:
-            coinflip = layer.ops.xp.random.random()
-            if (coinflip / factor) >= drop:
-                return layer.begin_update(X, drop=drop)
-            else:
-                return X, lambda dX, sgd=None: dX
-
-    model = wrap(drop_layer_fwd, layer)
-    model.predict = layer
-    return model
 
 def link_vectors_to_models(vocab):
     vectors = vocab.vectors
@@ -308,16 +196,21 @@ def link_vectors_to_models(vocab):
     # (unideal, I know)
     thinc.extra.load_nlp.VECTORS[(ops.device, VECTORS_KEY)] = data
 
+
 def Tok2Vec(width, embed_size, **kwargs):
     pretrained_dims = kwargs.get('pretrained_dims', 0)
     cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 2)
     cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
-    with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add,
-                                 '*': reapply}):
-        norm = HashEmbed(width, embed_size, column=cols.index(NORM), name='embed_norm')
-        prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX), name='embed_prefix')
-        suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX), name='embed_suffix')
-        shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE), name='embed_shape')
+    with Model.define_operators({'>>': chain, '|': concatenate, '**': clone,
+                                 '+': add, '*': reapply}):
+        norm = HashEmbed(width, embed_size, column=cols.index(NORM),
+                         name='embed_norm')
+        prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX),
+                           name='embed_prefix')
+        suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX),
+                           name='embed_suffix')
+        shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE),
+                          name='embed_shape')
         if pretrained_dims is not None and pretrained_dims >= 1:
             glove = StaticVectors(VECTORS_KEY, width, column=cols.index(ID))
 
@@ -329,7 +222,6 @@ def Tok2Vec(width, embed_size, **kwargs):
                 (norm | prefix | suffix | shape)
                 >> LN(Maxout(width, width*4, pieces=3)), column=5)
 
-
         convolution = Residual(
             ExtractWindow(nW=1)
             >> LN(Maxout(width, width*3, pieces=cnn_maxout_pieces))
@@ -354,6 +246,7 @@ def reapply(layer, n_times):
             Y, backprop = layer.begin_update(X, drop=drop)
             X = Y
             backprops.append(backprop)
+
         def reapply_bwd(dY, sgd=None):
             dX = None
             for backprop in reversed(backprops):
@@ -363,39 +256,20 @@ def reapply(layer, n_times):
                 else:
                     dX += dY
             return dX
+
         return Y, reapply_bwd
     return wrap(reapply_fwd, layer)
 
 
-
-
 def asarray(ops, dtype):
     def forward(X, drop=0.):
         return ops.asarray(X, dtype=dtype), None
     return layerize(forward)
 
 
-def foreach(layer):
-    def forward(Xs, drop=0.):
-        results = []
-        backprops = []
-        for X in Xs:
-            result, bp = layer.begin_update(X, drop=drop)
-            results.append(result)
-            backprops.append(bp)
-        def backward(d_results, sgd=None):
-            dXs = []
-            for d_result, backprop in zip(d_results, backprops):
-                dXs.append(backprop(d_result, sgd))
-            return dXs
-        return results, backward
-    model = layerize(forward)
-    model._layers.append(layer)
-    return model
-
-
 def rebatch(size, layer):
     ops = layer.ops
+
     def forward(X, drop=0.):
         if X.shape[0] < size:
             return layer.begin_update(X)
@@ -403,6 +277,7 @@ def rebatch(size, layer):
         results, bp_results = zip(*[layer.begin_update(p, drop=drop)
                                     for p in parts])
         y = ops.flatten(results)
+
         def backward(dy, sgd=None):
             d_parts = [bp(y, sgd=sgd) for bp, y in
                        zip(bp_results, _divide_array(dy, size))]
@@ -413,6 +288,7 @@ def rebatch(size, layer):
             except ValueError:
                 dX = None
             return dX
+
         return y, backward
     model = layerize(forward)
     model._layers.append(layer)
@@ -423,13 +299,14 @@ def _divide_array(X, size):
     parts = []
     index = 0
     while index < len(X):
-        parts.append(X[index : index + size])
+        parts.append(X[index:index + size])
         index += size
     return parts
 
 
 def get_col(idx):
     assert idx >= 0, idx
+
     def forward(X, drop=0.):
         assert idx >= 0, idx
         if isinstance(X, numpy.ndarray):
@@ -437,30 +314,28 @@ def get_col(idx):
         else:
             ops = CupyOps()
         output = ops.xp.ascontiguousarray(X[:, idx], dtype=X.dtype)
+
         def backward(y, sgd=None):
             assert idx >= 0, idx
             dX = ops.allocate(X.shape)
             dX[:, idx] += y
             return dX
+
         return output, backward
+
     return layerize(forward)
 
 
-def zero_init(model):
-    def _hook(self, X, y=None):
-        self.W.fill(0)
-    model.on_data_hooks.append(_hook)
-    return model
-
-
 def doc2feats(cols=None):
     if cols is None:
         cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
+
     def forward(docs, drop=0.):
         feats = []
         for doc in docs:
             feats.append(doc.to_array(cols))
         return feats, None
+
     model = layerize(forward)
     model.cols = cols
     return model
@@ -474,28 +349,14 @@ def print_shape(prefix):
 
 @layerize
 def get_token_vectors(tokens_attrs_vectors, drop=0.):
-    ops = Model.ops
     tokens, attrs, vectors = tokens_attrs_vectors
+
     def backward(d_output, sgd=None):
         return (tokens, d_output)
+
     return vectors, backward
 
 
-@layerize
-def flatten(seqs, drop=0.):
-    if isinstance(seqs[0], numpy.ndarray):
-        ops = NumpyOps()
-    elif hasattr(CupyOps.xp, 'ndarray') and isinstance(seqs[0], CupyOps.xp.ndarray):
-        ops = CupyOps()
-    else:
-        raise ValueError("Unable to flatten sequence of type %s" % type(seqs[0]))
-    lengths = [len(seq) for seq in seqs]
-    def finish_update(d_X, sgd=None):
-        return ops.unflatten(d_X, lengths)
-    X = ops.xp.vstack(seqs)
-    return X, finish_update
-
-
 @layerize
 def logistic(X, drop=0.):
     xp = get_array_module(X)
@@ -505,9 +366,11 @@ def logistic(X, drop=0.):
     X = xp.minimum(X, 10., X)
     X = xp.maximum(X, -10., X)
     Y = 1. / (1. + xp.exp(-X))
+
     def logistic_bwd(dY, sgd=None):
         dX = dY * (Y * (1-Y))
         return dX
+
     return Y, logistic_bwd
 
 
@@ -517,6 +380,7 @@ def zero_init(model):
     model.on_data_hooks.append(_zero_init_impl)
     return model
 
+
 @layerize
 def preprocess_doc(docs, drop=0.):
     keys = [doc.to_array([LOWER]) for doc in docs]
@@ -526,11 +390,13 @@ def preprocess_doc(docs, drop=0.):
     vals = ops.allocate(keys.shape[0]) + 1
     return (keys, vals, lengths), None
 
+
 def getitem(i):
     def getitem_fwd(X, drop=0.):
         return X[i], None
     return layerize(getitem_fwd)
 
+
 def build_tagger_model(nr_class, **cfg):
     embed_size = util.env_opt('embed_size', 7000)
     if 'token_vector_width' in cfg:
@@ -555,8 +421,6 @@ def build_tagger_model(nr_class, **cfg):
 
 @layerize
 def SpacyVectors(docs, drop=0.):
-    xp = get_array_module(docs[0].vocab.vectors.data)
-    width = docs[0].vocab.vectors.data.shape[1]
     batch = []
     for doc in docs:
         indices = numpy.zeros((len(doc),), dtype='i')
@@ -570,29 +434,6 @@ def SpacyVectors(docs, drop=0.):
     return batch, None
 
 
-def foreach(layer, drop_factor=1.0):
-    '''Map a layer across elements in a list'''
-    def foreach_fwd(Xs, drop=0.):
-        drop *= drop_factor
-        ys = []
-        backprops = []
-        for X in Xs:
-            y, bp_y = layer.begin_update(X, drop=drop)
-            ys.append(y)
-            backprops.append(bp_y)
-        def foreach_bwd(d_ys, sgd=None):
-            d_Xs = []
-            for d_y, bp_y in zip(d_ys, backprops):
-                if bp_y is not None and bp_y is not None:
-                    d_Xs.append(d_y, sgd=sgd)
-                else:
-                    d_Xs.append(None)
-            return d_Xs
-        return ys, foreach_bwd
-    model = wrap(foreach_fwd, layer)
-    return model
-
-
 def build_text_classifier(nr_class, width=64, **cfg):
     nr_vector = cfg.get('nr_vector', 5000)
     pretrained_dims = cfg.get('pretrained_dims', 0)
@@ -602,9 +443,7 @@ def build_text_classifier(nr_class, width=64, **cfg):
             model = (
                 SpacyVectors
                 >> flatten_add_lengths
-                >> with_getitem(0,
-                    Affine(width, pretrained_dims)
-                )
+                >> with_getitem(0, Affine(width, pretrained_dims))
                 >> ParametricAttention(width)
                 >> Pooling(sum_pool)
                 >> Residual(ReLu(width, width)) ** 2
@@ -613,7 +452,6 @@ def build_text_classifier(nr_class, width=64, **cfg):
             )
             return model
 
-
         lower = HashEmbed(width, nr_vector, column=1)
         prefix = HashEmbed(width//2, nr_vector, column=2)
         suffix = HashEmbed(width//2, nr_vector, column=3)
@@ -671,33 +509,40 @@ def build_text_classifier(nr_class, width=64, **cfg):
     model.lsuv = False
     return model
 
+
 @layerize
 def flatten(seqs, drop=0.):
     ops = Model.ops
     lengths = ops.asarray([len(seq) for seq in seqs], dtype='i')
+
     def finish_update(d_X, sgd=None):
         return ops.unflatten(d_X, lengths, pad=0)
+
     X = ops.flatten(seqs, pad=0)
     return X, finish_update
 
 
-def concatenate_lists(*layers, **kwargs): # pragma: no cover
-    '''Compose two or more models `f`, `g`, etc, such that their outputs are
+def concatenate_lists(*layers, **kwargs):  # pragma: no cover
+    """Compose two or more models `f`, `g`, etc, such that their outputs are
     concatenated, i.e. `concatenate(f, g)(x)` computes `hstack(f(x), g(x))`
-    '''
+    """
     if not layers:
         return noop()
     drop_factor = kwargs.get('drop_factor', 1.0)
     ops = layers[0].ops
     layers = [chain(layer, flatten) for layer in layers]
     concat = concatenate(*layers)
+
     def concatenate_lists_fwd(Xs, drop=0.):
         drop *= drop_factor
         lengths = ops.asarray([len(X) for X in Xs], dtype='i')
         flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop)
         ys = ops.unflatten(flat_y, lengths)
+
         def concatenate_lists_bwd(d_ys, sgd=None):
             return bp_flat_y(ops.flatten(d_ys), sgd=sgd)
+
         return ys, concatenate_lists_bwd
+
     model = wrap(concatenate_lists_fwd, concat)
     return model
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index c592cdc22..12332ab25 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -49,9 +49,8 @@ from .. import util
 from ..util import get_async, get_cuda_stream
 from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts
 from .._ml import Tok2Vec, doc2feats, rebatch
-from .._ml import Residual, drop_layer, flatten
+from .._ml import Residual, flatten
 from .._ml import link_vectors_to_models
-from .._ml import HistoryFeatures
 from ..compat import json_dumps, copy_array
 
 from .stateclass cimport StateClass
@@ -77,7 +76,7 @@ def set_debug(val):
 
 
 cdef class precompute_hiddens:
-    '''Allow a model to be "primed" by pre-computing input features in bulk.
+    """Allow a model to be "primed" by pre-computing input features in bulk.
 
     This is used for the parser, where we want to take a batch of documents,
     and compute vectors for each (token, position) pair. These vectors can then
@@ -92,7 +91,7 @@ cdef class precompute_hiddens:
     so we can save the factor k. This also gives a nice CPU/GPU division:
     we can do all our hard maths up front, packed into large multiplications,
     and do the hard-to-program parsing on the CPU.
-    '''
+    """
     cdef int nF, nO, nP
     cdef bint _is_synchronized
     cdef public object ops
@@ -280,23 +279,19 @@ cdef class Parser:
         return (tok2vec, lower, upper), cfg
 
     def __init__(self, Vocab vocab, moves=True, model=True, **cfg):
-        """
-        Create a Parser.
+        """Create a Parser.
 
-        Arguments:
-            vocab (Vocab):
-                The vocabulary object. Must be shared with documents to be processed.
-                The value is set to the .vocab attribute.
-            moves (TransitionSystem):
-                Defines how the parse-state is created, updated and evaluated.
-                The value is set to the .moves attribute unless True (default),
-                in which case a new instance is created with Parser.Moves().
-            model (object):
-                Defines how the parse-state is created, updated and evaluated.
-                The value is set to the .model attribute unless True (default),
-                in which case a new instance is created with Parser.Model().
-            **cfg:
-                Arbitrary configuration parameters. Set to the .cfg attribute
+        vocab (Vocab): The vocabulary object. Must be shared with documents
+            to be processed. The value is set to the `.vocab` attribute.
+        moves (TransitionSystem): Defines how the parse-state is created,
+            updated and evaluated. The value is set to the .moves attribute
+            unless True (default), in which case a new instance is created with
+            `Parser.Moves()`.
+        model (object): Defines how the parse-state is created, updated and
+            evaluated. The value is set to the .model attribute unless True
+            (default), in which case a new instance is created with
+            `Parser.Model()`.
+        **cfg: Arbitrary configuration parameters. Set to the `.cfg` attribute
         """
         self.vocab = vocab
         if moves is True:
@@ -322,13 +317,10 @@ cdef class Parser:
         return (Parser, (self.vocab, self.moves, self.model), None, None)
 
     def __call__(self, Doc doc, beam_width=None, beam_density=None):
-        """
-        Apply the parser or entity recognizer, setting the annotations onto the Doc object.
+        """Apply the parser or entity recognizer, setting the annotations onto
+        the `Doc` object.
 
-        Arguments:
-            doc (Doc): The document to be processed.
-        Returns:
-            None
+        doc (Doc): The document to be processed.
         """
         if beam_width is None:
             beam_width = self.cfg.get('beam_width', 1)
@@ -350,16 +342,13 @@ cdef class Parser:
 
     def pipe(self, docs, int batch_size=256, int n_threads=2,
              beam_width=None, beam_density=None):
-        """
-        Process a stream of documents.
+        """Process a stream of documents.
 
-        Arguments:
-            stream: The sequence of documents to process.
-            batch_size (int):
-                The number of documents to accumulate into a working set.
-            n_threads (int):
-                The number of threads with which to work on the buffer in parallel.
-        Yields (Doc): Documents, in order.
+        stream: The sequence of documents to process.
+        batch_size (int): Number of documents to accumulate into a working set.
+        n_threads (int): The number of threads with which to work on the buffer
+            in parallel.
+        YIELDS (Doc): Documents, in order.
         """
         if beam_width is None:
             beam_width = self.cfg.get('beam_width', 1)