Merge remote-tracking branch 'upstream/develop' into indonesian

2025-08-07 21:54:54 +03:00 · 2017-08-18 10:14:46 +07:00 · 2017-08-18 10:14:46 +07:00 · 7ae45bffcf
commit 7ae45bffcf
parent c62b49b7cc 009752eeae
31 changed files with 184 additions and 235 deletions
--- a/.gitignore
+++ b/.gitignore
@ -40,7 +40,6 @@ venv/

 # Distribution / packaging
 env/
-bin/
 build/
 develop-eggs/
 dist/
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,3 +1,4 @@
 recursive-include include *.h
 include LICENSE
 include README.rst
+include bin/spacy
--- a/bin/spacy
+++ b/bin/spacy
@ -0,0 +1 @@
+python -m spacy "$@"
--- a/setup.py
+++ b/setup.py
@ -187,6 +187,7 @@ def setup_package():
            url=about['__uri__'],
            license=about['__license__'],
            ext_modules=ext_modules,
+            scripts=['bin/spacy'],
            install_requires=[
                'numpy>=1.7',
                'murmurhash>=0.28,<0.29',
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@ -5,12 +5,10 @@ from thinc.neural._classes.hash_embed import HashEmbed
 from thinc.neural.ops import NumpyOps, CupyOps
 from thinc.neural.util import get_array_module
 import random
-import cytoolz

 from thinc.neural._classes.convolution import ExtractWindow
 from thinc.neural._classes.static_vectors import StaticVectors
 from thinc.neural._classes.batchnorm import BatchNorm
-from thinc.neural._classes.layernorm import LayerNorm as LN
 from thinc.neural._classes.resnet import Residual
 from thinc.neural import ReLu
 from thinc.neural._classes.selu import SELU
@ -21,7 +19,7 @@ from thinc.api import FeatureExtracter, with_getitem
 from thinc.neural.pooling import Pooling, max_pool, mean_pool, sum_pool
 from thinc.neural._classes.attention import ParametricAttention
 from thinc.linear.linear import LinearModel
-from thinc.api import uniqued, wrap, flatten_add_lengths
+from thinc.api import uniqued, wrap

 from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP
 from .tokens.doc import Doc
@ -55,27 +53,6 @@ def _logistic(X, drop=0.):
    return Y, logistic_bwd


-@layerize
-def add_tuples(X, drop=0.):
-    """Give inputs of sequence pairs, where each sequence is (vals, length),
-    sum the values, returning a single sequence.
-
-    If input is:
-    ((vals1, length), (vals2, length)
-    Output is:
-    (vals1+vals2, length)
-
-    vals are a single tensor for the whole batch.
-    """
-    (vals1, length1), (vals2, length2) = X
-    assert length1 == length2
-
-    def add_tuples_bwd(dY, sgd=None):
-        return (dY, dY)
-
-    return (vals1+vals2, length), add_tuples_bwd
-
-
 def _zero_init(model):
    def _zero_init_impl(self, X, y):
        self.W.fill(0)
@ -84,7 +61,6 @@ def _zero_init(model):
        model.W.fill(0.)
    return model

-
@layerize
 def _preprocess_doc(docs, drop=0.):
    keys = [doc.to_array([LOWER]) for doc in docs]
@ -96,6 +72,7 @@ def _preprocess_doc(docs, drop=0.):
    return (keys, vals, lengths), None


+
 def _init_for_precomputed(W, ops):
    if (W**2).sum() != 0.:
        return
@ -103,7 +80,6 @@ def _init_for_precomputed(W, ops):
    ops.xavier_uniform_init(reshaped)
    W[:] = reshaped.reshape(W.shape)

-
@describe.on_data(_set_dimensions_if_needed)
@describe.attributes(
    nI=Dimension("Input size"),
@ -209,9 +185,9 @@ class PrecomputableMaxouts(Model):


 def Tok2Vec(width, embed_size, preprocess=None):
-    cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
+    cols = [ID, NORM, PREFIX, SUFFIX, SHAPE]
    with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}):
-        norm = get_col(cols.index(NORM))     >> HashEmbed(width, embed_size, name='embed_lower')
+        norm = get_col(cols.index(NORM))   >> HashEmbed(width, embed_size, name='embed_lower')
        prefix = get_col(cols.index(PREFIX)) >> HashEmbed(width, embed_size//2, name='embed_prefix')
        suffix = get_col(cols.index(SUFFIX)) >> HashEmbed(width, embed_size//2, name='embed_suffix')
        shape = get_col(cols.index(SHAPE))   >> HashEmbed(width, embed_size//2, name='embed_shape')
@ -220,13 +196,13 @@ def Tok2Vec(width, embed_size, preprocess=None):
        tok2vec = (
            with_flatten(
                asarray(Model.ops, dtype='uint64')
-                >> uniqued(embed, column=5)
-                >> LN(Maxout(width, width*4, pieces=3))
-                >> Residual(ExtractWindow(nW=1) >> LN(Maxout(width, width*3)))
+                >> embed
+                >> Maxout(width, width*4, pieces=3)
+                >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
                >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
                >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
                >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)),
-                pad=4)
+            pad=4)
        )
        if preprocess not in (False, None):
            tok2vec = preprocess >> tok2vec
@ -321,7 +297,7 @@ def zero_init(model):


 def doc2feats(cols=None):
-    cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
+    cols = [ID, NORM, PREFIX, SUFFIX, SHAPE]
    def forward(docs, drop=0.):
        feats = []
        for doc in docs:
@ -347,36 +323,6 @@ def get_token_vectors(tokens_attrs_vectors, drop=0.):
    return vectors, backward


-def fine_tune(embedding, combine=None):
-    if combine is not None:
-        raise NotImplementedError(
-            "fine_tune currently only supports addition. Set combine=None")
-    def fine_tune_fwd(docs_tokvecs, drop=0.):
-        docs, tokvecs = docs_tokvecs
-        lengths = model.ops.asarray([len(doc) for doc in docs], dtype='i')
-
-        vecs, bp_vecs = embedding.begin_update(docs, drop=drop)
-        flat_tokvecs = embedding.ops.flatten(tokvecs)
-        flat_vecs = embedding.ops.flatten(vecs)
-        output = embedding.ops.unflatten(
-                   (model.mix[0] * flat_vecs + model.mix[1] * flat_tokvecs),
-                    lengths)
-
-        def fine_tune_bwd(d_output, sgd=None):
-            bp_vecs(d_output, sgd=sgd)
-            flat_grad = model.ops.flatten(d_output)
-            model.d_mix[1] += flat_tokvecs.dot(flat_grad.T).sum()
-            model.d_mix[0] += flat_vecs.dot(flat_grad.T).sum()
-            sgd(model._mem.weights, model._mem.gradient, key=model.id)
-            return d_output
-        return output, fine_tune_bwd
-    model = wrap(fine_tune_fwd, embedding)
-    model.mix = model._mem.add((model.id, 'mix'), (2,))
-    model.mix.fill(1.)
-    model.d_mix = model._mem.add_gradient((model.id, 'd_mix'), (model.id, 'mix'))
-    return model
-
-
@layerize
 def flatten(seqs, drop=0.):
    if isinstance(seqs[0], numpy.ndarray):
@ -423,26 +369,6 @@ def preprocess_doc(docs, drop=0.):
    vals = ops.allocate(keys.shape[0]) + 1
    return (keys, vals, lengths), None

-def getitem(i):
-    def getitem_fwd(X, drop=0.):
-        return X[i], None
-    return layerize(getitem_fwd)
-
-def build_tagger_model(nr_class, token_vector_width, **cfg):
-    with Model.define_operators({'>>': chain, '+': add}):
-        # Input: (doc, tensor) tuples
-        private_tok2vec = Tok2Vec(token_vector_width, 7500, preprocess=doc2feats())
-
-        model = (
-            fine_tune(private_tok2vec)
-            >> with_flatten(
-                Maxout(token_vector_width, token_vector_width)
-                >> Softmax(nr_class, token_vector_width)
-            )
-        )
-    model.nI = None
-    return model
-

 def build_text_classifier(nr_class, width=64, **cfg):
    nr_vector = cfg.get('nr_vector', 200)
@ -457,7 +383,7 @@ def build_text_classifier(nr_class, width=64, **cfg):
            >> _flatten_add_lengths
            >> with_getitem(0,
                uniqued(
-                  (embed_lower | embed_prefix | embed_suffix | embed_shape)
+                  (embed_lower | embed_prefix | embed_suffix | embed_shape) 
                  >> Maxout(width, width+(width//2)*3))
                >> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3))
                >> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3))
@ -478,7 +404,7 @@ def build_text_classifier(nr_class, width=64, **cfg):
            >> zero_init(Affine(nr_class, nr_class*2, drop_factor=0.0))
            >> logistic
        )
-
+ 
    model.lsuv = False
    return model

--- a/spacy/about.py
+++ b/spacy/about.py
@ -3,7 +3,7 @@
 # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py

 __title__ = 'spacy-nightly'
-__version__ = '2.0.0a7'
+__version__ = '2.0.0a9'
 __summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
 __uri__ = 'https://spacy.io'
 __author__ = 'Explosion AI'
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@ -8,7 +8,7 @@ import subprocess
 import sys

 from .link import link
-from ..util import prints
+from ..util import prints, get_package_path
 from .. import about


@ -32,7 +32,11 @@ def download(cmd, model, direct=False):
        version = get_version(model_name, compatibility)
        download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version))
        try:
-            link(None, model_name, model, force=True)
+            # Get package path here because link uses
+            # pip.get_installed_distributions() to check if model is a package,
+            # which fails if model was just installed via subprocess
+            package_path = get_package_path(model_name)
+            link(None, model_name, model, force=True, model_path=package_path)
        except:
            # Dirty, but since spacy.download and the auto-linking is mostly
            # a convenience wrapper, it's best to show a success message and
--- a/spacy/cli/link.py
+++ b/spacy/cli/link.py
@ -14,7 +14,7 @@ from .. import util
    link_name=("name of shortuct link to create", "positional", None, str),
    force=("force overwriting of existing link", "flag", "f", bool)
 )
-def link(cmd, origin, link_name, force=False):
+def link(cmd, origin, link_name, force=False, model_path=None):
    """
    Create a symlink for models within the spacy/data directory. Accepts
    either the name of a pip package, or the local path to the model data
@ -23,7 +23,7 @@ def link(cmd, origin, link_name, force=False):
    if util.is_package(origin):
        model_path = util.get_package_path(origin)
    else:
-        model_path = Path(origin)
+        model_path = Path(origin) if model_path is None else Path(model_path)
    if not model_path.exists():
        prints("The data should be located in %s" % path2str(model_path),
               title="Can't locate model data", exits=1)
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@ -15,10 +15,11 @@ from .. import about
@plac.annotations(
    input_dir=("directory with model data", "positional", None, str),
    output_dir=("output parent directory", "positional", None, str),
-    meta=("path to meta.json", "option", "m", str),
+    meta_path=("path to meta.json", "option", "m", str),
+    create_meta=("create meta.json, even if one exists in directory", "flag", "c", bool),
    force=("force overwriting of existing folder in output directory", "flag", "f", bool)
 )
-def package(cmd, input_dir, output_dir, meta=None, force=False):
+def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False, force=False):
    """
    Generate Python package for model data, including meta and required
    installation files. A new directory will be created in the specified
@ -26,7 +27,7 @@ def package(cmd, input_dir, output_dir, meta=None, force=False):
    """
    input_path = util.ensure_path(input_dir)
    output_path = util.ensure_path(output_dir)
-    meta_path = util.ensure_path(meta)
+    meta_path = util.ensure_path(meta_path)
    if not input_path or not input_path.exists():
        prints(input_path, title="Model directory not found", exits=1)
    if not output_path or not output_path.exists():
@ -38,7 +39,7 @@ def package(cmd, input_dir, output_dir, meta=None, force=False):
    template_manifest = get_template('MANIFEST.in')
    template_init = get_template('xx_model_name/__init__.py')
    meta_path = meta_path or input_path / 'meta.json'
-    if meta_path.is_file():
+    if not create_meta and meta_path.is_file():
        prints(meta_path, title="Reading meta.json from file")
        meta = util.read_json(meta_path)
    else:
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -91,8 +91,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
                for batch in minibatch(train_docs, size=batch_sizes):
                    docs, golds = zip(*batch)
                    nlp.update(docs, golds, sgd=optimizer,
-                               drop=next(dropout_rates), losses=losses,
-                               update_tensors=True)
+                               drop=next(dropout_rates), losses=losses)
                    pbar.update(sum(len(doc) for doc in docs))

            with nlp.use_params(optimizer.averages):
--- a/spacy/deprecated.py
+++ b/spacy/deprecated.py
@ -15,7 +15,7 @@ def depr_model_download(lang):
    lang (unicode): Language shortcut, 'en' or 'de'.
    """
    prints("The spacy.%s.download command is now deprecated. Please use "
-           "python -m spacy download [model name or shortcut] instead. For "
+           "spacy download [model name or shortcut] instead. For "
           "more info, see the documentation:" % lang,
           about.__docs_models__,
           "Downloading default '%s' model now..." % lang,
--- a/spacy/language.py
+++ b/spacy/language.py
@ -277,8 +277,7 @@ class Language(object):
    def make_doc(self, text):
        return self.tokenizer(text)

-    def update(self, docs, golds, drop=0., sgd=None, losses=None,
-            update_tensors=False):
+    def update(self, docs, golds, drop=0., sgd=None, losses=None):
        """Update the models in the pipeline.

        docs (iterable): A batch of `Doc` objects.
@ -311,7 +310,7 @@ class Language(object):
            tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
            d_tokvecses = proc.update((docs, tokvecses), golds,
                                      drop=drop, sgd=get_grads, losses=losses)
-            if update_tensors and d_tokvecses is not None:
+            if d_tokvecses is not None:
                bp_tokvecses(d_tokvecses, sgd=sgd)
        for key, (W, dW) in grads.items():
            sgd(W, dW, key=key)
@ -382,18 +381,9 @@ class Language(object):
        return optimizer

    def evaluate(self, docs_golds):
-        scorer = Scorer()
        docs, golds = zip(*docs_golds)
-        docs = list(docs)
-        golds = list(golds)
-        for pipe in self.pipeline:
-            if not hasattr(pipe, 'pipe'):
-                for doc in docs:
-                    pipe(doc)
-            else:
-                docs = list(pipe.pipe(docs))
-        assert len(docs) == len(golds)
-        for doc, gold in zip(docs, golds):
+        scorer = Scorer()
+        for doc, gold in zip(self.pipe(docs, batch_size=32), golds):
            scorer.score(doc, gold)
            doc.tensor = None
        return scorer
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@ -42,7 +42,7 @@ from .compat import json_dumps

 from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS
 from ._ml import rebatch, Tok2Vec, flatten, get_col, doc2feats
-from ._ml import build_text_classifier, build_tagger_model
+from ._ml import build_text_classifier
 from .parts_of_speech import X


@ -253,25 +253,23 @@ class NeuralTagger(BaseThincComponent):
        self.cfg = dict(cfg)

    def __call__(self, doc):
-        tags = self.predict(([doc], [doc.tensor]))
+        tags = self.predict([doc.tensor])
        self.set_annotations([doc], tags)
        return doc

    def pipe(self, stream, batch_size=128, n_threads=-1):
        for docs in cytoolz.partition_all(batch_size, stream):
-            docs = list(docs)
            tokvecs = [d.tensor for d in docs]
-            tag_ids = self.predict((docs, tokvecs))
+            tag_ids = self.predict(tokvecs)
            self.set_annotations(docs, tag_ids)
            yield from docs

-    def predict(self, docs_tokvecs):
-        scores = self.model(docs_tokvecs)
+    def predict(self, tokvecs):
+        scores = self.model(tokvecs)
        scores = self.model.ops.flatten(scores)
        guesses = scores.argmax(axis=1)
        if not isinstance(guesses, numpy.ndarray):
            guesses = guesses.get()
-        tokvecs = docs_tokvecs[1]
        guesses = self.model.ops.unflatten(guesses,
                    [tv.shape[0] for tv in tokvecs])
        return guesses
@ -296,7 +294,8 @@ class NeuralTagger(BaseThincComponent):

        if self.model.nI is None:
            self.model.nI = tokvecs[0].shape[1]
-        tag_scores, bp_tag_scores = self.model.begin_update(docs_tokvecs, drop=drop)
+
+        tag_scores, bp_tag_scores = self.model.begin_update(tokvecs, drop=drop)
        loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)

        d_tokvecs = bp_tag_scores(d_tag_scores, sgd=sgd)
@ -347,8 +346,10 @@ class NeuralTagger(BaseThincComponent):

    @classmethod
    def Model(cls, n_tags, token_vector_width):
-        return build_tagger_model(n_tags, token_vector_width)
- 
+        return with_flatten(
+            chain(Maxout(token_vector_width, token_vector_width),
+                  Softmax(n_tags, token_vector_width)))
+
    def use_params(self, params):
        with self.model.use_params(params):
            yield
@ -431,7 +432,7 @@ class NeuralLabeller(NeuralTagger):

    @property
    def labels(self):
-        return self.cfg.setdefault('labels', {})
+        return self.cfg.get('labels', {})

    @labels.setter
    def labels(self, value):
@ -454,8 +455,10 @@ class NeuralLabeller(NeuralTagger):

    @classmethod
    def Model(cls, n_tags, token_vector_width):
-        return build_tagger_model(n_tags, token_vector_width)
-    
+        return with_flatten(
+            chain(Maxout(token_vector_width, token_vector_width),
+                  Softmax(n_tags, token_vector_width)))
+
    def get_loss(self, docs, golds, scores):
        scores = self.model.ops.flatten(scores)
        cdef int idx = 0
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@ -385,7 +385,6 @@ cdef class ArcEager(TransitionSystem):
        for i in range(self.n_moves):
            if self.c[i].move == move and self.c[i].label == label:
                return self.c[i]
-        return Transition(clas=0, move=MISSING, label=0)

    def move_name(self, int move, attr_t label):
        label_str = self.strings[label]
--- a/spacy/syntax/nn_parser.pxd
+++ b/spacy/syntax/nn_parser.pxd
@ -14,4 +14,8 @@ cdef class Parser:
    cdef readonly TransitionSystem moves
    cdef readonly object cfg

+    cdef void _parse_step(self, StateC* state,
+            const float* feat_weights,
+            int nr_class, int nr_feat, int nr_piece) nogil
+
    #cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -44,7 +44,7 @@ from thinc.neural.util import get_array_module
 from .. import util
 from ..util import get_async, get_cuda_stream
 from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts
-from .._ml import Tok2Vec, doc2feats, rebatch, fine_tune
+from .._ml import Tok2Vec, doc2feats, rebatch
 from ..compat import json_dumps

 from . import _parse_features
@ -237,7 +237,6 @@ cdef class Parser:
        token_vector_width = util.env_opt('token_vector_width', token_vector_width)
        hidden_width = util.env_opt('hidden_width', hidden_width)
        parser_maxout_pieces = util.env_opt('parser_maxout_pieces', 2)
-        tensors = fine_tune(Tok2Vec(token_vector_width, 7500, preprocess=doc2feats()))
        if parser_maxout_pieces == 1:
            lower = PrecomputableAffine(hidden_width if depth >= 1 else nr_class,
                        nF=cls.nr_feature,
@ -249,10 +248,15 @@ cdef class Parser:
                        nI=token_vector_width)

        with Model.use_device('cpu'):
-            upper = chain(
-                clone(Maxout(hidden_width), (depth-1)),
-                zero_init(Affine(nr_class, drop_factor=0.0))
-            )
+            if depth == 0:
+                upper = chain()
+                upper.is_noop = True
+            else:
+                upper = chain(
+                    clone(Maxout(hidden_width), (depth-1)),
+                    zero_init(Affine(nr_class, drop_factor=0.0))
+                )
+                upper.is_noop = False
        # TODO: This is an unfortunate hack atm!
        # Used to set input dimensions in network.
        lower.begin_training(lower.ops.allocate((500, token_vector_width)))
@ -264,7 +268,7 @@ cdef class Parser:
            'hidden_width': hidden_width,
            'maxout_pieces': parser_maxout_pieces
        }
-        return (tensors, lower, upper), cfg
+        return (lower, upper), cfg

    def __init__(self, Vocab vocab, moves=True, model=True, **cfg):
        """
@ -340,10 +344,12 @@ cdef class Parser:
                The number of threads with which to work on the buffer in parallel.
        Yields (Doc): Documents, in order.
        """
+        cdef StateClass parse_state
        cdef Doc doc
+        queue = []
        for docs in cytoolz.partition_all(batch_size, docs):
            docs = list(docs)
-            tokvecs = [doc.tensor for doc in docs]
+            tokvecs = [d.tensor for d in docs]
            if beam_width == 1:
                parse_states = self.parse_batch(docs, tokvecs)
            else:
@ -363,11 +369,8 @@ cdef class Parser:
            int nr_class, nr_feat, nr_piece, nr_dim, nr_state
        if isinstance(docs, Doc):
            docs = [docs]
-        if isinstance(tokvecses, np.ndarray):
-            tokvecses = [tokvecses]

        tokvecs = self.model[0].ops.flatten(tokvecses)
-        tokvecs += self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))

        nr_state = len(docs)
        nr_class = self.moves.n_moves
@ -391,20 +394,27 @@ cdef class Parser:
        cdef np.ndarray scores
        c_token_ids = <int*>token_ids.data
        c_is_valid = <int*>is_valid.data
+        cdef int has_hidden = not getattr(vec2scores, 'is_noop', False)
        while not next_step.empty():
-            for i in range(next_step.size()):
-                st = next_step[i]
-                st.set_context_tokens(&c_token_ids[i*nr_feat], nr_feat)
-                self.moves.set_valid(&c_is_valid[i*nr_class], st)
+            if not has_hidden:
+                for i in cython.parallel.prange(
+                        next_step.size(), num_threads=6, nogil=True):
+                    self._parse_step(next_step[i],
+                        feat_weights, nr_class, nr_feat, nr_piece)
+            else:
+                for i in range(next_step.size()):
+                    st = next_step[i]
+                    st.set_context_tokens(&c_token_ids[i*nr_feat], nr_feat)
+                    self.moves.set_valid(&c_is_valid[i*nr_class], st)
                vectors = state2vec(token_ids[:next_step.size()])
-            scores = vec2scores(vectors)
-            c_scores = <float*>scores.data
-            for i in range(next_step.size()):
-                st = next_step[i]
-                guess = arg_max_if_valid(
-                    &c_scores[i*nr_class], &c_is_valid[i*nr_class], nr_class)
-                action = self.moves.c[guess]
-                action.do(st, action.label)
+                scores = vec2scores(vectors)
+                c_scores = <float*>scores.data
+                for i in range(next_step.size()):
+                    st = next_step[i]
+                    guess = arg_max_if_valid(
+                        &c_scores[i*nr_class], &c_is_valid[i*nr_class], nr_class)
+                    action = self.moves.c[guess]
+                    action.do(st, action.label)
            this_step, next_step = next_step, this_step
            next_step.clear()
            for st in this_step:
@ -419,7 +429,6 @@ cdef class Parser:
        cdef int nr_class = self.moves.n_moves
        cdef StateClass stcls, output
        tokvecs = self.model[0].ops.flatten(tokvecses)
-        tokvecs += self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
        cuda_stream = get_cuda_stream()
        state2vec, vec2scores = self.get_batch_model(len(docs), tokvecs,
                                                     cuda_stream, 0.0)
@ -452,6 +461,28 @@ cdef class Parser:
            beams.append(beam)
        return beams

+    cdef void _parse_step(self, StateC* state,
+            const float* feat_weights,
+            int nr_class, int nr_feat, int nr_piece) nogil:
+        '''This only works with no hidden layers -- fast but inaccurate'''
+        #for i in cython.parallel.prange(next_step.size(), num_threads=4, nogil=True):
+        #    self._parse_step(next_step[i], feat_weights, nr_class, nr_feat)
+        token_ids = <int*>calloc(nr_feat, sizeof(int))
+        scores = <float*>calloc(nr_class * nr_piece, sizeof(float))
+        is_valid = <int*>calloc(nr_class, sizeof(int))
+
+        state.set_context_tokens(token_ids, nr_feat)
+        sum_state_features(scores,
+            feat_weights, token_ids, 1, nr_feat, nr_class * nr_piece)
+        self.moves.set_valid(is_valid, state)
+        guess = arg_maxout_if_valid(scores, is_valid, nr_class, nr_piece)
+        action = self.moves.c[guess]
+        action.do(state, action.label)
+
+        free(is_valid)
+        free(scores)
+        free(token_ids)
+
    def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
        if losses is not None and self.name not in losses:
            losses[self.name] = 0.
@ -460,9 +491,6 @@ cdef class Parser:
        if isinstance(docs, Doc) and isinstance(golds, GoldParse):
            docs = [docs]
            golds = [golds]
-        my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=0.)
-        my_tokvecs = self.model[0].ops.flatten(my_tokvecs)
-        tokvecs += my_tokvecs

        cuda_stream = get_cuda_stream()

@ -512,9 +540,7 @@ cdef class Parser:
                break
        self._make_updates(d_tokvecs,
            backprops, sgd, cuda_stream)
-        d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs])
-        #bp_my_tokvecs(d_tokvecs, sgd=sgd)
-        return d_tokvecs
+        return self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs])

    def _init_gold_batch(self, whole_docs, whole_golds):
        """Make a square batch, of length equal to the shortest doc. A long
@ -577,7 +603,7 @@ cdef class Parser:
        return names

    def get_batch_model(self, batch_size, tokvecs, stream, dropout):
-        _, lower, upper = self.model
+        lower, upper = self.model
        state2vec = precompute_hiddens(batch_size, tokvecs,
                        lower, stream, drop=dropout)
        return state2vec, upper
@ -667,12 +693,10 @@ cdef class Parser:

    def to_disk(self, path, **exclude):
        serializers = {
-            'tok2vec_model': lambda p: p.open('wb').write(
-                self.model[0].to_bytes()),
            'lower_model': lambda p: p.open('wb').write(
-                self.model[1].to_bytes()),
+                self.model[0].to_bytes()),
            'upper_model': lambda p: p.open('wb').write(
-                self.model[2].to_bytes()),
+                self.model[1].to_bytes()),
            'vocab': lambda p: self.vocab.to_disk(p),
            'moves': lambda p: self.moves.to_disk(p, strings=False),
            'cfg': lambda p: p.open('w').write(json_dumps(self.cfg))
@ -693,29 +717,24 @@ cdef class Parser:
                self.model, cfg = self.Model(**self.cfg)
            else:
                cfg = {}
-            with (path / 'tok2vec_model').open('rb') as file_:
-                bytes_data = file_.read()
-            self.model[0].from_bytes(bytes_data)
            with (path / 'lower_model').open('rb') as file_:
                bytes_data = file_.read()
-            self.model[1].from_bytes(bytes_data)
+            self.model[0].from_bytes(bytes_data)
            with (path / 'upper_model').open('rb') as file_:
                bytes_data = file_.read()
-            self.model[2].from_bytes(bytes_data)
+            self.model[1].from_bytes(bytes_data)
            self.cfg.update(cfg)
        return self

    def to_bytes(self, **exclude):
        serializers = OrderedDict((
-            ('tok2vec_model', lambda: self.model[0].to_bytes()),
-            ('lower_model', lambda: self.model[1].to_bytes()),
-            ('upper_model', lambda: self.model[2].to_bytes()),
+            ('lower_model', lambda: self.model[0].to_bytes()),
+            ('upper_model', lambda: self.model[1].to_bytes()),
            ('vocab', lambda: self.vocab.to_bytes()),
            ('moves', lambda: self.moves.to_bytes(strings=False)),
            ('cfg', lambda: ujson.dumps(self.cfg))
        ))
        if 'model' in exclude:
-            exclude['tok2vec_model'] = True
            exclude['lower_model'] = True
            exclude['upper_model'] = True
            exclude.pop('model')
@ -726,7 +745,6 @@ cdef class Parser:
            ('vocab', lambda b: self.vocab.from_bytes(b)),
            ('moves', lambda b: self.moves.from_bytes(b, strings=False)),
            ('cfg', lambda b: self.cfg.update(ujson.loads(b))),
-            ('tok2vec_model', lambda b: None),
            ('lower_model', lambda b: None),
            ('upper_model', lambda b: None)
        ))
@ -736,12 +754,10 @@ cdef class Parser:
                self.model, cfg = self.Model(self.moves.n_moves)
            else:
                cfg = {}
-            if 'tok2vec_model' in msg:
-                self.model[0].from_bytes(msg['tok2vec_model'])
            if 'lower_model' in msg:
-                self.model[1].from_bytes(msg['lower_model'])
+                self.model[0].from_bytes(msg['lower_model'])
            if 'upper_model' in msg:
-                self.model[2].from_bytes(msg['upper_model'])
+                self.model[1].from_bytes(msg['upper_model'])
            self.cfg.update(cfg)
        return self

--- a/spacy/syntax/transition_system.pyx
+++ b/spacy/syntax/transition_system.pyx
@ -107,8 +107,6 @@ cdef class TransitionSystem:

    def is_valid(self, StateClass stcls, move_name):
        action = self.lookup_transition(move_name)
-        if action.move == 0:
-            return False
        return action.is_valid(stcls.c, action.label)

    cdef int set_valid(self, int* is_valid, const StateC* st) nogil:
--- a/spacy/util.py
+++ b/spacy/util.py
@ -113,7 +113,7 @@ def load_model(name, **overrides):
 def load_model_from_link(name, **overrides):
    """Load a model from a shortcut link, or directory in spaCy data path."""
    init_file = get_data_path() / name / '__init__.py'
-    spec = importlib.util.spec_from_file_location(name, init_file)
+    spec = importlib.util.spec_from_file_location(name, str(init_file))
    try:
        cls = importlib.util.module_from_spec(spec)
    except AttributeError:
--- a/website/_includes/_mixins.jade
+++ b/website/_includes/_mixins.jade
@ -103,20 +103,20 @@ mixin button(url, trusted, ...style)
    label    - [string] aside title (optional or false for no label)
    language - [string] language for syntax highlighting (default: "python")
               supports basic relevant languages available for PrismJS
-    icon     - [string] icon to display next to code block, mostly used for old/new
+    prompt    - [string] prompt or icon to display next to code block, (mostly used for old/new)
    height   - [integer] optional height to clip code block to

-mixin code(label, language, icon, height)
+mixin code(label, language, prompt, height)
    pre.c-code-block.o-block(class="lang-#{(language || DEFAULT_SYNTAX)}" class=icon ? "c-code-block--has-icon" : null style=height ? "height: #{height}px" : null)&attributes(attributes)
        if label
            h4.u-text-label.u-text-label--dark=label
-
+        - var icon = (prompt == 'accept' || prompt == 'reject')
        if icon
            - var classes = {'accept': 'u-color-green', 'reject': 'u-color-red'}
            .c-code-block__icon(class=classes[icon] || null class=classes[icon] ? "c-code-block__icon--border" : null)
                +icon(icon, 18)

-        code.c-code-block__content
+        code.c-code-block__content(data-prompt=icon ? null : prompt)
            block


--- a/website/assets/css/_components/_code.sass
+++ b/website/assets/css/_components/_code.sass
@ -35,6 +35,13 @@
    font: normal normal 1.1rem/#{2} $font-code
    padding: 1em 2em

+    &[data-prompt]:before,
+        content: attr(data-prompt)
+        margin-right: 0.65em
+        display: inline-block
+        vertical-align: middle
+        opacity: 0.5
+

 //- Inline code

--- a/website/docs/api/cli.jade
+++ b/website/docs/api/cli.jade
@ -5,16 +5,7 @@ include ../../_includes/_mixins
 p
    |  As of v1.7.0, spaCy comes with new command line helpers to download and
    |  link models and show useful debugging information. For a list of available
-    |  commands, type #[code python -m spacy]. To make the command even more
-    |  convenient, we recommend
-    |  #[+a("https://askubuntu.com/questions/17536/how-do-i-create-a-permanent-bash-alias/17537#17537") creating an alias]
-    |  mapping #[code python -m spacy] to #[code spacy].
-
-+aside("Why python -m?")
-    |  The problem with a global entry point is that it's resolved by looking up
-    |  entries in your #[code PATH] environment variable. This can give you
-    |  unexpected results, like executing the wrong spaCy installation.
-    |  #[code python -m] prevents fallbacks to system modules.
+    |  commands, type #[code spacy --help].

 +infobox("⚠️ Deprecation note")
    |  As of spaCy 2.0, the #[code model] command to initialise a model data
@ -33,8 +24,8 @@ p
    |  Direct downloads don't perform any compatibility checks and require the
    |  model name to be specified with its version (e.g., #[code en_core_web_sm-1.2.0]).

-+code(false, "bash").
-    python -m spacy download [model] [--direct]
+code(false, "bash", "$").
+    spacy download [model] [--direct]

 +table(["Argument", "Type", "Description"])
    +row
@ -80,8 +71,8 @@ p
    |  or use the #[+api("cli#package") #[code package]] command to create a
    |  model package.

-+code(false, "bash").
-    python -m spacy link [origin] [link_name] [--force]
+code(false, "bash", "$").
+    spacy link [origin] [link_name] [--force]

 +table(["Argument", "Type", "Description"])
    +row
@ -112,8 +103,8 @@ p
    |  markup to copy-paste into #[+a(gh("spacy") + "/issues") GitHub issues].

 +code(false, "bash").
-    python -m spacy info [--markdown]
-    python -m spacy info [model] [--markdown]
+    spacy info [--markdown]
+    spacy info [model] [--markdown]

 +table(["Argument", "Type", "Description"])
    +row
@ -139,8 +130,8 @@ p
    |  functions. The right converter is chosen based on the file extension of
    |  the input file. Currently only supports #[code .conllu].

-+code(false, "bash").
-    python -m spacy convert [input_file] [output_dir] [--n-sents] [--morphology]
+code(false, "bash", "$").
+    spacy convert [input_file] [output_dir] [--n-sents] [--morphology]

 +table(["Argument", "Type", "Description"])
    +row
@ -174,8 +165,8 @@ p
    |  Train a model. Expects data in spaCy's
    |  #[+a("/docs/api/annotation#json-input") JSON format].

-+code(false, "bash").
-    python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--n-sents] [--use-gpu] [--no-tagger] [--no-parser] [--no-entities]
+code(false, "bash", "$").
+    spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--n-sents] [--use-gpu] [--no-tagger] [--no-parser] [--no-entities]

 +table(["Argument", "Type", "Description"])
    +row
@ -345,8 +336,8 @@ p
    |  sure you're always using the latest versions. This means you need to be
    |  connected to the internet to use this command.

-+code(false, "bash").
-    python -m spacy package [input_dir] [output_dir] [--meta] [--force]
+code(false, "bash", "$").
+    spacy package [input_dir] [output_dir] [--meta] [--force]

 +table(["Argument", "Type", "Description"])
    +row
@ -360,10 +351,17 @@ p
        +cell Directory to create package folder in.

    +row
-        +cell #[code meta]
+        +cell #[code --meta-path], #[code -m]
        +cell option
        +cell Path to meta.json file (optional).

+    +row
+        +cell #[code --create-meta], #[code -c]
+        +cell flag
+        +cell
+            |  Create a meta.json file on the command line, even if one already
+            |  exists in the directory.
+
    +row
        +cell #[code --force], #[code -f]
        +cell flag
--- a/website/docs/api/language-models.jade
+++ b/website/docs/api/language-models.jade
@ -8,9 +8,9 @@ p


 +aside-code("Download language models", "bash").
-    python -m spacy download en
-    python -m spacy download de
-    python -m spacy download fr
+    spacy download en
+    spacy download de
+    spacy download fr

 +table([ "Language", "Token", "SBD", "Lemma", "POS", "NER", "Dep", "Vector", "Sentiment"])
    +row
--- a/website/docs/usage/adding-languages.jade
+++ b/website/docs/usage/adding-languages.jade
@ -205,7 +205,7 @@ p

 +infobox("Why lazy-loading?")
    |  Some languages contain large volumes of custom data, like lemmatizer
-    |  loopup tables, or complex regular expression that are expensive to
+    |  lookup tables, or complex regular expression that are expensive to
    |  compute. As of spaCy v2.0, #[code Language] classes are not imported on
    |  initialisation and are only loaded when you import them directly, or load
    |  a model that requires a language to be loaded. To lazy-load languages in
@ -789,4 +789,4 @@ p
    |  model use the using spaCy's #[+api("cli#train") #[code train]] command:

 +code(false, "bash").
-    python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--n-sents] [--use-gpu] [--no-tagger] [--no-parser] [--no-entities]
+    spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--n-sents] [--use-gpu] [--no-tagger] [--no-parser] [--no-entities]
--- a/website/docs/usage/index.jade
+++ b/website/docs/usage/index.jade
@ -32,10 +32,10 @@ p
    +qs({package: 'source'}) pip install -r requirements.txt
    +qs({package: 'source'}) pip install -e .

-    +qs({model: 'en'}) python -m spacy download en
-    +qs({model: 'de'}) python -m spacy download de
-    +qs({model: 'fr'}) python -m spacy download fr
-    +qs({model: 'es'}) python -m spacy download es
+    +qs({model: 'en'}) spacy download en
+    +qs({model: 'de'}) spacy download de
+    +qs({model: 'fr'}) spacy download fr
+    +qs({model: 'es'}) spacy download es

 +h(2, "installation") Installation instructions

@ -52,7 +52,7 @@ p Using pip, spaCy releases are currently only available as source packages.
    |  and available models, see the #[+a("/docs/usage/models") docs on models].

    +code.o-no-block.
-        python -m spacy download en
+        spacy download en

        &gt;&gt;&gt; import spacy
        &gt;&gt;&gt; nlp = spacy.load('en')
@ -312,7 +312,9 @@ p
    |  This error may occur when running the #[code spacy] command from the
    |  command line. spaCy does not currently add an entry to our #[code PATH]
    |  environment variable, as this can lead to unexpected results, especially
-    |  when using #[code virtualenv]. Run the command with #[code python -m],
+    |  when using #[code virtualenv]. Instead, spaCy adds an auto-alias that
+    |  maps #[code spacy] to #[code python -m spacy]. If this is not working as
+    |  expected, run the command with #[code python -m], yourself –
    |  for example #[code python -m spacy download en]. For more info on this,
    |  see #[+api("cli#download") download].

--- a/website/docs/usage/lightning-tour.jade
+++ b/website/docs/usage/lightning-tour.jade
@ -10,8 +10,8 @@ p
 +h(2, "models") Install models and process text

 +code(false, "bash").
-    python -m spacy download en
-    python -m spacy download de
+    spacy download en
+    spacy download de

 +code.
    import spacy
--- a/website/docs/usage/models.jade
+++ b/website/docs/usage/models.jade
@ -20,7 +20,7 @@ p
 +quickstart(QUICKSTART_MODELS, "Quickstart", "Install a default model, get the code to load it from within spaCy and an example to test it. For more options, see the section on available models below.")
    for models, lang in MODELS
        - var package = (models.length == 1) ? models[0] : models.find(function(m) { return m.def })
-        +qs({lang: lang}) python -m spacy download #{lang}
+        +qs({lang: lang}) spacy download #{lang}
        +qs({lang: lang}, "divider")
        +qs({lang: lang, load: "module"}, "python") import #{package.id}
        +qs({lang: lang, load: "module"}, "python") nlp = #{package.id}.load()
@ -52,16 +52,16 @@ p
    |  #[+api("cli#download") #[code download]] command. It takes care of
    |  finding the best-matching model compatible with your spaCy installation.

- var models = Object.keys(MODELS).map(function(lang) { return "python -m spacy download " + lang })
+- var models = Object.keys(MODELS).map(function(lang) { return "spacy download " + lang })
 +code(false, "bash").
    # out-of-the-box: download best-matching default model
-    #{Object.keys(MODELS).map(function(l) {return "python -m spacy download " + l}).join('\n')}
+    #{Object.keys(MODELS).map(function(l) {return "spacy download " + l}).join('\n')}

    # download best-matching version of specific model for your spaCy installation
-    python -m spacy download en_core_web_md
+    spacy download en_core_web_md

    # download exact model version (doesn't create shortcut link)
-    python -m spacy download en_core_web_md-1.2.0 --direct
+    spacy download en_core_web_md-1.2.0 --direct

 p
    |  The download command will #[+a("#download-pip") install the model] via
@ -72,7 +72,7 @@ p

 +code(false, "bash").
    pip install spacy
-    python -m spacy download en
+    spacy download en

 +code.
    import spacy
@ -179,8 +179,8 @@ p
    |  model names or IDs. And your system already comes with a native solution
    |  to mapping unicode aliases to file paths: symbolic links.

-+code(false, "bash").
-    python -m spacy link [package name or path] [shortcut] [--force]
+code(false, "bash", "$").
+    spacy link [package name or path] [shortcut] [--force]

 p
    |  The first argument is the #[strong package name] (if the model was
--- a/website/docs/usage/saving-loading.jade
+++ b/website/docs/usage/saving-loading.jade
@ -85,7 +85,7 @@ p
    }

 +code(false, "bash").
-    python -m spacy package /home/me/data/en_example_model /home/me/my_models
+    spacy package /home/me/data/en_example_model /home/me/my_models

 p This command will create a model package directory that should look like this:

--- a/website/docs/usage/training-ner.jade
+++ b/website/docs/usage/training-ner.jade
@ -102,7 +102,7 @@ p
    |  CLI command to create all required files and directories.

 +code(false, "bash").
-    python -m spacy package /home/me/data/en_technology /home/me/my_models
+    spacy package /home/me/data/en_technology /home/me/my_models

 p
    |  To build the package and create a #[code .tar.gz] archive, run
--- a/website/docs/usage/v2.jade
+++ b/website/docs/usage/v2.jade
@ -238,11 +238,11 @@ p
 +h(3, "features-models") Neural network models for English, German, French, Spanish and multi-language NER

 +aside-code("Example", "bash").
-    python -m spacy download en # default English model
-    python -m spacy download de # default German model
-    python -m spacy download fr # default French model
-    python -m spacy download es # default Spanish model
-    python -m spacy download xx_ent_wiki_sm # multi-language NER
+    spacy download en # default English model
+    spacy download de # default German model
+    spacy download fr # default French model
+    spacy download es # default Spanish model
+    spacy download xx_ent_wiki_sm # multi-language NER

 p
    |  spaCy v2.0 comes with new and improved neural network models for English,
--- a/website/docs/usage/visualizers.jade
+++ b/website/docs/usage/visualizers.jade
@ -259,7 +259,7 @@ p
    |  notebook, the visualizations will be included as HTML.

 +code("Jupyter Example").
-    # don't forget to install a model, e.g.: python -m spacy download en
+    # don't forget to install a model, e.g.: spacy download en
    import spacy
    from spacy import displacy

--- a/website/index.jade
+++ b/website/index.jade
@ -68,7 +68,7 @@ include _includes/_mixins
    +grid
        +grid-col("two-thirds")
            +terminal("lightning_tour.py").
-                # Install: pip install spacy && python -m spacy download en
+                # Install: pip install spacy && spacy download en
                import spacy

                # Load English tokenizer, tagger, parser, NER and word vectors