Merge remote-tracking branch 'upstream/develop' into indonesian

2025-07-15 02:32:37 +03:00 · 2017-08-25 09:21:49 +08:00 · 2017-08-25 09:21:49 +08:00 · 58d8078971
commit 58d8078971
parent f77443ab68 bb1abbeba5
25 changed files with 340 additions and 85 deletions
--- a/README.rst
+++ b/README.rst
@ -229,7 +229,7 @@ Compile from source
 The other way to install spaCy is to clone its
 `GitHub repository <https://github.com/explosion/spaCy>`_ and build it from
 source. That is the common way if you want to make changes to the code base.
-You'll need to make sure that you have a development enviroment consisting of a
+You'll need to make sure that you have a development environment consisting of a
 Python distribution including header files, a compiler,
 `pip <https://pip.pypa.io/en/latest/installing/>`__, `virtualenv <https://virtualenv.pypa.io/>`_
 and `git <https://git-scm.com>`_ installed. The compiler part is the trickiest.
--- a/spacy/main.py
+++ b/spacy/main.py
@ -3,15 +3,23 @@ from __future__ import print_function
 # NB! This breaks in plac on Python 2!!
 #from __future__ import unicode_literals
 if __name__ == '__main__':
    import plac
    import sys
-    from spacy.cli import download, link, info, package, train, convert
+    from spacy.cli import download, link, info, package, train, convert, model
    from spacy.cli import profile
    from spacy.util import prints
-    commands = {'download': download, 'link': link, 'info': info, 'train': train,
+    commands = {
-                'convert': convert, 'package': package}
+        'download': download,
        'link': link,
        'info': info,
        'train': train,
        'convert': convert,
        'package': package,
        'model': model,
        'profile': profile,
    }
    if len(sys.argv) == 1:
        prints(', '.join(commands), title="Available commands", exits=1)
    command = sys.argv.pop(1)
@ -19,5 +27,7 @@ if __name__ == '__main__':
    if command in commands:
        plac.call(commands[command])
    else:
-        prints("Available: %s" % ', '.join(commands),
+        prints(
-               title="Unknown command: %s" % command, exits=1)
+            "Available: %s" % ', '.join(commands),
            title="Unknown command: %s" % command,
            exits=1)
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@ -218,7 +218,10 @@ def drop_layer(layer, factor=2.):
            return layer.begin_update(X, drop=drop)
        else:
            return X, lambda dX, sgd=None: dX
-    return wrap(drop_layer_fwd, layer)
+
    model = wrap(drop_layer_fwd, layer)
    model.predict = layer
    return model
 def Tok2Vec(width, embed_size, preprocess=None):
@ -359,8 +362,6 @@ def get_token_vectors(tokens_attrs_vectors, drop=0.):
    def backward(d_output, sgd=None):
        return (tokens, d_output)
    return vectors, backward
 def fine_tune(embedding, combine=None):
    if combine is not None:
        raise NotImplementedError(
@ -373,22 +374,30 @@ def fine_tune(embedding, combine=None):
        flat_tokvecs = embedding.ops.flatten(tokvecs)
        flat_vecs = embedding.ops.flatten(vecs)
        output = embedding.ops.unflatten(
-                   (model.mix[0] * flat_vecs + model.mix[1] * flat_tokvecs),
+                   (model.mix[0] * flat_tokvecs + model.mix[1] * flat_vecs), lengths)
                    lengths)
        def fine_tune_bwd(d_output, sgd=None):
            bp_vecs(d_output, sgd=sgd)
            flat_grad = model.ops.flatten(d_output)
-            model.d_mix[1] += flat_tokvecs.dot(flat_grad.T).sum()
+            model.d_mix[0] += flat_tokvecs.dot(flat_grad.T).sum()
-            model.d_mix[0] += flat_vecs.dot(flat_grad.T).sum()
+            model.d_mix[1] += flat_vecs.dot(flat_grad.T).sum()
            bp_vecs([d_o * model.mix[1] for d_o in d_output], sgd=sgd)
            if sgd is not None:
                sgd(model._mem.weights, model._mem.gradient, key=model.id)
-            return d_output
+            return [d_o * model.mix[0] for d_o in d_output]
        return output, fine_tune_bwd
    def fine_tune_predict(docs_tokvecs):
        docs, tokvecs = docs_tokvecs
        vecs = embedding(docs)
        return [model.mix[0]*tv+model.mix[1]*v
                for tv, v in zip(tokvecs, vecs)]
    model = wrap(fine_tune_fwd, embedding)
    model.mix = model._mem.add((model.id, 'mix'), (2,))
-    model.mix.fill(1.)
+    model.mix.fill(0.5)
    model.d_mix = model._mem.add_gradient((model.id, 'd_mix'), (model.id, 'mix'))
    model.predict = fine_tune_predict
    return model
--- a/spacy/cli/init.py
+++ b/spacy/cli/init.py
@ -2,5 +2,7 @@ from .download import download
 from .info import info
 from .link import link
 from .package import package
 from .profile import profile
 from .train import train
 from .convert import convert
 from .model import model
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@ -24,28 +24,29 @@ def download(cmd, model, direct=False):
    with version.
    """
    if direct:
-        download_model('{m}/{m}.tar.gz'.format(m=model))
+        dl = download_model('{m}/{m}.tar.gz'.format(m=model))
    else:
        shortcuts = get_json(about.__shortcuts__, "available shortcuts")
        model_name = shortcuts.get(model, model)
        compatibility = get_compatibility()
        version = get_version(model_name, compatibility)
-        download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version))
+        dl = download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version))
-        try:
+        if dl == 0:
-            # Get package path here because link uses
+            try:
-            # pip.get_installed_distributions() to check if model is a package,
+                # Get package path here because link uses
-            # which fails if model was just installed via subprocess
+                # pip.get_installed_distributions() to check if model is a package,
-            package_path = get_package_path(model_name)
+                # which fails if model was just installed via subprocess
-            link(None, model_name, model, force=True, model_path=package_path)
+                package_path = get_package_path(model_name)
-        except:
+                link(None, model_name, model, force=True, model_path=package_path)
-            # Dirty, but since spacy.download and the auto-linking is mostly
+            except:
-            # a convenience wrapper, it's best to show a success message and
+                # Dirty, but since spacy.download and the auto-linking is mostly
-            # loading instructions, even if linking fails.
+                # a convenience wrapper, it's best to show a success message and
-            prints("Creating a shortcut link for 'en' didn't work (maybe you "
+                # loading instructions, even if linking fails.
-                   "don't have admin permissions?), but you can still load "
+                prints("Creating a shortcut link for 'en' didn't work (maybe you "
-                   "the model via its full package name:",
+                    "don't have admin permissions?), but you can still load "
-                   "nlp = spacy.load('%s')" % model_name,
+                    "the model via its full package name:",
-                   title="Download successful")
+                    "nlp = spacy.load('%s')" % model_name,
                    title="Download successful")
 def get_json(url, desc):
@ -77,6 +78,6 @@ def get_version(model, comp):
 def download_model(filename):
    download_url = about.__download_url__ + '/' + filename
-    subprocess.call([sys.executable, '-m',
+    return subprocess.call([sys.executable, '-m',
        'pip', 'install', '--no-cache-dir', download_url],
        env=os.environ.copy())
--- a/spacy/cli/model.py
+++ b/spacy/cli/model.py
@ -0,0 +1,119 @@
 # coding: utf8
 from __future__ import unicode_literals
 import gzip
 import math
 from ast import literal_eval
 from pathlib import Path
 from preshed.counter import PreshCounter
 import spacy
 from ..compat import fix_text
 from .. import util
 def model(cmd, lang, model_dir, freqs_data, clusters_data, vectors_data):
    model_path = Path(model_dir)
    freqs_path = Path(freqs_data)
    clusters_path = Path(clusters_data) if clusters_data else None
    vectors_path = Path(vectors_data) if vectors_data else None
    check_dirs(freqs_path, clusters_path, vectors_path)
    # vocab = util.get_lang_class(lang).Defaults.create_vocab()
    nlp = spacy.blank(lang)
    vocab = nlp.vocab
    probs, oov_prob = read_probs(freqs_path)
    clusters = read_clusters(clusters_path) if clusters_path else {}
    populate_vocab(vocab, clusters, probs, oov_prob)
    create_model(model_path, nlp)
 def create_model(model_path, model):
    if not model_path.exists():
        model_path.mkdir()
    model.to_disk(model_path.as_posix())
 def read_probs(freqs_path, max_length=100, min_doc_freq=5, min_freq=200):
    counts = PreshCounter()
    total = 0
    freqs_file = check_unzip(freqs_path)
    for i, line in enumerate(freqs_file):
        freq, doc_freq, key = line.rstrip().split('\t', 2)
        freq = int(freq)
        counts.inc(i + 1, freq)
        total += freq
    counts.smooth()
    log_total = math.log(total)
    freqs_file = check_unzip(freqs_path)
    probs = {}
    for line in freqs_file:
        freq, doc_freq, key = line.rstrip().split('\t', 2)
        doc_freq = int(doc_freq)
        freq = int(freq)
        if doc_freq >= min_doc_freq and freq >= min_freq and len(
                key) < max_length:
            word = literal_eval(key)
            smooth_count = counts.smoother(int(freq))
            probs[word] = math.log(smooth_count) - log_total
    oov_prob = math.log(counts.smoother(0)) - log_total
    return probs, oov_prob
 def read_clusters(clusters_path):
    clusters = {}
    with clusters_path.open() as f:
        for line in f:
            try:
                cluster, word, freq = line.split()
                word = fix_text(word)
            except ValueError:
                continue
            # If the clusterer has only seen the word a few times, its
            # cluster is unreliable.
            if int(freq) >= 3:
                clusters[word] = cluster
            else:
                clusters[word] = '0'
    # Expand clusters with re-casing
    for word, cluster in list(clusters.items()):
        if word.lower() not in clusters:
            clusters[word.lower()] = cluster
        if word.title() not in clusters:
            clusters[word.title()] = cluster
        if word.upper() not in clusters:
            clusters[word.upper()] = cluster
    return clusters
 def populate_vocab(vocab, clusters, probs, oov_prob):
    for word, prob in reversed(
            sorted(list(probs.items()), key=lambda item: item[1])):
        lexeme = vocab[word]
        lexeme.prob = prob
        lexeme.is_oov = False
        # Decode as a little-endian string, so that we can do & 15 to get
        # the first 4 bits. See _parse_features.pyx
        if word in clusters:
            lexeme.cluster = int(clusters[word][::-1], 2)
        else:
            lexeme.cluster = 0
 def check_unzip(file_path):
    file_path_str = file_path.as_posix()
    if file_path_str.endswith('gz'):
        return gzip.open(file_path_str)
    else:
        return file_path.open()
 def check_dirs(freqs_data, clusters_data, vectors_data):
    if not freqs_data.is_file():
        util.sys_exit(freqs_data.as_posix(), title="No frequencies file found")
    if clusters_data and not clusters_data.is_file():
        util.sys_exit(
            clusters_data.as_posix(), title="No Brown clusters file found")
    if vectors_data and not vectors_data.is_file():
        util.sys_exit(
            vectors_data.as_posix(), title="No word vectors file found")
--- a/spacy/cli/profile.py
+++ b/spacy/cli/profile.py
@ -0,0 +1,45 @@
 # coding: utf8
 from __future__ import unicode_literals, division, print_function
 import plac
 from pathlib import Path
 import ujson
 import cProfile
 import pstats
 import spacy
 import sys
 import tqdm
 import cytoolz
 def read_inputs(loc):
    if loc is None:
        file_ = sys.stdin
        file_ = (line.encode('utf8') for line in file_)
    else:
        file_ = Path(loc).open()
    for line in file_:
        data = ujson.loads(line)
        text = data['text']
        yield text
@plac.annotations(
    lang=("model/language", "positional", None, str),
    inputs=("Location of input file", "positional", None, read_inputs)
 )
 def profile(cmd, lang, inputs=None):
    """
    Profile a spaCy pipeline, to find out which functions take the most time.
    """
    nlp = spacy.load(lang) 
    texts = list(cytoolz.take(10000, inputs))
    cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
    s = pstats.Stats("Profile.prof")
    s.strip_dirs().sort_stats("time").print_stats()
 def parse_texts(nlp, texts):
    for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=128):
        pass
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -32,10 +32,12 @@ from ..compat import json_dumps
    resume=("Whether to resume training", "flag", "R", bool),
    no_tagger=("Don't train tagger", "flag", "T", bool),
    no_parser=("Don't train parser", "flag", "P", bool),
-    no_entities=("Don't train NER", "flag", "N", bool)
+    no_entities=("Don't train NER", "flag", "N", bool),
    gold_preproc=("Use gold preprocessing", "flag", "G", bool),
 )
 def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
-          use_gpu=-1, resume=False, no_tagger=False, no_parser=False, no_entities=False):
+          use_gpu=-1, resume=False, no_tagger=False, no_parser=False, no_entities=False,
          gold_preproc=False):
    """
    Train a model. Expects data in spaCy's JSON format.
    """
@ -86,13 +88,13 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
                i += 20
            with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
                train_docs = corpus.train_docs(nlp, projectivize=True,
-                                               gold_preproc=False, max_length=0)
+                                               gold_preproc=gold_preproc, max_length=0)
                losses = {}
                for batch in minibatch(train_docs, size=batch_sizes):
                    docs, golds = zip(*batch)
                    nlp.update(docs, golds, sgd=optimizer,
                               drop=next(dropout_rates), losses=losses,
-                               update_tensors=True)
+                               update_shared=True)
                    pbar.update(sum(len(doc) for doc in docs))
            with nlp.use_params(optimizer.averages):
@ -104,7 +106,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
                scorer = nlp_loaded.evaluate(
                            corpus.dev_docs(
                                nlp_loaded,
-                                gold_preproc=False))
+                                gold_preproc=gold_preproc))
                acc_loc =(output_path / ('model%d' % i) / 'accuracy.json')
                with acc_loc.open('w') as file_:
                    file_.write(json_dumps(scorer.scores))
--- a/spacy/glossary.py
+++ b/spacy/glossary.py
@ -60,7 +60,7 @@ GLOSSARY = {
    'JJR':          'adjective, comparative',
    'JJS':          'adjective, superlative',
    'LS':           'list item marker',
-    'MD':           'verb, modal auxillary',
+    'MD':           'verb, modal auxiliary',
    'NIL':          'missing tag',
    'NN':           'noun, singular or mass',
    'NNP':          'noun, proper singular',
@ -91,7 +91,7 @@ GLOSSARY = {
    'NFP':          'superfluous punctuation',
    'GW':           'additional word in multi-word expression',
    'XX':           'unknown',
-    'BES':          'auxillary "be"',
+    'BES':          'auxiliary "be"',
    'HVS':          'forms of "have"',
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -406,11 +406,11 @@ cdef class GoldParse:
        if tags is None:
            tags = [None for _ in doc]
        if heads is None:
-            heads = [token.i for token in doc]
+            heads = [None for token in doc]
        if deps is None:
            deps = [None for _ in doc]
        if entities is None:
-            entities = ['-' for _ in doc]
+            entities = [None for _ in doc]
        elif len(entities) == 0:
            entities = ['O' for _ in doc]
        elif not isinstance(entities[0], basestring):
--- a/spacy/lang/en/tokenizer_exceptions.py
+++ b/spacy/lang/en/tokenizer_exceptions.py
@ -232,7 +232,10 @@ for verb_data in [
    {ORTH: "are", LEMMA: "be", NORM: "are", TAG: "VBP", "number": 2},
    {ORTH: "is", LEMMA: "be", NORM: "is", TAG: "VBZ"},
    {ORTH: "was", LEMMA: "be", NORM: "was"},
-    {ORTH: "were", LEMMA: "be", NORM: "were"}]:
+    {ORTH: "were", LEMMA: "be", NORM: "were"},
    {ORTH: "have", NORM: "have"},
    {ORTH: "has", LEMMA: "have", NORM: "has"},
    {ORTH: "dare", NORM: "dare"}]:
    verb_data_tc = dict(verb_data)
    verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
    for data in [verb_data, verb_data_tc]:
--- a/spacy/language.py
+++ b/spacy/language.py
@ -200,6 +200,7 @@ class Language(object):
            else:
                flat_list.append(pipe)
        self.pipeline = flat_list
        self._optimizer = None
    @property
    def meta(self):
@ -244,7 +245,7 @@ class Language(object):
    def matcher(self):
        return self.get_component('matcher')
-    def get_component(self, name): 
+    def get_component(self, name):
        if self.pipeline in (True, None):
            return None
        for proc in self.pipeline:
@ -278,7 +279,7 @@ class Language(object):
        return self.tokenizer(text)
    def update(self, docs, golds, drop=0., sgd=None, losses=None,
-            update_tensors=False):
+            update_shared=False):
        """Update the models in the pipeline.
        docs (iterable): A batch of `Doc` objects.
@ -298,6 +299,10 @@ class Language(object):
                "Got: %d, %d" % (len(docs), len(golds)))
        if len(docs) == 0:
            return
        if sgd is None:
            if self._optimizer is None:
                self._optimizer = Adam(Model.ops, 0.001)
            sgd = self._optimizer
        tok2vec = self.pipeline[0]
        feats = tok2vec.doc2feats(docs)
        grads = {}
@ -312,10 +317,11 @@ class Language(object):
                continue
            d_tokvecses = proc.update((docs, tokvecses), golds,
                                      drop=drop, sgd=get_grads, losses=losses)
-            if update_tensors and d_tokvecses is not None:
+            if update_shared and d_tokvecses is not None:
                for i, d_tv in enumerate(d_tokvecses):
                    all_d_tokvecses[i] += d_tv
-        bp_tokvecses(all_d_tokvecses, sgd=sgd)
+        if update_shared and bp_tokvecses is not None:
            bp_tokvecses(all_d_tokvecses, sgd=sgd)
        for key, (W, dW) in grads.items():
            sgd(W, dW, key=key)
        # Clear the tensor variable, to free GPU memory.
@ -378,11 +384,11 @@ class Language(object):
        eps = util.env_opt('optimizer_eps', 1e-08)
        L2 = util.env_opt('L2_penalty', 1e-6)
        max_grad_norm = util.env_opt('grad_norm_clip', 1.)
-        optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1,
+        self._optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1,
-                         beta2=beta2, eps=eps)
+                              beta2=beta2, eps=eps)
-        optimizer.max_grad_norm = max_grad_norm
+        self._optimizer.max_grad_norm = max_grad_norm
-        optimizer.device = device
+        self._optimizer.device = device
-        return optimizer
+        return self._optimizer
    def evaluate(self, docs_golds):
        scorer = Scorer()
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@ -294,6 +294,8 @@ class NeuralTagger(BaseThincComponent):
        doc.is_tagged = True
    def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
        if losses is not None and self.name not in losses:
            losses[self.name] = 0.
        docs, tokvecs = docs_tokvecs
        if self.model.nI is None:
@ -302,6 +304,8 @@ class NeuralTagger(BaseThincComponent):
        loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)
        d_tokvecs = bp_tag_scores(d_tag_scores, sgd=sgd)
        if losses is not None:
            losses[self.name] += loss
        return d_tokvecs
    def get_loss(self, docs, golds, scores):
--- a/spacy/syntax/ner.pyx
+++ b/spacy/syntax/ner.pyx
@ -113,7 +113,7 @@ cdef class BiluoPushDown(TransitionSystem):
    def has_gold(self, GoldParse gold, start=0, end=None):
        end = end or len(gold.ner)
-        if all([tag == '-' for tag in gold.ner[start:end]]):
+        if all([tag in ('-', None) for tag in gold.ner[start:end]]):
            return False
        else:
            return True
--- a/spacy/syntax/nn_parser.pxd
+++ b/spacy/syntax/nn_parser.pxd
@ -14,4 +14,8 @@ cdef class Parser:
    cdef readonly TransitionSystem moves
    cdef readonly object cfg
    cdef void _parse_step(self, StateC* state,
            const float* feat_weights,
            int nr_class, int nr_feat, int nr_piece) nogil
    #cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -257,10 +257,15 @@ cdef class Parser:
                        nI=token_vector_width)
        with Model.use_device('cpu'):
-            upper = chain(
+            if depth == 0:
-                clone(Maxout(hidden_width), (depth-1)),
+                upper = chain()
-                zero_init(Affine(nr_class, drop_factor=0.0))
+                upper.is_noop = True
-            )
+            else:
                upper = chain(
                    clone(Maxout(hidden_width), (depth-1)),
                    zero_init(Affine(nr_class, drop_factor=0.0))
                )
                upper.is_noop = False
        # TODO: This is an unfortunate hack atm!
        # Used to set input dimensions in network.
        lower.begin_training(lower.ops.allocate((500, token_vector_width)))
@ -412,20 +417,27 @@ cdef class Parser:
        cdef np.ndarray scores
        c_token_ids = <int*>token_ids.data
        c_is_valid = <int*>is_valid.data
        cdef int has_hidden = not getattr(vec2scores, 'is_noop', False)
        while not next_step.empty():
-            for i in range(next_step.size()):
+            if not has_hidden:
-                st = next_step[i]
+                for i in cython.parallel.prange(
-                st.set_context_tokens(&c_token_ids[i*nr_feat], nr_feat)
+                        next_step.size(), num_threads=6, nogil=True):
-                self.moves.set_valid(&c_is_valid[i*nr_class], st)
+                    self._parse_step(next_step[i],
-            vectors = state2vec(token_ids[:next_step.size()])
+                        feat_weights, nr_class, nr_feat, nr_piece)
-            scores = vec2scores(vectors)
+            else:
-            c_scores = <float*>scores.data
+                for i in range(next_step.size()):
-            for i in range(next_step.size()):
+                    st = next_step[i]
-                st = next_step[i]
+                    st.set_context_tokens(&c_token_ids[i*nr_feat], nr_feat)
-                guess = arg_max_if_valid(
+                    self.moves.set_valid(&c_is_valid[i*nr_class], st)
-                    &c_scores[i*nr_class], &c_is_valid[i*nr_class], nr_class)
+                vectors = state2vec(token_ids[:next_step.size()])
-                action = self.moves.c[guess]
+                scores = vec2scores(vectors)
-                action.do(st, action.label)
+                c_scores = <float*>scores.data
                for i in range(next_step.size()):
                    st = next_step[i]
                    guess = arg_max_if_valid(
                        &c_scores[i*nr_class], &c_is_valid[i*nr_class], nr_class)
                    action = self.moves.c[guess]
                    action.do(st, action.label)
            this_step, next_step = next_step, this_step
            next_step.clear()
            for st in this_step:
@ -482,7 +494,31 @@ cdef class Parser:
            beams.append(beam)
        return beams
    cdef void _parse_step(self, StateC* state,
            const float* feat_weights,
            int nr_class, int nr_feat, int nr_piece) nogil:
        '''This only works with no hidden layers -- fast but inaccurate'''
        #for i in cython.parallel.prange(next_step.size(), num_threads=4, nogil=True):
        #    self._parse_step(next_step[i], feat_weights, nr_class, nr_feat)
        token_ids = <int*>calloc(nr_feat, sizeof(int))
        scores = <float*>calloc(nr_class * nr_piece, sizeof(float))
        is_valid = <int*>calloc(nr_class, sizeof(int))
        state.set_context_tokens(token_ids, nr_feat)
        sum_state_features(scores,
            feat_weights, token_ids, 1, nr_feat, nr_class * nr_piece)
        self.moves.set_valid(is_valid, state)
        guess = arg_maxout_if_valid(scores, is_valid, nr_class, nr_piece)
        action = self.moves.c[guess]
        action.do(state, action.label)
        free(is_valid)
        free(scores)
        free(token_ids)
    def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
        if not any(self.moves.has_gold(gold) for gold in golds):
            return None
        if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.5:
            return self.update_beam(docs_tokvecs, golds,
                    self.cfg['beam_width'], self.cfg['beam_density'],
@ -555,6 +591,10 @@ cdef class Parser:
    def update_beam(self, docs_tokvecs, golds, width=None, density=None,
            drop=0., sgd=None, losses=None):
        if not any(self.moves.has_gold(gold) for gold in golds):
            return None
        if not golds:
            return None
        if width is None:
            width = self.cfg.get('beam_width', 2)
        if density is None:
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -303,8 +303,14 @@ cdef class Doc:
                return self.user_hooks['vector'](self)
            if self._vector is not None:
                return self._vector
-            elif self.has_vector and len(self):
+            elif not len(self):
-                self._vector = sum(t.vector for t in self) / len(self)
+                self._vector = numpy.zeros((self.vocab.vectors_length,), dtype='f')
                return self._vector
            elif self.has_vector:
                vector = numpy.zeros((self.vocab.vectors_length,), dtype='f')
                for token in self.c[:self.length]:
                    vector += self.vocab.get_vector(token.lex.orth)
                self._vector = vector / len(self)
                return self._vector
            elif self.tensor is not None:
                self._vector = self.tensor.mean(axis=0)
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -4,6 +4,7 @@ from __future__ import unicode_literals
 import bz2
 import ujson
 import re
 import numpy
 from libc.string cimport memset, memcpy
 from libc.stdint cimport int32_t
@ -244,7 +245,7 @@ cdef class Vocab:
    @property
    def vectors_length(self):
-        return len(self.vectors)
+        return self.vectors.data.shape[1]
    def clear_vectors(self, new_dim=None):
        """Drop the current vector table. Because all vectors must be the same
@ -268,7 +269,10 @@ cdef class Vocab:
        """
        if isinstance(orth, basestring_):
            orth = self.strings.add(orth)
-        return self.vectors[orth]
+        if orth in self.vectors.key2row:
            return self.vectors[orth]
        else:
            return numpy.zeros((self.vectors_length,), dtype='f')
    def set_vector(self, orth, vector):
        """Set a vector for a word in the vocabulary.
--- a/website/docs/api/_annotation/_pos-tags.jade
+++ b/website/docs/api/_annotation/_pos-tags.jade
@ -21,7 +21,7 @@ p
    +pos-row("$", "SYM", "SymType=currency", "symbol, currency")
    +pos-row("ADD", "X", "", "email")
    +pos-row("AFX", "ADJ", "Hyph=yes", "affix")
-    +pos-row("BES", "VERB", "", 'auxillary "be"')
+    +pos-row("BES", "VERB", "", 'auxiliary "be"')
    +pos-row("CC", "CONJ", "ConjType=coor", "conjunction, coordinating")
    +pos-row("CD", "NUM", "NumType=card", "cardinal number")
    +pos-row("DT", "DET", "determiner")
@ -35,7 +35,7 @@ p
    +pos-row("JJR", "ADJ", "Degree=comp", "adjective, comparative")
    +pos-row("JJS", "ADJ", "Degree=sup", "adjective, superlative")
    +pos-row("LS", "PUNCT", "NumType=ord", "list item marker")
-    +pos-row("MD", "VERB", "VerbType=mod", "verb, modal auxillary")
+    +pos-row("MD", "VERB", "VerbType=mod", "verb, modal auxiliary")
    +pos-row("NFP", "PUNCT", "", "superfluous punctuation")
    +pos-row("NIL", "", "", "missing tag")
    +pos-row("NN", "NOUN", "Number=sing", "noun, singular or mass")
--- a/website/docs/api/span.jade
+++ b/website/docs/api/span.jade
@ -205,7 +205,7 @@ p Retokenize the document, such that the span is merged into a single token.
 p
    |  The token within the span that's highest in the parse tree. If there's a
-    |  tie, the earlist is prefered.
+    |  tie, the earliest is preferred.
 +aside-code("Example").
    doc = nlp(u'I like New York in Autumn.')
--- a/website/docs/usage/customizing-tokenizer.jade
+++ b/website/docs/usage/customizing-tokenizer.jade
@ -39,7 +39,7 @@ p
 +h(2, "special-cases") Adding special case tokenization rules
 p
-    |  Most domains have at least some idiosyncracies that require custom
+    |  Most domains have at least some idiosyncrasies that require custom
    |  tokenization rules. This could be very certain expressions, or
    |  abbreviations only used in this specific field.
--- a/website/docs/usage/index.jade
+++ b/website/docs/usage/index.jade
@ -109,7 +109,7 @@ p
    |  The other way to install spaCy is to clone its
    |  #[+a(gh("spaCy")) GitHub repository] and build it from source. That is
    |  the common way if you want to make changes to the code base. You'll need to
-    |  make sure that you have a development enviroment consisting of a Python
+    |  make sure that you have a development environment consisting of a Python
    |  distribution including header files, a compiler,
    |  #[+a("https://pip.pypa.io/en/latest/installing/") pip],
    |  #[+a("https://virtualenv.pypa.io/") virtualenv] and
--- a/website/docs/usage/models.jade
+++ b/website/docs/usage/models.jade
@ -190,10 +190,10 @@ p
 +code("Examples", "bash").
    # set up shortcut link to load installed package as "en_default"
-    python -m spacy link en_core_web_md en_default
+    spacy link en_core_web_md en_default
    # set up shortcut link to load local model as "my_amazing_model"
-    python -m spacy link /Users/you/model my_amazing_model
+    spacy link /Users/you/model my_amazing_model
 +infobox("Important note")
    |  In order to create a symlink, your user needs the #[strong required permissions].
--- a/website/docs/usage/pos-tagging.jade
+++ b/website/docs/usage/pos-tagging.jade
@ -40,7 +40,7 @@ p
        +cell #[code VerbForm=Fin], #[code Mood=Ind], #[code Tense=Pres]
    +row
-        +cell I read the paper yesteday
+        +cell I read the paper yesterday
        +cell read
        +cell read
        +cell verb
--- a/website/docs/usage/production-use.jade
+++ b/website/docs/usage/production-use.jade
@ -94,7 +94,7 @@ p
    |  is mostly intended as a convenient, interactive wrapper. It performs
    |  compatibility checks and prints detailed error messages and warnings.
    |  However, if you're downloading models as part of an automated build
-    |  process, this only adds an unecessary layer of complexity. If you know
+    |  process, this only adds an unnecessary layer of complexity. If you know
    |  which models your application needs, you should be specifying them directly.
 p