Merge branch 'develop' of https://github.com/explosion/spaCy into develop

2025-08-08 14:14:57 +03:00 · 2017-08-22 17:02:04 -05:00 · 2017-08-22 17:02:04 -05:00 · 9c580ad28a
commit 9c580ad28a
parent a4633fff6f 03b5b9727a
6 changed files with 75 additions and 6 deletions
--- a/spacy/main.py
+++ b/spacy/main.py
@ -7,6 +7,7 @@ if __name__ == '__main__':
    import plac
    import sys
    from spacy.cli import download, link, info, package, train, convert, model
+    from spacy.cli import profile
    from spacy.util import prints

    commands = {
@ -16,7 +17,8 @@ if __name__ == '__main__':
        'train': train,
        'convert': convert,
        'package': package,
-        'model': model
+        'model': model,
+        'profile': profile,
    }
    if len(sys.argv) == 1:
        prints(', '.join(commands), title="Available commands", exits=1)
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@ -218,7 +218,10 @@ def drop_layer(layer, factor=2.):
            return layer.begin_update(X, drop=drop)
        else:
            return X, lambda dX, sgd=None: dX
-    return wrap(drop_layer_fwd, layer)
+
+    model = wrap(drop_layer_fwd, layer)
+    model.predict = layer
+    return model


 def Tok2Vec(width, embed_size, preprocess=None):
@ -382,10 +385,18 @@ def fine_tune(embedding, combine=None):
            sgd(model._mem.weights, model._mem.gradient, key=model.id)
            return [d_o * model.mix[0] for d_o in d_output]
        return output, fine_tune_bwd
+
+    def fine_tune_predict(docs_tokvecs):
+        docs, tokvecs = docs_tokvecs
+        vecs = embedding(docs)
+        return [model.mix[0]*tv+model.mix[1]*v
+                for tv, v in zip(tokvecs, vecs)]
+
    model = wrap(fine_tune_fwd, embedding)
    model.mix = model._mem.add((model.id, 'mix'), (2,))
    model.mix.fill(0.5)
    model.d_mix = model._mem.add_gradient((model.id, 'd_mix'), (model.id, 'mix'))
+    model.predict = fine_tune_predict
    return model


--- a/spacy/cli/init.py
+++ b/spacy/cli/init.py
@ -2,6 +2,7 @@ from .download import download
 from .info import info
 from .link import link
 from .package import package
+from .profile import profile
 from .train import train
 from .convert import convert
 from .model import model
--- a/spacy/cli/profile.py
+++ b/spacy/cli/profile.py
@ -0,0 +1,45 @@
+# coding: utf8
+from __future__ import unicode_literals, division, print_function
+
+import plac
+from pathlib import Path
+import ujson
+import cProfile
+import pstats
+
+import spacy
+import sys
+import tqdm
+import cytoolz
+
+
+def read_inputs(loc):
+    if loc is None:
+        file_ = sys.stdin
+        file_ = (line.encode('utf8') for line in file_)
+    else:
+        file_ = Path(loc).open()
+    for line in file_:
+        data = ujson.loads(line)
+        text = data['text']
+        yield text
+
+
+@plac.annotations(
+    lang=("model/language", "positional", None, str),
+    inputs=("Location of input file", "positional", None, read_inputs)
+)
+def profile(cmd, lang, inputs=None):
+    """
+    Profile a spaCy pipeline, to find out which functions take the most time.
+    """
+    nlp = spacy.load(lang) 
+    texts = list(cytoolz.take(10000, inputs))
+    cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
+    s = pstats.Stats("Profile.prof")
+    s.strip_dirs().sort_stats("time").print_stats()
+
+
+def parse_texts(nlp, texts):
+    for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=128):
+        pass
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -303,8 +303,14 @@ cdef class Doc:
                return self.user_hooks['vector'](self)
            if self._vector is not None:
                return self._vector
-            elif self.has_vector and len(self):
-                self._vector = sum(t.vector for t in self) / len(self)
+            elif not len(self):
+                self._vector = numpy.zeros((self.vocab.vectors_length,), dtype='f')
+                return self._vector
+            elif self.has_vector:
+                vector = numpy.zeros((self.vocab.vectors_length,), dtype='f')
+                for token in self.c[:self.length]:
+                    vector += self.vocab.get_vector(token.lex.orth)
+                self._vector = vector / len(self)
                return self._vector
            elif self.tensor is not None:
                self._vector = self.tensor.mean(axis=0)
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -4,6 +4,7 @@ from __future__ import unicode_literals
 import bz2
 import ujson
 import re
+import numpy

 from libc.string cimport memset, memcpy
 from libc.stdint cimport int32_t
@ -244,7 +245,7 @@ cdef class Vocab:

    @property
    def vectors_length(self):
-        return len(self.vectors)
+        return self.vectors.data.shape[1]

    def clear_vectors(self, new_dim=None):
        """Drop the current vector table. Because all vectors must be the same
@ -268,7 +269,10 @@ cdef class Vocab:
        """
        if isinstance(orth, basestring_):
            orth = self.strings.add(orth)
-        return self.vectors[orth]
+        if orth in self.vectors.key2row:
+            return self.vectors[orth]
+        else:
+            return numpy.zeros((self.vectors_length,), dtype='f')

    def set_vector(self, orth, vector):
        """Set a vector for a word in the vocabulary.