Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2017-08-22 17:02:04 -05:00
commit 9c580ad28a
6 changed files with 75 additions and 6 deletions

View File

@ -7,6 +7,7 @@ if __name__ == '__main__':
import plac import plac
import sys import sys
from spacy.cli import download, link, info, package, train, convert, model from spacy.cli import download, link, info, package, train, convert, model
from spacy.cli import profile
from spacy.util import prints from spacy.util import prints
commands = { commands = {
@ -16,7 +17,8 @@ if __name__ == '__main__':
'train': train, 'train': train,
'convert': convert, 'convert': convert,
'package': package, 'package': package,
'model': model 'model': model,
'profile': profile,
} }
if len(sys.argv) == 1: if len(sys.argv) == 1:
prints(', '.join(commands), title="Available commands", exits=1) prints(', '.join(commands), title="Available commands", exits=1)

View File

@ -218,7 +218,10 @@ def drop_layer(layer, factor=2.):
return layer.begin_update(X, drop=drop) return layer.begin_update(X, drop=drop)
else: else:
return X, lambda dX, sgd=None: dX return X, lambda dX, sgd=None: dX
return wrap(drop_layer_fwd, layer)
model = wrap(drop_layer_fwd, layer)
model.predict = layer
return model
def Tok2Vec(width, embed_size, preprocess=None): def Tok2Vec(width, embed_size, preprocess=None):
@ -382,10 +385,18 @@ def fine_tune(embedding, combine=None):
sgd(model._mem.weights, model._mem.gradient, key=model.id) sgd(model._mem.weights, model._mem.gradient, key=model.id)
return [d_o * model.mix[0] for d_o in d_output] return [d_o * model.mix[0] for d_o in d_output]
return output, fine_tune_bwd return output, fine_tune_bwd
def fine_tune_predict(docs_tokvecs):
docs, tokvecs = docs_tokvecs
vecs = embedding(docs)
return [model.mix[0]*tv+model.mix[1]*v
for tv, v in zip(tokvecs, vecs)]
model = wrap(fine_tune_fwd, embedding) model = wrap(fine_tune_fwd, embedding)
model.mix = model._mem.add((model.id, 'mix'), (2,)) model.mix = model._mem.add((model.id, 'mix'), (2,))
model.mix.fill(0.5) model.mix.fill(0.5)
model.d_mix = model._mem.add_gradient((model.id, 'd_mix'), (model.id, 'mix')) model.d_mix = model._mem.add_gradient((model.id, 'd_mix'), (model.id, 'mix'))
model.predict = fine_tune_predict
return model return model

View File

@ -2,6 +2,7 @@ from .download import download
from .info import info from .info import info
from .link import link from .link import link
from .package import package from .package import package
from .profile import profile
from .train import train from .train import train
from .convert import convert from .convert import convert
from .model import model from .model import model

45
spacy/cli/profile.py Normal file
View File

@ -0,0 +1,45 @@
# coding: utf8
from __future__ import unicode_literals, division, print_function
import plac
from pathlib import Path
import ujson
import cProfile
import pstats
import spacy
import sys
import tqdm
import cytoolz
def read_inputs(loc):
if loc is None:
file_ = sys.stdin
file_ = (line.encode('utf8') for line in file_)
else:
file_ = Path(loc).open()
for line in file_:
data = ujson.loads(line)
text = data['text']
yield text
@plac.annotations(
lang=("model/language", "positional", None, str),
inputs=("Location of input file", "positional", None, read_inputs)
)
def profile(cmd, lang, inputs=None):
"""
Profile a spaCy pipeline, to find out which functions take the most time.
"""
nlp = spacy.load(lang)
texts = list(cytoolz.take(10000, inputs))
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
s = pstats.Stats("Profile.prof")
s.strip_dirs().sort_stats("time").print_stats()
def parse_texts(nlp, texts):
for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=128):
pass

View File

@ -303,8 +303,14 @@ cdef class Doc:
return self.user_hooks['vector'](self) return self.user_hooks['vector'](self)
if self._vector is not None: if self._vector is not None:
return self._vector return self._vector
elif self.has_vector and len(self): elif not len(self):
self._vector = sum(t.vector for t in self) / len(self) self._vector = numpy.zeros((self.vocab.vectors_length,), dtype='f')
return self._vector
elif self.has_vector:
vector = numpy.zeros((self.vocab.vectors_length,), dtype='f')
for token in self.c[:self.length]:
vector += self.vocab.get_vector(token.lex.orth)
self._vector = vector / len(self)
return self._vector return self._vector
elif self.tensor is not None: elif self.tensor is not None:
self._vector = self.tensor.mean(axis=0) self._vector = self.tensor.mean(axis=0)

View File

@ -4,6 +4,7 @@ from __future__ import unicode_literals
import bz2 import bz2
import ujson import ujson
import re import re
import numpy
from libc.string cimport memset, memcpy from libc.string cimport memset, memcpy
from libc.stdint cimport int32_t from libc.stdint cimport int32_t
@ -244,7 +245,7 @@ cdef class Vocab:
@property @property
def vectors_length(self): def vectors_length(self):
return len(self.vectors) return self.vectors.data.shape[1]
def clear_vectors(self, new_dim=None): def clear_vectors(self, new_dim=None):
"""Drop the current vector table. Because all vectors must be the same """Drop the current vector table. Because all vectors must be the same
@ -268,7 +269,10 @@ cdef class Vocab:
""" """
if isinstance(orth, basestring_): if isinstance(orth, basestring_):
orth = self.strings.add(orth) orth = self.strings.add(orth)
return self.vectors[orth] if orth in self.vectors.key2row:
return self.vectors[orth]
else:
return numpy.zeros((self.vectors_length,), dtype='f')
def set_vector(self, orth, vector): def set_vector(self, orth, vector):
"""Set a vector for a word in the vocabulary. """Set a vector for a word in the vocabulary.