mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
9c580ad28a
|
@ -7,6 +7,7 @@ if __name__ == '__main__':
|
|||
import plac
|
||||
import sys
|
||||
from spacy.cli import download, link, info, package, train, convert, model
|
||||
from spacy.cli import profile
|
||||
from spacy.util import prints
|
||||
|
||||
commands = {
|
||||
|
@ -16,7 +17,8 @@ if __name__ == '__main__':
|
|||
'train': train,
|
||||
'convert': convert,
|
||||
'package': package,
|
||||
'model': model
|
||||
'model': model,
|
||||
'profile': profile,
|
||||
}
|
||||
if len(sys.argv) == 1:
|
||||
prints(', '.join(commands), title="Available commands", exits=1)
|
||||
|
|
13
spacy/_ml.py
13
spacy/_ml.py
|
@ -218,7 +218,10 @@ def drop_layer(layer, factor=2.):
|
|||
return layer.begin_update(X, drop=drop)
|
||||
else:
|
||||
return X, lambda dX, sgd=None: dX
|
||||
return wrap(drop_layer_fwd, layer)
|
||||
|
||||
model = wrap(drop_layer_fwd, layer)
|
||||
model.predict = layer
|
||||
return model
|
||||
|
||||
|
||||
def Tok2Vec(width, embed_size, preprocess=None):
|
||||
|
@ -382,10 +385,18 @@ def fine_tune(embedding, combine=None):
|
|||
sgd(model._mem.weights, model._mem.gradient, key=model.id)
|
||||
return [d_o * model.mix[0] for d_o in d_output]
|
||||
return output, fine_tune_bwd
|
||||
|
||||
def fine_tune_predict(docs_tokvecs):
|
||||
docs, tokvecs = docs_tokvecs
|
||||
vecs = embedding(docs)
|
||||
return [model.mix[0]*tv+model.mix[1]*v
|
||||
for tv, v in zip(tokvecs, vecs)]
|
||||
|
||||
model = wrap(fine_tune_fwd, embedding)
|
||||
model.mix = model._mem.add((model.id, 'mix'), (2,))
|
||||
model.mix.fill(0.5)
|
||||
model.d_mix = model._mem.add_gradient((model.id, 'd_mix'), (model.id, 'mix'))
|
||||
model.predict = fine_tune_predict
|
||||
return model
|
||||
|
||||
|
||||
|
|
|
@ -2,6 +2,7 @@ from .download import download
|
|||
from .info import info
|
||||
from .link import link
|
||||
from .package import package
|
||||
from .profile import profile
|
||||
from .train import train
|
||||
from .convert import convert
|
||||
from .model import model
|
||||
|
|
45
spacy/cli/profile.py
Normal file
45
spacy/cli/profile.py
Normal file
|
@ -0,0 +1,45 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals, division, print_function
|
||||
|
||||
import plac
|
||||
from pathlib import Path
|
||||
import ujson
|
||||
import cProfile
|
||||
import pstats
|
||||
|
||||
import spacy
|
||||
import sys
|
||||
import tqdm
|
||||
import cytoolz
|
||||
|
||||
|
||||
def read_inputs(loc):
|
||||
if loc is None:
|
||||
file_ = sys.stdin
|
||||
file_ = (line.encode('utf8') for line in file_)
|
||||
else:
|
||||
file_ = Path(loc).open()
|
||||
for line in file_:
|
||||
data = ujson.loads(line)
|
||||
text = data['text']
|
||||
yield text
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
lang=("model/language", "positional", None, str),
|
||||
inputs=("Location of input file", "positional", None, read_inputs)
|
||||
)
|
||||
def profile(cmd, lang, inputs=None):
|
||||
"""
|
||||
Profile a spaCy pipeline, to find out which functions take the most time.
|
||||
"""
|
||||
nlp = spacy.load(lang)
|
||||
texts = list(cytoolz.take(10000, inputs))
|
||||
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
|
||||
s = pstats.Stats("Profile.prof")
|
||||
s.strip_dirs().sort_stats("time").print_stats()
|
||||
|
||||
|
||||
def parse_texts(nlp, texts):
|
||||
for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=128):
|
||||
pass
|
|
@ -303,8 +303,14 @@ cdef class Doc:
|
|||
return self.user_hooks['vector'](self)
|
||||
if self._vector is not None:
|
||||
return self._vector
|
||||
elif self.has_vector and len(self):
|
||||
self._vector = sum(t.vector for t in self) / len(self)
|
||||
elif not len(self):
|
||||
self._vector = numpy.zeros((self.vocab.vectors_length,), dtype='f')
|
||||
return self._vector
|
||||
elif self.has_vector:
|
||||
vector = numpy.zeros((self.vocab.vectors_length,), dtype='f')
|
||||
for token in self.c[:self.length]:
|
||||
vector += self.vocab.get_vector(token.lex.orth)
|
||||
self._vector = vector / len(self)
|
||||
return self._vector
|
||||
elif self.tensor is not None:
|
||||
self._vector = self.tensor.mean(axis=0)
|
||||
|
|
|
@ -4,6 +4,7 @@ from __future__ import unicode_literals
|
|||
import bz2
|
||||
import ujson
|
||||
import re
|
||||
import numpy
|
||||
|
||||
from libc.string cimport memset, memcpy
|
||||
from libc.stdint cimport int32_t
|
||||
|
@ -244,7 +245,7 @@ cdef class Vocab:
|
|||
|
||||
@property
|
||||
def vectors_length(self):
|
||||
return len(self.vectors)
|
||||
return self.vectors.data.shape[1]
|
||||
|
||||
def clear_vectors(self, new_dim=None):
|
||||
"""Drop the current vector table. Because all vectors must be the same
|
||||
|
@ -268,7 +269,10 @@ cdef class Vocab:
|
|||
"""
|
||||
if isinstance(orth, basestring_):
|
||||
orth = self.strings.add(orth)
|
||||
if orth in self.vectors.key2row:
|
||||
return self.vectors[orth]
|
||||
else:
|
||||
return numpy.zeros((self.vectors_length,), dtype='f')
|
||||
|
||||
def set_vector(self, orth, vector):
|
||||
"""Set a vector for a word in the vocabulary.
|
||||
|
|
Loading…
Reference in New Issue
Block a user