mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
9c580ad28a
|
@ -7,6 +7,7 @@ if __name__ == '__main__':
|
||||||
import plac
|
import plac
|
||||||
import sys
|
import sys
|
||||||
from spacy.cli import download, link, info, package, train, convert, model
|
from spacy.cli import download, link, info, package, train, convert, model
|
||||||
|
from spacy.cli import profile
|
||||||
from spacy.util import prints
|
from spacy.util import prints
|
||||||
|
|
||||||
commands = {
|
commands = {
|
||||||
|
@ -16,7 +17,8 @@ if __name__ == '__main__':
|
||||||
'train': train,
|
'train': train,
|
||||||
'convert': convert,
|
'convert': convert,
|
||||||
'package': package,
|
'package': package,
|
||||||
'model': model
|
'model': model,
|
||||||
|
'profile': profile,
|
||||||
}
|
}
|
||||||
if len(sys.argv) == 1:
|
if len(sys.argv) == 1:
|
||||||
prints(', '.join(commands), title="Available commands", exits=1)
|
prints(', '.join(commands), title="Available commands", exits=1)
|
||||||
|
|
13
spacy/_ml.py
13
spacy/_ml.py
|
@ -218,7 +218,10 @@ def drop_layer(layer, factor=2.):
|
||||||
return layer.begin_update(X, drop=drop)
|
return layer.begin_update(X, drop=drop)
|
||||||
else:
|
else:
|
||||||
return X, lambda dX, sgd=None: dX
|
return X, lambda dX, sgd=None: dX
|
||||||
return wrap(drop_layer_fwd, layer)
|
|
||||||
|
model = wrap(drop_layer_fwd, layer)
|
||||||
|
model.predict = layer
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
def Tok2Vec(width, embed_size, preprocess=None):
|
def Tok2Vec(width, embed_size, preprocess=None):
|
||||||
|
@ -382,10 +385,18 @@ def fine_tune(embedding, combine=None):
|
||||||
sgd(model._mem.weights, model._mem.gradient, key=model.id)
|
sgd(model._mem.weights, model._mem.gradient, key=model.id)
|
||||||
return [d_o * model.mix[0] for d_o in d_output]
|
return [d_o * model.mix[0] for d_o in d_output]
|
||||||
return output, fine_tune_bwd
|
return output, fine_tune_bwd
|
||||||
|
|
||||||
|
def fine_tune_predict(docs_tokvecs):
|
||||||
|
docs, tokvecs = docs_tokvecs
|
||||||
|
vecs = embedding(docs)
|
||||||
|
return [model.mix[0]*tv+model.mix[1]*v
|
||||||
|
for tv, v in zip(tokvecs, vecs)]
|
||||||
|
|
||||||
model = wrap(fine_tune_fwd, embedding)
|
model = wrap(fine_tune_fwd, embedding)
|
||||||
model.mix = model._mem.add((model.id, 'mix'), (2,))
|
model.mix = model._mem.add((model.id, 'mix'), (2,))
|
||||||
model.mix.fill(0.5)
|
model.mix.fill(0.5)
|
||||||
model.d_mix = model._mem.add_gradient((model.id, 'd_mix'), (model.id, 'mix'))
|
model.d_mix = model._mem.add_gradient((model.id, 'd_mix'), (model.id, 'mix'))
|
||||||
|
model.predict = fine_tune_predict
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -2,6 +2,7 @@ from .download import download
|
||||||
from .info import info
|
from .info import info
|
||||||
from .link import link
|
from .link import link
|
||||||
from .package import package
|
from .package import package
|
||||||
|
from .profile import profile
|
||||||
from .train import train
|
from .train import train
|
||||||
from .convert import convert
|
from .convert import convert
|
||||||
from .model import model
|
from .model import model
|
||||||
|
|
45
spacy/cli/profile.py
Normal file
45
spacy/cli/profile.py
Normal file
|
@ -0,0 +1,45 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals, division, print_function
|
||||||
|
|
||||||
|
import plac
|
||||||
|
from pathlib import Path
|
||||||
|
import ujson
|
||||||
|
import cProfile
|
||||||
|
import pstats
|
||||||
|
|
||||||
|
import spacy
|
||||||
|
import sys
|
||||||
|
import tqdm
|
||||||
|
import cytoolz
|
||||||
|
|
||||||
|
|
||||||
|
def read_inputs(loc):
|
||||||
|
if loc is None:
|
||||||
|
file_ = sys.stdin
|
||||||
|
file_ = (line.encode('utf8') for line in file_)
|
||||||
|
else:
|
||||||
|
file_ = Path(loc).open()
|
||||||
|
for line in file_:
|
||||||
|
data = ujson.loads(line)
|
||||||
|
text = data['text']
|
||||||
|
yield text
|
||||||
|
|
||||||
|
|
||||||
|
@plac.annotations(
|
||||||
|
lang=("model/language", "positional", None, str),
|
||||||
|
inputs=("Location of input file", "positional", None, read_inputs)
|
||||||
|
)
|
||||||
|
def profile(cmd, lang, inputs=None):
|
||||||
|
"""
|
||||||
|
Profile a spaCy pipeline, to find out which functions take the most time.
|
||||||
|
"""
|
||||||
|
nlp = spacy.load(lang)
|
||||||
|
texts = list(cytoolz.take(10000, inputs))
|
||||||
|
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
|
||||||
|
s = pstats.Stats("Profile.prof")
|
||||||
|
s.strip_dirs().sort_stats("time").print_stats()
|
||||||
|
|
||||||
|
|
||||||
|
def parse_texts(nlp, texts):
|
||||||
|
for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=128):
|
||||||
|
pass
|
|
@ -303,8 +303,14 @@ cdef class Doc:
|
||||||
return self.user_hooks['vector'](self)
|
return self.user_hooks['vector'](self)
|
||||||
if self._vector is not None:
|
if self._vector is not None:
|
||||||
return self._vector
|
return self._vector
|
||||||
elif self.has_vector and len(self):
|
elif not len(self):
|
||||||
self._vector = sum(t.vector for t in self) / len(self)
|
self._vector = numpy.zeros((self.vocab.vectors_length,), dtype='f')
|
||||||
|
return self._vector
|
||||||
|
elif self.has_vector:
|
||||||
|
vector = numpy.zeros((self.vocab.vectors_length,), dtype='f')
|
||||||
|
for token in self.c[:self.length]:
|
||||||
|
vector += self.vocab.get_vector(token.lex.orth)
|
||||||
|
self._vector = vector / len(self)
|
||||||
return self._vector
|
return self._vector
|
||||||
elif self.tensor is not None:
|
elif self.tensor is not None:
|
||||||
self._vector = self.tensor.mean(axis=0)
|
self._vector = self.tensor.mean(axis=0)
|
||||||
|
|
|
@ -4,6 +4,7 @@ from __future__ import unicode_literals
|
||||||
import bz2
|
import bz2
|
||||||
import ujson
|
import ujson
|
||||||
import re
|
import re
|
||||||
|
import numpy
|
||||||
|
|
||||||
from libc.string cimport memset, memcpy
|
from libc.string cimport memset, memcpy
|
||||||
from libc.stdint cimport int32_t
|
from libc.stdint cimport int32_t
|
||||||
|
@ -244,7 +245,7 @@ cdef class Vocab:
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def vectors_length(self):
|
def vectors_length(self):
|
||||||
return len(self.vectors)
|
return self.vectors.data.shape[1]
|
||||||
|
|
||||||
def clear_vectors(self, new_dim=None):
|
def clear_vectors(self, new_dim=None):
|
||||||
"""Drop the current vector table. Because all vectors must be the same
|
"""Drop the current vector table. Because all vectors must be the same
|
||||||
|
@ -268,7 +269,10 @@ cdef class Vocab:
|
||||||
"""
|
"""
|
||||||
if isinstance(orth, basestring_):
|
if isinstance(orth, basestring_):
|
||||||
orth = self.strings.add(orth)
|
orth = self.strings.add(orth)
|
||||||
return self.vectors[orth]
|
if orth in self.vectors.key2row:
|
||||||
|
return self.vectors[orth]
|
||||||
|
else:
|
||||||
|
return numpy.zeros((self.vectors_length,), dtype='f')
|
||||||
|
|
||||||
def set_vector(self, orth, vector):
|
def set_vector(self, orth, vector):
|
||||||
"""Set a vector for a word in the vocabulary.
|
"""Set a vector for a word in the vocabulary.
|
||||||
|
|
Loading…
Reference in New Issue
Block a user