2018-05-15 23:17:29 +03:00
|
|
|
# cython: infer_types=True
|
|
|
|
# cython: cdivision=True
|
|
|
|
# cython: boundscheck=False
|
|
|
|
# coding: utf-8
|
|
|
|
from __future__ import unicode_literals, print_function
|
|
|
|
|
|
|
|
from collections import OrderedDict
|
|
|
|
import numpy
|
|
|
|
cimport cython.parallel
|
|
|
|
import numpy.random
|
|
|
|
cimport numpy as np
|
|
|
|
from libc.math cimport exp
|
|
|
|
from libcpp.vector cimport vector
|
|
|
|
from libc.string cimport memset, memcpy
|
|
|
|
from libc.stdlib cimport calloc, free, realloc
|
|
|
|
from cymem.cymem cimport Pool
|
|
|
|
from thinc.typedefs cimport weight_t, class_t, hash_t
|
|
|
|
from thinc.extra.search cimport Beam
|
|
|
|
from thinc.api import chain, clone
|
|
|
|
from thinc.v2v import Model, Maxout, Affine
|
|
|
|
from thinc.misc import LayerNorm
|
2019-10-20 18:15:17 +03:00
|
|
|
from thinc.neural.ops import CupyOps, NumpyOps
|
2018-05-15 23:17:29 +03:00
|
|
|
from thinc.neural.util import get_array_module
|
|
|
|
from thinc.linalg cimport Vec, VecVec
|
💫 Use Blis for matrix multiplications (#2966)
Our epic matrix multiplication odyssey is drawing to a close...
I've now finally got the Blis linear algebra routines in a self-contained Python package, with wheels for Windows, Linux and OSX. The only missing platform at the moment is Windows Python 2.7. The result is at https://github.com/explosion/cython-blis
Thinc v7.0.0 will make the change to Blis. I've put a Thinc v7.0.0.dev0 up on PyPi so that we can test these changes with the CI, and even get them out to spacy-nightly, before Thinc v7.0.0 is released. This PR also updates the other dependencies to be in line with the current versions master is using. I've also resolved the msgpack deprecation problems, and gotten spaCy and Thinc up to date with the latest Cython.
The point of switching to Blis is to have control of how our matrix multiplications are executed across platforms. When we were using numpy for this, a different library would be used on pip and conda, OSX would use Accelerate, etc. This would open up different bugs and performance problems, especially when multi-threading was introduced.
With the change to Blis, we now strictly single-thread the matrix multiplications. This will make it much easier to use multiprocessing to parallelise the runtime, since we won't have nested parallelism problems to deal with.
* Use blis
* Use -2 arg to Cython
* Update dependencies
* Fix requirements
* Update setup dependencies
* Fix requirement typo
* Fix msgpack errors
* Remove Python27 test from Appveyor, until Blis works there
* Auto-format setup.py
* Fix murmurhash version
2018-11-27 02:44:04 +03:00
|
|
|
cimport blis.cy
|
2018-05-15 23:17:29 +03:00
|
|
|
|
|
|
|
from .._ml import zero_init, PrecomputableAffine, Tok2Vec, flatten
|
|
|
|
from .._ml import link_vectors_to_models, create_default_optimizer
|
💫 Replace ujson, msgpack and dill/pickle/cloudpickle with srsly (#3003)
Remove hacks and wrappers, keep code in sync across our libraries and move spaCy a few steps closer to only depending on packages with binary wheels 🎉
See here: https://github.com/explosion/srsly
Serialization is hard, especially across Python versions and multiple platforms. After dealing with many subtle bugs over the years (encodings, locales, large files) our libraries like spaCy and Prodigy have steadily grown a number of utility functions to wrap the multiple serialization formats we need to support (especially json, msgpack and pickle). These wrapping functions ended up duplicated across our codebases, so we wanted to put them in one place.
At the same time, we noticed that having a lot of small dependencies was making maintainence harder, and making installation slower. To solve this, we've made srsly standalone, by including the component packages directly within it. This way we can provide all the serialization utilities we need in a single binary wheel.
srsly currently includes forks of the following packages:
ujson
msgpack
msgpack-numpy
cloudpickle
* WIP: replace json/ujson with srsly
* Replace ujson in examples
Use regular json instead of srsly to make code easier to read and follow
* Update requirements
* Fix imports
* Fix typos
* Replace msgpack with srsly
* Fix warning
2018-12-03 03:28:22 +03:00
|
|
|
from ..compat import copy_array
|
2018-05-15 23:17:29 +03:00
|
|
|
from ..tokens.doc cimport Doc
|
|
|
|
from ..gold cimport GoldParse
|
|
|
|
from ..errors import Errors, TempErrors
|
|
|
|
from .. import util
|
|
|
|
from .stateclass cimport StateClass
|
|
|
|
from .transition_system cimport Transition
|
|
|
|
from . import _beam_utils
|
|
|
|
from . import nonproj
|
|
|
|
|
|
|
|
|
|
|
|
cdef WeightsC get_c_weights(model) except *:
|
|
|
|
cdef WeightsC output
|
|
|
|
cdef precompute_hiddens state2vec = model.state2vec
|
|
|
|
output.feat_weights = state2vec.get_feat_weights()
|
|
|
|
output.feat_bias = <const float*>state2vec.bias.data
|
2019-10-21 02:11:30 +03:00
|
|
|
cdef np.ndarray vec2scores_W
|
|
|
|
cdef np.ndarray vec2scores_b
|
|
|
|
if model.vec2scores is None:
|
|
|
|
output.hidden_weights = NULL
|
|
|
|
output.hidden_bias = NULL
|
|
|
|
else:
|
2019-10-23 06:14:00 +03:00
|
|
|
vec2scores_W = model.vec2scores.W
|
|
|
|
vec2scores_b = model.vec2scores.b
|
2019-10-21 02:11:30 +03:00
|
|
|
output.hidden_weights = <const float*>vec2scores_W.data
|
|
|
|
output.hidden_bias = <const float*>vec2scores_b.data
|
2019-02-24 18:41:41 +03:00
|
|
|
cdef np.ndarray class_mask = model._class_mask
|
|
|
|
output.seen_classes = <const float*>class_mask.data
|
2018-05-15 23:17:29 +03:00
|
|
|
return output
|
|
|
|
|
|
|
|
|
|
|
|
cdef SizesC get_c_sizes(model, int batch_size) except *:
|
|
|
|
cdef SizesC output
|
|
|
|
output.states = batch_size
|
2019-10-21 02:11:30 +03:00
|
|
|
if model.vec2scores is None:
|
|
|
|
output.classes = model.state2vec.nO
|
|
|
|
else:
|
|
|
|
output.classes = model.vec2scores.nO
|
2018-05-15 23:17:29 +03:00
|
|
|
output.hiddens = model.state2vec.nO
|
|
|
|
output.pieces = model.state2vec.nP
|
|
|
|
output.feats = model.state2vec.nF
|
|
|
|
output.embed_width = model.tokvecs.shape[1]
|
|
|
|
return output
|
|
|
|
|
|
|
|
|
2019-10-22 16:06:44 +03:00
|
|
|
cdef ActivationsC alloc_activations(SizesC n) nogil:
|
|
|
|
cdef ActivationsC A
|
|
|
|
memset(&A, 0, sizeof(A))
|
|
|
|
resize_activations(&A, n)
|
|
|
|
return A
|
|
|
|
|
|
|
|
|
|
|
|
cdef void free_activations(const ActivationsC* A) nogil:
|
|
|
|
free(A.token_ids)
|
|
|
|
free(A.scores)
|
|
|
|
free(A.unmaxed)
|
|
|
|
free(A.hiddens)
|
|
|
|
free(A.is_valid)
|
|
|
|
|
|
|
|
|
2018-05-15 23:17:29 +03:00
|
|
|
cdef void resize_activations(ActivationsC* A, SizesC n) nogil:
|
|
|
|
if n.states <= A._max_size:
|
|
|
|
A._curr_size = n.states
|
|
|
|
return
|
|
|
|
if A._max_size == 0:
|
|
|
|
A.token_ids = <int*>calloc(n.states * n.feats, sizeof(A.token_ids[0]))
|
|
|
|
A.scores = <float*>calloc(n.states * n.classes, sizeof(A.scores[0]))
|
|
|
|
A.unmaxed = <float*>calloc(n.states * n.hiddens * n.pieces, sizeof(A.unmaxed[0]))
|
|
|
|
A.hiddens = <float*>calloc(n.states * n.hiddens, sizeof(A.hiddens[0]))
|
|
|
|
A.is_valid = <int*>calloc(n.states * n.classes, sizeof(A.is_valid[0]))
|
|
|
|
A._max_size = n.states
|
|
|
|
else:
|
|
|
|
A.token_ids = <int*>realloc(A.token_ids,
|
|
|
|
n.states * n.feats * sizeof(A.token_ids[0]))
|
|
|
|
A.scores = <float*>realloc(A.scores,
|
|
|
|
n.states * n.classes * sizeof(A.scores[0]))
|
|
|
|
A.unmaxed = <float*>realloc(A.unmaxed,
|
|
|
|
n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
|
|
|
|
A.hiddens = <float*>realloc(A.hiddens,
|
|
|
|
n.states * n.hiddens * sizeof(A.hiddens[0]))
|
|
|
|
A.is_valid = <int*>realloc(A.is_valid,
|
|
|
|
n.states * n.classes * sizeof(A.is_valid[0]))
|
|
|
|
A._max_size = n.states
|
|
|
|
A._curr_size = n.states
|
|
|
|
|
|
|
|
|
|
|
|
cdef void predict_states(ActivationsC* A, StateC** states,
|
|
|
|
const WeightsC* W, SizesC n) nogil:
|
2019-10-21 02:11:30 +03:00
|
|
|
cdef double one = 1.0
|
2018-05-15 23:17:29 +03:00
|
|
|
resize_activations(A, n)
|
|
|
|
for i in range(n.states):
|
|
|
|
states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
|
2019-10-21 02:11:30 +03:00
|
|
|
memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
|
|
|
|
memset(A.hiddens, 0, n.states * n.hiddens * sizeof(float))
|
2018-05-15 23:17:29 +03:00
|
|
|
sum_state_features(A.unmaxed,
|
|
|
|
W.feat_weights, A.token_ids, n.states, n.feats, n.hiddens * n.pieces)
|
|
|
|
for i in range(n.states):
|
|
|
|
VecVec.add_i(&A.unmaxed[i*n.hiddens*n.pieces],
|
|
|
|
W.feat_bias, 1., n.hiddens * n.pieces)
|
|
|
|
for j in range(n.hiddens):
|
|
|
|
index = i * n.hiddens * n.pieces + j * n.pieces
|
|
|
|
which = Vec.arg_max(&A.unmaxed[index], n.pieces)
|
|
|
|
A.hiddens[i*n.hiddens + j] = A.unmaxed[index + which]
|
|
|
|
memset(A.scores, 0, n.states * n.classes * sizeof(float))
|
2019-10-21 02:11:30 +03:00
|
|
|
if W.hidden_weights == NULL:
|
|
|
|
memcpy(A.scores, A.hiddens, n.states * n.classes * sizeof(float))
|
|
|
|
else:
|
|
|
|
# Compute hidden-to-output
|
|
|
|
blis.cy.gemm(blis.cy.NO_TRANSPOSE, blis.cy.TRANSPOSE,
|
|
|
|
n.states, n.classes, n.hiddens, one,
|
|
|
|
<float*>A.hiddens, n.hiddens, 1,
|
|
|
|
<float*>W.hidden_weights, n.hiddens, 1,
|
|
|
|
one,
|
|
|
|
<float*>A.scores, n.classes, 1)
|
|
|
|
# Add bias
|
|
|
|
for i in range(n.states):
|
|
|
|
VecVec.add_i(&A.scores[i*n.classes],
|
|
|
|
W.hidden_bias, 1., n.classes)
|
2019-02-24 18:41:41 +03:00
|
|
|
# Set unseen classes to minimum value
|
|
|
|
i = 0
|
|
|
|
min_ = A.scores[0]
|
|
|
|
for i in range(1, n.states * n.classes):
|
|
|
|
if A.scores[i] < min_:
|
|
|
|
min_ = A.scores[i]
|
|
|
|
for i in range(n.states):
|
|
|
|
for j in range(n.classes):
|
|
|
|
if not W.seen_classes[j]:
|
|
|
|
A.scores[i*n.classes+j] = min_
|
2018-05-15 23:17:29 +03:00
|
|
|
|
💫 Replace ujson, msgpack and dill/pickle/cloudpickle with srsly (#3003)
Remove hacks and wrappers, keep code in sync across our libraries and move spaCy a few steps closer to only depending on packages with binary wheels 🎉
See here: https://github.com/explosion/srsly
Serialization is hard, especially across Python versions and multiple platforms. After dealing with many subtle bugs over the years (encodings, locales, large files) our libraries like spaCy and Prodigy have steadily grown a number of utility functions to wrap the multiple serialization formats we need to support (especially json, msgpack and pickle). These wrapping functions ended up duplicated across our codebases, so we wanted to put them in one place.
At the same time, we noticed that having a lot of small dependencies was making maintainence harder, and making installation slower. To solve this, we've made srsly standalone, by including the component packages directly within it. This way we can provide all the serialization utilities we need in a single binary wheel.
srsly currently includes forks of the following packages:
ujson
msgpack
msgpack-numpy
cloudpickle
* WIP: replace json/ujson with srsly
* Replace ujson in examples
Use regular json instead of srsly to make code easier to read and follow
* Update requirements
* Fix imports
* Fix typos
* Replace msgpack with srsly
* Fix warning
2018-12-03 03:28:22 +03:00
|
|
|
|
2018-05-15 23:17:29 +03:00
|
|
|
cdef void sum_state_features(float* output,
|
|
|
|
const float* cached, const int* token_ids, int B, int F, int O) nogil:
|
|
|
|
cdef int idx, b, f, i
|
|
|
|
cdef const float* feature
|
|
|
|
padding = cached
|
|
|
|
cached += F * O
|
|
|
|
cdef int id_stride = F*O
|
|
|
|
cdef float one = 1.
|
|
|
|
for b in range(B):
|
|
|
|
for f in range(F):
|
|
|
|
if token_ids[f] < 0:
|
|
|
|
feature = &padding[f*O]
|
|
|
|
else:
|
|
|
|
idx = token_ids[f] * id_stride + f*O
|
|
|
|
feature = &cached[idx]
|
💫 Use Blis for matrix multiplications (#2966)
Our epic matrix multiplication odyssey is drawing to a close...
I've now finally got the Blis linear algebra routines in a self-contained Python package, with wheels for Windows, Linux and OSX. The only missing platform at the moment is Windows Python 2.7. The result is at https://github.com/explosion/cython-blis
Thinc v7.0.0 will make the change to Blis. I've put a Thinc v7.0.0.dev0 up on PyPi so that we can test these changes with the CI, and even get them out to spacy-nightly, before Thinc v7.0.0 is released. This PR also updates the other dependencies to be in line with the current versions master is using. I've also resolved the msgpack deprecation problems, and gotten spaCy and Thinc up to date with the latest Cython.
The point of switching to Blis is to have control of how our matrix multiplications are executed across platforms. When we were using numpy for this, a different library would be used on pip and conda, OSX would use Accelerate, etc. This would open up different bugs and performance problems, especially when multi-threading was introduced.
With the change to Blis, we now strictly single-thread the matrix multiplications. This will make it much easier to use multiprocessing to parallelise the runtime, since we won't have nested parallelism problems to deal with.
* Use blis
* Use -2 arg to Cython
* Update dependencies
* Fix requirements
* Update setup dependencies
* Fix requirement typo
* Fix msgpack errors
* Remove Python27 test from Appveyor, until Blis works there
* Auto-format setup.py
* Fix murmurhash version
2018-11-27 02:44:04 +03:00
|
|
|
blis.cy.axpyv(blis.cy.NO_CONJUGATE, O, one,
|
|
|
|
<float*>feature, 1,
|
|
|
|
&output[b*O], 1)
|
2018-05-15 23:17:29 +03:00
|
|
|
token_ids += F
|
|
|
|
|
|
|
|
|
|
|
|
cdef void cpu_log_loss(float* d_scores,
|
|
|
|
const float* costs, const int* is_valid, const float* scores,
|
|
|
|
int O) nogil:
|
|
|
|
"""Do multi-label log loss"""
|
|
|
|
cdef double max_, gmax, Z, gZ
|
|
|
|
best = arg_max_if_gold(scores, costs, is_valid, O)
|
2019-03-15 17:22:16 +03:00
|
|
|
guess = Vec.arg_max(scores, O)
|
2019-03-10 16:53:03 +03:00
|
|
|
if best == -1 or guess == -1:
|
|
|
|
# These shouldn't happen, but if they do, we want to make sure we don't
|
|
|
|
# cause an OOB access.
|
|
|
|
return
|
2018-05-15 23:17:29 +03:00
|
|
|
Z = 1e-10
|
|
|
|
gZ = 1e-10
|
|
|
|
max_ = scores[guess]
|
|
|
|
gmax = scores[best]
|
|
|
|
for i in range(O):
|
2019-03-15 17:22:16 +03:00
|
|
|
Z += exp(scores[i] - max_)
|
|
|
|
if costs[i] <= costs[best]:
|
|
|
|
gZ += exp(scores[i] - gmax)
|
2018-05-15 23:17:29 +03:00
|
|
|
for i in range(O):
|
2019-03-15 17:22:16 +03:00
|
|
|
if costs[i] <= costs[best]:
|
2018-05-15 23:17:29 +03:00
|
|
|
d_scores[i] = (exp(scores[i]-max_) / Z) - (exp(scores[i]-gmax)/gZ)
|
|
|
|
else:
|
|
|
|
d_scores[i] = exp(scores[i]-max_) / Z
|
|
|
|
|
💫 Replace ujson, msgpack and dill/pickle/cloudpickle with srsly (#3003)
Remove hacks and wrappers, keep code in sync across our libraries and move spaCy a few steps closer to only depending on packages with binary wheels 🎉
See here: https://github.com/explosion/srsly
Serialization is hard, especially across Python versions and multiple platforms. After dealing with many subtle bugs over the years (encodings, locales, large files) our libraries like spaCy and Prodigy have steadily grown a number of utility functions to wrap the multiple serialization formats we need to support (especially json, msgpack and pickle). These wrapping functions ended up duplicated across our codebases, so we wanted to put them in one place.
At the same time, we noticed that having a lot of small dependencies was making maintainence harder, and making installation slower. To solve this, we've made srsly standalone, by including the component packages directly within it. This way we can provide all the serialization utilities we need in a single binary wheel.
srsly currently includes forks of the following packages:
ujson
msgpack
msgpack-numpy
cloudpickle
* WIP: replace json/ujson with srsly
* Replace ujson in examples
Use regular json instead of srsly to make code easier to read and follow
* Update requirements
* Fix imports
* Fix typos
* Replace msgpack with srsly
* Fix warning
2018-12-03 03:28:22 +03:00
|
|
|
|
2018-05-15 23:17:29 +03:00
|
|
|
cdef int arg_max_if_gold(const weight_t* scores, const weight_t* costs,
|
|
|
|
const int* is_valid, int n) nogil:
|
|
|
|
# Find minimum cost
|
|
|
|
cdef float cost = 1
|
|
|
|
for i in range(n):
|
|
|
|
if is_valid[i] and costs[i] < cost:
|
|
|
|
cost = costs[i]
|
|
|
|
# Now find best-scoring with that cost
|
|
|
|
cdef int best = -1
|
|
|
|
for i in range(n):
|
|
|
|
if costs[i] <= cost and is_valid[i]:
|
|
|
|
if best == -1 or scores[i] > scores[best]:
|
|
|
|
best = i
|
|
|
|
return best
|
|
|
|
|
|
|
|
|
|
|
|
cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil:
|
|
|
|
cdef int best = -1
|
|
|
|
for i in range(n):
|
|
|
|
if is_valid[i] >= 1:
|
|
|
|
if best == -1 or scores[i] > scores[best]:
|
|
|
|
best = i
|
|
|
|
return best
|
|
|
|
|
|
|
|
|
|
|
|
class ParserModel(Model):
|
2019-02-24 18:41:41 +03:00
|
|
|
def __init__(self, tok2vec, lower_model, upper_model, unseen_classes=None):
|
2018-05-15 23:17:29 +03:00
|
|
|
Model.__init__(self)
|
2019-10-21 02:11:30 +03:00
|
|
|
self._layers = [tok2vec, lower_model]
|
|
|
|
if upper_model is not None:
|
|
|
|
self._layers.append(upper_model)
|
2019-02-24 18:41:41 +03:00
|
|
|
self.unseen_classes = set()
|
|
|
|
if unseen_classes:
|
|
|
|
for class_ in unseen_classes:
|
|
|
|
self.unseen_classes.add(class_)
|
2018-05-15 23:17:29 +03:00
|
|
|
|
|
|
|
def begin_update(self, docs, drop=0.):
|
2019-02-24 18:41:41 +03:00
|
|
|
step_model = ParserStepModel(docs, self._layers, drop=drop,
|
|
|
|
unseen_classes=self.unseen_classes)
|
2018-05-15 23:17:29 +03:00
|
|
|
def finish_parser_update(golds, sgd=None):
|
|
|
|
step_model.make_updates(sgd)
|
|
|
|
return None
|
|
|
|
return step_model, finish_parser_update
|
|
|
|
|
|
|
|
def resize_output(self, new_output):
|
2019-10-21 02:11:30 +03:00
|
|
|
if len(self._layers) == 2:
|
|
|
|
return
|
2018-12-19 04:10:36 +03:00
|
|
|
if new_output == self.upper.nO:
|
|
|
|
return
|
💫 Better support for semi-supervised learning (#3035)
The new spacy pretrain command implemented BERT/ULMFit/etc-like transfer learning, using our Language Modelling with Approximate Outputs version of BERT's cloze task. Pretraining is convenient, but in some ways it's a bit of a strange solution. All we're doing is initialising the weights. At the same time, we're putting a lot of work into our optimisation so that it's less sensitive to initial conditions, and more likely to find good optima. I discuss this a bit in the pseudo-rehearsal blog post: https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
Support semi-supervised learning in spacy train
One obvious way to improve these pretraining methods is to do multi-task learning, instead of just transfer learning. This has been shown to work very well: https://arxiv.org/pdf/1809.08370.pdf . This patch makes it easy to do this sort of thing.
Add a new argument to spacy train, --raw-text. This takes a jsonl file with unlabelled data that can be used in arbitrary ways to do semi-supervised learning.
Add a new method to the Language class and to pipeline components, .rehearse(). This is like .update(), but doesn't expect GoldParse objects. It takes a batch of Doc objects, and performs an update on some semi-supervised objective.
Move the BERT-LMAO objective out from spacy/cli/pretrain.py into spacy/_ml.py, so we can create a new pipeline component, ClozeMultitask. This can be specified as a parser or NER multitask in the spacy train command. Example usage:
python -m spacy train en ./tmp ~/data/en-core-web/train/nw.json ~/data/en-core-web/dev/nw.json --pipeline parser --raw-textt ~/data/unlabelled/reddit-100k.jsonl --vectors en_vectors_web_lg --parser-multitasks cloze
Implement rehearsal methods for pipeline components
The new --raw-text argument and nlp.rehearse() method also gives us a good place to implement the the idea in the pseudo-rehearsal blog post in the parser. This works as follows:
Add a new nlp.resume_training() method. This allocates copies of pre-trained models in the pipeline, setting things up for the rehearsal updates. It also returns an optimizer object. This also greatly reduces confusion around the nlp.begin_training() method, which randomises the weights, making it not suitable for adding new labels or otherwise fine-tuning a pre-trained model.
Implement rehearsal updates on the Parser class, making it available for the dependency parser and NER. During rehearsal, the initial model is used to supervise the model being trained. The current model is asked to match the predictions of the initial model on some data. This minimises catastrophic forgetting, by keeping the model's predictions close to the original. See the blog post for details.
Implement rehearsal updates for tagger
Implement rehearsal updates for text categoriz
2018-12-10 18:25:33 +03:00
|
|
|
smaller = self.upper
|
2019-02-21 19:12:23 +03:00
|
|
|
|
|
|
|
with Model.use_device('cpu'):
|
|
|
|
larger = Affine(new_output, smaller.nI)
|
2019-02-24 18:41:41 +03:00
|
|
|
larger.W.fill(0.0)
|
|
|
|
larger.b.fill(0.0)
|
💫 Better support for semi-supervised learning (#3035)
The new spacy pretrain command implemented BERT/ULMFit/etc-like transfer learning, using our Language Modelling with Approximate Outputs version of BERT's cloze task. Pretraining is convenient, but in some ways it's a bit of a strange solution. All we're doing is initialising the weights. At the same time, we're putting a lot of work into our optimisation so that it's less sensitive to initial conditions, and more likely to find good optima. I discuss this a bit in the pseudo-rehearsal blog post: https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
Support semi-supervised learning in spacy train
One obvious way to improve these pretraining methods is to do multi-task learning, instead of just transfer learning. This has been shown to work very well: https://arxiv.org/pdf/1809.08370.pdf . This patch makes it easy to do this sort of thing.
Add a new argument to spacy train, --raw-text. This takes a jsonl file with unlabelled data that can be used in arbitrary ways to do semi-supervised learning.
Add a new method to the Language class and to pipeline components, .rehearse(). This is like .update(), but doesn't expect GoldParse objects. It takes a batch of Doc objects, and performs an update on some semi-supervised objective.
Move the BERT-LMAO objective out from spacy/cli/pretrain.py into spacy/_ml.py, so we can create a new pipeline component, ClozeMultitask. This can be specified as a parser or NER multitask in the spacy train command. Example usage:
python -m spacy train en ./tmp ~/data/en-core-web/train/nw.json ~/data/en-core-web/dev/nw.json --pipeline parser --raw-textt ~/data/unlabelled/reddit-100k.jsonl --vectors en_vectors_web_lg --parser-multitasks cloze
Implement rehearsal methods for pipeline components
The new --raw-text argument and nlp.rehearse() method also gives us a good place to implement the the idea in the pseudo-rehearsal blog post in the parser. This works as follows:
Add a new nlp.resume_training() method. This allocates copies of pre-trained models in the pipeline, setting things up for the rehearsal updates. It also returns an optimizer object. This also greatly reduces confusion around the nlp.begin_training() method, which randomises the weights, making it not suitable for adding new labels or otherwise fine-tuning a pre-trained model.
Implement rehearsal updates on the Parser class, making it available for the dependency parser and NER. During rehearsal, the initial model is used to supervise the model being trained. The current model is asked to match the predictions of the initial model on some data. This minimises catastrophic forgetting, by keeping the model's predictions close to the original. See the blog post for details.
Implement rehearsal updates for tagger
Implement rehearsal updates for text categoriz
2018-12-10 18:25:33 +03:00
|
|
|
# It seems very unhappy if I pass these as smaller.W?
|
|
|
|
# Seems to segfault. Maybe it's a descriptor protocol thing?
|
|
|
|
smaller_W = smaller.W
|
|
|
|
larger_W = larger.W
|
|
|
|
smaller_b = smaller.b
|
|
|
|
larger_b = larger.b
|
2018-05-15 23:17:29 +03:00
|
|
|
# Weights are stored in (nr_out, nr_in) format, so we're basically
|
|
|
|
# just adding rows here.
|
💫 Better support for semi-supervised learning (#3035)
The new spacy pretrain command implemented BERT/ULMFit/etc-like transfer learning, using our Language Modelling with Approximate Outputs version of BERT's cloze task. Pretraining is convenient, but in some ways it's a bit of a strange solution. All we're doing is initialising the weights. At the same time, we're putting a lot of work into our optimisation so that it's less sensitive to initial conditions, and more likely to find good optima. I discuss this a bit in the pseudo-rehearsal blog post: https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
Support semi-supervised learning in spacy train
One obvious way to improve these pretraining methods is to do multi-task learning, instead of just transfer learning. This has been shown to work very well: https://arxiv.org/pdf/1809.08370.pdf . This patch makes it easy to do this sort of thing.
Add a new argument to spacy train, --raw-text. This takes a jsonl file with unlabelled data that can be used in arbitrary ways to do semi-supervised learning.
Add a new method to the Language class and to pipeline components, .rehearse(). This is like .update(), but doesn't expect GoldParse objects. It takes a batch of Doc objects, and performs an update on some semi-supervised objective.
Move the BERT-LMAO objective out from spacy/cli/pretrain.py into spacy/_ml.py, so we can create a new pipeline component, ClozeMultitask. This can be specified as a parser or NER multitask in the spacy train command. Example usage:
python -m spacy train en ./tmp ~/data/en-core-web/train/nw.json ~/data/en-core-web/dev/nw.json --pipeline parser --raw-textt ~/data/unlabelled/reddit-100k.jsonl --vectors en_vectors_web_lg --parser-multitasks cloze
Implement rehearsal methods for pipeline components
The new --raw-text argument and nlp.rehearse() method also gives us a good place to implement the the idea in the pseudo-rehearsal blog post in the parser. This works as follows:
Add a new nlp.resume_training() method. This allocates copies of pre-trained models in the pipeline, setting things up for the rehearsal updates. It also returns an optimizer object. This also greatly reduces confusion around the nlp.begin_training() method, which randomises the weights, making it not suitable for adding new labels or otherwise fine-tuning a pre-trained model.
Implement rehearsal updates on the Parser class, making it available for the dependency parser and NER. During rehearsal, the initial model is used to supervise the model being trained. The current model is asked to match the predictions of the initial model on some data. This minimises catastrophic forgetting, by keeping the model's predictions close to the original. See the blog post for details.
Implement rehearsal updates for tagger
Implement rehearsal updates for text categoriz
2018-12-10 18:25:33 +03:00
|
|
|
larger_W[:smaller.nO] = smaller_W
|
|
|
|
larger_b[:smaller.nO] = smaller_b
|
|
|
|
self._layers[-1] = larger
|
2019-02-24 18:41:41 +03:00
|
|
|
for i in range(smaller.nO, new_output):
|
|
|
|
self.unseen_classes.add(i)
|
2018-09-14 21:50:59 +03:00
|
|
|
|
|
|
|
def begin_training(self, X, y=None):
|
2018-09-25 12:08:31 +03:00
|
|
|
self.lower.begin_training(X, y=y)
|
💫 Replace ujson, msgpack and dill/pickle/cloudpickle with srsly (#3003)
Remove hacks and wrappers, keep code in sync across our libraries and move spaCy a few steps closer to only depending on packages with binary wheels 🎉
See here: https://github.com/explosion/srsly
Serialization is hard, especially across Python versions and multiple platforms. After dealing with many subtle bugs over the years (encodings, locales, large files) our libraries like spaCy and Prodigy have steadily grown a number of utility functions to wrap the multiple serialization formats we need to support (especially json, msgpack and pickle). These wrapping functions ended up duplicated across our codebases, so we wanted to put them in one place.
At the same time, we noticed that having a lot of small dependencies was making maintainence harder, and making installation slower. To solve this, we've made srsly standalone, by including the component packages directly within it. This way we can provide all the serialization utilities we need in a single binary wheel.
srsly currently includes forks of the following packages:
ujson
msgpack
msgpack-numpy
cloudpickle
* WIP: replace json/ujson with srsly
* Replace ujson in examples
Use regular json instead of srsly to make code easier to read and follow
* Update requirements
* Fix imports
* Fix typos
* Replace msgpack with srsly
* Fix warning
2018-12-03 03:28:22 +03:00
|
|
|
|
2018-05-15 23:17:29 +03:00
|
|
|
@property
|
|
|
|
def tok2vec(self):
|
|
|
|
return self._layers[0]
|
💫 Replace ujson, msgpack and dill/pickle/cloudpickle with srsly (#3003)
Remove hacks and wrappers, keep code in sync across our libraries and move spaCy a few steps closer to only depending on packages with binary wheels 🎉
See here: https://github.com/explosion/srsly
Serialization is hard, especially across Python versions and multiple platforms. After dealing with many subtle bugs over the years (encodings, locales, large files) our libraries like spaCy and Prodigy have steadily grown a number of utility functions to wrap the multiple serialization formats we need to support (especially json, msgpack and pickle). These wrapping functions ended up duplicated across our codebases, so we wanted to put them in one place.
At the same time, we noticed that having a lot of small dependencies was making maintainence harder, and making installation slower. To solve this, we've made srsly standalone, by including the component packages directly within it. This way we can provide all the serialization utilities we need in a single binary wheel.
srsly currently includes forks of the following packages:
ujson
msgpack
msgpack-numpy
cloudpickle
* WIP: replace json/ujson with srsly
* Replace ujson in examples
Use regular json instead of srsly to make code easier to read and follow
* Update requirements
* Fix imports
* Fix typos
* Replace msgpack with srsly
* Fix warning
2018-12-03 03:28:22 +03:00
|
|
|
|
2018-05-15 23:17:29 +03:00
|
|
|
@property
|
|
|
|
def lower(self):
|
|
|
|
return self._layers[1]
|
💫 Replace ujson, msgpack and dill/pickle/cloudpickle with srsly (#3003)
Remove hacks and wrappers, keep code in sync across our libraries and move spaCy a few steps closer to only depending on packages with binary wheels 🎉
See here: https://github.com/explosion/srsly
Serialization is hard, especially across Python versions and multiple platforms. After dealing with many subtle bugs over the years (encodings, locales, large files) our libraries like spaCy and Prodigy have steadily grown a number of utility functions to wrap the multiple serialization formats we need to support (especially json, msgpack and pickle). These wrapping functions ended up duplicated across our codebases, so we wanted to put them in one place.
At the same time, we noticed that having a lot of small dependencies was making maintainence harder, and making installation slower. To solve this, we've made srsly standalone, by including the component packages directly within it. This way we can provide all the serialization utilities we need in a single binary wheel.
srsly currently includes forks of the following packages:
ujson
msgpack
msgpack-numpy
cloudpickle
* WIP: replace json/ujson with srsly
* Replace ujson in examples
Use regular json instead of srsly to make code easier to read and follow
* Update requirements
* Fix imports
* Fix typos
* Replace msgpack with srsly
* Fix warning
2018-12-03 03:28:22 +03:00
|
|
|
|
2018-05-15 23:17:29 +03:00
|
|
|
@property
|
|
|
|
def upper(self):
|
|
|
|
return self._layers[2]
|
|
|
|
|
|
|
|
|
|
|
|
class ParserStepModel(Model):
|
2019-02-24 18:41:41 +03:00
|
|
|
def __init__(self, docs, layers, unseen_classes=None, drop=0.):
|
2018-05-15 23:17:29 +03:00
|
|
|
self.tokvecs, self.bp_tokvecs = layers[0].begin_update(docs, drop=drop)
|
2019-10-21 02:11:30 +03:00
|
|
|
if layers[1].nP >= 2:
|
|
|
|
activation = "maxout"
|
|
|
|
elif len(layers) == 2:
|
|
|
|
activation = None
|
|
|
|
else:
|
|
|
|
activation = "relu"
|
2018-05-15 23:17:29 +03:00
|
|
|
self.state2vec = precompute_hiddens(len(docs), self.tokvecs, layers[1],
|
2019-10-21 02:11:30 +03:00
|
|
|
activation=activation, drop=drop)
|
|
|
|
if len(layers) == 3:
|
|
|
|
self.vec2scores = layers[-1]
|
|
|
|
else:
|
|
|
|
self.vec2scores = None
|
2019-10-18 18:22:16 +03:00
|
|
|
self.cuda_stream = util.get_cuda_stream(non_blocking=True)
|
2018-05-15 23:17:29 +03:00
|
|
|
self.backprops = []
|
2019-10-21 02:11:30 +03:00
|
|
|
if self.vec2scores is None:
|
|
|
|
self._class_mask = numpy.zeros((self.state2vec.nO,), dtype='f')
|
|
|
|
else:
|
|
|
|
self._class_mask = numpy.zeros((self.vec2scores.nO,), dtype='f')
|
2019-02-24 18:41:41 +03:00
|
|
|
self._class_mask.fill(1)
|
|
|
|
if unseen_classes is not None:
|
|
|
|
for class_ in unseen_classes:
|
|
|
|
self._class_mask[class_] = 0.
|
2018-05-15 23:17:29 +03:00
|
|
|
|
|
|
|
@property
|
|
|
|
def nO(self):
|
|
|
|
return self.state2vec.nO
|
|
|
|
|
2019-02-24 18:41:41 +03:00
|
|
|
def class_is_unseen(self, class_):
|
|
|
|
return self._class_mask[class_]
|
|
|
|
|
|
|
|
def mark_class_unseen(self, class_):
|
|
|
|
self._class_mask[class_] = 0
|
|
|
|
|
|
|
|
def mark_class_seen(self, class_):
|
|
|
|
self._class_mask[class_] = 1
|
|
|
|
|
2018-05-15 23:17:29 +03:00
|
|
|
def begin_update(self, states, drop=0.):
|
|
|
|
token_ids = self.get_token_ids(states)
|
|
|
|
vector, get_d_tokvecs = self.state2vec.begin_update(token_ids, drop=0.0)
|
2019-10-21 02:11:30 +03:00
|
|
|
if self.vec2scores is not None:
|
|
|
|
mask = self.vec2scores.ops.get_dropout_mask(vector.shape, drop)
|
|
|
|
if mask is not None:
|
|
|
|
vector *= mask
|
|
|
|
scores, get_d_vector = self.vec2scores.begin_update(vector, drop=drop)
|
|
|
|
else:
|
|
|
|
scores = NumpyOps().asarray(vector)
|
|
|
|
get_d_vector = lambda d_scores, sgd=None: d_scores
|
|
|
|
mask = None
|
2019-02-24 18:41:41 +03:00
|
|
|
# If the class is unseen, make sure its score is minimum
|
|
|
|
scores[:, self._class_mask == 0] = numpy.nanmin(scores)
|
2018-05-15 23:17:29 +03:00
|
|
|
|
|
|
|
def backprop_parser_step(d_scores, sgd=None):
|
2019-02-24 18:41:41 +03:00
|
|
|
# Zero vectors for unseen classes
|
|
|
|
d_scores *= self._class_mask
|
2018-05-15 23:17:29 +03:00
|
|
|
d_vector = get_d_vector(d_scores, sgd=sgd)
|
|
|
|
if mask is not None:
|
|
|
|
d_vector *= mask
|
2018-05-19 20:24:34 +03:00
|
|
|
if isinstance(self.state2vec.ops, CupyOps) \
|
2018-05-15 23:17:29 +03:00
|
|
|
and not isinstance(token_ids, self.state2vec.ops.xp.ndarray):
|
|
|
|
# Move token_ids and d_vector to GPU, asynchronously
|
|
|
|
self.backprops.append((
|
|
|
|
util.get_async(self.cuda_stream, token_ids),
|
|
|
|
util.get_async(self.cuda_stream, d_vector),
|
|
|
|
get_d_tokvecs
|
|
|
|
))
|
|
|
|
else:
|
|
|
|
self.backprops.append((token_ids, d_vector, get_d_tokvecs))
|
|
|
|
return None
|
|
|
|
return scores, backprop_parser_step
|
|
|
|
|
|
|
|
def get_token_ids(self, batch):
|
|
|
|
states = _beam_utils.collect_states(batch)
|
|
|
|
cdef StateClass state
|
|
|
|
states = [state for state in states if not state.is_final()]
|
|
|
|
cdef np.ndarray ids = numpy.zeros((len(states), self.state2vec.nF),
|
|
|
|
dtype='i', order='C')
|
|
|
|
ids.fill(-1)
|
|
|
|
c_ids = <int*>ids.data
|
|
|
|
for state in states:
|
|
|
|
state.c.set_context_tokens(c_ids, ids.shape[1])
|
|
|
|
c_ids += ids.shape[1]
|
|
|
|
return ids
|
|
|
|
|
|
|
|
def make_updates(self, sgd):
|
|
|
|
# Add a padding vector to the d_tokvecs gradient, so that missing
|
|
|
|
# values don't affect the real gradient.
|
|
|
|
d_tokvecs = self.ops.allocate((self.tokvecs.shape[0]+1, self.tokvecs.shape[1]))
|
2019-10-21 02:11:30 +03:00
|
|
|
# Tells CUDA to block, so our async copies complete.
|
|
|
|
if self.cuda_stream is not None:
|
|
|
|
self.cuda_stream.synchronize()
|
2018-05-15 23:17:29 +03:00
|
|
|
for ids, d_vector, bp_vector in self.backprops:
|
|
|
|
d_state_features = bp_vector((d_vector, ids), sgd=sgd)
|
|
|
|
ids = ids.flatten()
|
|
|
|
d_state_features = d_state_features.reshape(
|
|
|
|
(ids.size, d_state_features.shape[2]))
|
|
|
|
self.ops.scatter_add(d_tokvecs, ids,
|
|
|
|
d_state_features)
|
|
|
|
# Padded -- see update()
|
|
|
|
self.bp_tokvecs(d_tokvecs[:-1], sgd=sgd)
|
|
|
|
return d_tokvecs
|
|
|
|
|
|
|
|
|
|
|
|
cdef class precompute_hiddens:
|
|
|
|
"""Allow a model to be "primed" by pre-computing input features in bulk.
|
|
|
|
|
|
|
|
This is used for the parser, where we want to take a batch of documents,
|
|
|
|
and compute vectors for each (token, position) pair. These vectors can then
|
|
|
|
be reused, especially for beam-search.
|
|
|
|
|
|
|
|
Let's say we're using 12 features for each state, e.g. word at start of
|
|
|
|
buffer, three words on stack, their children, etc. In the normal arc-eager
|
|
|
|
system, a document of length N is processed in 2*N states. This means we'll
|
|
|
|
create 2*N*12 feature vectors --- but if we pre-compute, we only need
|
|
|
|
N*12 vector computations. The saving for beam-search is much better:
|
|
|
|
if we have a beam of k, we'll normally make 2*N*12*K computations --
|
|
|
|
so we can save the factor k. This also gives a nice CPU/GPU division:
|
|
|
|
we can do all our hard maths up front, packed into large multiplications,
|
|
|
|
and do the hard-to-program parsing on the CPU.
|
|
|
|
"""
|
|
|
|
cdef readonly int nF, nO, nP
|
|
|
|
cdef bint _is_synchronized
|
|
|
|
cdef public object ops
|
|
|
|
cdef np.ndarray _features
|
|
|
|
cdef np.ndarray _cached
|
|
|
|
cdef np.ndarray bias
|
|
|
|
cdef object _cuda_stream
|
|
|
|
cdef object _bp_hiddens
|
2019-10-21 02:11:30 +03:00
|
|
|
cdef object activation
|
2018-05-15 23:17:29 +03:00
|
|
|
|
|
|
|
def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None,
|
2019-10-21 02:11:30 +03:00
|
|
|
activation="maxout", drop=0.):
|
2018-05-15 23:17:29 +03:00
|
|
|
gpu_cached, bp_features = lower_model.begin_update(tokvecs, drop=drop)
|
|
|
|
cdef np.ndarray cached
|
|
|
|
if not isinstance(gpu_cached, numpy.ndarray):
|
|
|
|
# Note the passing of cuda_stream here: it lets
|
|
|
|
# cupy make the copy asynchronously.
|
|
|
|
# We then have to block before first use.
|
|
|
|
cached = gpu_cached.get(stream=cuda_stream)
|
|
|
|
else:
|
|
|
|
cached = gpu_cached
|
|
|
|
if not isinstance(lower_model.b, numpy.ndarray):
|
|
|
|
self.bias = lower_model.b.get()
|
|
|
|
else:
|
|
|
|
self.bias = lower_model.b
|
|
|
|
self.nF = cached.shape[1]
|
|
|
|
self.nP = getattr(lower_model, 'nP', 1)
|
|
|
|
self.nO = cached.shape[2]
|
|
|
|
self.ops = lower_model.ops
|
2019-10-21 02:11:30 +03:00
|
|
|
assert activation in (None, "relu", "maxout")
|
|
|
|
self.activation = activation
|
2018-05-15 23:17:29 +03:00
|
|
|
self._is_synchronized = False
|
|
|
|
self._cuda_stream = cuda_stream
|
|
|
|
self._cached = cached
|
|
|
|
self._bp_hiddens = bp_features
|
|
|
|
|
|
|
|
cdef const float* get_feat_weights(self) except NULL:
|
|
|
|
if not self._is_synchronized and self._cuda_stream is not None:
|
|
|
|
self._cuda_stream.synchronize()
|
|
|
|
self._is_synchronized = True
|
|
|
|
return <float*>self._cached.data
|
|
|
|
|
|
|
|
def __call__(self, X):
|
2019-10-21 02:11:30 +03:00
|
|
|
return self.begin_update(X, drop=None)[0]
|
2018-05-15 23:17:29 +03:00
|
|
|
|
|
|
|
def begin_update(self, token_ids, drop=0.):
|
|
|
|
cdef np.ndarray state_vector = numpy.zeros(
|
|
|
|
(token_ids.shape[0], self.nO, self.nP), dtype='f')
|
|
|
|
# This is tricky, but (assuming GPU available);
|
|
|
|
# - Input to forward on CPU
|
|
|
|
# - Output from forward on CPU
|
|
|
|
# - Input to backward on GPU!
|
|
|
|
# - Output from backward on GPU
|
|
|
|
bp_hiddens = self._bp_hiddens
|
|
|
|
|
|
|
|
feat_weights = self.get_feat_weights()
|
|
|
|
cdef int[:, ::1] ids = token_ids
|
|
|
|
sum_state_features(<float*>state_vector.data,
|
|
|
|
feat_weights, &ids[0,0],
|
|
|
|
token_ids.shape[0], self.nF, self.nO*self.nP)
|
|
|
|
state_vector += self.bias
|
|
|
|
state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
|
|
|
|
|
|
|
|
def backward(d_state_vector_ids, sgd=None):
|
|
|
|
d_state_vector, token_ids = d_state_vector_ids
|
|
|
|
d_state_vector = bp_nonlinearity(d_state_vector, sgd)
|
|
|
|
d_tokens = bp_hiddens((d_state_vector, token_ids), sgd)
|
|
|
|
return d_tokens
|
|
|
|
return state_vector, backward
|
|
|
|
|
|
|
|
def _nonlinearity(self, state_vector):
|
2019-10-20 18:15:17 +03:00
|
|
|
if isinstance(state_vector, numpy.ndarray):
|
|
|
|
ops = NumpyOps()
|
|
|
|
else:
|
|
|
|
ops = CupyOps()
|
|
|
|
|
2019-10-21 02:11:30 +03:00
|
|
|
if self.activation == "maxout":
|
2019-10-20 18:15:17 +03:00
|
|
|
state_vector, mask = ops.maxout(state_vector)
|
2019-10-21 02:11:30 +03:00
|
|
|
else:
|
|
|
|
state_vector = state_vector.reshape(state_vector.shape[:-1])
|
|
|
|
if self.activation == "relu":
|
|
|
|
mask = state_vector >= 0.
|
|
|
|
state_vector *= mask
|
2018-05-15 23:17:29 +03:00
|
|
|
|
|
|
|
def backprop_nonlinearity(d_best, sgd=None):
|
2019-10-20 18:15:17 +03:00
|
|
|
if isinstance(d_best, numpy.ndarray):
|
|
|
|
ops = NumpyOps()
|
|
|
|
else:
|
|
|
|
ops = CupyOps()
|
2019-10-28 01:43:09 +03:00
|
|
|
mask_ = ops.asarray(mask)
|
|
|
|
|
2019-10-20 18:15:17 +03:00
|
|
|
# This will usually be on GPU
|
|
|
|
d_best = ops.asarray(d_best)
|
2018-12-20 18:12:22 +03:00
|
|
|
# Fix nans (which can occur from unseen classes.)
|
2019-10-20 18:15:17 +03:00
|
|
|
d_best[ops.xp.isnan(d_best)] = 0.
|
2019-10-21 02:11:30 +03:00
|
|
|
if self.activation == "maxout":
|
|
|
|
mask_ = ops.asarray(mask)
|
|
|
|
return ops.backprop_maxout(d_best, mask_, self.nP)
|
|
|
|
elif self.activation == "relu":
|
|
|
|
mask_ = ops.asarray(mask)
|
2019-10-20 18:15:17 +03:00
|
|
|
d_best *= mask_
|
2018-05-15 23:17:29 +03:00
|
|
|
d_best = d_best.reshape((d_best.shape + (1,)))
|
|
|
|
return d_best
|
|
|
|
else:
|
2019-10-21 02:11:30 +03:00
|
|
|
return d_best.reshape((d_best.shape + (1,)))
|
2018-05-15 23:17:29 +03:00
|
|
|
return state_vector, backprop_nonlinearity
|