mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
Experimental character-based pretraining (#5700)
* Use cosine loss in Cloze multitask
* Fix char_embed for gpu
* Call resume_training for base model in train CLI
* Fix bilstm_depth default in pretrain command
* Implement character-based pretraining objective
* Use chars loss in ClozeMultitask
* Add method to decode predicted characters
* Fix number characters
* Rescale gradients for mlm
* Fix char embed+vectors in ml
* Fix pipes
* Fix pretrain args
* Move get_characters_loss
* Fix import
* Fix import
* Mention characters loss option in pretrain
* Remove broken 'self attention' option in pretrain
* Revert "Remove broken 'self attention' option in pretrain"
This reverts commit 56b820f6af
.
* Document 'characters' objective of pretrain
This commit is contained in:
parent
86d13a9fb8
commit
3e78e82a83
20
spacy/_ml.py
20
spacy/_ml.py
|
@ -14,7 +14,7 @@ from thinc.api import with_getitem, flatten_add_lengths
|
||||||
from thinc.api import uniqued, wrap, noop
|
from thinc.api import uniqued, wrap, noop
|
||||||
from thinc.linear.linear import LinearModel
|
from thinc.linear.linear import LinearModel
|
||||||
from thinc.neural.ops import NumpyOps, CupyOps
|
from thinc.neural.ops import NumpyOps, CupyOps
|
||||||
from thinc.neural.util import get_array_module, copy_array
|
from thinc.neural.util import get_array_module, copy_array, to_categorical
|
||||||
from thinc.neural.optimizers import Adam
|
from thinc.neural.optimizers import Adam
|
||||||
|
|
||||||
from thinc import describe
|
from thinc import describe
|
||||||
|
@ -840,6 +840,8 @@ def masked_language_model(vocab, model, mask_prob=0.15):
|
||||||
|
|
||||||
def mlm_backward(d_output, sgd=None):
|
def mlm_backward(d_output, sgd=None):
|
||||||
d_output *= 1 - mask
|
d_output *= 1 - mask
|
||||||
|
# Rescale gradient for number of instances.
|
||||||
|
d_output *= mask.size - mask.sum()
|
||||||
return backprop(d_output, sgd=sgd)
|
return backprop(d_output, sgd=sgd)
|
||||||
|
|
||||||
return output, mlm_backward
|
return output, mlm_backward
|
||||||
|
@ -944,7 +946,7 @@ class CharacterEmbed(Model):
|
||||||
# for the tip.
|
# for the tip.
|
||||||
nCv = self.ops.xp.arange(self.nC)
|
nCv = self.ops.xp.arange(self.nC)
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
doc_ids = doc.to_utf8_array(nr_char=self.nC)
|
doc_ids = self.ops.asarray(doc.to_utf8_array(nr_char=self.nC))
|
||||||
doc_vectors = self.ops.allocate((len(doc), self.nC, self.nM))
|
doc_vectors = self.ops.allocate((len(doc), self.nC, self.nM))
|
||||||
# Let's say I have a 2d array of indices, and a 3d table of data. What numpy
|
# Let's say I have a 2d array of indices, and a 3d table of data. What numpy
|
||||||
# incantation do I chant to get
|
# incantation do I chant to get
|
||||||
|
@ -986,3 +988,17 @@ def get_cossim_loss(yh, y, ignore_zeros=False):
|
||||||
losses[zero_indices] = 0
|
losses[zero_indices] = 0
|
||||||
loss = losses.sum()
|
loss = losses.sum()
|
||||||
return loss, -d_yh
|
return loss, -d_yh
|
||||||
|
|
||||||
|
|
||||||
|
def get_characters_loss(ops, docs, prediction, nr_char=10):
|
||||||
|
target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs])
|
||||||
|
target_ids = target_ids.reshape((-1,))
|
||||||
|
target = ops.asarray(to_categorical(target_ids, nb_classes=256), dtype="f")
|
||||||
|
target = target.reshape((-1, 256*nr_char))
|
||||||
|
diff = prediction - target
|
||||||
|
loss = (diff**2).sum()
|
||||||
|
d_target = diff / float(prediction.shape[0])
|
||||||
|
return loss, d_target
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -18,7 +18,8 @@ from ..errors import Errors
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
from ..attrs import ID, HEAD
|
from ..attrs import ID, HEAD
|
||||||
from .._ml import Tok2Vec, flatten, chain, create_default_optimizer
|
from .._ml import Tok2Vec, flatten, chain, create_default_optimizer
|
||||||
from .._ml import masked_language_model, get_cossim_loss
|
from .._ml import masked_language_model, get_cossim_loss, get_characters_loss
|
||||||
|
from .._ml import MultiSoftmax
|
||||||
from .. import util
|
from .. import util
|
||||||
from .train import _load_pretrained_tok2vec
|
from .train import _load_pretrained_tok2vec
|
||||||
|
|
||||||
|
@ -42,7 +43,7 @@ from .train import _load_pretrained_tok2vec
|
||||||
bilstm_depth=("Depth of BiLSTM layers (requires PyTorch)", "option", "lstm", int),
|
bilstm_depth=("Depth of BiLSTM layers (requires PyTorch)", "option", "lstm", int),
|
||||||
embed_rows=("Number of embedding rows", "option", "er", int),
|
embed_rows=("Number of embedding rows", "option", "er", int),
|
||||||
loss_func=(
|
loss_func=(
|
||||||
"Loss function to use for the objective. Either 'L2' or 'cosine'",
|
"Loss function to use for the objective. Either 'characters', 'L2' or 'cosine'",
|
||||||
"option",
|
"option",
|
||||||
"L",
|
"L",
|
||||||
str,
|
str,
|
||||||
|
@ -85,11 +86,11 @@ def pretrain(
|
||||||
output_dir,
|
output_dir,
|
||||||
width=96,
|
width=96,
|
||||||
conv_depth=4,
|
conv_depth=4,
|
||||||
bilstm_depth=0,
|
|
||||||
cnn_pieces=3,
|
cnn_pieces=3,
|
||||||
sa_depth=0,
|
sa_depth=0,
|
||||||
use_chars=False,
|
|
||||||
cnn_window=1,
|
cnn_window=1,
|
||||||
|
bilstm_depth=0,
|
||||||
|
use_chars=False,
|
||||||
embed_rows=2000,
|
embed_rows=2000,
|
||||||
loss_func="cosine",
|
loss_func="cosine",
|
||||||
use_vectors=False,
|
use_vectors=False,
|
||||||
|
@ -124,11 +125,7 @@ def pretrain(
|
||||||
config[key] = str(config[key])
|
config[key] = str(config[key])
|
||||||
util.fix_random_seed(seed)
|
util.fix_random_seed(seed)
|
||||||
|
|
||||||
has_gpu = prefer_gpu()
|
has_gpu = prefer_gpu(gpu_id=1)
|
||||||
if has_gpu:
|
|
||||||
import torch
|
|
||||||
|
|
||||||
torch.set_default_tensor_type("torch.cuda.FloatTensor")
|
|
||||||
msg.info("Using GPU" if has_gpu else "Not using GPU")
|
msg.info("Using GPU" if has_gpu else "Not using GPU")
|
||||||
|
|
||||||
output_dir = Path(output_dir)
|
output_dir = Path(output_dir)
|
||||||
|
@ -174,6 +171,7 @@ def pretrain(
|
||||||
subword_features=not use_chars, # Set to False for Chinese etc
|
subword_features=not use_chars, # Set to False for Chinese etc
|
||||||
cnn_maxout_pieces=cnn_pieces, # If set to 1, use Mish activation.
|
cnn_maxout_pieces=cnn_pieces, # If set to 1, use Mish activation.
|
||||||
),
|
),
|
||||||
|
objective=loss_func
|
||||||
)
|
)
|
||||||
# Load in pretrained weights
|
# Load in pretrained weights
|
||||||
if init_tok2vec is not None:
|
if init_tok2vec is not None:
|
||||||
|
@ -264,7 +262,10 @@ def make_update(model, docs, optimizer, drop=0.0, objective="L2"):
|
||||||
RETURNS loss: A float for the loss.
|
RETURNS loss: A float for the loss.
|
||||||
"""
|
"""
|
||||||
predictions, backprop = model.begin_update(docs, drop=drop)
|
predictions, backprop = model.begin_update(docs, drop=drop)
|
||||||
loss, gradients = get_vectors_loss(model.ops, docs, predictions, objective)
|
if objective == "characters":
|
||||||
|
loss, gradients = get_characters_loss(model.ops, docs, predictions)
|
||||||
|
else:
|
||||||
|
loss, gradients = get_vectors_loss(model.ops, docs, predictions, objective)
|
||||||
backprop(gradients, sgd=optimizer)
|
backprop(gradients, sgd=optimizer)
|
||||||
# Don't want to return a cupy object here
|
# Don't want to return a cupy object here
|
||||||
# The gradients are modified in-place by the BERT MLM,
|
# The gradients are modified in-place by the BERT MLM,
|
||||||
|
@ -326,16 +327,23 @@ def get_vectors_loss(ops, docs, prediction, objective="L2"):
|
||||||
return loss, d_target
|
return loss, d_target
|
||||||
|
|
||||||
|
|
||||||
def create_pretraining_model(nlp, tok2vec):
|
def create_pretraining_model(nlp, tok2vec, objective="cosine", nr_char=10):
|
||||||
"""Define a network for the pretraining. We simply add an output layer onto
|
"""Define a network for the pretraining. We simply add an output layer onto
|
||||||
the tok2vec input model. The tok2vec input model needs to be a model that
|
the tok2vec input model. The tok2vec input model needs to be a model that
|
||||||
takes a batch of Doc objects (as a list), and returns a list of arrays.
|
takes a batch of Doc objects (as a list), and returns a list of arrays.
|
||||||
Each array in the output needs to have one row per token in the doc.
|
Each array in the output needs to have one row per token in the doc.
|
||||||
"""
|
"""
|
||||||
output_size = nlp.vocab.vectors.data.shape[1]
|
if objective == "characters":
|
||||||
output_layer = chain(
|
out_sizes = [256] * nr_char
|
||||||
LN(Maxout(300, pieces=3)), Affine(output_size, drop_factor=0.0)
|
output_layer = chain(
|
||||||
)
|
LN(Maxout(300, pieces=3)),
|
||||||
|
MultiSoftmax(out_sizes, 300)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
output_size = nlp.vocab.vectors.data.shape[1]
|
||||||
|
output_layer = chain(
|
||||||
|
LN(Maxout(300, pieces=3)), Affine(output_size, drop_factor=0.0)
|
||||||
|
)
|
||||||
# This is annoying, but the parser etc have the flatten step after
|
# This is annoying, but the parser etc have the flatten step after
|
||||||
# the tok2vec. To load the weights in cleanly, we need to match
|
# the tok2vec. To load the weights in cleanly, we need to match
|
||||||
# the shape of the models' components exactly. So what we cann
|
# the shape of the models' components exactly. So what we cann
|
||||||
|
|
|
@ -285,7 +285,7 @@ def train(
|
||||||
|
|
||||||
if base_model and not pipes_added:
|
if base_model and not pipes_added:
|
||||||
# Start with an existing model, use default optimizer
|
# Start with an existing model, use default optimizer
|
||||||
optimizer = create_default_optimizer(Model.ops)
|
optimizer = nlp.resume_training(device=use_gpu)
|
||||||
else:
|
else:
|
||||||
# Start with a blank model, call begin_training
|
# Start with a blank model, call begin_training
|
||||||
cfg = {"device": use_gpu}
|
cfg = {"device": use_gpu}
|
||||||
|
|
|
@ -49,6 +49,14 @@ def Tok2Vec(width, embed_size, **kwargs):
|
||||||
>> LN(Maxout(width, width * 5, pieces=3)),
|
>> LN(Maxout(width, width * 5, pieces=3)),
|
||||||
column=cols.index(ORTH),
|
column=cols.index(ORTH),
|
||||||
)
|
)
|
||||||
|
elif char_embed:
|
||||||
|
embed = concatenate_lists(
|
||||||
|
CharacterEmbed(nM=64, nC=8),
|
||||||
|
FeatureExtracter(cols) >> with_flatten(glove),
|
||||||
|
)
|
||||||
|
reduce_dimensions = LN(
|
||||||
|
Maxout(width, 64 * 8 + width, pieces=cnn_maxout_pieces)
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
embed = uniqued(
|
embed = uniqued(
|
||||||
(glove | norm) >> LN(Maxout(width, width * 2, pieces=3)),
|
(glove | norm) >> LN(Maxout(width, width * 2, pieces=3)),
|
||||||
|
@ -81,7 +89,8 @@ def Tok2Vec(width, embed_size, **kwargs):
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
tok2vec = FeatureExtracter(cols) >> with_flatten(
|
tok2vec = FeatureExtracter(cols) >> with_flatten(
|
||||||
embed >> convolution ** conv_depth, pad=conv_depth
|
embed
|
||||||
|
>> convolution ** conv_depth, pad=conv_depth
|
||||||
)
|
)
|
||||||
|
|
||||||
if bilstm_depth >= 1:
|
if bilstm_depth >= 1:
|
||||||
|
|
|
@ -33,6 +33,7 @@ from .._ml import build_text_classifier, build_simple_cnn_text_classifier
|
||||||
from .._ml import build_bow_text_classifier, build_nel_encoder
|
from .._ml import build_bow_text_classifier, build_nel_encoder
|
||||||
from .._ml import link_vectors_to_models, zero_init, flatten
|
from .._ml import link_vectors_to_models, zero_init, flatten
|
||||||
from .._ml import masked_language_model, create_default_optimizer, get_cossim_loss
|
from .._ml import masked_language_model, create_default_optimizer, get_cossim_loss
|
||||||
|
from .._ml import MultiSoftmax, get_characters_loss
|
||||||
from ..errors import Errors, TempErrors, Warnings
|
from ..errors import Errors, TempErrors, Warnings
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
@ -846,11 +847,15 @@ class MultitaskObjective(Tagger):
|
||||||
class ClozeMultitask(Pipe):
|
class ClozeMultitask(Pipe):
|
||||||
@classmethod
|
@classmethod
|
||||||
def Model(cls, vocab, tok2vec, **cfg):
|
def Model(cls, vocab, tok2vec, **cfg):
|
||||||
output_size = vocab.vectors.data.shape[1]
|
if cfg["objective"] == "characters":
|
||||||
output_layer = chain(
|
out_sizes = [256] * cfg.get("nr_char", 4)
|
||||||
LayerNorm(Maxout(output_size, tok2vec.nO, pieces=3)),
|
output_layer = MultiSoftmax(out_sizes)
|
||||||
zero_init(Affine(output_size, output_size, drop_factor=0.0))
|
else:
|
||||||
)
|
output_size = vocab.vectors.data.shape[1]
|
||||||
|
output_layer = chain(
|
||||||
|
LayerNorm(Maxout(output_size, tok2vec.nO, pieces=3)),
|
||||||
|
zero_init(Affine(output_size, output_size, drop_factor=0.0))
|
||||||
|
)
|
||||||
model = chain(tok2vec, output_layer)
|
model = chain(tok2vec, output_layer)
|
||||||
model = masked_language_model(vocab, model)
|
model = masked_language_model(vocab, model)
|
||||||
model.tok2vec = tok2vec
|
model.tok2vec = tok2vec
|
||||||
|
@ -861,6 +866,8 @@ class ClozeMultitask(Pipe):
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.model = model
|
self.model = model
|
||||||
self.cfg = cfg
|
self.cfg = cfg
|
||||||
|
self.cfg.setdefault("objective", "characters")
|
||||||
|
self.cfg.setdefault("nr_char", 4)
|
||||||
|
|
||||||
def set_annotations(self, docs, dep_ids, tensors=None):
|
def set_annotations(self, docs, dep_ids, tensors=None):
|
||||||
pass
|
pass
|
||||||
|
@ -869,7 +876,8 @@ class ClozeMultitask(Pipe):
|
||||||
tok2vec=None, sgd=None, **kwargs):
|
tok2vec=None, sgd=None, **kwargs):
|
||||||
link_vectors_to_models(self.vocab)
|
link_vectors_to_models(self.vocab)
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
self.model = self.Model(self.vocab, tok2vec)
|
kwargs.update(self.cfg)
|
||||||
|
self.model = self.Model(self.vocab, tok2vec, **kwargs)
|
||||||
X = self.model.ops.allocate((5, self.model.tok2vec.nO))
|
X = self.model.ops.allocate((5, self.model.tok2vec.nO))
|
||||||
self.model.output_layer.begin_training(X)
|
self.model.output_layer.begin_training(X)
|
||||||
if sgd is None:
|
if sgd is None:
|
||||||
|
@ -883,13 +891,16 @@ class ClozeMultitask(Pipe):
|
||||||
return tokvecs, vectors
|
return tokvecs, vectors
|
||||||
|
|
||||||
def get_loss(self, docs, vectors, prediction):
|
def get_loss(self, docs, vectors, prediction):
|
||||||
# The simplest way to implement this would be to vstack the
|
if self.cfg["objective"] == "characters":
|
||||||
# token.vector values, but that's a bit inefficient, especially on GPU.
|
loss, gradient = get_characters_loss(self.model.ops, docs, prediction)
|
||||||
# Instead we fetch the index into the vectors table for each of our tokens,
|
else:
|
||||||
# and look them up all at once. This prevents data copying.
|
# The simplest way to implement this would be to vstack the
|
||||||
ids = self.model.ops.flatten([doc.to_array(ID).ravel() for doc in docs])
|
# token.vector values, but that's a bit inefficient, especially on GPU.
|
||||||
target = vectors[ids]
|
# Instead we fetch the index into the vectors table for each of our tokens,
|
||||||
loss, gradient = get_cossim_loss(prediction, target, ignore_zeros=True)
|
# and look them up all at once. This prevents data copying.
|
||||||
|
ids = self.model.ops.flatten([doc.to_array(ID).ravel() for doc in docs])
|
||||||
|
target = vectors[ids]
|
||||||
|
loss, gradient = get_cossim_loss(prediction, target, ignore_zeros=True)
|
||||||
return float(loss), gradient
|
return float(loss), gradient
|
||||||
|
|
||||||
def update(self, docs, golds, drop=0., sgd=None, losses=None):
|
def update(self, docs, golds, drop=0., sgd=None, losses=None):
|
||||||
|
@ -906,6 +917,20 @@ class ClozeMultitask(Pipe):
|
||||||
if losses is not None:
|
if losses is not None:
|
||||||
losses[self.name] += loss
|
losses[self.name] += loss
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def decode_utf8_predictions(char_array):
|
||||||
|
# The format alternates filling from start and end, and 255 is missing
|
||||||
|
words = []
|
||||||
|
char_array = char_array.reshape((char_array.shape[0], -1, 256))
|
||||||
|
nr_char = char_array.shape[1]
|
||||||
|
char_array = char_array.argmax(axis=-1)
|
||||||
|
for row in char_array:
|
||||||
|
starts = [chr(c) for c in row[::2] if c != 255]
|
||||||
|
ends = [chr(c) for c in row[1::2] if c != 255]
|
||||||
|
word = "".join(starts + list(reversed(ends)))
|
||||||
|
words.append(word)
|
||||||
|
return words
|
||||||
|
|
||||||
|
|
||||||
@component("textcat", assigns=["doc.cats"])
|
@component("textcat", assigns=["doc.cats"])
|
||||||
class TextCategorizer(Pipe):
|
class TextCategorizer(Pipe):
|
||||||
|
@ -1069,6 +1094,7 @@ cdef class DependencyParser(Parser):
|
||||||
assigns = ["token.dep", "token.is_sent_start", "doc.sents"]
|
assigns = ["token.dep", "token.is_sent_start", "doc.sents"]
|
||||||
requires = []
|
requires = []
|
||||||
TransitionSystem = ArcEager
|
TransitionSystem = ArcEager
|
||||||
|
nr_feature = 8
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def postprocesses(self):
|
def postprocesses(self):
|
||||||
|
|
|
@ -473,7 +473,7 @@ $ python -m spacy pretrain [texts_loc] [vectors_model] [output_dir]
|
||||||
| `--use-chars`, `-chr` <Tag variant="new">2.2.2</Tag> | flag | Whether to use character-based embedding. |
|
| `--use-chars`, `-chr` <Tag variant="new">2.2.2</Tag> | flag | Whether to use character-based embedding. |
|
||||||
| `--sa-depth`, `-sa` <Tag variant="new">2.2.2</Tag> | option | Depth of self-attention layers. |
|
| `--sa-depth`, `-sa` <Tag variant="new">2.2.2</Tag> | option | Depth of self-attention layers. |
|
||||||
| `--embed-rows`, `-er` | option | Number of embedding rows. |
|
| `--embed-rows`, `-er` | option | Number of embedding rows. |
|
||||||
| `--loss-func`, `-L` | option | Loss function to use for the objective. Either `"L2"` or `"cosine"`. |
|
| `--loss-func`, `-L` | option | Loss function to use for the objective. Either `"cosine"`, `"L2"` or `"characters"`. |
|
||||||
| `--dropout`, `-d` | option | Dropout rate. |
|
| `--dropout`, `-d` | option | Dropout rate. |
|
||||||
| `--batch-size`, `-bs` | option | Number of words per training batch. |
|
| `--batch-size`, `-bs` | option | Number of words per training batch. |
|
||||||
| `--max-length`, `-xw` | option | Maximum words per example. Longer examples are discarded. |
|
| `--max-length`, `-xw` | option | Maximum words per example. Longer examples are discarded. |
|
||||||
|
|
Loading…
Reference in New Issue
Block a user