Update pretrain command

This commit is contained in:
Matthew Honnibal 2018-11-29 12:36:43 +00:00
parent 73255091f8
commit 008e1ee1dd

View File

@ -28,8 +28,9 @@ from spacy.tokens import Doc
from spacy.attrs import ID, HEAD from spacy.attrs import ID, HEAD
from spacy.util import minibatch, minibatch_by_words, use_gpu, compounding, ensure_path from spacy.util import minibatch, minibatch_by_words, use_gpu, compounding, ensure_path
from spacy._ml import Tok2Vec, flatten, chain, zero_init, create_default_optimizer from spacy._ml import Tok2Vec, flatten, chain, zero_init, create_default_optimizer
from thinc.v2v import Affine from thinc.v2v import Affine, Maxout
from thinc.api import wrap from thinc.api import wrap
from thinc.misc import LayerNorm as LN
def prefer_gpu(): def prefer_gpu():
@ -120,7 +121,10 @@ def create_pretraining_model(nlp, tok2vec):
Each array in the output needs to have one row per token in the doc. Each array in the output needs to have one row per token in the doc.
''' '''
output_size = nlp.vocab.vectors.data.shape[1] output_size = nlp.vocab.vectors.data.shape[1]
output_layer = zero_init(Affine(output_size, drop_factor=0.0)) output_layer = chain(
LN(Maxout(300, pieces=3)),
zero_init(Affine(output_size, drop_factor=0.0))
)
# This is annoying, but the parser etc have the flatten step after # This is annoying, but the parser etc have the flatten step after
# the tok2vec. To load the weights in cleanly, we need to match # the tok2vec. To load the weights in cleanly, we need to match
# the shape of the models' components exactly. So what we cann # the shape of the models' components exactly. So what we cann
@ -139,16 +143,10 @@ def create_pretraining_model(nlp, tok2vec):
def masked_language_model(vocab, model, mask_prob=0.15): def masked_language_model(vocab, model, mask_prob=0.15):
'''Convert a model into a BERT-style masked language model''' '''Convert a model into a BERT-style masked language model'''
vocab_words = [lex.text for lex in vocab if lex.prob != 0.0]
vocab_probs = [lex.prob for lex in vocab if lex.prob != 0.0]
vocab_words = vocab_words[:10000]
vocab_probs = vocab_probs[:10000]
vocab_probs = numpy.exp(numpy.array(vocab_probs, dtype='f'))
vocab_probs /= vocab_probs.sum()
random_words = RandomWords(vocab)
def mlm_forward(docs, drop=0.): def mlm_forward(docs, drop=0.):
mask, docs = apply_mask(docs, vocab_words, vocab_probs, mask, docs = apply_mask(docs, random_words, mask_prob=mask_prob)
mask_prob=mask_prob)
mask = model.ops.asarray(mask).reshape((mask.shape[0], 1)) mask = model.ops.asarray(mask).reshape((mask.shape[0], 1))
output, backprop = model.begin_update(docs, drop=drop) output, backprop = model.begin_update(docs, drop=drop)
@ -161,7 +159,7 @@ def masked_language_model(vocab, model, mask_prob=0.15):
return wrap(mlm_forward, model) return wrap(mlm_forward, model)
def apply_mask(docs, vocab_texts, vocab_probs, mask_prob=0.15): def apply_mask(docs, random_words, mask_prob=0.15):
N = sum(len(doc) for doc in docs) N = sum(len(doc) for doc in docs)
mask = numpy.random.uniform(0., 1.0, (N,)) mask = numpy.random.uniform(0., 1.0, (N,))
mask = mask >= mask_prob mask = mask >= mask_prob
@ -171,7 +169,7 @@ def apply_mask(docs, vocab_texts, vocab_probs, mask_prob=0.15):
words = [] words = []
for token in doc: for token in doc:
if not mask[i]: if not mask[i]:
word = replace_word(token.text, vocab_texts, vocab_probs) word = replace_word(token.text, random_words)
else: else:
word = token.text word = token.text
words.append(word) words.append(word)
@ -186,19 +184,35 @@ def apply_mask(docs, vocab_texts, vocab_probs, mask_prob=0.15):
return mask, masked_docs return mask, masked_docs
def replace_word(word, vocab_texts, vocab_probs, mask='[MASK]'): def replace_word(word, random_words, mask='[MASK]'):
roll = random.random() roll = random.random()
if roll < 0.8: if roll < 0.8:
return mask return mask
elif roll < 0.9: elif roll < 0.9:
index = numpy.random.choice(len(vocab_texts), p=vocab_probs) return random_words.next()
return vocab_texts[index]
else: else:
return word return word
class RandomWords(object):
def __init__(self, vocab):
self.words = [lex.text for lex in vocab if lex.prob != 0.0]
self.probs = [lex.prob for lex in vocab if lex.prob != 0.0]
self.words = self.words[:10000]
self.probs = self.probs[:10000]
self.probs = numpy.exp(numpy.array(self.probs, dtype='f'))
self.probs /= self.probs.sum()
self._cache = []
def next(self):
if not self._cache:
self._cache.extend(numpy.random.choice(len(self.words), 10000,
p=self.probs))
index = self._cache.pop()
return self.words[index]
class ProgressTracker(object): class ProgressTracker(object):
def __init__(self, frequency=100000): def __init__(self, frequency=1000000):
self.loss = 0.0 self.loss = 0.0
self.prev_loss = 0.0 self.prev_loss = 0.0
self.nr_word = 0 self.nr_word = 0
@ -206,9 +220,11 @@ class ProgressTracker(object):
self.frequency = frequency self.frequency = frequency
self.last_time = time.time() self.last_time = time.time()
self.last_update = 0 self.last_update = 0
self.epoch_loss = 0.0
def update(self, epoch, loss, docs): def update(self, epoch, loss, docs):
self.loss += loss self.loss += loss
self.epoch_loss += loss
words_in_batch = sum(len(doc) for doc in docs) words_in_batch = sum(len(doc) for doc in docs)
self.words_per_epoch[epoch] += words_in_batch self.words_per_epoch[epoch] += words_in_batch
self.nr_word += words_in_batch self.nr_word += words_in_batch
@ -243,8 +259,8 @@ class ProgressTracker(object):
seed=("Seed for random number generators", "option", "s", float), seed=("Seed for random number generators", "option", "s", float),
nr_iter=("Number of iterations to pretrain", "option", "i", int), nr_iter=("Number of iterations to pretrain", "option", "i", int),
) )
def pretrain(texts_loc, vectors_model, output_dir, width=128, depth=4, def pretrain(texts_loc, vectors_model, output_dir, width=96, depth=4,
embed_rows=5000, use_vectors=False, dropout=0.2, nr_iter=100, seed=0): embed_rows=2000, use_vectors=False, dropout=0.2, nr_iter=1000, seed=0):
""" """
Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components, Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
using an approximate language-modelling objective. Specifically, we load using an approximate language-modelling objective. Specifically, we load
@ -284,8 +300,8 @@ def pretrain(texts_loc, vectors_model, output_dir, width=128, depth=4,
print('Epoch', '#Words', 'Loss', 'w/s') print('Epoch', '#Words', 'Loss', 'w/s')
texts = stream_texts() if texts_loc == '-' else load_texts(texts_loc) texts = stream_texts() if texts_loc == '-' else load_texts(texts_loc)
for epoch in range(nr_iter): for epoch in range(nr_iter):
for batch in minibatch(texts, size=256): for batch in minibatch_by_words(((text, None) for text in texts), size=5000):
docs = make_docs(nlp, batch) docs = make_docs(nlp, [text for (text, _) in batch])
loss = make_update(model, docs, optimizer, drop=dropout) loss = make_update(model, docs, optimizer, drop=dropout)
progress = tracker.update(epoch, loss, docs) progress = tracker.update(epoch, loss, docs)
if progress: if progress:
@ -297,6 +313,8 @@ def pretrain(texts_loc, vectors_model, output_dir, width=128, depth=4,
file_.write(model.tok2vec.to_bytes()) file_.write(model.tok2vec.to_bytes())
with (output_dir / 'log.jsonl').open('a') as file_: with (output_dir / 'log.jsonl').open('a') as file_:
file_.write(json.dumps({'nr_word': tracker.nr_word, file_.write(json.dumps({'nr_word': tracker.nr_word,
'loss': tracker.loss, 'epoch': epoch}) + '\n') 'loss': tracker.loss, 'epoch_loss': tracker.epoch_loss,
'epoch': epoch}) + '\n')
tracker.epoch_loss = 0.0
if texts_loc != '-': if texts_loc != '-':
texts = load_texts(texts_loc) texts = load_texts(texts_loc)