mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 18:06:29 +03:00
Fix pretrain script
This commit is contained in:
parent
09a0227656
commit
2ddd428834
|
@ -25,7 +25,7 @@ from collections import Counter
|
||||||
|
|
||||||
import spacy
|
import spacy
|
||||||
from spacy.attrs import ID
|
from spacy.attrs import ID
|
||||||
from spacy.util import minibatch_by_words, use_gpu, compounding, ensure_path
|
from spacy.util import minibatch, minibatch_by_words, use_gpu, compounding, ensure_path
|
||||||
from spacy._ml import Tok2Vec, flatten, chain, zero_init, create_default_optimizer
|
from spacy._ml import Tok2Vec, flatten, chain, zero_init, create_default_optimizer
|
||||||
from thinc.v2v import Affine
|
from thinc.v2v import Affine
|
||||||
|
|
||||||
|
@ -85,7 +85,8 @@ def get_vectors_loss(ops, docs, prediction):
|
||||||
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
|
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
|
||||||
target = docs[0].vocab.vectors.data[ids]
|
target = docs[0].vocab.vectors.data[ids]
|
||||||
d_scores = (prediction - target) / prediction.shape[0]
|
d_scores = (prediction - target) / prediction.shape[0]
|
||||||
loss = (d_scores**2).sum()
|
# Don't want to return a cupy object here
|
||||||
|
loss = float((d_scores**2).sum())
|
||||||
return loss, d_scores
|
return loss, d_scores
|
||||||
|
|
||||||
|
|
||||||
|
@ -97,11 +98,16 @@ def create_pretraining_model(nlp, tok2vec):
|
||||||
'''
|
'''
|
||||||
output_size = nlp.vocab.vectors.data.shape[1]
|
output_size = nlp.vocab.vectors.data.shape[1]
|
||||||
output_layer = zero_init(Affine(output_size, drop_factor=0.0))
|
output_layer = zero_init(Affine(output_size, drop_factor=0.0))
|
||||||
|
# This is annoying, but the parser etc have the flatten step after
|
||||||
|
# the tok2vec. To load the weights in cleanly, we need to match
|
||||||
|
# the shape of the models' components exactly. So what we cann
|
||||||
|
# "tok2vec" has to be the same set of processes as what the components do.
|
||||||
|
tok2vec = chain(tok2vec, flatten)
|
||||||
model = chain(
|
model = chain(
|
||||||
tok2vec,
|
tok2vec,
|
||||||
flatten,
|
|
||||||
output_layer
|
output_layer
|
||||||
)
|
)
|
||||||
|
model.tok2vec = tok2vec
|
||||||
model.output_layer = output_layer
|
model.output_layer = output_layer
|
||||||
model.begin_training([nlp.make_doc('Give it a doc to infer shapes')])
|
model.begin_training([nlp.make_doc('Give it a doc to infer shapes')])
|
||||||
return model
|
return model
|
||||||
|
@ -144,7 +150,7 @@ class ProgressTracker(object):
|
||||||
nr_iter=("Number of iterations to pretrain", "option", "i", int),
|
nr_iter=("Number of iterations to pretrain", "option", "i", int),
|
||||||
)
|
)
|
||||||
def pretrain(texts_loc, vectors_model, output_dir, width=128, depth=4,
|
def pretrain(texts_loc, vectors_model, output_dir, width=128, depth=4,
|
||||||
embed_rows=1000, dropout=0.2, nr_iter=1, seed=0):
|
embed_rows=1000, dropout=0.2, nr_iter=10, seed=0):
|
||||||
"""
|
"""
|
||||||
Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
|
Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
|
||||||
using an approximate language-modelling objective. Specifically, we load
|
using an approximate language-modelling objective. Specifically, we load
|
||||||
|
@ -170,29 +176,29 @@ def pretrain(texts_loc, vectors_model, output_dir, width=128, depth=4,
|
||||||
file_.write(json.dumps(config))
|
file_.write(json.dumps(config))
|
||||||
has_gpu = prefer_gpu()
|
has_gpu = prefer_gpu()
|
||||||
nlp = spacy.load(vectors_model)
|
nlp = spacy.load(vectors_model)
|
||||||
tok2vec = Tok2Vec(width, embed_rows,
|
model = create_pretraining_model(nlp,
|
||||||
|
Tok2Vec(width, embed_rows,
|
||||||
conv_depth=depth,
|
conv_depth=depth,
|
||||||
pretrained_vectors=nlp.vocab.vectors.name,
|
pretrained_vectors=nlp.vocab.vectors.name,
|
||||||
bilstm_depth=0, # Requires PyTorch. Experimental.
|
bilstm_depth=0, # Requires PyTorch. Experimental.
|
||||||
cnn_maxout_pieces=2, # You can try setting this higher
|
cnn_maxout_pieces=2, # You can try setting this higher
|
||||||
subword_features=True) # Set to False for character models, e.g. Chinese
|
subword_features=True)) # Set to False for character models, e.g. Chinese
|
||||||
model = create_pretraining_model(nlp, tok2vec)
|
|
||||||
optimizer = create_default_optimizer(model.ops)
|
optimizer = create_default_optimizer(model.ops)
|
||||||
tracker = ProgressTracker()
|
tracker = ProgressTracker()
|
||||||
print('Epoch', '#Words', 'Loss', 'w/s')
|
print('Epoch', '#Words', 'Loss', 'w/s')
|
||||||
texts = stream_texts() if texts_loc == '-' else load_texts(texts_loc)
|
texts = stream_texts() if texts_loc == '-' else load_texts(texts_loc)
|
||||||
for epoch in range(nr_iter):
|
for epoch in range(nr_iter):
|
||||||
for batch in minibatch_by_words(texts, tuples=False, size=50000):
|
for batch in minibatch(texts, size=64):
|
||||||
docs = [nlp.make_doc(text) for text in batch]
|
docs = [nlp.make_doc(text) for text in batch]
|
||||||
loss = make_update(model, docs, optimizer, drop=dropout)
|
loss = make_update(model, docs, optimizer, drop=dropout)
|
||||||
progress = tracker.update(epoch, loss, docs)
|
progress = tracker.update(epoch, loss, docs)
|
||||||
if progress:
|
if progress:
|
||||||
print(*progress)
|
print(*progress)
|
||||||
if texts_loc == '-' and tracker.words_per_epoch[epoch] >= 10**7:
|
if texts_loc == '-' and tracker.words_per_epoch[epoch] >= 10**6:
|
||||||
break
|
break
|
||||||
with model.use_params(optimizer.averages):
|
with model.use_params(optimizer.averages):
|
||||||
with (output_dir / ('model%d.bin' % epoch)).open('wb') as file_:
|
with (output_dir / ('model%d.bin' % epoch)).open('wb') as file_:
|
||||||
file_.write(tok2vec.to_bytes())
|
file_.write(model.tok2vec.to_bytes())
|
||||||
with (output_dir / 'log.jsonl').open('a') as file_:
|
with (output_dir / 'log.jsonl').open('a') as file_:
|
||||||
file_.write(json.dumps({'nr_word': tracker.nr_word,
|
file_.write(json.dumps({'nr_word': tracker.nr_word,
|
||||||
'loss': tracker.loss, 'epoch': epoch}))
|
'loss': tracker.loss, 'epoch': epoch}))
|
||||||
|
|
Loading…
Reference in New Issue
Block a user