add tok2vec parameters to train script to facilitate init_tok2vec (#5021)

This commit is contained in:
Sofie Van Landeghem 2020-02-16 17:16:41 +01:00 committed by GitHub
parent a27c77ce62
commit 2572460175
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 47 additions and 12 deletions

View File

@ -34,7 +34,7 @@ from .train import _load_pretrained_tok2vec
vectors_model=("Name or path to spaCy model with vectors to learn from"), vectors_model=("Name or path to spaCy model with vectors to learn from"),
output_dir=("Directory to write models to on each epoch", "positional", None, str), output_dir=("Directory to write models to on each epoch", "positional", None, str),
width=("Width of CNN layers", "option", "cw", int), width=("Width of CNN layers", "option", "cw", int),
depth=("Depth of CNN layers", "option", "cd", int), conv_depth=("Depth of CNN layers", "option", "cd", int),
cnn_window=("Window size for CNN layers", "option", "cW", int), cnn_window=("Window size for CNN layers", "option", "cW", int),
cnn_pieces=("Maxout size for CNN layers. 1 for Mish", "option", "cP", int), cnn_pieces=("Maxout size for CNN layers. 1 for Mish", "option", "cP", int),
use_chars=("Whether to use character-based embedding", "flag", "chr", bool), use_chars=("Whether to use character-based embedding", "flag", "chr", bool),
@ -84,7 +84,7 @@ def pretrain(
vectors_model, vectors_model,
output_dir, output_dir,
width=96, width=96,
depth=4, conv_depth=4,
bilstm_depth=0, bilstm_depth=0,
cnn_pieces=3, cnn_pieces=3,
sa_depth=0, sa_depth=0,
@ -132,9 +132,15 @@ def pretrain(
msg.info("Using GPU" if has_gpu else "Not using GPU") msg.info("Using GPU" if has_gpu else "Not using GPU")
output_dir = Path(output_dir) output_dir = Path(output_dir)
if output_dir.exists() and [p for p in output_dir.iterdir()]:
msg.warn(
"Output directory is not empty",
"It is better to use an empty directory or refer to a new output path, "
"then the new directory will be created for you.",
)
if not output_dir.exists(): if not output_dir.exists():
output_dir.mkdir() output_dir.mkdir()
msg.good("Created output directory") msg.good("Created output directory: {}".format(output_dir))
srsly.write_json(output_dir / "config.json", config) srsly.write_json(output_dir / "config.json", config)
msg.good("Saved settings to config.json") msg.good("Saved settings to config.json")
@ -162,7 +168,7 @@ def pretrain(
Tok2Vec( Tok2Vec(
width, width,
embed_rows, embed_rows,
conv_depth=depth, conv_depth=conv_depth,
pretrained_vectors=pretrained_vectors, pretrained_vectors=pretrained_vectors,
bilstm_depth=bilstm_depth, # Requires PyTorch. Experimental. bilstm_depth=bilstm_depth, # Requires PyTorch. Experimental.
subword_features=not use_chars, # Set to False for Chinese etc subword_features=not use_chars, # Set to False for Chinese etc

View File

@ -33,6 +33,13 @@ from .. import about
pipeline=("Comma-separated names of pipeline components", "option", "p", str), pipeline=("Comma-separated names of pipeline components", "option", "p", str),
replace_components=("Replace components from base model", "flag", "R", bool), replace_components=("Replace components from base model", "flag", "R", bool),
vectors=("Model to load vectors from", "option", "v", str), vectors=("Model to load vectors from", "option", "v", str),
width=("Width of CNN layers of Tok2Vec component", "option", "cw", int),
conv_depth=("Depth of CNN layers of Tok2Vec component", "option", "cd", int),
cnn_window=("Window size for CNN layers of Tok2Vec component", "option", "cW", int),
cnn_pieces=("Maxout size for CNN layers of Tok2Vec component. 1 for Mish", "option", "cP", int),
use_chars=("Whether to use character-based embedding of Tok2Vec component", "flag", "chr", bool),
bilstm_depth=("Depth of BiLSTM layers of Tok2Vec component (requires PyTorch)", "option", "lstm", int),
embed_rows=("Number of embedding rows of Tok2Vec component", "option", "er", int),
n_iter=("Number of iterations", "option", "n", int), n_iter=("Number of iterations", "option", "n", int),
n_early_stopping=("Maximum number of training epochs without dev accuracy improvement", "option", "ne", int), n_early_stopping=("Maximum number of training epochs without dev accuracy improvement", "option", "ne", int),
n_examples=("Number of examples", "option", "ns", int), n_examples=("Number of examples", "option", "ns", int),
@ -64,6 +71,13 @@ def train(
pipeline="tagger,parser,ner", pipeline="tagger,parser,ner",
replace_components=False, replace_components=False,
vectors=None, vectors=None,
width=96,
conv_depth=4,
cnn_window=1,
cnn_pieces=3,
use_chars=False,
bilstm_depth=0,
embed_rows=2000,
n_iter=30, n_iter=30,
n_early_stopping=None, n_early_stopping=None,
n_examples=0, n_examples=0,
@ -116,6 +130,7 @@ def train(
) )
if not output_path.exists(): if not output_path.exists():
output_path.mkdir() output_path.mkdir()
msg.good("Created output directory: {}".format(output_path))
# Take dropout and batch size as generators of values -- dropout # Take dropout and batch size as generators of values -- dropout
# starts high and decays sharply, to force the optimizer to explore. # starts high and decays sharply, to force the optimizer to explore.
@ -250,7 +265,15 @@ def train(
optimizer = create_default_optimizer(Model.ops) optimizer = create_default_optimizer(Model.ops)
else: else:
# Start with a blank model, call begin_training # Start with a blank model, call begin_training
optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu) cfg = {"device": use_gpu}
cfg["conv_depth"] = conv_depth
cfg["token_vector_width"] = width
cfg["bilstm_depth"] = bilstm_depth
cfg["cnn_maxout_pieces"] = cnn_pieces
cfg["embed_size"] = embed_rows
cfg["conv_window"] = cnn_window
cfg["subword_features"] = not use_chars
optimizer = nlp.begin_training(lambda: corpus.train_tuples, **cfg)
nlp._optimizer = None nlp._optimizer = None
@ -375,13 +398,19 @@ def train(
if not batch: if not batch:
continue continue
docs, golds = zip(*batch) docs, golds = zip(*batch)
nlp.update( try:
docs, nlp.update(
golds, docs,
sgd=optimizer, golds,
drop=next(dropout_rates), sgd=optimizer,
losses=losses, drop=next(dropout_rates),
) losses=losses,
)
except ValueError as e:
msg.warn("Error during training")
if init_tok2vec:
msg.warn("Did you provide the same parameters during 'train' as during 'pretrain'?")
msg.fail("Original error message: {}".format(e), exits=1)
if raw_text: if raw_text:
# If raw text is available, perform 'rehearsal' updates, # If raw text is available, perform 'rehearsal' updates,
# which use unlabelled data to reduce overfitting. # which use unlabelled data to reduce overfitting.