mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 02:06:31 +03:00
add tok2vec parameters to train script to facilitate init_tok2vec (#5021)
This commit is contained in:
parent
a27c77ce62
commit
2572460175
|
@ -34,7 +34,7 @@ from .train import _load_pretrained_tok2vec
|
|||
vectors_model=("Name or path to spaCy model with vectors to learn from"),
|
||||
output_dir=("Directory to write models to on each epoch", "positional", None, str),
|
||||
width=("Width of CNN layers", "option", "cw", int),
|
||||
depth=("Depth of CNN layers", "option", "cd", int),
|
||||
conv_depth=("Depth of CNN layers", "option", "cd", int),
|
||||
cnn_window=("Window size for CNN layers", "option", "cW", int),
|
||||
cnn_pieces=("Maxout size for CNN layers. 1 for Mish", "option", "cP", int),
|
||||
use_chars=("Whether to use character-based embedding", "flag", "chr", bool),
|
||||
|
@ -84,7 +84,7 @@ def pretrain(
|
|||
vectors_model,
|
||||
output_dir,
|
||||
width=96,
|
||||
depth=4,
|
||||
conv_depth=4,
|
||||
bilstm_depth=0,
|
||||
cnn_pieces=3,
|
||||
sa_depth=0,
|
||||
|
@ -132,9 +132,15 @@ def pretrain(
|
|||
msg.info("Using GPU" if has_gpu else "Not using GPU")
|
||||
|
||||
output_dir = Path(output_dir)
|
||||
if output_dir.exists() and [p for p in output_dir.iterdir()]:
|
||||
msg.warn(
|
||||
"Output directory is not empty",
|
||||
"It is better to use an empty directory or refer to a new output path, "
|
||||
"then the new directory will be created for you.",
|
||||
)
|
||||
if not output_dir.exists():
|
||||
output_dir.mkdir()
|
||||
msg.good("Created output directory")
|
||||
msg.good("Created output directory: {}".format(output_dir))
|
||||
srsly.write_json(output_dir / "config.json", config)
|
||||
msg.good("Saved settings to config.json")
|
||||
|
||||
|
@ -162,7 +168,7 @@ def pretrain(
|
|||
Tok2Vec(
|
||||
width,
|
||||
embed_rows,
|
||||
conv_depth=depth,
|
||||
conv_depth=conv_depth,
|
||||
pretrained_vectors=pretrained_vectors,
|
||||
bilstm_depth=bilstm_depth, # Requires PyTorch. Experimental.
|
||||
subword_features=not use_chars, # Set to False for Chinese etc
|
||||
|
|
|
@ -33,6 +33,13 @@ from .. import about
|
|||
pipeline=("Comma-separated names of pipeline components", "option", "p", str),
|
||||
replace_components=("Replace components from base model", "flag", "R", bool),
|
||||
vectors=("Model to load vectors from", "option", "v", str),
|
||||
width=("Width of CNN layers of Tok2Vec component", "option", "cw", int),
|
||||
conv_depth=("Depth of CNN layers of Tok2Vec component", "option", "cd", int),
|
||||
cnn_window=("Window size for CNN layers of Tok2Vec component", "option", "cW", int),
|
||||
cnn_pieces=("Maxout size for CNN layers of Tok2Vec component. 1 for Mish", "option", "cP", int),
|
||||
use_chars=("Whether to use character-based embedding of Tok2Vec component", "flag", "chr", bool),
|
||||
bilstm_depth=("Depth of BiLSTM layers of Tok2Vec component (requires PyTorch)", "option", "lstm", int),
|
||||
embed_rows=("Number of embedding rows of Tok2Vec component", "option", "er", int),
|
||||
n_iter=("Number of iterations", "option", "n", int),
|
||||
n_early_stopping=("Maximum number of training epochs without dev accuracy improvement", "option", "ne", int),
|
||||
n_examples=("Number of examples", "option", "ns", int),
|
||||
|
@ -64,6 +71,13 @@ def train(
|
|||
pipeline="tagger,parser,ner",
|
||||
replace_components=False,
|
||||
vectors=None,
|
||||
width=96,
|
||||
conv_depth=4,
|
||||
cnn_window=1,
|
||||
cnn_pieces=3,
|
||||
use_chars=False,
|
||||
bilstm_depth=0,
|
||||
embed_rows=2000,
|
||||
n_iter=30,
|
||||
n_early_stopping=None,
|
||||
n_examples=0,
|
||||
|
@ -116,6 +130,7 @@ def train(
|
|||
)
|
||||
if not output_path.exists():
|
||||
output_path.mkdir()
|
||||
msg.good("Created output directory: {}".format(output_path))
|
||||
|
||||
# Take dropout and batch size as generators of values -- dropout
|
||||
# starts high and decays sharply, to force the optimizer to explore.
|
||||
|
@ -250,7 +265,15 @@ def train(
|
|||
optimizer = create_default_optimizer(Model.ops)
|
||||
else:
|
||||
# Start with a blank model, call begin_training
|
||||
optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
|
||||
cfg = {"device": use_gpu}
|
||||
cfg["conv_depth"] = conv_depth
|
||||
cfg["token_vector_width"] = width
|
||||
cfg["bilstm_depth"] = bilstm_depth
|
||||
cfg["cnn_maxout_pieces"] = cnn_pieces
|
||||
cfg["embed_size"] = embed_rows
|
||||
cfg["conv_window"] = cnn_window
|
||||
cfg["subword_features"] = not use_chars
|
||||
optimizer = nlp.begin_training(lambda: corpus.train_tuples, **cfg)
|
||||
|
||||
nlp._optimizer = None
|
||||
|
||||
|
@ -375,13 +398,19 @@ def train(
|
|||
if not batch:
|
||||
continue
|
||||
docs, golds = zip(*batch)
|
||||
nlp.update(
|
||||
docs,
|
||||
golds,
|
||||
sgd=optimizer,
|
||||
drop=next(dropout_rates),
|
||||
losses=losses,
|
||||
)
|
||||
try:
|
||||
nlp.update(
|
||||
docs,
|
||||
golds,
|
||||
sgd=optimizer,
|
||||
drop=next(dropout_rates),
|
||||
losses=losses,
|
||||
)
|
||||
except ValueError as e:
|
||||
msg.warn("Error during training")
|
||||
if init_tok2vec:
|
||||
msg.warn("Did you provide the same parameters during 'train' as during 'pretrain'?")
|
||||
msg.fail("Original error message: {}".format(e), exits=1)
|
||||
if raw_text:
|
||||
# If raw text is available, perform 'rehearsal' updates,
|
||||
# which use unlabelled data to reduce overfitting.
|
||||
|
|
Loading…
Reference in New Issue
Block a user