mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 01:04:34 +03:00
add tok2vec parameters to train script to facilitate init_tok2vec (#5021)
This commit is contained in:
parent
a27c77ce62
commit
2572460175
|
@ -34,7 +34,7 @@ from .train import _load_pretrained_tok2vec
|
||||||
vectors_model=("Name or path to spaCy model with vectors to learn from"),
|
vectors_model=("Name or path to spaCy model with vectors to learn from"),
|
||||||
output_dir=("Directory to write models to on each epoch", "positional", None, str),
|
output_dir=("Directory to write models to on each epoch", "positional", None, str),
|
||||||
width=("Width of CNN layers", "option", "cw", int),
|
width=("Width of CNN layers", "option", "cw", int),
|
||||||
depth=("Depth of CNN layers", "option", "cd", int),
|
conv_depth=("Depth of CNN layers", "option", "cd", int),
|
||||||
cnn_window=("Window size for CNN layers", "option", "cW", int),
|
cnn_window=("Window size for CNN layers", "option", "cW", int),
|
||||||
cnn_pieces=("Maxout size for CNN layers. 1 for Mish", "option", "cP", int),
|
cnn_pieces=("Maxout size for CNN layers. 1 for Mish", "option", "cP", int),
|
||||||
use_chars=("Whether to use character-based embedding", "flag", "chr", bool),
|
use_chars=("Whether to use character-based embedding", "flag", "chr", bool),
|
||||||
|
@ -84,7 +84,7 @@ def pretrain(
|
||||||
vectors_model,
|
vectors_model,
|
||||||
output_dir,
|
output_dir,
|
||||||
width=96,
|
width=96,
|
||||||
depth=4,
|
conv_depth=4,
|
||||||
bilstm_depth=0,
|
bilstm_depth=0,
|
||||||
cnn_pieces=3,
|
cnn_pieces=3,
|
||||||
sa_depth=0,
|
sa_depth=0,
|
||||||
|
@ -132,9 +132,15 @@ def pretrain(
|
||||||
msg.info("Using GPU" if has_gpu else "Not using GPU")
|
msg.info("Using GPU" if has_gpu else "Not using GPU")
|
||||||
|
|
||||||
output_dir = Path(output_dir)
|
output_dir = Path(output_dir)
|
||||||
|
if output_dir.exists() and [p for p in output_dir.iterdir()]:
|
||||||
|
msg.warn(
|
||||||
|
"Output directory is not empty",
|
||||||
|
"It is better to use an empty directory or refer to a new output path, "
|
||||||
|
"then the new directory will be created for you.",
|
||||||
|
)
|
||||||
if not output_dir.exists():
|
if not output_dir.exists():
|
||||||
output_dir.mkdir()
|
output_dir.mkdir()
|
||||||
msg.good("Created output directory")
|
msg.good("Created output directory: {}".format(output_dir))
|
||||||
srsly.write_json(output_dir / "config.json", config)
|
srsly.write_json(output_dir / "config.json", config)
|
||||||
msg.good("Saved settings to config.json")
|
msg.good("Saved settings to config.json")
|
||||||
|
|
||||||
|
@ -162,7 +168,7 @@ def pretrain(
|
||||||
Tok2Vec(
|
Tok2Vec(
|
||||||
width,
|
width,
|
||||||
embed_rows,
|
embed_rows,
|
||||||
conv_depth=depth,
|
conv_depth=conv_depth,
|
||||||
pretrained_vectors=pretrained_vectors,
|
pretrained_vectors=pretrained_vectors,
|
||||||
bilstm_depth=bilstm_depth, # Requires PyTorch. Experimental.
|
bilstm_depth=bilstm_depth, # Requires PyTorch. Experimental.
|
||||||
subword_features=not use_chars, # Set to False for Chinese etc
|
subword_features=not use_chars, # Set to False for Chinese etc
|
||||||
|
|
|
@ -33,6 +33,13 @@ from .. import about
|
||||||
pipeline=("Comma-separated names of pipeline components", "option", "p", str),
|
pipeline=("Comma-separated names of pipeline components", "option", "p", str),
|
||||||
replace_components=("Replace components from base model", "flag", "R", bool),
|
replace_components=("Replace components from base model", "flag", "R", bool),
|
||||||
vectors=("Model to load vectors from", "option", "v", str),
|
vectors=("Model to load vectors from", "option", "v", str),
|
||||||
|
width=("Width of CNN layers of Tok2Vec component", "option", "cw", int),
|
||||||
|
conv_depth=("Depth of CNN layers of Tok2Vec component", "option", "cd", int),
|
||||||
|
cnn_window=("Window size for CNN layers of Tok2Vec component", "option", "cW", int),
|
||||||
|
cnn_pieces=("Maxout size for CNN layers of Tok2Vec component. 1 for Mish", "option", "cP", int),
|
||||||
|
use_chars=("Whether to use character-based embedding of Tok2Vec component", "flag", "chr", bool),
|
||||||
|
bilstm_depth=("Depth of BiLSTM layers of Tok2Vec component (requires PyTorch)", "option", "lstm", int),
|
||||||
|
embed_rows=("Number of embedding rows of Tok2Vec component", "option", "er", int),
|
||||||
n_iter=("Number of iterations", "option", "n", int),
|
n_iter=("Number of iterations", "option", "n", int),
|
||||||
n_early_stopping=("Maximum number of training epochs without dev accuracy improvement", "option", "ne", int),
|
n_early_stopping=("Maximum number of training epochs without dev accuracy improvement", "option", "ne", int),
|
||||||
n_examples=("Number of examples", "option", "ns", int),
|
n_examples=("Number of examples", "option", "ns", int),
|
||||||
|
@ -64,6 +71,13 @@ def train(
|
||||||
pipeline="tagger,parser,ner",
|
pipeline="tagger,parser,ner",
|
||||||
replace_components=False,
|
replace_components=False,
|
||||||
vectors=None,
|
vectors=None,
|
||||||
|
width=96,
|
||||||
|
conv_depth=4,
|
||||||
|
cnn_window=1,
|
||||||
|
cnn_pieces=3,
|
||||||
|
use_chars=False,
|
||||||
|
bilstm_depth=0,
|
||||||
|
embed_rows=2000,
|
||||||
n_iter=30,
|
n_iter=30,
|
||||||
n_early_stopping=None,
|
n_early_stopping=None,
|
||||||
n_examples=0,
|
n_examples=0,
|
||||||
|
@ -116,6 +130,7 @@ def train(
|
||||||
)
|
)
|
||||||
if not output_path.exists():
|
if not output_path.exists():
|
||||||
output_path.mkdir()
|
output_path.mkdir()
|
||||||
|
msg.good("Created output directory: {}".format(output_path))
|
||||||
|
|
||||||
# Take dropout and batch size as generators of values -- dropout
|
# Take dropout and batch size as generators of values -- dropout
|
||||||
# starts high and decays sharply, to force the optimizer to explore.
|
# starts high and decays sharply, to force the optimizer to explore.
|
||||||
|
@ -250,7 +265,15 @@ def train(
|
||||||
optimizer = create_default_optimizer(Model.ops)
|
optimizer = create_default_optimizer(Model.ops)
|
||||||
else:
|
else:
|
||||||
# Start with a blank model, call begin_training
|
# Start with a blank model, call begin_training
|
||||||
optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
|
cfg = {"device": use_gpu}
|
||||||
|
cfg["conv_depth"] = conv_depth
|
||||||
|
cfg["token_vector_width"] = width
|
||||||
|
cfg["bilstm_depth"] = bilstm_depth
|
||||||
|
cfg["cnn_maxout_pieces"] = cnn_pieces
|
||||||
|
cfg["embed_size"] = embed_rows
|
||||||
|
cfg["conv_window"] = cnn_window
|
||||||
|
cfg["subword_features"] = not use_chars
|
||||||
|
optimizer = nlp.begin_training(lambda: corpus.train_tuples, **cfg)
|
||||||
|
|
||||||
nlp._optimizer = None
|
nlp._optimizer = None
|
||||||
|
|
||||||
|
@ -375,13 +398,19 @@ def train(
|
||||||
if not batch:
|
if not batch:
|
||||||
continue
|
continue
|
||||||
docs, golds = zip(*batch)
|
docs, golds = zip(*batch)
|
||||||
nlp.update(
|
try:
|
||||||
docs,
|
nlp.update(
|
||||||
golds,
|
docs,
|
||||||
sgd=optimizer,
|
golds,
|
||||||
drop=next(dropout_rates),
|
sgd=optimizer,
|
||||||
losses=losses,
|
drop=next(dropout_rates),
|
||||||
)
|
losses=losses,
|
||||||
|
)
|
||||||
|
except ValueError as e:
|
||||||
|
msg.warn("Error during training")
|
||||||
|
if init_tok2vec:
|
||||||
|
msg.warn("Did you provide the same parameters during 'train' as during 'pretrain'?")
|
||||||
|
msg.fail("Original error message: {}".format(e), exits=1)
|
||||||
if raw_text:
|
if raw_text:
|
||||||
# If raw text is available, perform 'rehearsal' updates,
|
# If raw text is available, perform 'rehearsal' updates,
|
||||||
# which use unlabelled data to reduce overfitting.
|
# which use unlabelled data to reduce overfitting.
|
||||||
|
|
Loading…
Reference in New Issue
Block a user