2020-07-22 14:42:59 +03:00
from typing import Optional , Dict , Any , Tuple , Union , Callable , List
2020-09-08 16:24:47 +03:00
from timeit import default_timer as timer
2020-02-27 20:42:27 +03:00
import tqdm
2020-01-29 19:06:46 +03:00
from pathlib import Path
2020-02-27 20:42:27 +03:00
from wasabi import msg
2020-01-29 19:06:46 +03:00
import thinc
import thinc . schedules
2020-09-19 02:17:02 +03:00
from thinc . api import Config , Optimizer , require_gpu , fix_random_seed , set_gpu_allocator
2020-05-18 23:23:33 +03:00
import random
2020-07-10 18:57:40 +03:00
import typer
2020-08-14 16:00:52 +03:00
import logging
2020-01-29 19:06:46 +03:00
2020-09-28 13:30:13 +03:00
from . init_pipeline import init_pipeline
2020-09-28 11:53:17 +03:00
from . init_pipeline import create_before_to_disk_callback
2020-07-11 00:34:17 +03:00
from . _util import app , Arg , Opt , parse_config_overrides , show_validation_error
2020-09-28 11:53:17 +03:00
from . _util import import_code
2020-07-22 14:42:59 +03:00
from . . language import Language
2020-02-27 20:42:27 +03:00
from . . import util
2020-05-20 12:41:12 +03:00
from . . errors import Errors
2020-09-28 11:53:17 +03:00
from . . util import resolve_dot_names , registry
from . . schemas import ConfigSchemaTraining
2020-07-10 14:31:27 +03:00
2020-06-21 14:44:00 +03:00
2020-07-10 18:57:40 +03:00
@app.command (
" train " , context_settings = { " allow_extra_args " : True , " ignore_unknown_options " : True }
)
2020-06-20 15:15:04 +03:00
def train_cli (
2020-01-29 19:06:46 +03:00
# fmt: off
2020-07-10 18:57:40 +03:00
ctx : typer . Context , # This is only used to read additional arguments
2020-06-21 22:35:01 +03:00
config_path : Path = Arg ( . . . , help = " Path to config file " , exists = True ) ,
2020-09-03 14:13:03 +03:00
output_path : Optional [ Path ] = Opt ( None , " --output " , " --output-path " , " -o " , help = " Output directory to store trained pipeline in " ) ,
2020-09-19 02:17:02 +03:00
code_path : Optional [ Path ] = Opt ( None , " --code " , " -c " , help = " Path to Python file with additional code (registered functions) to be imported " ) ,
2020-09-28 12:06:07 +03:00
init_path : Optional [ Path ] = Opt ( None , " --init " , " -i " , help = " Path to already initialized pipeline directory, e.g. created with ' spacy init pipeline ' (will speed up training) " ) ,
2020-07-09 20:44:28 +03:00
verbose : bool = Opt ( False , " --verbose " , " -V " , " -VV " , help = " Display more information for debugging purposes " ) ,
2020-09-28 12:06:07 +03:00
use_gpu : int = Opt ( - 1 , " --gpu-id " , " -g " , help = " GPU ID or -1 for CPU " )
2020-01-29 19:06:46 +03:00
# fmt: on
) :
"""
2020-09-03 14:13:03 +03:00
Train or update a spaCy pipeline . Requires data in spaCy ' s binary format. To
2020-07-10 18:57:40 +03:00
convert data from other formats , use the ` spacy convert ` command . The
config file includes all settings and hyperparameters used during traing .
To override settings in the config , e . g . settings that point to local
paths or that you want to experiment with , you can override them as
command line options . For instance , - - training . batch_size 128 overrides
the value of " batch_size " in the block " [training] " . The - - code argument
lets you pass in a Python file that ' s imported before training. It can be
used to register custom functions and architectures that can then be
referenced in the config .
2020-09-04 13:58:50 +03:00
DOCS : https : / / nightly . spacy . io / api / cli #train
2020-01-29 19:06:46 +03:00
"""
2020-08-14 16:00:52 +03:00
util . logger . setLevel ( logging . DEBUG if verbose else logging . ERROR )
2020-08-04 16:09:37 +03:00
verify_cli_args ( config_path , output_path )
2020-07-10 18:57:40 +03:00
overrides = parse_config_overrides ( ctx . args )
2020-07-11 14:03:53 +03:00
import_code ( code_path )
2020-07-10 14:31:27 +03:00
if use_gpu > = 0 :
msg . info ( f " Using GPU: { use_gpu } " )
require_gpu ( use_gpu )
else :
msg . info ( " Using CPU " )
2020-09-28 11:53:17 +03:00
config = util . load_config ( config_path , overrides = overrides , interpolate = False )
msg . divider ( " Initializing pipeline " )
2020-09-28 12:06:07 +03:00
nlp = init_nlp ( config , output_path , init_path )
msg . divider ( " Training pipeline " )
train ( nlp , output_path , use_gpu = use_gpu )
def init_nlp (
config : Config , output_path : Optional [ Path ] , init_path : Optional [ Path ]
) - > None :
if init_path is not None :
nlp = util . load_model ( init_path )
2020-09-28 13:30:13 +03:00
if must_reinitialize ( config , nlp . config ) :
msg . fail (
f " Config has changed: can ' t use initialized pipeline from "
f " { init_path } . Please re-run ' spacy init nlp ' . " ,
exits = 1 ,
)
2020-09-28 12:06:07 +03:00
msg . good ( f " Loaded initialized pipeline from { init_path } " )
return nlp
if output_path is not None :
output_init_path = output_path / " model-initial "
2020-09-28 13:30:13 +03:00
if not output_init_path . exists ( ) :
msg . info ( f " Initializing the pipeline in { output_init_path } " )
2020-09-28 00:59:44 +03:00
nlp = init_pipeline ( config )
2020-09-28 13:30:13 +03:00
nlp . to_disk ( output_init_path )
2020-09-28 12:06:07 +03:00
msg . good ( f " Saved initialized pipeline to { output_init_path } " )
2020-09-28 00:59:44 +03:00
else :
2020-09-28 12:06:07 +03:00
nlp = util . load_model ( output_init_path )
2020-09-28 13:30:13 +03:00
if must_reinitialize ( config , nlp . config ) :
msg . warn ( " Config has changed: need to re-initialize pipeline " )
nlp = init_pipeline ( config )
nlp . to_disk ( output_init_path )
msg . good ( f " Re-initialized pipeline in { output_init_path } " )
else :
msg . good ( f " Loaded initialized pipeline from { output_init_path } " )
2020-09-28 12:06:07 +03:00
return nlp
2020-09-28 13:30:13 +03:00
msg . warn (
" Not saving initialized model: no output directory specified. "
" To speed up training, spaCy can save the initialized nlp object with "
" the vocabulary, vectors and label scheme. To take advantage of this, "
" provide an output directory or use the ' spacy init nlp ' command. "
)
2020-09-28 12:06:07 +03:00
return init_pipeline ( config )
2020-09-28 00:59:44 +03:00
2020-01-29 19:06:46 +03:00
2020-09-28 11:53:17 +03:00
def train (
nlp : Language , output_path : Optional [ Path ] = None , * , use_gpu : int = - 1
) - > None :
2020-09-28 00:59:44 +03:00
# Create iterator, which yields out info after each optimization step.
2020-09-28 02:08:30 +03:00
config = nlp . config . interpolate ( )
2020-09-28 11:53:17 +03:00
if config [ " training " ] [ " seed " ] is not None :
fix_random_seed ( config [ " training " ] [ " seed " ] )
allocator = config [ " training " ] [ " gpu_allocator " ]
if use_gpu > = 0 and allocator :
set_gpu_allocator ( allocator )
2020-09-28 02:08:30 +03:00
T = registry . resolve ( config [ " training " ] , schema = ConfigSchemaTraining )
2020-09-28 13:30:13 +03:00
dot_names = [ T [ " train_corpus " ] , T [ " dev_corpus " ] ]
train_corpus , dev_corpus = resolve_dot_names ( config , dot_names )
2020-09-28 04:42:47 +03:00
optimizer = T [ " optimizer " ]
2020-09-28 02:08:30 +03:00
score_weights = T [ " score_weights " ]
batcher = T [ " batcher " ]
train_logger = T [ " logger " ]
before_to_disk = create_before_to_disk_callback ( T [ " before_to_disk " ] )
# Components that shouldn't be updated during training
frozen_components = T [ " frozen_components " ]
2020-01-29 19:06:46 +03:00
# Create iterator, which yields out info after each optimization step.
training_step_iterator = train_while_improving (
nlp ,
optimizer ,
2020-09-28 02:08:30 +03:00
create_train_batches ( train_corpus ( nlp ) , batcher , T [ " max_epochs " ] ) ,
2020-08-04 16:09:37 +03:00
create_evaluation_callback ( nlp , dev_corpus , score_weights ) ,
2020-09-28 02:08:30 +03:00
dropout = T [ " dropout " ] ,
accumulate_gradient = T [ " accumulate_gradient " ] ,
patience = T [ " patience " ] ,
max_steps = T [ " max_steps " ] ,
eval_frequency = T [ " eval_frequency " ] ,
2020-08-06 00:35:09 +03:00
exclude = frozen_components ,
2020-01-29 19:06:46 +03:00
)
2020-09-28 12:06:07 +03:00
msg . info ( f " Pipeline: { nlp . pipe_names } " )
if frozen_components :
msg . info ( f " Frozen components: { frozen_components } " )
2020-09-28 11:53:17 +03:00
msg . info ( f " Initial learn rate: { optimizer . learn_rate } " )
2020-09-23 13:12:38 +03:00
with nlp . select_pipes ( disable = frozen_components ) :
2020-09-23 11:37:12 +03:00
print_row , finalize_logger = train_logger ( nlp )
2020-01-29 19:06:46 +03:00
try :
2020-09-28 02:08:30 +03:00
progress = tqdm . tqdm ( total = T [ " eval_frequency " ] , leave = False )
2020-08-26 16:24:33 +03:00
progress . set_description ( f " Epoch 1 " )
2020-01-29 19:06:46 +03:00
for batch , info , is_best_checkpoint in training_step_iterator :
progress . update ( 1 )
if is_best_checkpoint is not None :
progress . close ( )
print_row ( info )
if is_best_checkpoint and output_path is not None :
2020-09-23 13:12:38 +03:00
with nlp . select_pipes ( disable = frozen_components ) :
2020-09-28 02:08:30 +03:00
update_meta ( T , nlp , info )
2020-09-02 20:37:43 +03:00
with nlp . use_params ( optimizer . averages ) :
2020-09-24 13:40:25 +03:00
nlp = before_to_disk ( nlp )
2020-09-02 20:37:43 +03:00
nlp . to_disk ( output_path / " model-best " )
2020-09-28 02:08:30 +03:00
progress = tqdm . tqdm ( total = T [ " eval_frequency " ] , leave = False )
2020-08-26 16:24:33 +03:00
progress . set_description ( f " Epoch { info [ ' epoch ' ] } " )
2020-06-12 03:02:07 +03:00
except Exception as e :
2020-08-26 16:24:33 +03:00
finalize_logger ( )
2020-06-26 20:34:12 +03:00
if output_path is not None :
2020-08-23 19:31:30 +03:00
# We don't want to swallow the traceback if we don't have a
# specific error.
2020-06-26 20:34:12 +03:00
msg . warn (
f " Aborting and saving the final best model. "
2020-08-23 19:31:30 +03:00
f " Encountered exception: { str ( e ) } "
2020-06-26 20:34:12 +03:00
)
2020-09-24 13:40:25 +03:00
nlp = before_to_disk ( nlp )
2020-08-23 19:31:30 +03:00
nlp . to_disk ( output_path / " model-final " )
raise e
2020-01-29 19:06:46 +03:00
finally :
2020-08-26 16:24:33 +03:00
finalize_logger ( )
2020-01-29 19:06:46 +03:00
if output_path is not None :
2020-05-20 13:56:27 +03:00
final_model_path = output_path / " model-final "
if optimizer . averages :
with nlp . use_params ( optimizer . averages ) :
nlp . to_disk ( final_model_path )
else :
2020-01-29 19:06:46 +03:00
nlp . to_disk ( final_model_path )
2020-09-03 14:13:03 +03:00
msg . good ( f " Saved pipeline to output directory { final_model_path } " )
2020-01-29 19:06:46 +03:00
2020-09-28 13:30:13 +03:00
def must_reinitialize ( train_config : Config , init_config : Config ) - > bool :
# TODO: do this better and more fine-grained
return train_config . interpolate ( ) . to_str ( ) == init_config . interpolate ( ) . to_str ( )
2020-09-26 14:13:57 +03:00
def add_vectors ( nlp : Language , vectors : str ) - > None :
title = f " Config validation error for vectors { vectors } "
desc = (
" This typically means that there ' s a problem in the config.cfg included "
" with the packaged vectors. Make sure that the vectors package you ' re "
" loading is compatible with the current version of spaCy. "
)
with show_validation_error (
title = title , desc = desc , hint_fill = False , show_config = False
) :
util . load_vectors_into_model ( nlp , vectors )
2020-08-04 16:09:37 +03:00
def create_train_batches ( iterator , batcher , max_epochs : int ) :
2020-08-31 20:55:22 +03:00
epoch = 0
examples = list ( iterator )
2020-08-04 16:09:37 +03:00
if not examples :
# Raise error if no data
raise ValueError ( Errors . E986 )
2020-08-31 20:55:22 +03:00
while max_epochs < 1 or epoch != max_epochs :
2020-08-04 16:09:37 +03:00
random . shuffle ( examples )
for batch in batcher ( examples ) :
2020-06-26 20:34:12 +03:00
yield epoch , batch
2020-08-04 16:09:37 +03:00
epoch + = 1
2020-01-29 19:06:46 +03:00
2020-07-22 14:42:59 +03:00
def create_evaluation_callback (
2020-08-26 16:24:33 +03:00
nlp : Language , dev_corpus : Callable , weights : Dict [ str , float ]
2020-07-22 14:42:59 +03:00
) - > Callable [ [ ] , Tuple [ float , Dict [ str , float ] ] ] :
2020-09-24 11:27:33 +03:00
weights = { key : value for key , value in weights . items ( ) if value is not None }
2020-07-22 14:42:59 +03:00
def evaluate ( ) - > Tuple [ float , Dict [ str , float ] ] :
2020-08-04 16:09:37 +03:00
dev_examples = list ( dev_corpus ( nlp ) )
scores = nlp . evaluate ( dev_examples )
2020-09-24 12:29:07 +03:00
# Calculate a weighted sum based on score_weights for the main score.
# We can only consider scores that are ints/floats, not dicts like
# entity scores per type etc.
for key , value in scores . items ( ) :
if key in weights and not isinstance ( value , ( int , float ) ) :
raise ValueError ( Errors . E915 . format ( name = key , score_type = type ( value ) ) )
2020-06-12 03:02:07 +03:00
try :
2020-08-23 22:15:12 +03:00
weighted_score = sum (
scores . get ( s , 0.0 ) * weights . get ( s , 0.0 ) for s in weights
)
2020-06-12 03:02:07 +03:00
except KeyError as e :
2020-07-22 14:42:59 +03:00
keys = list ( scores . keys ( ) )
err = Errors . E983 . format ( dict = " score_weights " , key = str ( e ) , keys = keys )
2020-08-06 00:53:21 +03:00
raise KeyError ( err ) from None
2020-04-20 23:06:28 +03:00
return weighted_score , scores
2020-01-29 19:06:46 +03:00
return evaluate
def train_while_improving (
2020-07-22 14:42:59 +03:00
nlp : Language ,
optimizer : Optimizer ,
2020-06-12 03:02:07 +03:00
train_data ,
evaluate ,
* ,
2020-07-22 14:42:59 +03:00
dropout : float ,
eval_frequency : int ,
accumulate_gradient : int ,
patience : int ,
max_steps : int ,
2020-08-05 00:39:19 +03:00
exclude : List [ str ] ,
2020-01-29 19:06:46 +03:00
) :
""" Train until an evaluation stops improving. Works as a generator,
with each iteration yielding a tuple ` ( batch , info , is_best_checkpoint ) ` ,
where info is a dict , and is_best_checkpoint is in [ True , False , None ] - -
None indicating that the iteration was not evaluated as a checkpoint .
2020-08-18 20:15:16 +03:00
The evaluation is conducted by calling the evaluate callback .
2020-01-29 19:06:46 +03:00
Positional arguments :
nlp : The spaCy pipeline to evaluate .
2020-05-20 13:56:27 +03:00
optimizer : The optimizer callable .
2020-01-29 19:06:46 +03:00
train_data ( Iterable [ Batch ] ) : A generator of batches , with the training
data . Each batch should be a Sized [ Tuple [ Input , Annot ] ] . The training
data iterable needs to take care of iterating over the epochs and
shuffling .
evaluate ( Callable [ [ ] , Tuple [ float , Any ] ] ) : A callback to perform evaluation .
The callback should take no arguments and return a tuple
` ( main_score , other_scores ) ` . The main_score should be a float where
higher is better . other_scores can be any object .
Every iteration , the function yields out a tuple with :
2020-06-26 20:34:12 +03:00
* batch : A list of Example objects .
2020-01-29 19:06:46 +03:00
* info : A dict with various information about the last update ( see below ) .
* is_best_checkpoint : A value in None , False , True , indicating whether this
was the best evaluation so far . You should use this to save the model
checkpoints during training . If None , evaluation was not conducted on
that iteration . False means evaluation was conducted , but a previous
evaluation was better .
The info dict provides the following information :
epoch ( int ) : How many passes over the data have been completed .
step ( int ) : How many steps have been completed .
2020-08-31 15:24:41 +03:00
score ( float ) : The main score from the last evaluation .
2020-01-29 19:06:46 +03:00
other_scores : : The other scores from the last evaluation .
2020-08-28 22:44:04 +03:00
losses : The accumulated losses throughout training .
2020-01-29 19:06:46 +03:00
checkpoints : A list of previous results , where each result is a
( score , step , epoch ) tuple .
"""
if isinstance ( dropout , float ) :
dropouts = thinc . schedules . constant ( dropout )
else :
dropouts = dropout
results = [ ]
losses = { }
2020-09-08 16:24:47 +03:00
words_seen = 0
start_time = timer ( )
2020-06-26 20:34:12 +03:00
for step , ( epoch , batch ) in enumerate ( train_data ) :
2020-01-29 19:06:46 +03:00
dropout = next ( dropouts )
2020-08-05 00:39:19 +03:00
for subbatch in subdivide_batch ( batch , accumulate_gradient ) :
nlp . update (
subbatch , drop = dropout , losses = losses , sgd = False , exclude = exclude
)
# TODO: refactor this so we don't have to run it separately in here
for name , proc in nlp . pipeline :
2020-08-12 00:29:31 +03:00
if (
name not in exclude
and hasattr ( proc , " model " )
and proc . model not in ( True , False , None )
) :
2020-08-05 00:39:19 +03:00
proc . model . finish_update ( optimizer )
2020-01-29 19:06:46 +03:00
optimizer . step_schedules ( )
if not ( step % eval_frequency ) :
2020-08-04 16:09:37 +03:00
if optimizer . averages :
with nlp . use_params ( optimizer . averages ) :
score , other_scores = evaluate ( )
else :
score , other_scores = evaluate ( )
2020-01-29 19:06:46 +03:00
results . append ( ( score , step ) )
is_best_checkpoint = score == max ( results ) [ 0 ]
else :
score , other_scores = ( None , None )
is_best_checkpoint = None
2020-09-08 16:24:47 +03:00
words_seen + = sum ( len ( eg ) for eg in batch )
2020-01-29 19:06:46 +03:00
info = {
2020-06-26 20:34:12 +03:00
" epoch " : epoch ,
2020-01-29 19:06:46 +03:00
" step " : step ,
" score " : score ,
" other_scores " : other_scores ,
" losses " : losses ,
" checkpoints " : results ,
2020-09-08 16:24:47 +03:00
" seconds " : int ( timer ( ) - start_time ) ,
" words " : words_seen ,
2020-01-29 19:06:46 +03:00
}
yield batch , info , is_best_checkpoint
if is_best_checkpoint is not None :
losses = { }
2020-05-20 13:56:27 +03:00
# Stop if no improvement in `patience` updates (if specified)
2020-01-29 19:06:46 +03:00
best_score , best_step = max ( results )
2020-05-20 13:56:27 +03:00
if patience and ( step - best_step ) > = patience :
break
# Stop if we've exhausted our max steps (if specified)
2020-07-01 19:08:14 +03:00
if max_steps and step > = max_steps :
2020-01-29 19:06:46 +03:00
break
2020-05-18 23:23:33 +03:00
def subdivide_batch ( batch , accumulate_gradient ) :
batch = list ( batch )
2020-06-26 20:34:12 +03:00
batch . sort ( key = lambda eg : len ( eg . predicted ) )
2020-05-18 23:23:33 +03:00
sub_len = len ( batch ) / / accumulate_gradient
start = 0
for i in range ( accumulate_gradient ) :
subbatch = batch [ start : start + sub_len ]
if subbatch :
yield subbatch
start + = len ( subbatch )
2020-06-12 03:02:07 +03:00
subbatch = batch [ start : ]
2020-05-18 23:23:33 +03:00
if subbatch :
yield subbatch
2020-01-29 19:06:46 +03:00
2020-07-22 14:42:59 +03:00
def update_meta (
training : Union [ Dict [ str , Any ] , Config ] , nlp : Language , info : Dict [ str , Any ]
) - > None :
2020-06-12 03:02:07 +03:00
nlp . meta [ " performance " ] = { }
2020-07-29 12:04:12 +03:00
for metric in training [ " score_weights " ] :
2020-09-24 11:27:33 +03:00
if metric is not None :
nlp . meta [ " performance " ] [ metric ] = info [ " other_scores " ] . get ( metric , 0.0 )
2020-06-12 03:02:07 +03:00
for pipe_name in nlp . pipe_names :
nlp . meta [ " performance " ] [ f " { pipe_name } _loss " ] = info [ " losses " ] [ pipe_name ]
2020-06-26 20:34:12 +03:00
2020-08-26 16:24:33 +03:00
def verify_cli_args ( config_path : Path , output_path : Optional [ Path ] = None ) - > None :
2020-06-26 20:34:12 +03:00
# Make sure all files and paths exists if they are needed
if not config_path or not config_path . exists ( ) :
msg . fail ( " Config file not found " , config_path , exits = 1 )
if output_path is not None :
if not output_path . exists ( ) :
output_path . mkdir ( )
msg . good ( f " Created output directory: { output_path } " )
2020-09-28 12:56:14 +03:00
# TODO: this is currently imported by the ray extension and not used otherwise
def load_from_paths (
config : Config ,
) - > Tuple [ List [ Dict [ str , str ] ] , Dict [ str , dict ] , bytes ] :
weights_data = None
init_tok2vec = util . ensure_path ( config [ " training " ] [ " init_tok2vec " ] )
if init_tok2vec is not None :
if not init_tok2vec . exists ( ) :
msg . fail ( " Can ' t find pretrained tok2vec " , init_tok2vec , exits = 1 )
with init_tok2vec . open ( " rb " ) as file_ :
weights_data = file_ . read ( )
2020-09-28 13:30:13 +03:00
return None , { } , { } , weights_data