2020-06-26 20:34:12 +03:00
from typing import Optional , Dict , List , Union , Sequence
2020-04-20 23:06:28 +03:00
from timeit import default_timer as timer
2020-06-12 03:02:07 +03:00
import srsly
2020-02-27 20:42:27 +03:00
import tqdm
2020-06-26 20:34:12 +03:00
from pydantic import BaseModel , FilePath
2020-01-29 19:06:46 +03:00
from pathlib import Path
2020-02-27 20:42:27 +03:00
from wasabi import msg
2020-01-29 19:06:46 +03:00
import thinc
import thinc . schedules
2020-07-06 14:06:25 +03:00
from thinc . api import Model , use_pytorch_for_gpu_memory , require_gpu , fix_random_seed
2020-05-18 23:23:33 +03:00
import random
2020-01-29 19:06:46 +03:00
2020-06-21 14:44:00 +03:00
from . _app import app , Arg , Opt
2020-07-06 14:02:36 +03:00
from . . gold import Corpus , Example
2020-06-12 03:02:07 +03:00
from . . lookups import Lookups
2020-02-27 20:42:27 +03:00
from . . import util
2020-05-20 12:41:12 +03:00
from . . errors import Errors
2020-06-20 15:15:04 +03:00
# Don't remove - required to load the built-in architectures
from . . ml import models # noqa: F401
2020-01-29 19:06:46 +03:00
2020-06-21 14:44:00 +03:00
# from ..schemas import ConfigSchema # TODO: include?
2020-01-29 19:06:46 +03:00
registry = util . registry
CONFIG_STR = """
[ training ]
patience = 10
eval_frequency = 10
dropout = 0.2
init_tok2vec = null
max_epochs = 100
orth_variant_level = 0.0
gold_preproc = false
max_length = 0
use_gpu = 0
scores = [ " ents_p " , " ents_r " , " ents_f " ]
score_weights = { " ents_f " : 1.0 }
limit = 0
[ training . batch_size ]
@schedules = " compounding.v1 "
start = 100
stop = 1000
compound = 1.001
[ optimizer ]
@optimizers = " Adam.v1 "
learn_rate = 0.001
beta1 = 0.9
beta2 = 0.999
[ nlp ]
lang = " en "
2020-06-12 03:02:07 +03:00
vectors = null
2020-01-29 19:06:46 +03:00
[ nlp . pipeline . tok2vec ]
factory = " tok2vec "
[ nlp . pipeline . ner ]
factory = " ner "
[ nlp . pipeline . ner . model ]
2020-02-27 20:42:27 +03:00
@architectures = " spacy.TransitionBasedParser.v1 "
2020-01-29 19:06:46 +03:00
nr_feature_tokens = 3
hidden_width = 64
maxout_pieces = 3
[ nlp . pipeline . ner . model . tok2vec ]
2020-02-27 20:42:27 +03:00
@architectures = " spacy.Tok2VecTensors.v1 "
2020-01-29 19:06:46 +03:00
width = $ { nlp . pipeline . tok2vec . model : width }
[ nlp . pipeline . tok2vec . model ]
2020-02-27 20:42:27 +03:00
@architectures = " spacy.HashEmbedCNN.v1 "
2020-01-29 19:06:46 +03:00
pretrained_vectors = $ { nlp : vectors }
width = 128
depth = 4
window_size = 1
embed_size = 10000
maxout_pieces = 3
2020-02-27 20:42:27 +03:00
subword_features = true
2020-01-29 19:06:46 +03:00
"""
2020-06-04 17:07:30 +03:00
2020-06-26 20:34:12 +03:00
class PipelineComponent ( BaseModel ) :
factory : str
model : Model
class Config :
arbitrary_types_allowed = True
class ConfigSchema ( BaseModel ) :
optimizer : Optional [ " Optimizer " ]
class training ( BaseModel ) :
patience : int = 10
eval_frequency : int = 100
dropout : float = 0.2
init_tok2vec : Optional [ FilePath ] = None
max_epochs : int = 100
orth_variant_level : float = 0.0
gold_preproc : bool = False
max_length : int = 0
use_gpu : int = 0
scores : List [ str ] = [ " ents_p " , " ents_r " , " ents_f " ]
score_weights : Dict [ str , Union [ int , float ] ] = { " ents_f " : 1.0 }
limit : int = 0
batch_size : Union [ Sequence [ int ] , int ]
class nlp ( BaseModel ) :
lang : str
vectors : Optional [ str ]
pipeline : Optional [ Dict [ str , PipelineComponent ] ]
class Config :
extra = " allow "
2020-06-21 14:44:00 +03:00
@app.command ( " train " )
2020-06-20 15:15:04 +03:00
def train_cli (
2020-01-29 19:06:46 +03:00
# fmt: off
2020-06-21 22:35:01 +03:00
train_path : Path = Arg ( . . . , help = " Location of JSON-formatted training data " , exists = True ) ,
dev_path : Path = Arg ( . . . , help = " Location of JSON-formatted development data " , exists = True ) ,
config_path : Path = Arg ( . . . , help = " Path to config file " , exists = True ) ,
2020-07-06 16:57:38 +03:00
output_path : Optional [ Path ] = Opt ( None , " --output " , " --output-path " , " -o " , help = " Output directory to store model in " ) ,
2020-06-21 14:44:00 +03:00
code_path : Optional [ Path ] = Opt ( None , " --code-path " , " -c " , help = " Path to Python file with additional code (registered functions) to be imported " ) ,
init_tok2vec : Optional [ Path ] = Opt ( None , " --init-tok2vec " , " -t2v " , help = " Path to pretrained weights for the tok2vec components. See ' spacy pretrain ' . Experimental. " ) ,
raw_text : Optional [ Path ] = Opt ( None , " --raw-text " , " -rt " , help = " Path to jsonl file with unlabelled text documents. " ) ,
verbose : bool = Opt ( False , " --verbose " , " -VV " , help = " Display more information for debugging purposes " ) ,
use_gpu : int = Opt ( - 1 , " --use-gpu " , " -g " , help = " Use GPU " ) ,
tag_map_path : Optional [ Path ] = Opt ( None , " --tag-map-path " , " -tm " , help = " Location of JSON-formatted tag map " ) ,
omit_extra_lookups : bool = Opt ( False , " --omit-extra-lookups " , " -OEL " , help = " Don ' t include extra lookups in model " ) ,
2020-01-29 19:06:46 +03:00
# fmt: on
) :
"""
Train or update a spaCy model . Requires data to be formatted in spaCy ' s
JSON format . To convert data from other formats , use the ` spacy convert `
command .
"""
2020-06-12 03:02:07 +03:00
util . set_env_log ( verbose )
2020-06-26 20:34:12 +03:00
verify_cli_args ( * * locals ( ) )
2020-06-12 03:02:07 +03:00
if raw_text is not None :
raw_text = list ( srsly . read_jsonl ( raw_text ) )
tag_map = { }
if tag_map_path is not None :
tag_map = srsly . read_json ( tag_map_path )
weights_data = None
if init_tok2vec is not None :
with init_tok2vec . open ( " rb " ) as file_ :
weights_data = file_ . read ( )
2020-01-29 19:06:46 +03:00
2020-05-18 23:23:33 +03:00
if use_gpu > = 0 :
2020-06-12 03:02:07 +03:00
msg . info ( " Using GPU: {use_gpu} " )
2020-07-06 14:06:25 +03:00
require_gpu ( use_gpu )
2020-05-18 23:23:33 +03:00
else :
msg . info ( " Using CPU " )
2020-06-03 11:00:21 +03:00
train (
2020-04-20 23:06:28 +03:00
config_path ,
{ " train " : train_path , " dev " : dev_path } ,
output_path = output_path ,
raw_text = raw_text ,
2020-06-12 03:02:07 +03:00
tag_map = tag_map ,
weights_data = weights_data ,
omit_extra_lookups = omit_extra_lookups ,
2020-04-20 23:06:28 +03:00
)
2020-01-29 19:06:46 +03:00
2020-06-03 11:00:21 +03:00
def train (
2020-06-21 22:35:01 +03:00
config_path : Path ,
data_paths : Dict [ str , Path ] ,
raw_text : Optional [ Path ] = None ,
output_path : Optional [ Path ] = None ,
tag_map : Optional [ Path ] = None ,
weights_data : Optional [ bytes ] = None ,
omit_extra_lookups : bool = False ,
) - > None :
2020-02-18 17:38:18 +03:00
msg . info ( f " Loading config from: { config_path } " )
2020-06-03 11:00:21 +03:00
# Read the config first without creating objects, to get to the original nlp_config
2020-04-20 23:06:28 +03:00
config = util . load_config ( config_path , create_objects = False )
2020-07-06 14:06:25 +03:00
fix_random_seed ( config [ " training " ] [ " seed " ] )
2020-06-12 03:22:13 +03:00
if config [ " training " ] . get ( " use_pytorch_for_gpu_memory " ) :
2020-06-20 15:15:04 +03:00
# It feels kind of weird to not have a default for this.
2020-05-23 00:10:40 +03:00
use_pytorch_for_gpu_memory ( )
2020-04-20 23:06:28 +03:00
nlp_config = config [ " nlp " ]
config = util . load_config ( config_path , create_objects = True )
2020-06-12 03:02:07 +03:00
training = config [ " training " ]
2020-01-29 19:06:46 +03:00
msg . info ( " Creating nlp from config " )
2020-02-27 20:42:27 +03:00
nlp = util . load_model_from_config ( nlp_config )
2020-06-03 11:00:21 +03:00
optimizer = training [ " optimizer " ]
2020-02-27 20:42:27 +03:00
limit = training [ " limit " ]
2020-06-26 20:34:12 +03:00
corpus = Corpus ( data_paths [ " train " ] , data_paths [ " dev " ] , limit = limit )
2020-06-12 03:02:07 +03:00
if " textcat " in nlp_config [ " pipeline " ] :
2020-06-26 20:34:12 +03:00
verify_textcat_config ( nlp , nlp_config )
2020-06-12 03:02:07 +03:00
if training . get ( " resume " , False ) :
msg . info ( " Resuming training " )
nlp . resume_training ( )
else :
msg . info ( f " Initializing the nlp pipeline: { nlp . pipe_names } " )
2020-07-04 17:25:34 +03:00
train_examples = list (
corpus . train_dataset (
2020-07-08 12:26:54 +03:00
nlp , shuffle = False , gold_preproc = training [ " gold_preproc " ] ,
max_length = training [ " max_length " ]
2020-07-04 17:25:34 +03:00
)
)
2020-06-26 20:34:12 +03:00
nlp . begin_training ( lambda : train_examples )
2020-06-12 03:02:07 +03:00
# Update tag map with provided mapping
nlp . vocab . morphology . tag_map . update ( tag_map )
# Create empty extra lexeme tables so the data from spacy-lookups-data
# isn't loaded if these features are accessed
if omit_extra_lookups :
nlp . vocab . lookups_extra = Lookups ( )
nlp . vocab . lookups_extra . add_table ( " lexeme_cluster " )
nlp . vocab . lookups_extra . add_table ( " lexeme_prob " )
nlp . vocab . lookups_extra . add_table ( " lexeme_settings " )
# Load a pretrained tok2vec model - cf. CLI command 'pretrain'
if weights_data is not None :
tok2vec_path = config . get ( " pretraining " , { } ) . get ( " tok2vec_model " , None )
if tok2vec_path is None :
msg . fail (
f " To use a pretrained tok2vec model, the config needs to specify which "
f " tok2vec layer to load in the setting [pretraining.tok2vec_model]. " ,
exits = 1 ,
)
tok2vec = config
for subpath in tok2vec_path . split ( " . " ) :
tok2vec = tok2vec . get ( subpath )
if not tok2vec :
msg . fail (
2020-06-20 15:15:04 +03:00
f " Could not locate the tok2vec model at { tok2vec_path } . " , exits = 1 ,
2020-06-12 03:02:07 +03:00
)
tok2vec . from_bytes ( weights_data )
2020-01-29 19:06:46 +03:00
2020-06-26 20:34:12 +03:00
msg . info ( " Loading training corpus " )
2020-02-27 20:42:27 +03:00
train_batches = create_train_batches ( nlp , corpus , training )
evaluate = create_evaluation_callback ( nlp , optimizer , corpus , training )
2020-01-29 19:06:46 +03:00
# Create iterator, which yields out info after each optimization step.
msg . info ( " Start training " )
training_step_iterator = train_while_improving (
nlp ,
optimizer ,
train_batches ,
evaluate ,
2020-05-20 13:56:27 +03:00
dropout = training [ " dropout " ] ,
accumulate_gradient = training [ " accumulate_gradient " ] ,
patience = training . get ( " patience " , 0 ) ,
max_steps = training . get ( " max_steps " , 0 ) ,
eval_frequency = training [ " eval_frequency " ] ,
2020-06-12 03:02:07 +03:00
raw_text = raw_text ,
2020-01-29 19:06:46 +03:00
)
2020-02-18 17:38:18 +03:00
msg . info ( f " Training. Initial learn rate: { optimizer . learn_rate } " )
2020-02-27 20:42:27 +03:00
print_row = setup_printer ( training , nlp )
2020-01-29 19:06:46 +03:00
try :
2020-02-27 20:42:27 +03:00
progress = tqdm . tqdm ( total = training [ " eval_frequency " ] , leave = False )
2020-01-29 19:06:46 +03:00
for batch , info , is_best_checkpoint in training_step_iterator :
progress . update ( 1 )
if is_best_checkpoint is not None :
progress . close ( )
print_row ( info )
if is_best_checkpoint and output_path is not None :
2020-06-12 03:02:07 +03:00
update_meta ( training , nlp , info )
nlp . to_disk ( output_path / " model-best " )
2020-02-27 20:42:27 +03:00
progress = tqdm . tqdm ( total = training [ " eval_frequency " ] , leave = False )
2020-06-12 03:02:07 +03:00
except Exception as e :
2020-06-26 20:34:12 +03:00
if output_path is not None :
msg . warn (
f " Aborting and saving the final best model. "
f " Encountered exception: { str ( e ) } " ,
exits = 1 ,
)
else :
raise e
2020-01-29 19:06:46 +03:00
finally :
if output_path is not None :
2020-05-20 13:56:27 +03:00
final_model_path = output_path / " model-final "
if optimizer . averages :
with nlp . use_params ( optimizer . averages ) :
nlp . to_disk ( final_model_path )
else :
2020-01-29 19:06:46 +03:00
nlp . to_disk ( final_model_path )
2020-06-12 03:02:07 +03:00
msg . good ( f " Saved model to output directory { final_model_path } " )
2020-01-29 19:06:46 +03:00
def create_train_batches ( nlp , corpus , cfg ) :
2020-06-26 20:34:12 +03:00
max_epochs = cfg . get ( " max_epochs " , 0 )
2020-07-04 17:25:34 +03:00
train_examples = list (
corpus . train_dataset (
nlp ,
shuffle = True ,
gold_preproc = cfg [ " gold_preproc " ] ,
max_length = cfg [ " max_length " ] ,
)
)
2020-06-26 20:34:12 +03:00
epoch = 0
2020-01-29 19:06:46 +03:00
while True :
2020-05-20 12:41:12 +03:00
if len ( train_examples ) == 0 :
raise ValueError ( Errors . E988 )
2020-06-26 20:34:12 +03:00
epoch + = 1
2020-07-08 22:37:06 +03:00
if cfg . get ( " batch_by_words " , True ) :
2020-07-08 12:26:54 +03:00
batches = util . minibatch_by_words (
train_examples ,
size = cfg [ " batch_size " ] ,
discard_oversize = cfg [ " discard_oversize " ] ,
)
else :
batches = util . minibatch (
train_examples ,
size = cfg [ " batch_size " ] ,
)
2020-06-03 11:00:21 +03:00
# make sure the minibatch_by_words result is not empty, or we'll have an infinite training loop
try :
first = next ( batches )
2020-06-26 20:34:12 +03:00
yield epoch , first
2020-06-03 11:00:21 +03:00
except StopIteration :
raise ValueError ( Errors . E986 )
2020-05-18 23:23:33 +03:00
for batch in batches :
2020-06-26 20:34:12 +03:00
yield epoch , batch
if max_epochs > = 1 and epoch > = max_epochs :
2020-05-20 13:56:27 +03:00
break
2020-07-01 02:02:35 +03:00
random . shuffle ( train_examples )
2020-01-29 19:06:46 +03:00
def create_evaluation_callback ( nlp , optimizer , corpus , cfg ) :
def evaluate ( ) :
2020-05-18 23:23:33 +03:00
dev_examples = list (
corpus . dev_dataset (
nlp , gold_preproc = cfg [ " gold_preproc " ] , ignore_misaligned = True
2020-01-29 19:06:46 +03:00
)
2020-05-18 23:23:33 +03:00
)
2020-06-26 20:34:12 +03:00
n_words = sum ( len ( ex . predicted ) for ex in dev_examples )
2020-07-01 16:04:36 +03:00
batch_size = cfg . get ( " evaluation_batch_size " , 128 )
2020-05-18 23:23:33 +03:00
start_time = timer ( )
2020-06-12 03:02:07 +03:00
2020-05-18 23:23:33 +03:00
if optimizer . averages :
with nlp . use_params ( optimizer . averages ) :
2020-07-01 16:16:25 +03:00
scorer = nlp . evaluate ( dev_examples , batch_size = batch_size )
2020-05-18 23:23:33 +03:00
else :
2020-07-01 16:16:25 +03:00
scorer = nlp . evaluate ( dev_examples , batch_size = batch_size )
2020-05-18 23:23:33 +03:00
end_time = timer ( )
wps = n_words / ( end_time - start_time )
scores = scorer . scores
# Calculate a weighted sum based on score_weights for the main score
weights = cfg [ " score_weights " ]
2020-06-12 03:02:07 +03:00
try :
weighted_score = sum ( scores [ s ] * weights . get ( s , 0.0 ) for s in weights )
except KeyError as e :
2020-06-20 15:15:04 +03:00
raise KeyError (
Errors . E983 . format (
2020-06-26 20:34:12 +03:00
dict = " score_weights " , key = str ( e ) , keys = list ( scores . keys ( ) )
2020-06-20 15:15:04 +03:00
)
)
2020-06-12 03:02:07 +03:00
2020-05-18 23:23:33 +03:00
scores [ " speed " ] = wps
2020-04-20 23:06:28 +03:00
return weighted_score , scores
2020-01-29 19:06:46 +03:00
return evaluate
def train_while_improving (
2020-06-12 03:02:07 +03:00
nlp ,
optimizer ,
train_data ,
evaluate ,
* ,
dropout ,
eval_frequency ,
accumulate_gradient = 1 ,
patience = 0 ,
max_steps = 0 ,
raw_text = None ,
2020-01-29 19:06:46 +03:00
) :
""" Train until an evaluation stops improving. Works as a generator,
with each iteration yielding a tuple ` ( batch , info , is_best_checkpoint ) ` ,
where info is a dict , and is_best_checkpoint is in [ True , False , None ] - -
None indicating that the iteration was not evaluated as a checkpoint .
The evaluation is conducted by calling the evaluate callback , which should
Positional arguments :
nlp : The spaCy pipeline to evaluate .
2020-05-20 13:56:27 +03:00
optimizer : The optimizer callable .
2020-01-29 19:06:46 +03:00
train_data ( Iterable [ Batch ] ) : A generator of batches , with the training
data . Each batch should be a Sized [ Tuple [ Input , Annot ] ] . The training
data iterable needs to take care of iterating over the epochs and
shuffling .
evaluate ( Callable [ [ ] , Tuple [ float , Any ] ] ) : A callback to perform evaluation .
The callback should take no arguments and return a tuple
` ( main_score , other_scores ) ` . The main_score should be a float where
higher is better . other_scores can be any object .
Every iteration , the function yields out a tuple with :
2020-06-26 20:34:12 +03:00
* batch : A list of Example objects .
2020-01-29 19:06:46 +03:00
* info : A dict with various information about the last update ( see below ) .
* is_best_checkpoint : A value in None , False , True , indicating whether this
was the best evaluation so far . You should use this to save the model
checkpoints during training . If None , evaluation was not conducted on
that iteration . False means evaluation was conducted , but a previous
evaluation was better .
The info dict provides the following information :
epoch ( int ) : How many passes over the data have been completed .
step ( int ) : How many steps have been completed .
score ( float ) : The main score form the last evaluation .
other_scores : : The other scores from the last evaluation .
loss : The accumulated losses throughout training .
checkpoints : A list of previous results , where each result is a
( score , step , epoch ) tuple .
"""
if isinstance ( dropout , float ) :
dropouts = thinc . schedules . constant ( dropout )
else :
dropouts = dropout
results = [ ]
losses = { }
2020-05-20 12:41:12 +03:00
to_enable = [ name for name , proc in nlp . pipeline if hasattr ( proc , " model " ) ]
2020-06-12 03:02:07 +03:00
if raw_text :
random . shuffle ( raw_text )
2020-07-06 14:02:36 +03:00
raw_examples = [ Example . from_dict ( nlp . make_doc ( rt [ " text " ] ) , { } ) for rt in raw_text ]
raw_batches = util . minibatch ( raw_examples , size = 8 )
2020-06-12 03:02:07 +03:00
2020-06-26 20:34:12 +03:00
for step , ( epoch , batch ) in enumerate ( train_data ) :
2020-01-29 19:06:46 +03:00
dropout = next ( dropouts )
2020-05-20 12:41:12 +03:00
with nlp . select_pipes ( enable = to_enable ) :
for subbatch in subdivide_batch ( batch , accumulate_gradient ) :
nlp . update ( subbatch , drop = dropout , losses = losses , sgd = False )
2020-06-12 03:02:07 +03:00
if raw_text :
# If raw text is available, perform 'rehearsal' updates,
# which use unlabelled data to reduce overfitting.
raw_batch = list ( next ( raw_batches ) )
nlp . rehearse ( raw_batch , sgd = optimizer , losses = losses )
2020-05-20 12:41:12 +03:00
for name , proc in nlp . pipeline :
2020-05-20 13:30:21 +03:00
if hasattr ( proc , " model " ) :
proc . model . finish_update ( optimizer )
2020-01-29 19:06:46 +03:00
optimizer . step_schedules ( )
if not ( step % eval_frequency ) :
score , other_scores = evaluate ( )
results . append ( ( score , step ) )
is_best_checkpoint = score == max ( results ) [ 0 ]
else :
score , other_scores = ( None , None )
is_best_checkpoint = None
info = {
2020-06-26 20:34:12 +03:00
" epoch " : epoch ,
2020-01-29 19:06:46 +03:00
" step " : step ,
" score " : score ,
" other_scores " : other_scores ,
" losses " : losses ,
" checkpoints " : results ,
}
yield batch , info , is_best_checkpoint
if is_best_checkpoint is not None :
losses = { }
2020-05-20 13:56:27 +03:00
# Stop if no improvement in `patience` updates (if specified)
2020-01-29 19:06:46 +03:00
best_score , best_step = max ( results )
2020-05-20 13:56:27 +03:00
if patience and ( step - best_step ) > = patience :
break
# Stop if we've exhausted our max steps (if specified)
2020-07-01 19:08:14 +03:00
if max_steps and step > = max_steps :
2020-01-29 19:06:46 +03:00
break
2020-05-18 23:23:33 +03:00
def subdivide_batch ( batch , accumulate_gradient ) :
batch = list ( batch )
2020-06-26 20:34:12 +03:00
batch . sort ( key = lambda eg : len ( eg . predicted ) )
2020-05-18 23:23:33 +03:00
sub_len = len ( batch ) / / accumulate_gradient
start = 0
for i in range ( accumulate_gradient ) :
subbatch = batch [ start : start + sub_len ]
if subbatch :
yield subbatch
start + = len ( subbatch )
2020-06-12 03:02:07 +03:00
subbatch = batch [ start : ]
2020-05-18 23:23:33 +03:00
if subbatch :
yield subbatch
2020-01-29 19:06:46 +03:00
2020-02-27 20:42:27 +03:00
def setup_printer ( training , nlp ) :
score_cols = training [ " scores " ]
2020-01-29 19:06:46 +03:00
score_widths = [ max ( len ( col ) , 6 ) for col in score_cols ]
2020-02-27 20:42:27 +03:00
loss_cols = [ f " Loss { pipe } " for pipe in nlp . pipe_names ]
2020-01-29 19:06:46 +03:00
loss_widths = [ max ( len ( col ) , 8 ) for col in loss_cols ]
2020-06-26 20:34:12 +03:00
table_header = [ " E " , " # " ] + loss_cols + score_cols + [ " Score " ]
2020-01-29 19:06:46 +03:00
table_header = [ col . upper ( ) for col in table_header ]
2020-06-26 20:34:12 +03:00
table_widths = [ 3 , 6 ] + loss_widths + score_widths + [ 6 ]
2020-01-29 19:06:46 +03:00
table_aligns = [ " r " for _ in table_widths ]
msg . row ( table_header , widths = table_widths )
msg . row ( [ " - " * width for width in table_widths ] )
def print_row ( info ) :
2020-06-12 03:02:07 +03:00
try :
losses = [
" {0:.2f} " . format ( float ( info [ " losses " ] [ pipe_name ] ) )
for pipe_name in nlp . pipe_names
]
except KeyError as e :
raise KeyError (
2020-06-20 15:15:04 +03:00
Errors . E983 . format (
2020-06-26 20:34:12 +03:00
dict = " scores (losses) " , key = str ( e ) , keys = list ( info [ " losses " ] . keys ( ) )
2020-06-20 15:15:04 +03:00
)
)
2020-06-12 03:02:07 +03:00
try :
scores = [
2020-06-20 15:15:04 +03:00
" {0:.2f} " . format ( float ( info [ " other_scores " ] [ col ] ) ) for col in score_cols
2020-06-12 03:02:07 +03:00
]
except KeyError as e :
2020-06-20 15:15:04 +03:00
raise KeyError (
Errors . E983 . format (
2020-06-26 20:34:12 +03:00
dict = " scores (other) " ,
2020-06-20 15:15:04 +03:00
key = str ( e ) ,
keys = list ( info [ " other_scores " ] . keys ( ) ) ,
)
)
2020-06-12 03:02:07 +03:00
data = (
2020-07-04 17:25:34 +03:00
[ info [ " epoch " ] , info [ " step " ] ]
+ losses
+ scores
+ [ " {0:.2f} " . format ( float ( info [ " score " ] ) ) ]
2020-06-12 03:02:07 +03:00
)
2020-01-29 19:06:46 +03:00
msg . row ( data , widths = table_widths , aligns = table_aligns )
return print_row
2020-06-12 03:02:07 +03:00
def update_meta ( training , nlp , info ) :
score_cols = training [ " scores " ]
nlp . meta [ " performance " ] = { }
for metric in score_cols :
nlp . meta [ " performance " ] [ metric ] = info [ " other_scores " ] [ metric ]
for pipe_name in nlp . pipe_names :
nlp . meta [ " performance " ] [ f " { pipe_name } _loss " ] = info [ " losses " ] [ pipe_name ]
2020-06-26 20:34:12 +03:00
def verify_cli_args (
train_path ,
dev_path ,
config_path ,
output_path = None ,
code_path = None ,
init_tok2vec = None ,
raw_text = None ,
verbose = False ,
use_gpu = - 1 ,
tag_map_path = None ,
omit_extra_lookups = False ,
) :
# Make sure all files and paths exists if they are needed
if not config_path or not config_path . exists ( ) :
msg . fail ( " Config file not found " , config_path , exits = 1 )
if not train_path or not train_path . exists ( ) :
msg . fail ( " Training data not found " , train_path , exits = 1 )
if not dev_path or not dev_path . exists ( ) :
msg . fail ( " Development data not found " , dev_path , exits = 1 )
if output_path is not None :
if not output_path . exists ( ) :
output_path . mkdir ( )
msg . good ( f " Created output directory: { output_path } " )
elif output_path . exists ( ) and [ p for p in output_path . iterdir ( ) if p . is_dir ( ) ] :
msg . warn (
" Output directory is not empty. " ,
" This can lead to unintended side effects when saving the model. "
" Please use an empty directory or a different path instead. If "
" the specified output path doesn ' t exist, the directory will be "
" created for you. " ,
)
if code_path is not None :
if not code_path . exists ( ) :
msg . fail ( " Path to Python code not found " , code_path , exits = 1 )
try :
util . import_file ( " python_code " , code_path )
except Exception as e :
msg . fail ( f " Couldn ' t load Python code: { code_path } " , e , exits = 1 )
if init_tok2vec is not None and not init_tok2vec . exists ( ) :
msg . fail ( " Can ' t find pretrained tok2vec " , init_tok2vec , exits = 1 )
def verify_textcat_config ( nlp , nlp_config ) :
# if 'positive_label' is provided: double check whether it's in the data and
# the task is binary
if nlp_config [ " pipeline " ] [ " textcat " ] . get ( " positive_label " , None ) :
textcat_labels = nlp . get_pipe ( " textcat " ) . cfg . get ( " labels " , [ ] )
pos_label = nlp_config [ " pipeline " ] [ " textcat " ] [ " positive_label " ]
if pos_label not in textcat_labels :
msg . fail (
f " The textcat ' s ' positive_label ' config setting ' { pos_label } ' "
f " does not match any label in the training data. " ,
exits = 1 ,
)
if len ( textcat_labels ) != 2 :
msg . fail (
f " A textcat ' positive_label ' ' { pos_label } ' was "
f " provided for training data that does not appear to be a "
f " binary classification problem with two labels. " ,
exits = 1 ,
)