2023-06-14 18:48:41 +03:00
import re
2018-11-30 22:16:14 +03:00
from pathlib import Path
2023-06-14 18:48:41 +03:00
from typing import Optional
2020-07-11 14:03:53 +03:00
import typer
2023-06-14 18:48:41 +03:00
from wasabi import msg
2018-11-16 00:17:16 +03:00
2020-09-28 16:09:59 +03:00
from . . training . pretrain import pretrain
from . . util import load_config
2023-06-14 18:48:41 +03:00
from . _util import (
Arg ,
Opt ,
app ,
import_code ,
parse_config_overrides ,
setup_gpu ,
show_validation_error ,
)
2018-11-16 00:17:16 +03:00
2020-07-11 20:17:59 +03:00
@app.command (
" pretrain " ,
context_settings = { " allow_extra_args " : True , " ignore_unknown_options " : True } ,
)
2020-06-21 22:35:01 +03:00
def pretrain_cli (
2020-01-01 15:15:46 +03:00
# fmt: off
2020-07-11 14:03:53 +03:00
ctx : typer . Context , # This is only used to read additional arguments
2020-12-08 12:41:18 +03:00
config_path : Path = Arg ( . . . , help = " Path to config file " , exists = True , dir_okay = False , allow_dash = True ) ,
2020-09-13 15:05:05 +03:00
output_dir : Path = Arg ( . . . , help = " Directory to write weights to on each epoch " ) ,
2020-09-19 02:17:02 +03:00
code_path : Optional [ Path ] = Opt ( None , " --code " , " -c " , help = " Path to Python file with additional code (registered functions) to be imported " ) ,
2020-06-21 14:44:00 +03:00
resume_path : Optional [ Path ] = Opt ( None , " --resume-path " , " -r " , help = " Path to pretrained weights from which to resume pretraining " ) ,
2020-08-09 23:31:52 +03:00
epoch_resume : Optional [ int ] = Opt ( None , " --epoch-resume " , " -er " , help = " The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files. " ) ,
2020-07-22 17:53:41 +03:00
use_gpu : int = Opt ( - 1 , " --gpu-id " , " -g " , help = " GPU ID or -1 for CPU " ) ,
2023-04-03 16:24:03 +03:00
skip_last : bool = Opt ( False , " --skip-last " , " -L " , help = " Skip saving model-last.bin " ) ,
2020-06-04 17:09:55 +03:00
# fmt: on
2018-11-30 22:16:14 +03:00
) :
"""
Pre - train the ' token-to-vector ' ( tok2vec ) layer of pipeline components ,
2020-07-03 18:57:28 +03:00
using an approximate language - modelling objective . Two objective types
are available , vector - based and character - based .
2020-07-06 14:06:25 +03:00
2020-07-03 18:57:28 +03:00
In the vector - based objective , we load word vectors that have been trained
using a word2vec - style distributional similarity algorithm , and train a
component like a CNN , BiLSTM , etc to predict vectors which match the
pretrained ones . The weights are saved to a directory after each epoch . You
can then pass a path to one of these pretrained weights files to the
' spacy train ' command .
2018-11-30 22:16:14 +03:00
This technique may be especially helpful if you have little labelled data .
However , it ' s still quite experimental, so your mileage may vary.
2018-11-16 00:17:16 +03:00
2018-11-30 22:16:14 +03:00
To load the weights back in during ' spacy train ' , you need to ensure
2020-06-03 15:45:00 +03:00
all settings are the same between pretraining and training . Ideally ,
this is done by using the same config file for both commands .
2020-09-04 13:58:50 +03:00
2021-01-30 12:09:38 +03:00
DOCS : https : / / spacy . io / api / cli #pretrain
2018-11-30 22:16:14 +03:00
"""
2020-09-13 15:05:05 +03:00
config_overrides = parse_config_overrides ( ctx . args )
2020-07-11 14:03:53 +03:00
import_code ( code_path )
2020-09-13 15:05:05 +03:00
verify_cli_args ( config_path , output_dir , resume_path , epoch_resume )
2020-09-28 16:09:59 +03:00
setup_gpu ( use_gpu )
2020-07-22 14:42:59 +03:00
msg . info ( f " Loading config from: { config_path } " )
2020-09-13 15:05:05 +03:00
2020-08-02 16:18:30 +03:00
with show_validation_error ( config_path ) :
2020-09-28 16:09:59 +03:00
raw_config = load_config (
2020-09-27 23:21:31 +03:00
config_path , overrides = config_overrides , interpolate = False
2020-09-13 15:05:05 +03:00
)
2020-09-27 23:21:31 +03:00
config = raw_config . interpolate ( )
2020-09-13 15:05:05 +03:00
if not config . get ( " pretraining " ) :
2020-08-24 16:56:03 +03:00
# TODO: What's the solution here? How do we handle optional blocks?
msg . fail ( " The [pretraining] block in your config is empty " , exits = 1 )
2020-07-22 14:42:59 +03:00
if not output_dir . exists ( ) :
2022-07-26 15:35:18 +03:00
output_dir . mkdir ( parents = True )
2020-07-22 14:42:59 +03:00
msg . good ( f " Created output directory: { output_dir } " )
2020-09-27 23:21:31 +03:00
# Save non-interpolated config
raw_config . to_disk ( output_dir / " config.cfg " )
2020-06-03 15:45:00 +03:00
msg . good ( " Saved config file in the output directory " )
2020-09-17 12:48:04 +03:00
2020-09-13 15:05:05 +03:00
pretrain (
config ,
output_dir ,
resume_path = resume_path ,
epoch_resume = epoch_resume ,
use_gpu = use_gpu ,
2020-09-28 22:17:10 +03:00
silent = False ,
2023-04-03 16:24:03 +03:00
skip_last = skip_last ,
2020-09-13 15:05:05 +03:00
)
2019-06-16 14:22:57 +03:00
msg . good ( " Successfully finished pretrain " )
2018-11-16 01:44:07 +03:00
2018-11-28 20:04:58 +03:00
2020-09-13 15:05:05 +03:00
def verify_cli_args ( config_path , output_dir , resume_path , epoch_resume ) :
2020-12-08 12:41:18 +03:00
if not config_path or ( str ( config_path ) != " - " and not config_path . exists ( ) ) :
2020-07-03 18:57:28 +03:00
msg . fail ( " Config file not found " , config_path , exits = 1 )
if output_dir . exists ( ) and [ p for p in output_dir . iterdir ( ) ] :
if resume_path :
msg . warn (
2020-09-03 14:13:03 +03:00
" Output directory is not empty. " ,
" If you ' re resuming a run in this directory, the old weights "
" for the consecutive epochs will be overwritten with the new ones. " ,
2020-07-03 18:57:28 +03:00
)
else :
msg . warn (
" Output directory is not empty. " ,
" It is better to use an empty directory or refer to a new output path, "
" then the new directory will be created for you. " ,
)
if resume_path is not None :
2021-04-28 10:17:15 +03:00
if resume_path . is_dir ( ) :
# This is necessary because Windows gives a Permission Denied when we
# try to open the directory later, which is confusing. See #7878
msg . fail (
" --resume-path should be a weights file, but {resume_path} is a directory. " ,
exits = True ,
)
2020-07-03 18:57:28 +03:00
model_name = re . search ( r " model \ d+ \ .bin " , str ( resume_path ) )
if not model_name and not epoch_resume :
msg . fail (
" You have to use the --epoch-resume setting when using a renamed weight file for --resume-path " ,
exits = True ,
)
elif not model_name and epoch_resume < 0 :
msg . fail (
f " The argument --epoch-resume has to be greater or equal to 0. { epoch_resume } is invalid " ,
exits = True ,
)