2023-06-26 12:41:03 +03:00
import re
2018-11-30 22:16:14 +03:00
from pathlib import Path
2023-06-26 12:41:03 +03:00
from typing import Optional
2020-07-11 14:03:53 +03:00
import typer
2023-06-26 12:41:03 +03:00
from wasabi import msg
2018-11-16 00:17:16 +03:00
2020-09-28 16:09:59 +03:00
from . . training . pretrain import pretrain
from . . util import load_config
2023-06-26 12:41:03 +03:00
from . _util import (
Arg ,
Opt ,
app ,
Accept multiple code files in all CLI commands (#12101)
* Add support for multiple code files to all relevant commands
Prior to this, only the package command supported multiple code files.
* Update docs
* Add debug data test, plus generic fixtures
One tricky thing here: it's tempting to create the config by creating a
pipeline in code, but that requires declaring the custom components
here. However the CliRunner appears to be run in the same process or
otherwise have access to our registry, so it works even without any
code arguments. So it's necessary to avoid declaring the components in
the tests.
* Add debug config test and restructure
The code argument imports the provided file. If it adds item to the
registry, that affects global state, which CliRunner doesn't isolate.
Since there's no standard way to remove things from the registry, this
instead uses subprocess.run to run commands.
* Use a more generic, parametrized test
* Add output arg for assemble and pretrain
Assemble and pretrain require an output argument. This commit adds
assemble testing, but not pretrain, as that requires an actual trainable
component, which is not currently in the test config.
* Add evaluate test and some cleanup
* Mark tests as slow
* Revert argument name change
* Apply suggestions from code review
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
* Format API CLI docs
* isort
* Fix imports in tests
* isort
* Undo changes to package CLI help
* Fix python executable and lang code in test
* Fix executable in another test
---------
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: Raphael Mitsch <r.mitsch@outlook.com>
2023-08-01 16:24:02 +03:00
import_code_paths ,
2023-06-26 12:41:03 +03:00
parse_config_overrides ,
setup_gpu ,
show_validation_error ,
)
2018-11-16 00:17:16 +03:00
2020-07-11 20:17:59 +03:00
@app.command (
" pretrain " ,
context_settings = { " allow_extra_args " : True , " ignore_unknown_options " : True } ,
)
2020-06-21 22:35:01 +03:00
def pretrain_cli (
2020-01-01 15:15:46 +03:00
# fmt: off
2020-07-11 14:03:53 +03:00
ctx : typer . Context , # This is only used to read additional arguments
2020-12-08 12:41:18 +03:00
config_path : Path = Arg ( . . . , help = " Path to config file " , exists = True , dir_okay = False , allow_dash = True ) ,
2020-09-13 15:05:05 +03:00
output_dir : Path = Arg ( . . . , help = " Directory to write weights to on each epoch " ) ,
Accept multiple code files in all CLI commands (#12101)
* Add support for multiple code files to all relevant commands
Prior to this, only the package command supported multiple code files.
* Update docs
* Add debug data test, plus generic fixtures
One tricky thing here: it's tempting to create the config by creating a
pipeline in code, but that requires declaring the custom components
here. However the CliRunner appears to be run in the same process or
otherwise have access to our registry, so it works even without any
code arguments. So it's necessary to avoid declaring the components in
the tests.
* Add debug config test and restructure
The code argument imports the provided file. If it adds item to the
registry, that affects global state, which CliRunner doesn't isolate.
Since there's no standard way to remove things from the registry, this
instead uses subprocess.run to run commands.
* Use a more generic, parametrized test
* Add output arg for assemble and pretrain
Assemble and pretrain require an output argument. This commit adds
assemble testing, but not pretrain, as that requires an actual trainable
component, which is not currently in the test config.
* Add evaluate test and some cleanup
* Mark tests as slow
* Revert argument name change
* Apply suggestions from code review
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
* Format API CLI docs
* isort
* Fix imports in tests
* isort
* Undo changes to package CLI help
* Fix python executable and lang code in test
* Fix executable in another test
---------
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: Raphael Mitsch <r.mitsch@outlook.com>
2023-08-01 16:24:02 +03:00
code_path : str = Opt ( " " , " --code " , " -c " , help = " Comma-separated paths to Python files with additional code (registered functions) to be imported " ) ,
2020-06-21 14:44:00 +03:00
resume_path : Optional [ Path ] = Opt ( None , " --resume-path " , " -r " , help = " Path to pretrained weights from which to resume pretraining " ) ,
2020-08-09 23:31:52 +03:00
epoch_resume : Optional [ int ] = Opt ( None , " --epoch-resume " , " -er " , help = " The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files. " ) ,
2020-07-22 17:53:41 +03:00
use_gpu : int = Opt ( - 1 , " --gpu-id " , " -g " , help = " GPU ID or -1 for CPU " ) ,
2023-04-03 16:24:03 +03:00
skip_last : bool = Opt ( False , " --skip-last " , " -L " , help = " Skip saving model-last.bin " ) ,
2020-06-04 17:09:55 +03:00
# fmt: on
2018-11-30 22:16:14 +03:00
) :
"""
Pre - train the ' token-to-vector ' ( tok2vec ) layer of pipeline components ,
2020-07-03 18:57:28 +03:00
using an approximate language - modelling objective . Two objective types
are available , vector - based and character - based .
2020-07-06 14:06:25 +03:00
2020-07-03 18:57:28 +03:00
In the vector - based objective , we load word vectors that have been trained
using a word2vec - style distributional similarity algorithm , and train a
component like a CNN , BiLSTM , etc to predict vectors which match the
pretrained ones . The weights are saved to a directory after each epoch . You
can then pass a path to one of these pretrained weights files to the
' spacy train ' command .
2018-11-30 22:16:14 +03:00
This technique may be especially helpful if you have little labelled data .
However , it ' s still quite experimental, so your mileage may vary.
2018-11-16 00:17:16 +03:00
2018-11-30 22:16:14 +03:00
To load the weights back in during ' spacy train ' , you need to ensure
2020-06-03 15:45:00 +03:00
all settings are the same between pretraining and training . Ideally ,
this is done by using the same config file for both commands .
2020-09-04 13:58:50 +03:00
2021-01-30 12:09:38 +03:00
DOCS : https : / / spacy . io / api / cli #pretrain
2018-11-30 22:16:14 +03:00
"""
2020-09-13 15:05:05 +03:00
config_overrides = parse_config_overrides ( ctx . args )
Accept multiple code files in all CLI commands (#12101)
* Add support for multiple code files to all relevant commands
Prior to this, only the package command supported multiple code files.
* Update docs
* Add debug data test, plus generic fixtures
One tricky thing here: it's tempting to create the config by creating a
pipeline in code, but that requires declaring the custom components
here. However the CliRunner appears to be run in the same process or
otherwise have access to our registry, so it works even without any
code arguments. So it's necessary to avoid declaring the components in
the tests.
* Add debug config test and restructure
The code argument imports the provided file. If it adds item to the
registry, that affects global state, which CliRunner doesn't isolate.
Since there's no standard way to remove things from the registry, this
instead uses subprocess.run to run commands.
* Use a more generic, parametrized test
* Add output arg for assemble and pretrain
Assemble and pretrain require an output argument. This commit adds
assemble testing, but not pretrain, as that requires an actual trainable
component, which is not currently in the test config.
* Add evaluate test and some cleanup
* Mark tests as slow
* Revert argument name change
* Apply suggestions from code review
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
* Format API CLI docs
* isort
* Fix imports in tests
* isort
* Undo changes to package CLI help
* Fix python executable and lang code in test
* Fix executable in another test
---------
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: Raphael Mitsch <r.mitsch@outlook.com>
2023-08-01 16:24:02 +03:00
import_code_paths ( code_path )
2020-09-13 15:05:05 +03:00
verify_cli_args ( config_path , output_dir , resume_path , epoch_resume )
2020-09-28 16:09:59 +03:00
setup_gpu ( use_gpu )
2020-07-22 14:42:59 +03:00
msg . info ( f " Loading config from: { config_path } " )
2020-09-13 15:05:05 +03:00
2020-08-02 16:18:30 +03:00
with show_validation_error ( config_path ) :
2020-09-28 16:09:59 +03:00
raw_config = load_config (
2020-09-27 23:21:31 +03:00
config_path , overrides = config_overrides , interpolate = False
2020-09-13 15:05:05 +03:00
)
2020-09-27 23:21:31 +03:00
config = raw_config . interpolate ( )
2020-09-13 15:05:05 +03:00
if not config . get ( " pretraining " ) :
2020-08-24 16:56:03 +03:00
# TODO: What's the solution here? How do we handle optional blocks?
msg . fail ( " The [pretraining] block in your config is empty " , exits = 1 )
2020-07-22 14:42:59 +03:00
if not output_dir . exists ( ) :
2022-07-26 15:35:18 +03:00
output_dir . mkdir ( parents = True )
2020-07-22 14:42:59 +03:00
msg . good ( f " Created output directory: { output_dir } " )
2020-09-27 23:21:31 +03:00
# Save non-interpolated config
raw_config . to_disk ( output_dir / " config.cfg " )
2020-06-03 15:45:00 +03:00
msg . good ( " Saved config file in the output directory " )
2020-09-17 12:48:04 +03:00
2020-09-13 15:05:05 +03:00
pretrain (
config ,
output_dir ,
resume_path = resume_path ,
epoch_resume = epoch_resume ,
use_gpu = use_gpu ,
2020-09-28 22:17:10 +03:00
silent = False ,
2023-04-03 16:24:03 +03:00
skip_last = skip_last ,
2020-09-13 15:05:05 +03:00
)
2019-06-16 14:22:57 +03:00
msg . good ( " Successfully finished pretrain " )
2018-11-16 01:44:07 +03:00
2018-11-28 20:04:58 +03:00
2020-09-13 15:05:05 +03:00
def verify_cli_args ( config_path , output_dir , resume_path , epoch_resume ) :
2020-12-08 12:41:18 +03:00
if not config_path or ( str ( config_path ) != " - " and not config_path . exists ( ) ) :
2020-07-03 18:57:28 +03:00
msg . fail ( " Config file not found " , config_path , exits = 1 )
if output_dir . exists ( ) and [ p for p in output_dir . iterdir ( ) ] :
if resume_path :
msg . warn (
2020-09-03 14:13:03 +03:00
" Output directory is not empty. " ,
" If you ' re resuming a run in this directory, the old weights "
" for the consecutive epochs will be overwritten with the new ones. " ,
2020-07-03 18:57:28 +03:00
)
else :
msg . warn (
" Output directory is not empty. " ,
" It is better to use an empty directory or refer to a new output path, "
" then the new directory will be created for you. " ,
)
if resume_path is not None :
2021-04-28 10:17:15 +03:00
if resume_path . is_dir ( ) :
# This is necessary because Windows gives a Permission Denied when we
# try to open the directory later, which is confusing. See #7878
msg . fail (
" --resume-path should be a weights file, but {resume_path} is a directory. " ,
exits = True ,
)
2020-07-03 18:57:28 +03:00
model_name = re . search ( r " model \ d+ \ .bin " , str ( resume_path ) )
if not model_name and not epoch_resume :
msg . fail (
" You have to use the --epoch-resume setting when using a renamed weight file for --resume-path " ,
exits = True ,
)
elif not model_name and epoch_resume < 0 :
msg . fail (
f " The argument --epoch-resume has to be greater or equal to 0. { epoch_resume } is invalid " ,
exits = True ,
)