2020-09-21 11:59:07 +03:00
from typing import Dict , List , Union , Optional , Any , Callable , Type , Tuple
2020-08-05 17:00:59 +03:00
from typing import Iterable , TypeVar , TYPE_CHECKING
2019-12-25 14:39:49 +03:00
from enum import Enum
2020-09-29 12:52:45 +03:00
from pydantic import BaseModel , Field , ValidationError , validator , create_model
2020-07-10 14:31:27 +03:00
from pydantic import StrictStr , StrictInt , StrictFloat , StrictBool
2020-09-29 12:52:45 +03:00
from pydantic . main import ModelMetaclass
2021-06-28 12:29:29 +03:00
from thinc . api import Optimizer , ConfigValidationError , Model
2020-09-27 23:21:31 +03:00
from thinc . config import Promise
2019-12-25 14:39:49 +03:00
from collections import defaultdict
2020-09-29 12:52:45 +03:00
import inspect
2019-12-25 14:39:49 +03:00
from . attrs import NAMES
2020-09-18 16:45:55 +03:00
from . lookups import Lookups
2020-09-29 19:08:02 +03:00
from . util import is_cython_func
2019-12-25 14:39:49 +03:00
2020-08-05 17:00:59 +03:00
if TYPE_CHECKING :
# This lets us add type hints for mypy etc. without causing circular imports
from . language import Language # noqa: F401
2020-09-09 11:31:03 +03:00
from . training import Example # noqa: F401
2021-06-28 12:29:29 +03:00
from . vocab import Vocab # noqa: F401
2020-08-05 17:00:59 +03:00
2020-09-27 23:21:31 +03:00
# fmt: off
2020-08-04 16:09:37 +03:00
ItemT = TypeVar ( " ItemT " )
2020-09-27 23:21:31 +03:00
Batcher = Union [ Callable [ [ Iterable [ ItemT ] ] , Iterable [ List [ ItemT ] ] ] , Promise ]
Reader = Union [ Callable [ [ " Language " , str ] , Iterable [ " Example " ] ] , Promise ]
Logger = Union [ Callable [ [ " Language " ] , Tuple [ Callable [ [ Dict [ str , Any ] ] , None ] , Callable ] ] , Promise ]
# fmt: on
2020-08-04 16:09:37 +03:00
2019-12-25 14:39:49 +03:00
2020-07-25 16:01:15 +03:00
def validate ( schema : Type [ BaseModel ] , obj : Dict [ str , Any ] ) - > List [ str ] :
2019-12-25 14:39:49 +03:00
""" Validate data against a given pydantic schema.
2020-07-25 16:01:15 +03:00
obj ( Dict [ str , Any ] ) : JSON - serializable data to validate .
2019-12-25 14:39:49 +03:00
schema ( pydantic . BaseModel ) : The schema to validate against .
2020-07-25 16:01:15 +03:00
RETURNS ( List [ str ] ) : A list of error messages , if available .
2019-12-25 14:39:49 +03:00
"""
try :
schema ( * * obj )
return [ ]
except ValidationError as e :
errors = e . errors ( )
data = defaultdict ( list )
for error in errors :
err_loc = " -> " . join ( [ str ( p ) for p in error . get ( " loc " , [ ] ) ] )
data [ err_loc ] . append ( error . get ( " msg " ) )
return [ f " [ { loc } ] { ' , ' . join ( msg ) } " for loc , msg in data . items ( ) ]
2020-09-29 12:52:45 +03:00
# Initialization
class ArgSchemaConfig :
extra = " forbid "
arbitrary_types_allowed = True
class ArgSchemaConfigExtra :
extra = " forbid "
arbitrary_types_allowed = True
def get_arg_model (
func : Callable ,
* ,
exclude : Iterable [ str ] = tuple ( ) ,
name : str = " ArgModel " ,
strict : bool = True ,
) - > ModelMetaclass :
""" Generate a pydantic model for function arguments.
func ( Callable ) : The function to generate the schema for .
exclude ( Iterable [ str ] ) : Parameter names to ignore .
name ( str ) : Name of created model class .
strict ( bool ) : Don ' t allow extra arguments if no variable keyword arguments
are allowed on the function .
RETURNS ( ModelMetaclass ) : A pydantic model .
"""
sig_args = { }
try :
sig = inspect . signature ( func )
except ValueError :
# Typically happens if the method is part of a Cython module without
# binding=True. Here we just use an empty model that allows everything.
return create_model ( name , __config__ = ArgSchemaConfigExtra )
has_variable = False
for param in sig . parameters . values ( ) :
if param . name in exclude :
continue
if param . kind == param . VAR_KEYWORD :
# The function allows variable keyword arguments so we shouldn't
# include **kwargs etc. in the schema and switch to non-strict
# mode and pass through all other values
has_variable = True
continue
# If no annotation is specified assume it's anything
annotation = param . annotation if param . annotation != param . empty else Any
2020-09-29 19:08:02 +03:00
# If no default value is specified assume that it's required. Cython
# functions/methods will have param.empty for default value None so we
# need to treat them differently
default_empty = None if is_cython_func ( func ) else . . .
default = param . default if param . default != param . empty else default_empty
2020-09-29 12:52:45 +03:00
sig_args [ param . name ] = ( annotation , default )
is_strict = strict and not has_variable
sig_args [ " __config__ " ] = ArgSchemaConfig if is_strict else ArgSchemaConfigExtra
return create_model ( name , * * sig_args )
def validate_init_settings (
func : Callable ,
settings : Dict [ str , Any ] ,
* ,
section : Optional [ str ] = None ,
name : str = " " ,
2020-09-29 13:20:26 +03:00
exclude : Iterable [ str ] = ( " get_examples " , " nlp " ) ,
2020-09-29 12:52:45 +03:00
) - > Dict [ str , Any ] :
""" Validate initialization settings against the expected arguments in
the method signature . Will parse values if possible ( e . g . int to string )
and return the updated settings dict . Will raise a ConfigValidationError
if types don ' t match or required values are missing.
func ( Callable ) : The initialize method of a given component etc .
2020-10-05 23:50:14 +03:00
settings ( Dict [ str , Any ] ) : The settings from the respective [ initialize ] block .
2020-09-29 12:52:45 +03:00
section ( str ) : Initialize section , for error message .
name ( str ) : Name of the block in the section .
exclude ( Iterable [ str ] ) : Parameter names to exclude from schema .
RETURNS ( Dict [ str , Any ] ) : The validated settings .
"""
schema = get_arg_model ( func , exclude = exclude , name = " InitArgModel " )
try :
return schema ( * * settings ) . dict ( )
except ValidationError as e :
block = " initialize " if not section else f " initialize. { section } "
title = f " Error validating initialization settings in [ { block } ] "
raise ConfigValidationError (
2020-09-29 22:39:28 +03:00
title = title , errors = e . errors ( ) , config = settings , parent = name
2020-09-29 12:52:45 +03:00
) from None
2019-12-25 14:39:49 +03:00
# Matcher token patterns
2020-07-25 16:01:15 +03:00
def validate_token_pattern ( obj : list ) - > List [ str ] :
2019-12-25 14:39:49 +03:00
# Try to convert non-string keys (e.g. {ORTH: "foo"} -> {"ORTH": "foo"})
get_key = lambda k : NAMES [ k ] if isinstance ( k , int ) and k < len ( NAMES ) else k
if isinstance ( obj , list ) :
converted = [ ]
for pattern in obj :
if isinstance ( pattern , dict ) :
pattern = { get_key ( k ) : v for k , v in pattern . items ( ) }
converted . append ( pattern )
obj = converted
return validate ( TokenPatternSchema , { " pattern " : obj } )
class TokenPatternString ( BaseModel ) :
2020-09-04 15:05:55 +03:00
REGEX : Optional [ StrictStr ] = Field ( None , alias = " regex " )
IN : Optional [ List [ StrictStr ] ] = Field ( None , alias = " in " )
NOT_IN : Optional [ List [ StrictStr ] ] = Field ( None , alias = " not_in " )
2020-09-24 17:55:09 +03:00
IS_SUBSET : Optional [ List [ StrictStr ] ] = Field ( None , alias = " is_subset " )
IS_SUPERSET : Optional [ List [ StrictStr ] ] = Field ( None , alias = " is_superset " )
2021-08-02 20:39:26 +03:00
INTERSECTS : Optional [ List [ StrictStr ] ] = Field ( None , alias = " intersects " )
2019-12-25 14:39:49 +03:00
class Config :
extra = " forbid "
2020-09-04 15:05:55 +03:00
allow_population_by_field_name = True # allow alias and field name
2019-12-25 14:39:49 +03:00
2020-08-25 18:13:11 +03:00
@validator ( " * " , pre = True , each_item = True , allow_reuse = True )
2019-12-25 14:39:49 +03:00
def raise_for_none ( cls , v ) :
if v is None :
raise ValueError ( " None / null is not allowed " )
return v
class TokenPatternNumber ( BaseModel ) :
2020-09-04 15:05:55 +03:00
REGEX : Optional [ StrictStr ] = Field ( None , alias = " regex " )
IN : Optional [ List [ StrictInt ] ] = Field ( None , alias = " in " )
NOT_IN : Optional [ List [ StrictInt ] ] = Field ( None , alias = " not_in " )
2021-08-02 20:39:26 +03:00
IS_SUBSET : Optional [ List [ StrictInt ] ] = Field ( None , alias = " is_subset " )
IS_SUPERSET : Optional [ List [ StrictInt ] ] = Field ( None , alias = " is_superset " )
INTERSECTS : Optional [ List [ StrictInt ] ] = Field ( None , alias = " intersects " )
2019-12-25 14:39:49 +03:00
EQ : Union [ StrictInt , StrictFloat ] = Field ( None , alias = " == " )
2020-05-21 20:01:02 +03:00
NEQ : Union [ StrictInt , StrictFloat ] = Field ( None , alias = " != " )
2019-12-25 14:39:49 +03:00
GEQ : Union [ StrictInt , StrictFloat ] = Field ( None , alias = " >= " )
LEQ : Union [ StrictInt , StrictFloat ] = Field ( None , alias = " <= " )
GT : Union [ StrictInt , StrictFloat ] = Field ( None , alias = " > " )
LT : Union [ StrictInt , StrictFloat ] = Field ( None , alias = " < " )
class Config :
extra = " forbid "
2020-09-04 15:05:55 +03:00
allow_population_by_field_name = True # allow alias and field name
2019-12-25 14:39:49 +03:00
2020-08-25 18:13:11 +03:00
@validator ( " * " , pre = True , each_item = True , allow_reuse = True )
2019-12-25 14:39:49 +03:00
def raise_for_none ( cls , v ) :
if v is None :
raise ValueError ( " None / null is not allowed " )
return v
class TokenPatternOperator ( str , Enum ) :
plus : StrictStr = " + "
start : StrictStr = " * "
question : StrictStr = " ? "
exclamation : StrictStr = " ! "
StringValue = Union [ TokenPatternString , StrictStr ]
NumberValue = Union [ TokenPatternNumber , StrictInt , StrictFloat ]
UnderscoreValue = Union [
2020-09-17 12:48:04 +03:00
TokenPatternString , TokenPatternNumber , str , int , float , list , bool
2019-12-25 14:39:49 +03:00
]
class TokenPattern ( BaseModel ) :
orth : Optional [ StringValue ] = None
text : Optional [ StringValue ] = None
lower : Optional [ StringValue ] = None
pos : Optional [ StringValue ] = None
tag : Optional [ StringValue ] = None
2020-09-24 17:55:09 +03:00
morph : Optional [ StringValue ] = None
2019-12-25 14:39:49 +03:00
dep : Optional [ StringValue ] = None
lemma : Optional [ StringValue ] = None
shape : Optional [ StringValue ] = None
ent_type : Optional [ StringValue ] = None
norm : Optional [ StringValue ] = None
length : Optional [ NumberValue ] = None
2020-02-18 16:32:53 +03:00
spacy : Optional [ StrictBool ] = None
2019-12-25 14:39:49 +03:00
is_alpha : Optional [ StrictBool ] = None
is_ascii : Optional [ StrictBool ] = None
is_digit : Optional [ StrictBool ] = None
is_lower : Optional [ StrictBool ] = None
is_upper : Optional [ StrictBool ] = None
is_title : Optional [ StrictBool ] = None
is_punct : Optional [ StrictBool ] = None
is_space : Optional [ StrictBool ] = None
is_bracket : Optional [ StrictBool ] = None
is_quote : Optional [ StrictBool ] = None
is_left_punct : Optional [ StrictBool ] = None
is_right_punct : Optional [ StrictBool ] = None
is_currency : Optional [ StrictBool ] = None
is_stop : Optional [ StrictBool ] = None
is_sent_start : Optional [ StrictBool ] = None
2020-03-26 16:05:40 +03:00
sent_start : Optional [ StrictBool ] = None
2019-12-25 14:39:49 +03:00
like_num : Optional [ StrictBool ] = None
like_url : Optional [ StrictBool ] = None
like_email : Optional [ StrictBool ] = None
op : Optional [ TokenPatternOperator ] = None
underscore : Optional [ Dict [ StrictStr , UnderscoreValue ] ] = Field ( None , alias = " _ " )
class Config :
extra = " forbid "
allow_population_by_field_name = True
alias_generator = lambda value : value . upper ( )
2020-08-25 18:13:11 +03:00
@validator ( " * " , pre = True , allow_reuse = True )
2019-12-25 14:39:49 +03:00
def raise_for_none ( cls , v ) :
if v is None :
raise ValueError ( " None / null is not allowed " )
return v
class TokenPatternSchema ( BaseModel ) :
2020-10-16 01:41:19 +03:00
pattern : List [ TokenPattern ] = Field ( . . . , min_items = 1 )
2019-12-25 14:39:49 +03:00
class Config :
extra = " forbid "
# Model meta
class ModelMetaSchema ( BaseModel ) :
# fmt: off
lang : StrictStr = Field ( . . . , title = " Two-letter language code, e.g. ' en ' " )
name : StrictStr = Field ( . . . , title = " Model name " )
version : StrictStr = Field ( . . . , title = " Model version " )
2020-08-18 15:39:40 +03:00
spacy_version : StrictStr = Field ( " " , title = " Compatible spaCy version identifier " )
parent_package : StrictStr = Field ( " spacy " , title = " Name of parent spaCy package, e.g. spacy or spacy-nightly " )
2020-12-10 15:36:46 +03:00
requirements : List [ StrictStr ] = Field ( [ ] , title = " Additional Python package dependencies, used for the Python package setup " )
2020-08-18 15:39:40 +03:00
pipeline : List [ StrictStr ] = Field ( [ ] , title = " Names of pipeline components " )
description : StrictStr = Field ( " " , title = " Model description " )
license : StrictStr = Field ( " " , title = " Model license " )
author : StrictStr = Field ( " " , title = " Model author name " )
email : StrictStr = Field ( " " , title = " Model author email " )
url : StrictStr = Field ( " " , title = " Model author URL " )
sources : Optional [ Union [ List [ StrictStr ] , List [ Dict [ str , str ] ] ] ] = Field ( None , title = " Training data sources " )
vectors : Dict [ str , Any ] = Field ( { } , title = " Included word vectors " )
2020-09-11 12:38:33 +03:00
labels : Dict [ str , List [ str ] ] = Field ( { } , title = " Component labels, keyed by component name " )
2020-10-11 13:30:57 +03:00
performance : Dict [ str , Any ] = Field ( { } , title = " Accuracy and speed numbers " )
2020-08-18 15:39:40 +03:00
spacy_git_version : StrictStr = Field ( " " , title = " Commit of spaCy version used " )
2019-12-25 14:39:49 +03:00
# fmt: on
2020-07-10 14:31:27 +03:00
# Config schema
# We're not setting any defaults here (which is too messy) and are making all
# fields required, so we can raise validation errors for missing values. To
# provide a default, we include a separate .cfg file with all values and
# check that against this schema in the test suite to make sure it's always
# up to date.
class ConfigSchemaTraining ( BaseModel ) :
# fmt: off
2020-09-17 12:38:59 +03:00
dev_corpus : StrictStr = Field ( . . . , title = " Path in the config to the dev data " )
train_corpus : StrictStr = Field ( . . . , title = " Path in the config to the training data " )
2020-08-04 16:09:37 +03:00
batcher : Batcher = Field ( . . . , title = " Batcher for the training data " )
2020-07-10 14:31:27 +03:00
dropout : StrictFloat = Field ( . . . , title = " Dropout rate " )
patience : StrictInt = Field ( . . . , title = " How many steps to continue without improvement in evaluation score " )
max_epochs : StrictInt = Field ( . . . , title = " Maximum number of epochs to train for " )
max_steps : StrictInt = Field ( . . . , title = " Maximum number of update steps to train for " )
eval_frequency : StrictInt = Field ( . . . , title = " How often to evaluate during training (steps) " )
2020-07-10 21:52:00 +03:00
seed : Optional [ StrictInt ] = Field ( . . . , title = " Random seed " )
2020-09-19 02:17:02 +03:00
gpu_allocator : Optional [ StrictStr ] = Field ( . . . , title = " Memory allocator when running on GPU " )
2020-07-10 14:31:27 +03:00
accumulate_gradient : StrictInt = Field ( . . . , title = " Whether to divide the batch up into substeps " )
2020-09-24 11:27:33 +03:00
score_weights : Dict [ StrictStr , Optional [ Union [ StrictFloat , StrictInt ] ] ] = Field ( . . . , title = " Scores to report and their weights for selecting final model " )
2020-07-10 14:31:27 +03:00
optimizer : Optimizer = Field ( . . . , title = " The optimizer to use " )
2020-08-26 16:24:33 +03:00
logger : Logger = Field ( . . . , title = " The logger to track training progress " )
2020-08-05 00:39:19 +03:00
frozen_components : List [ str ] = Field ( . . . , title = " Pipeline components that shouldn ' t be updated during training " )
2021-04-26 17:53:53 +03:00
annotating_components : List [ str ] = Field ( . . . , title = " Pipeline components that should set annotations during training " )
2020-09-24 13:40:25 +03:00
before_to_disk : Optional [ Callable [ [ " Language " ] , " Language " ] ] = Field ( . . . , title = " Optional callback to modify nlp object after training, before it ' s saved to disk " )
2020-07-10 14:31:27 +03:00
# fmt: on
class Config :
extra = " forbid "
2020-06-21 14:44:00 +03:00
arbitrary_types_allowed = True
2020-07-10 14:31:27 +03:00
class ConfigSchemaNlp ( BaseModel ) :
2020-07-22 14:42:59 +03:00
# fmt: off
2020-07-10 14:31:27 +03:00
lang : StrictStr = Field ( . . . , title = " The base language to use " )
2020-07-22 14:42:59 +03:00
pipeline : List [ StrictStr ] = Field ( . . . , title = " The pipeline component names in order " )
2020-08-28 16:20:14 +03:00
disabled : List [ StrictStr ] = Field ( . . . , title = " Pipeline components to disable by default " )
2020-07-22 14:42:59 +03:00
tokenizer : Callable = Field ( . . . , title = " The tokenizer to use " )
2020-08-05 20:47:54 +03:00
before_creation : Optional [ Callable [ [ Type [ " Language " ] ] , Type [ " Language " ] ] ] = Field ( . . . , title = " Optional callback to modify Language class before initialization " )
after_creation : Optional [ Callable [ [ " Language " ] , " Language " ] ] = Field ( . . . , title = " Optional callback to modify nlp object after creation and before the pipeline is constructed " )
after_pipeline_creation : Optional [ Callable [ [ " Language " ] , " Language " ] ] = Field ( . . . , title = " Optional callback to modify nlp object after the pipeline is constructed " )
2020-12-09 11:13:26 +03:00
batch_size : Optional [ int ] = Field ( . . . , title = " Default batch size " )
2020-07-22 14:42:59 +03:00
# fmt: on
2019-12-25 14:39:49 +03:00
class Config :
extra = " forbid "
2020-07-10 14:31:27 +03:00
arbitrary_types_allowed = True
2020-08-24 16:56:03 +03:00
class ConfigSchemaPretrainEmpty ( BaseModel ) :
class Config :
extra = " forbid "
2020-07-11 14:03:53 +03:00
class ConfigSchemaPretrain ( BaseModel ) :
# fmt: off
max_epochs : StrictInt = Field ( . . . , title = " Maximum number of epochs to train for " )
dropout : StrictFloat = Field ( . . . , title = " Dropout rate " )
2021-08-12 12:14:48 +03:00
n_save_every : Optional [ StrictInt ] = Field ( . . . , title = " Saving additional temporary model after n batches within an epoch " )
n_save_epoch : Optional [ StrictInt ] = Field ( . . . , title = " Saving model after every n epoch " )
2020-07-11 14:03:53 +03:00
optimizer : Optimizer = Field ( . . . , title = " The optimizer to use " )
2020-09-17 12:38:59 +03:00
corpus : StrictStr = Field ( . . . , title = " Path in the config to the training data " )
2020-09-13 15:05:05 +03:00
batcher : Batcher = Field ( . . . , title = " Batcher for the training data " )
component : str = Field ( . . . , title = " Component to find the layer to pretrain " )
layer : str = Field ( . . . , title = " Layer to pretrain. Whole model if empty. " )
2021-06-28 12:29:29 +03:00
objective : Callable [ [ " Vocab " , Model ] , Model ] = Field ( . . . , title = " A function that creates the pretraining objective. " )
2020-07-11 14:03:53 +03:00
# fmt: on
class Config :
extra = " forbid "
arbitrary_types_allowed = True
2020-09-29 17:47:55 +03:00
class ConfigSchemaInit ( BaseModel ) :
2020-09-28 12:56:14 +03:00
# fmt: off
2020-09-29 17:47:55 +03:00
vocab_data : Optional [ StrictStr ] = Field ( . . . , title = " Path to JSON-formatted vocabulary file " )
2020-09-28 12:56:14 +03:00
lookups : Optional [ Lookups ] = Field ( . . . , title = " Vocabulary lookups, e.g. lexeme normalization " )
vectors : Optional [ StrictStr ] = Field ( . . . , title = " Path to vectors " )
init_tok2vec : Optional [ StrictStr ] = Field ( . . . , title = " Path to pretrained tok2vec weights " )
2020-09-29 17:47:55 +03:00
tokenizer : Dict [ StrictStr , Any ] = Field ( . . . , help = " Arguments to be passed into Tokenizer.initialize " )
2020-10-08 22:33:49 +03:00
components : Dict [ StrictStr , Dict [ StrictStr , Any ] ] = Field ( . . . , help = " Arguments for TrainablePipe.initialize methods of pipeline components, keyed by component " )
2021-01-12 13:29:31 +03:00
before_init : Optional [ Callable [ [ " Language " ] , " Language " ] ] = Field ( . . . , title = " Optional callback to modify nlp object before initialization " )
after_init : Optional [ Callable [ [ " Language " ] , " Language " ] ] = Field ( . . . , title = " Optional callback to modify nlp object after initialization " )
2020-09-28 12:56:14 +03:00
# fmt: on
class Config :
extra = " forbid "
arbitrary_types_allowed = True
2020-07-10 14:31:27 +03:00
class ConfigSchema ( BaseModel ) :
training : ConfigSchemaTraining
nlp : ConfigSchemaNlp
2020-08-24 16:56:03 +03:00
pretraining : Union [ ConfigSchemaPretrain , ConfigSchemaPretrainEmpty ] = { }
2020-07-22 14:42:59 +03:00
components : Dict [ str , Dict [ str , Any ] ]
2020-09-17 12:38:59 +03:00
corpora : Dict [ str , Reader ]
2020-09-28 12:56:14 +03:00
initialize : ConfigSchemaInit
2020-07-10 14:31:27 +03:00
class Config :
extra = " allow "
arbitrary_types_allowed = True
2020-06-21 14:44:00 +03:00
2020-09-29 21:38:35 +03:00
CONFIG_SCHEMAS = {
" nlp " : ConfigSchemaNlp ,
" training " : ConfigSchemaTraining ,
" pretraining " : ConfigSchemaPretrain ,
" initialize " : ConfigSchemaInit ,
}
2020-09-27 23:21:31 +03:00
2020-06-21 14:44:00 +03:00
# Project config Schema
2020-08-25 12:54:53 +03:00
class ProjectConfigAssetGitItem ( BaseModel ) :
# fmt: off
repo : StrictStr = Field ( . . . , title = " URL of Git repo to download from " )
path : StrictStr = Field ( . . . , title = " File path or sub-directory to download (used for sparse checkout) " )
branch : StrictStr = Field ( " master " , title = " Branch to clone from " )
# fmt: on
class ProjectConfigAssetURL ( BaseModel ) :
2020-06-27 15:15:41 +03:00
# fmt: off
2020-06-21 14:44:00 +03:00
dest : StrictStr = Field ( . . . , title = " Destination of downloaded asset " )
2020-07-07 21:51:50 +03:00
url : Optional [ StrictStr ] = Field ( None , title = " URL of asset " )
2020-06-27 15:15:41 +03:00
checksum : str = Field ( None , title = " MD5 hash of file " , regex = r " ([a-fA-F \ d] {32} ) " )
2020-08-25 18:14:27 +03:00
description : StrictStr = Field ( " " , title = " Description of asset " )
2020-06-27 15:15:41 +03:00
# fmt: on
2020-06-21 14:44:00 +03:00
2020-08-25 12:54:53 +03:00
class ProjectConfigAssetGit ( BaseModel ) :
# fmt: off
git : ProjectConfigAssetGitItem = Field ( . . . , title = " Git repo information " )
checksum : str = Field ( None , title = " MD5 hash of file " , regex = r " ([a-fA-F \ d] {32} ) " )
2020-08-25 18:14:27 +03:00
description : Optional [ StrictStr ] = Field ( None , title = " Description of asset " )
2020-08-25 12:54:53 +03:00
# fmt: on
2020-06-21 14:44:00 +03:00
class ProjectConfigCommand ( BaseModel ) :
# fmt: off
name : StrictStr = Field ( . . . , title = " Name of command " )
help : Optional [ StrictStr ] = Field ( None , title = " Command description " )
script : List [ StrictStr ] = Field ( [ ] , title = " List of CLI commands to run, in order " )
2020-07-10 00:51:18 +03:00
deps : List [ StrictStr ] = Field ( [ ] , title = " File dependencies required by this command " )
outputs : List [ StrictStr ] = Field ( [ ] , title = " Outputs produced by this command " )
outputs_no_cache : List [ StrictStr ] = Field ( [ ] , title = " Outputs not tracked by DVC (DVC only) " )
no_skip : bool = Field ( False , title = " Never skip this command, even if nothing changed " )
2020-06-21 14:44:00 +03:00
# fmt: on
2020-06-22 15:53:31 +03:00
class Config :
title = " A single named command specified in a project config "
extra = " forbid "
2020-06-21 14:44:00 +03:00
class ProjectConfigSchema ( BaseModel ) :
# fmt: off
2020-08-23 19:32:09 +03:00
vars : Dict [ StrictStr , Any ] = Field ( { } , title = " Optional variables to substitute in commands " )
2021-02-10 05:45:27 +03:00
env : Dict [ StrictStr , Any ] = Field ( { } , title = " Optional variable names to substitute in commands, mapped to environment variable names " )
2020-08-25 12:54:53 +03:00
assets : List [ Union [ ProjectConfigAssetURL , ProjectConfigAssetGit ] ] = Field ( [ ] , title = " Data assets " )
2020-07-09 02:42:51 +03:00
workflows : Dict [ StrictStr , List [ StrictStr ] ] = Field ( { } , title = " Named workflows, mapped to list of project commands to run in order " )
2020-06-21 14:44:00 +03:00
commands : List [ ProjectConfigCommand ] = Field ( [ ] , title = " Project command shortucts " )
2020-08-25 18:14:27 +03:00
title : Optional [ str ] = Field ( None , title = " Project title " )
2020-10-05 21:00:42 +03:00
spacy_version : Optional [ StrictStr ] = Field ( None , title = " spaCy version range that the project is compatible with " )
2020-06-21 14:44:00 +03:00
# fmt: on
class Config :
title = " Schema for project configuration file "
2020-08-19 14:33:15 +03:00
# Recommendations for init config workflows
class RecommendationTrfItem ( BaseModel ) :
name : str
size_factor : int
class RecommendationTrf ( BaseModel ) :
efficiency : RecommendationTrfItem
accuracy : RecommendationTrfItem
class RecommendationSchema ( BaseModel ) :
word_vectors : Optional [ str ] = None
transformer : Optional [ RecommendationTrf ] = None
has_letters : bool = True