mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
Update config
This commit is contained in:
parent
9f6ad06452
commit
1590de11b1
|
@ -72,14 +72,15 @@ def init_pipeline(config: Config, use_gpu: int = -1) -> Language:
|
||||||
config = nlp.config.interpolate()
|
config = nlp.config.interpolate()
|
||||||
# Resolve all training-relevant sections using the filled nlp config
|
# Resolve all training-relevant sections using the filled nlp config
|
||||||
T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
|
T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
|
||||||
dot_names = [T["train_corpus"], T["dev_corpus"], T["raw_text"]]
|
dot_names = [T["train_corpus"], T["dev_corpus"]]
|
||||||
train_corpus, dev_corpus, raw_text = resolve_dot_names(config, dot_names)
|
train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
|
||||||
I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
|
I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
|
||||||
init_vocab(nlp, data=I["vocab"]["data"], lookups=I["vocab"]["lookups"])
|
V = I["vocab"]
|
||||||
|
init_vocab(nlp, data=V["data"], lookups=V["lookups"])
|
||||||
msg.good("Created vocabulary")
|
msg.good("Created vocabulary")
|
||||||
if T["vectors"] is not None:
|
if V["vectors"] is not None:
|
||||||
add_vectors(nlp, T["vectors"])
|
add_vectors(nlp, V["vectors"])
|
||||||
msg.good(f"Added vectors: {T['vectors']}")
|
msg.good(f"Added vectors: {V['vectors']}")
|
||||||
optimizer = T["optimizer"]
|
optimizer = T["optimizer"]
|
||||||
before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
|
before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
|
||||||
# Components that shouldn't be updated during training
|
# Components that shouldn't be updated during training
|
||||||
|
@ -130,20 +131,15 @@ def init_vocab(
|
||||||
|
|
||||||
|
|
||||||
def add_tok2vec_weights(
|
def add_tok2vec_weights(
|
||||||
nlp: Language, pretrain_config: Dict[str, Any], init_config: Dict[str, Any]
|
nlp: Language, pretrain_config: Dict[str, Any], vocab_config: Dict[str, Any]
|
||||||
) -> None:
|
) -> None:
|
||||||
# Load pretrained tok2vec weights - cf. CLI command 'pretrain'
|
# Load pretrained tok2vec weights - cf. CLI command 'pretrain'
|
||||||
P = pretrain_config
|
P = pretrain_config
|
||||||
I = init_config
|
V = vocab_config
|
||||||
raw_text = util.ensure_path(I["vocab"]["raw_text"])
|
|
||||||
if raw_text is not None:
|
|
||||||
if not raw_text.exists():
|
|
||||||
msg.fail("Can't find raw text", raw_text, exits=1)
|
|
||||||
raw_text = list(srsly.read_jsonl(raw_text))
|
|
||||||
weights_data = None
|
weights_data = None
|
||||||
init_tok2vec = util.ensure_path(I["vocab"]["init_tok2vec"])
|
init_tok2vec = util.ensure_path(V["init_tok2vec"])
|
||||||
if init_tok2vec is not None:
|
if init_tok2vec is not None:
|
||||||
if P["objective"].get("type") == "vectors" and not I["vectors"]:
|
if P["objective"].get("type") == "vectors" and not V["vectors"]:
|
||||||
err = "Need initialize.vectors if pretraining.objective.type is vectors"
|
err = "Need initialize.vectors if pretraining.objective.type is vectors"
|
||||||
msg.fail(err, exits=1)
|
msg.fail(err, exits=1)
|
||||||
if not init_tok2vec.exists():
|
if not init_tok2vec.exists():
|
||||||
|
|
|
@ -277,11 +277,6 @@ path = ${paths.dev}
|
||||||
max_length = 0
|
max_length = 0
|
||||||
|
|
||||||
[training]
|
[training]
|
||||||
{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
|
|
||||||
vectors = null
|
|
||||||
{% else -%}
|
|
||||||
vectors = "{{ word_vectors }}"
|
|
||||||
{% endif -%}
|
|
||||||
{% if use_transformer -%}
|
{% if use_transformer -%}
|
||||||
accumulate_gradient = {{ transformer["size_factor"] }}
|
accumulate_gradient = {{ transformer["size_factor"] }}
|
||||||
{% endif -%}
|
{% endif -%}
|
||||||
|
@ -317,3 +312,12 @@ start = 100
|
||||||
stop = 1000
|
stop = 1000
|
||||||
compound = 1.001
|
compound = 1.001
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
||||||
|
[initialize]
|
||||||
|
|
||||||
|
[initialize.vocab]
|
||||||
|
{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
|
||||||
|
vectors = null
|
||||||
|
{% else -%}
|
||||||
|
vectors = "{{ word_vectors }}"
|
||||||
|
{% endif -%}
|
||||||
|
|
|
@ -1,8 +1,9 @@
|
||||||
[paths]
|
[paths]
|
||||||
train = ""
|
train = ""
|
||||||
dev = ""
|
dev = ""
|
||||||
raw = null
|
raw_text = null
|
||||||
init_tok2vec = null
|
init_tok2vec = null
|
||||||
|
vocab_data = null
|
||||||
|
|
||||||
[system]
|
[system]
|
||||||
seed = 0
|
seed = 0
|
||||||
|
@ -54,11 +55,6 @@ seed = ${system.seed}
|
||||||
gpu_allocator = ${system.gpu_allocator}
|
gpu_allocator = ${system.gpu_allocator}
|
||||||
dropout = 0.1
|
dropout = 0.1
|
||||||
accumulate_gradient = 1
|
accumulate_gradient = 1
|
||||||
# Extra resources for transfer-learning or pseudo-rehearsal
|
|
||||||
init_tok2vec = ${paths.init_tok2vec}
|
|
||||||
raw_text = ${paths.raw}
|
|
||||||
vectors = null
|
|
||||||
lookups = null
|
|
||||||
# Controls early-stopping. 0 or -1 mean unlimited.
|
# Controls early-stopping. 0 or -1 mean unlimited.
|
||||||
patience = 1600
|
patience = 1600
|
||||||
max_epochs = 0
|
max_epochs = 0
|
||||||
|
@ -112,9 +108,8 @@ tokenizer = {}
|
||||||
components = {}
|
components = {}
|
||||||
|
|
||||||
[initialize.vocab]
|
[initialize.vocab]
|
||||||
data = null
|
data = ${paths.vocab_data}
|
||||||
lookups = null
|
lookups = null
|
||||||
vectors = null
|
vectors = null
|
||||||
# Extra resources for transfer-learning or pseudo-rehearsal
|
# Extra resources for transfer-learning or pseudo-rehearsal
|
||||||
init_tok2vec = ${paths.init_tok2vec}
|
init_tok2vec = ${paths.init_tok2vec}
|
||||||
raw_text = ${paths.raw}
|
|
||||||
|
|
|
@ -32,7 +32,7 @@ learn_rate = 0.001
|
||||||
|
|
||||||
[corpora.pretrain]
|
[corpora.pretrain]
|
||||||
@readers = "spacy.JsonlReader.v1"
|
@readers = "spacy.JsonlReader.v1"
|
||||||
path = ${paths.raw}
|
path = ${paths.raw_text}
|
||||||
min_length = 5
|
min_length = 5
|
||||||
max_length = 500
|
max_length = 500
|
||||||
limit = 0
|
limit = 0
|
||||||
|
|
|
@ -3,7 +3,6 @@ from typing import Iterable, TypeVar, TYPE_CHECKING
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from pydantic import BaseModel, Field, ValidationError, validator
|
from pydantic import BaseModel, Field, ValidationError, validator
|
||||||
from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
|
from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
|
||||||
from pydantic import root_validator
|
|
||||||
from thinc.config import Promise
|
from thinc.config import Promise
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from thinc.api import Optimizer
|
from thinc.api import Optimizer
|
||||||
|
@ -205,8 +204,6 @@ class ModelMetaSchema(BaseModel):
|
||||||
|
|
||||||
class ConfigSchemaTraining(BaseModel):
|
class ConfigSchemaTraining(BaseModel):
|
||||||
# fmt: off
|
# fmt: off
|
||||||
vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
|
|
||||||
lookups: Optional[Lookups] = Field(..., title="Vocab lookups")
|
|
||||||
dev_corpus: StrictStr = Field(..., title="Path in the config to the dev data")
|
dev_corpus: StrictStr = Field(..., title="Path in the config to the dev data")
|
||||||
train_corpus: StrictStr = Field(..., title="Path in the config to the training data")
|
train_corpus: StrictStr = Field(..., title="Path in the config to the training data")
|
||||||
batcher: Batcher = Field(..., title="Batcher for the training data")
|
batcher: Batcher = Field(..., title="Batcher for the training data")
|
||||||
|
@ -219,8 +216,6 @@ class ConfigSchemaTraining(BaseModel):
|
||||||
gpu_allocator: Optional[StrictStr] = Field(..., title="Memory allocator when running on GPU")
|
gpu_allocator: Optional[StrictStr] = Field(..., title="Memory allocator when running on GPU")
|
||||||
accumulate_gradient: StrictInt = Field(..., title="Whether to divide the batch up into substeps")
|
accumulate_gradient: StrictInt = Field(..., title="Whether to divide the batch up into substeps")
|
||||||
score_weights: Dict[StrictStr, Optional[Union[StrictFloat, StrictInt]]] = Field(..., title="Scores to report and their weights for selecting final model")
|
score_weights: Dict[StrictStr, Optional[Union[StrictFloat, StrictInt]]] = Field(..., title="Scores to report and their weights for selecting final model")
|
||||||
init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
|
|
||||||
raw_text: Optional[StrictStr] = Field(default=None, title="Raw text")
|
|
||||||
optimizer: Optimizer = Field(..., title="The optimizer to use")
|
optimizer: Optimizer = Field(..., title="The optimizer to use")
|
||||||
logger: Logger = Field(..., title="The logger to track training progress")
|
logger: Logger = Field(..., title="The logger to track training progress")
|
||||||
frozen_components: List[str] = Field(..., title="Pipeline components that shouldn't be updated during training")
|
frozen_components: List[str] = Field(..., title="Pipeline components that shouldn't be updated during training")
|
||||||
|
@ -275,11 +270,10 @@ class ConfigSchemaPretrain(BaseModel):
|
||||||
|
|
||||||
class ConfigSchemaInitVocab(BaseModel):
|
class ConfigSchemaInitVocab(BaseModel):
|
||||||
# fmt: off
|
# fmt: off
|
||||||
data: Optional[str] = Field(..., title="Path to JSON-formatted vocabulary file")
|
data: Optional[StrictStr] = Field(..., title="Path to JSON-formatted vocabulary file")
|
||||||
lookups: Optional[Lookups] = Field(..., title="Vocabulary lookups, e.g. lexeme normalization")
|
lookups: Optional[Lookups] = Field(..., title="Vocabulary lookups, e.g. lexeme normalization")
|
||||||
vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
|
vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
|
||||||
init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
|
init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
|
||||||
raw_text: Optional[StrictStr] = Field(default=None, title="Raw text")
|
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
|
@ -290,7 +284,7 @@ class ConfigSchemaInitVocab(BaseModel):
|
||||||
class ConfigSchemaInit(BaseModel):
|
class ConfigSchemaInit(BaseModel):
|
||||||
vocab: ConfigSchemaInitVocab
|
vocab: ConfigSchemaInitVocab
|
||||||
tokenizer: Any
|
tokenizer: Any
|
||||||
components: Dict[str, Any]
|
components: Dict[StrictStr, Any]
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
extra = "forbid"
|
extra = "forbid"
|
||||||
|
|
Loading…
Reference in New Issue
Block a user