mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 02:06:31 +03:00
rewrite train_corpus to corpus.train in config
This commit is contained in:
parent
bd87e8686e
commit
51fa929f47
|
@ -21,14 +21,16 @@ eval_frequency = 200
|
||||||
score_weights = {"dep_las": 0.4, "ents_f": 0.4, "tag_acc": 0.2}
|
score_weights = {"dep_las": 0.4, "ents_f": 0.4, "tag_acc": 0.2}
|
||||||
frozen_components = []
|
frozen_components = []
|
||||||
|
|
||||||
[training.train_corpus]
|
[training.corpus]
|
||||||
|
|
||||||
|
[training.corpus.train]
|
||||||
@readers = "spacy.Corpus.v1"
|
@readers = "spacy.Corpus.v1"
|
||||||
path = ${paths:train}
|
path = ${paths:train}
|
||||||
gold_preproc = true
|
gold_preproc = true
|
||||||
max_length = 0
|
max_length = 0
|
||||||
limit = 0
|
limit = 0
|
||||||
|
|
||||||
[training.dev_corpus]
|
[training.corpus.dev]
|
||||||
@readers = "spacy.Corpus.v1"
|
@readers = "spacy.Corpus.v1"
|
||||||
path = ${paths:dev}
|
path = ${paths:dev}
|
||||||
gold_preproc = ${training.read_train:gold_preproc}
|
gold_preproc = ${training.read_train:gold_preproc}
|
||||||
|
|
|
@ -20,14 +20,16 @@ patience = 10000
|
||||||
eval_frequency = 200
|
eval_frequency = 200
|
||||||
score_weights = {"dep_las": 0.8, "tag_acc": 0.2}
|
score_weights = {"dep_las": 0.8, "tag_acc": 0.2}
|
||||||
|
|
||||||
[training.read_train]
|
[training.corpus]
|
||||||
|
|
||||||
|
[training.corpus.train]
|
||||||
@readers = "spacy.Corpus.v1"
|
@readers = "spacy.Corpus.v1"
|
||||||
path = ${paths:train}
|
path = ${paths:train}
|
||||||
gold_preproc = true
|
gold_preproc = true
|
||||||
max_length = 0
|
max_length = 0
|
||||||
limit = 0
|
limit = 0
|
||||||
|
|
||||||
[training.read_dev]
|
[training.corpus.dev]
|
||||||
@readers = "spacy.Corpus.v1"
|
@readers = "spacy.Corpus.v1"
|
||||||
path = ${paths:dev}
|
path = ${paths:dev}
|
||||||
gold_preproc = ${training.read_train:gold_preproc}
|
gold_preproc = ${training.read_train:gold_preproc}
|
||||||
|
|
|
@ -195,12 +195,14 @@ total_steps = 20000
|
||||||
initial_rate = 5e-5
|
initial_rate = 5e-5
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
||||||
[training.train_corpus]
|
[training.corpus]
|
||||||
|
|
||||||
|
[training.corpus.train]
|
||||||
@readers = "spacy.Corpus.v1"
|
@readers = "spacy.Corpus.v1"
|
||||||
path = ${paths.train}
|
path = ${paths.train}
|
||||||
max_length = {{ 500 if hardware == "gpu" else 2000 }}
|
max_length = {{ 500 if hardware == "gpu" else 2000 }}
|
||||||
|
|
||||||
[training.dev_corpus]
|
[training.corpus.dev]
|
||||||
@readers = "spacy.Corpus.v1"
|
@readers = "spacy.Corpus.v1"
|
||||||
path = ${paths.dev}
|
path = ${paths.dev}
|
||||||
max_length = 0
|
max_length = 0
|
||||||
|
|
|
@ -92,8 +92,8 @@ def train(
|
||||||
raw_text, tag_map, morph_rules, weights_data = load_from_paths(config)
|
raw_text, tag_map, morph_rules, weights_data = load_from_paths(config)
|
||||||
T_cfg = config["training"]
|
T_cfg = config["training"]
|
||||||
optimizer = T_cfg["optimizer"]
|
optimizer = T_cfg["optimizer"]
|
||||||
train_corpus = T_cfg["train_corpus"]
|
train_corpus = T_cfg["corpus"]["train"]
|
||||||
dev_corpus = T_cfg["dev_corpus"]
|
dev_corpus = T_cfg["corpus"]["dev"]
|
||||||
batcher = T_cfg["batcher"]
|
batcher = T_cfg["batcher"]
|
||||||
train_logger = T_cfg["logger"]
|
train_logger = T_cfg["logger"]
|
||||||
# Components that shouldn't be updated during training
|
# Components that shouldn't be updated during training
|
||||||
|
|
|
@ -44,7 +44,9 @@ frozen_components = []
|
||||||
[training.logger]
|
[training.logger]
|
||||||
@loggers = "spacy.ConsoleLogger.v1"
|
@loggers = "spacy.ConsoleLogger.v1"
|
||||||
|
|
||||||
[training.train_corpus]
|
[training.corpus]
|
||||||
|
|
||||||
|
[training.corpus.train]
|
||||||
@readers = "spacy.Corpus.v1"
|
@readers = "spacy.Corpus.v1"
|
||||||
path = ${paths.train}
|
path = ${paths.train}
|
||||||
# Whether to train on sequences with 'gold standard' sentence boundaries
|
# Whether to train on sequences with 'gold standard' sentence boundaries
|
||||||
|
@ -56,7 +58,7 @@ max_length = 0
|
||||||
# Limitation on number of training examples
|
# Limitation on number of training examples
|
||||||
limit = 0
|
limit = 0
|
||||||
|
|
||||||
[training.dev_corpus]
|
[training.corpus.dev]
|
||||||
@readers = "spacy.Corpus.v1"
|
@readers = "spacy.Corpus.v1"
|
||||||
path = ${paths.dev}
|
path = ${paths.dev}
|
||||||
# Whether to train on sequences with 'gold standard' sentence boundaries
|
# Whether to train on sequences with 'gold standard' sentence boundaries
|
||||||
|
|
|
@ -198,8 +198,7 @@ class ModelMetaSchema(BaseModel):
|
||||||
class ConfigSchemaTraining(BaseModel):
|
class ConfigSchemaTraining(BaseModel):
|
||||||
# fmt: off
|
# fmt: off
|
||||||
vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
|
vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
|
||||||
train_corpus: Reader = Field(..., title="Reader for the training data")
|
corpus: Reader = Field(..., title="Reader for the training and dev data")
|
||||||
dev_corpus: Reader = Field(..., title="Reader for the dev data")
|
|
||||||
batcher: Batcher = Field(..., title="Batcher for the training data")
|
batcher: Batcher = Field(..., title="Batcher for the training data")
|
||||||
dropout: StrictFloat = Field(..., title="Dropout rate")
|
dropout: StrictFloat = Field(..., title="Dropout rate")
|
||||||
patience: StrictInt = Field(..., title="How many steps to continue without improvement in evaluation score")
|
patience: StrictInt = Field(..., title="How many steps to continue without improvement in evaluation score")
|
||||||
|
|
|
@ -19,11 +19,13 @@ dev = ""
|
||||||
|
|
||||||
[training]
|
[training]
|
||||||
|
|
||||||
[training.train_corpus]
|
[training.corpus]
|
||||||
|
|
||||||
|
[training.corpus.train]
|
||||||
@readers = "spacy.Corpus.v1"
|
@readers = "spacy.Corpus.v1"
|
||||||
path = ${paths.train}
|
path = ${paths.train}
|
||||||
|
|
||||||
[training.dev_corpus]
|
[training.corpus.dev]
|
||||||
@readers = "spacy.Corpus.v1"
|
@readers = "spacy.Corpus.v1"
|
||||||
path = ${paths.dev}
|
path = ${paths.dev}
|
||||||
|
|
||||||
|
@ -300,20 +302,20 @@ def test_config_overrides():
|
||||||
|
|
||||||
def test_config_interpolation():
|
def test_config_interpolation():
|
||||||
config = Config().from_str(nlp_config_string, interpolate=False)
|
config = Config().from_str(nlp_config_string, interpolate=False)
|
||||||
assert config["training"]["train_corpus"]["path"] == "${paths.train}"
|
assert config["training"]["corpus"]["train"]["path"] == "${paths.train}"
|
||||||
interpolated = config.interpolate()
|
interpolated = config.interpolate()
|
||||||
assert interpolated["training"]["train_corpus"]["path"] == ""
|
assert interpolated["training"]["corpus"]["train"]["path"] == ""
|
||||||
nlp = English.from_config(config)
|
nlp = English.from_config(config)
|
||||||
assert nlp.config["training"]["train_corpus"]["path"] == "${paths.train}"
|
assert nlp.config["training"]["corpus"]["train"]["path"] == "${paths.train}"
|
||||||
# Ensure that variables are preserved in nlp config
|
# Ensure that variables are preserved in nlp config
|
||||||
width = "${components.tok2vec.model.width}"
|
width = "${components.tok2vec.model.width}"
|
||||||
assert config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
|
assert config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
|
||||||
assert nlp.config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
|
assert nlp.config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
|
||||||
interpolated2 = nlp.config.interpolate()
|
interpolated2 = nlp.config.interpolate()
|
||||||
assert interpolated2["training"]["train_corpus"]["path"] == ""
|
assert interpolated2["training"]["corpus"]["train"]["path"] == ""
|
||||||
assert interpolated2["components"]["tagger"]["model"]["tok2vec"]["width"] == 342
|
assert interpolated2["components"]["tagger"]["model"]["tok2vec"]["width"] == 342
|
||||||
nlp2 = English.from_config(interpolated)
|
nlp2 = English.from_config(interpolated)
|
||||||
assert nlp2.config["training"]["train_corpus"]["path"] == ""
|
assert nlp2.config["training"]["corpus"]["train"]["path"] == ""
|
||||||
assert nlp2.config["components"]["tagger"]["model"]["tok2vec"]["width"] == 342
|
assert nlp2.config["components"]["tagger"]["model"]["tok2vec"]["width"] == 342
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -26,7 +26,7 @@ streaming.
|
||||||
> [paths]
|
> [paths]
|
||||||
> train = "corpus/train.spacy"
|
> train = "corpus/train.spacy"
|
||||||
>
|
>
|
||||||
> [training.train_corpus]
|
> [training.corpus.train]
|
||||||
> @readers = "spacy.Corpus.v1"
|
> @readers = "spacy.Corpus.v1"
|
||||||
> path = ${paths.train}
|
> path = ${paths.train}
|
||||||
> gold_preproc = false
|
> gold_preproc = false
|
||||||
|
|
|
@ -126,24 +126,23 @@ $ python -m spacy train config.cfg --paths.train ./corpus/train.spacy
|
||||||
This section defines settings and controls for the training and evaluation
|
This section defines settings and controls for the training and evaluation
|
||||||
process that are used when you run [`spacy train`](/api/cli#train).
|
process that are used when you run [`spacy train`](/api/cli#train).
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| --------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ |
|
| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ |
|
||||||
| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ |
|
| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ |
|
||||||
| `dev_corpus` | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/top-level#Corpus). ~~Callable[[Language], Iterator[Example]]~~ |
|
| `corpus` | Dictionary with `train` and `develop` keys, each referring to a callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/top-level#Corpus). ~~Callable[[Language], Iterator[Example]]~~ |
|
||||||
| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ |
|
| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ |
|
||||||
| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ |
|
| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ |
|
||||||
| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ |
|
| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ |
|
||||||
| `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~ |
|
| `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~ |
|
||||||
| `max_epochs` | Maximum number of epochs to train for. Defaults to `0`. ~~int~~ |
|
| `max_epochs` | Maximum number of epochs to train for. Defaults to `0`. ~~int~~ |
|
||||||
| `max_steps` | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~ |
|
| `max_steps` | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~ |
|
||||||
| `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ |
|
| `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ |
|
||||||
| `patience` | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~ |
|
| `patience` | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~ |
|
||||||
| `raw_text` | Optional path to a jsonl file with unlabelled text documents for a [rehearsal](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~ |
|
| `raw_text` | Optional path to a jsonl file with unlabelled text documents for a [rehearsal](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~ |
|
||||||
| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ |
|
| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ |
|
||||||
| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ |
|
| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ |
|
||||||
| `train_corpus` | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/top-level#Corpus). ~~Callable[[Language], Iterator[Example]]~~ |
|
| `vectors` | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~ |
|
||||||
| `vectors` | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~ |
|
|
||||||
|
|
||||||
### pretraining {#config-pretraining tag="section,optional"}
|
### pretraining {#config-pretraining tag="section,optional"}
|
||||||
|
|
||||||
|
|
|
@ -448,7 +448,7 @@ remain in the config file stored on your local system.
|
||||||
> [training.logger]
|
> [training.logger]
|
||||||
> @loggers = "spacy.WandbLogger.v1"
|
> @loggers = "spacy.WandbLogger.v1"
|
||||||
> project_name = "monitor_spacy_training"
|
> project_name = "monitor_spacy_training"
|
||||||
> remove_config_values = ["paths.train", "paths.dev", "training.dev_corpus.path", "training.train_corpus.path"]
|
> remove_config_values = ["paths.train", "paths.dev", "training.corpus.train.path", "training.corpus.dev.path"]
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
|
@ -478,7 +478,7 @@ the [`Corpus`](/api/corpus) class.
|
||||||
> [paths]
|
> [paths]
|
||||||
> train = "corpus/train.spacy"
|
> train = "corpus/train.spacy"
|
||||||
>
|
>
|
||||||
> [training.train_corpus]
|
> [training.corpus.train]
|
||||||
> @readers = "spacy.Corpus.v1"
|
> @readers = "spacy.Corpus.v1"
|
||||||
> path = ${paths.train}
|
> path = ${paths.train}
|
||||||
> gold_preproc = false
|
> gold_preproc = false
|
||||||
|
|
|
@ -969,7 +969,7 @@ your results.
|
||||||
> [training.logger]
|
> [training.logger]
|
||||||
> @loggers = "spacy.WandbLogger.v1"
|
> @loggers = "spacy.WandbLogger.v1"
|
||||||
> project_name = "monitor_spacy_training"
|
> project_name = "monitor_spacy_training"
|
||||||
> remove_config_values = ["paths.train", "paths.dev", "training.dev_corpus.path", "training.train_corpus.path"]
|
> remove_config_values = ["paths.train", "paths.dev", "training.corpus.train.path", "training.corpus.dev.path"]
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
![Screenshot: Visualized training results](../images/wandb1.jpg)
|
![Screenshot: Visualized training results](../images/wandb1.jpg)
|
||||||
|
|
|
@ -746,7 +746,7 @@ as **config settings** – in this case, `source`.
|
||||||
> #### config.cfg
|
> #### config.cfg
|
||||||
>
|
>
|
||||||
> ```ini
|
> ```ini
|
||||||
> [training.train_corpus]
|
> [training.corpus.train]
|
||||||
> @readers = "corpus_variants.v1"
|
> @readers = "corpus_variants.v1"
|
||||||
> source = "s3://your_bucket/path/data.csv"
|
> source = "s3://your_bucket/path/data.csv"
|
||||||
> ```
|
> ```
|
||||||
|
|
Loading…
Reference in New Issue
Block a user