From bd87e8686e05487116c3a0c631bcb789059b2636 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 15 Sep 2020 21:40:38 +0200 Subject: [PATCH 01/13] move tests to correct subdir --- spacy/tests/{ => pipeline}/test_tok2vec.py | 2 +- spacy/tests/training/__init__.py | 0 spacy/tests/{ => training}/test_training.py | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) rename spacy/tests/{ => pipeline}/test_tok2vec.py (99%) create mode 100644 spacy/tests/training/__init__.py rename spacy/tests/{ => training}/test_training.py (99%) diff --git a/spacy/tests/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py similarity index 99% rename from spacy/tests/test_tok2vec.py rename to spacy/tests/pipeline/test_tok2vec.py index fb30c6ae5..0365554bc 100644 --- a/spacy/tests/test_tok2vec.py +++ b/spacy/tests/pipeline/test_tok2vec.py @@ -9,7 +9,7 @@ from spacy.tokens import Doc from spacy.training import Example from spacy import util from spacy.lang.en import English -from .util import get_batch +from ..util import get_batch from thinc.api import Config diff --git a/spacy/tests/training/__init__.py b/spacy/tests/training/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/test_training.py b/spacy/tests/training/test_training.py similarity index 99% rename from spacy/tests/test_training.py rename to spacy/tests/training/test_training.py index 1926aca1f..67cc37b1c 100644 --- a/spacy/tests/test_training.py +++ b/spacy/tests/training/test_training.py @@ -12,7 +12,7 @@ from thinc.api import compounding import pytest import srsly -from .util import make_tempdir +from ..util import make_tempdir @pytest.fixture From 51fa929f47120272bd6b8dfbba1f000833446f0f Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 15 Sep 2020 21:58:04 +0200 Subject: [PATCH 02/13] rewrite train_corpus to corpus.train in config --- extra/experiments/onto-joint/defaults.cfg | 6 ++-- .../ptb-joint-pos-dep/defaults.cfg | 6 ++-- spacy/cli/templates/quickstart_training.jinja | 6 ++-- spacy/cli/train.py | 4 +-- spacy/default_config.cfg | 6 ++-- spacy/schemas.py | 3 +- .../tests/serialize/test_serialize_config.py | 16 +++++---- website/docs/api/corpus.md | 2 +- website/docs/api/data-formats.md | 35 +++++++++---------- website/docs/api/top-level.md | 4 +-- website/docs/usage/projects.md | 2 +- website/docs/usage/training.md | 2 +- 12 files changed, 50 insertions(+), 42 deletions(-) diff --git a/extra/experiments/onto-joint/defaults.cfg b/extra/experiments/onto-joint/defaults.cfg index 7954b57b5..97eebe6b4 100644 --- a/extra/experiments/onto-joint/defaults.cfg +++ b/extra/experiments/onto-joint/defaults.cfg @@ -21,14 +21,16 @@ eval_frequency = 200 score_weights = {"dep_las": 0.4, "ents_f": 0.4, "tag_acc": 0.2} frozen_components = [] -[training.train_corpus] +[training.corpus] + +[training.corpus.train] @readers = "spacy.Corpus.v1" path = ${paths:train} gold_preproc = true max_length = 0 limit = 0 -[training.dev_corpus] +[training.corpus.dev] @readers = "spacy.Corpus.v1" path = ${paths:dev} gold_preproc = ${training.read_train:gold_preproc} diff --git a/extra/experiments/ptb-joint-pos-dep/defaults.cfg b/extra/experiments/ptb-joint-pos-dep/defaults.cfg index 8f9c5666e..03e2f5bd7 100644 --- a/extra/experiments/ptb-joint-pos-dep/defaults.cfg +++ b/extra/experiments/ptb-joint-pos-dep/defaults.cfg @@ -20,14 +20,16 @@ patience = 10000 eval_frequency = 200 score_weights = {"dep_las": 0.8, "tag_acc": 0.2} -[training.read_train] +[training.corpus] + +[training.corpus.train] @readers = "spacy.Corpus.v1" path = ${paths:train} gold_preproc = true max_length = 0 limit = 0 -[training.read_dev] +[training.corpus.dev] @readers = "spacy.Corpus.v1" path = ${paths:dev} gold_preproc = ${training.read_train:gold_preproc} diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index 199aae217..39d4d875d 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -195,12 +195,14 @@ total_steps = 20000 initial_rate = 5e-5 {% endif %} -[training.train_corpus] +[training.corpus] + +[training.corpus.train] @readers = "spacy.Corpus.v1" path = ${paths.train} max_length = {{ 500 if hardware == "gpu" else 2000 }} -[training.dev_corpus] +[training.corpus.dev] @readers = "spacy.Corpus.v1" path = ${paths.dev} max_length = 0 diff --git a/spacy/cli/train.py b/spacy/cli/train.py index ae4a8455e..2c2eeb88b 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -92,8 +92,8 @@ def train( raw_text, tag_map, morph_rules, weights_data = load_from_paths(config) T_cfg = config["training"] optimizer = T_cfg["optimizer"] - train_corpus = T_cfg["train_corpus"] - dev_corpus = T_cfg["dev_corpus"] + train_corpus = T_cfg["corpus"]["train"] + dev_corpus = T_cfg["corpus"]["dev"] batcher = T_cfg["batcher"] train_logger = T_cfg["logger"] # Components that shouldn't be updated during training diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg index 7cd71453f..61f3dfe25 100644 --- a/spacy/default_config.cfg +++ b/spacy/default_config.cfg @@ -44,7 +44,9 @@ frozen_components = [] [training.logger] @loggers = "spacy.ConsoleLogger.v1" -[training.train_corpus] +[training.corpus] + +[training.corpus.train] @readers = "spacy.Corpus.v1" path = ${paths.train} # Whether to train on sequences with 'gold standard' sentence boundaries @@ -56,7 +58,7 @@ max_length = 0 # Limitation on number of training examples limit = 0 -[training.dev_corpus] +[training.corpus.dev] @readers = "spacy.Corpus.v1" path = ${paths.dev} # Whether to train on sequences with 'gold standard' sentence boundaries diff --git a/spacy/schemas.py b/spacy/schemas.py index 0dd2b9204..d8bcf3c1d 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -198,8 +198,7 @@ class ModelMetaSchema(BaseModel): class ConfigSchemaTraining(BaseModel): # fmt: off vectors: Optional[StrictStr] = Field(..., title="Path to vectors") - train_corpus: Reader = Field(..., title="Reader for the training data") - dev_corpus: Reader = Field(..., title="Reader for the dev data") + corpus: Reader = Field(..., title="Reader for the training and dev data") batcher: Batcher = Field(..., title="Batcher for the training data") dropout: StrictFloat = Field(..., title="Dropout rate") patience: StrictInt = Field(..., title="How many steps to continue without improvement in evaluation score") diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py index 0ab212fda..d113ac2a5 100644 --- a/spacy/tests/serialize/test_serialize_config.py +++ b/spacy/tests/serialize/test_serialize_config.py @@ -19,11 +19,13 @@ dev = "" [training] -[training.train_corpus] +[training.corpus] + +[training.corpus.train] @readers = "spacy.Corpus.v1" path = ${paths.train} -[training.dev_corpus] +[training.corpus.dev] @readers = "spacy.Corpus.v1" path = ${paths.dev} @@ -300,20 +302,20 @@ def test_config_overrides(): def test_config_interpolation(): config = Config().from_str(nlp_config_string, interpolate=False) - assert config["training"]["train_corpus"]["path"] == "${paths.train}" + assert config["training"]["corpus"]["train"]["path"] == "${paths.train}" interpolated = config.interpolate() - assert interpolated["training"]["train_corpus"]["path"] == "" + assert interpolated["training"]["corpus"]["train"]["path"] == "" nlp = English.from_config(config) - assert nlp.config["training"]["train_corpus"]["path"] == "${paths.train}" + assert nlp.config["training"]["corpus"]["train"]["path"] == "${paths.train}" # Ensure that variables are preserved in nlp config width = "${components.tok2vec.model.width}" assert config["components"]["tagger"]["model"]["tok2vec"]["width"] == width assert nlp.config["components"]["tagger"]["model"]["tok2vec"]["width"] == width interpolated2 = nlp.config.interpolate() - assert interpolated2["training"]["train_corpus"]["path"] == "" + assert interpolated2["training"]["corpus"]["train"]["path"] == "" assert interpolated2["components"]["tagger"]["model"]["tok2vec"]["width"] == 342 nlp2 = English.from_config(interpolated) - assert nlp2.config["training"]["train_corpus"]["path"] == "" + assert nlp2.config["training"]["corpus"]["train"]["path"] == "" assert nlp2.config["components"]["tagger"]["model"]["tok2vec"]["width"] == 342 diff --git a/website/docs/api/corpus.md b/website/docs/api/corpus.md index 0f49b02e3..c25ce1651 100644 --- a/website/docs/api/corpus.md +++ b/website/docs/api/corpus.md @@ -26,7 +26,7 @@ streaming. > [paths] > train = "corpus/train.spacy" > -> [training.train_corpus] +> [training.corpus.train] > @readers = "spacy.Corpus.v1" > path = ${paths.train} > gold_preproc = false diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index 79ecb08b3..74d612862 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -126,24 +126,23 @@ $ python -m spacy train config.cfg --paths.train ./corpus/train.spacy This section defines settings and controls for the training and evaluation process that are used when you run [`spacy train`](/api/cli#train). -| Name | Description | -| --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ | -| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ | -| `dev_corpus` | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/top-level#Corpus). ~~Callable[[Language], Iterator[Example]]~~ | -| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ | -| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ | -| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ | -| `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~ | -| `max_epochs` | Maximum number of epochs to train for. Defaults to `0`. ~~int~~ | -| `max_steps` | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~ | -| `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ | -| `patience` | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~ | -| `raw_text` | Optional path to a jsonl file with unlabelled text documents for a [rehearsal](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~ | -| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ | -| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ | -| `train_corpus` | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/top-level#Corpus). ~~Callable[[Language], Iterator[Example]]~~ | -| `vectors` | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~ | +| Name | Description | +| --------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ | +| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ | +| `corpus` | Dictionary with `train` and `develop` keys, each referring to a callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/top-level#Corpus). ~~Callable[[Language], Iterator[Example]]~~ | +| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ | +| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ | +| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ | +| `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~ | +| `max_epochs` | Maximum number of epochs to train for. Defaults to `0`. ~~int~~ | +| `max_steps` | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~ | +| `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ | +| `patience` | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~ | +| `raw_text` | Optional path to a jsonl file with unlabelled text documents for a [rehearsal](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~ | +| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ | +| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ | +| `vectors` | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~ | ### pretraining {#config-pretraining tag="section,optional"} diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index f52c63f18..be7994d5d 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -448,7 +448,7 @@ remain in the config file stored on your local system. > [training.logger] > @loggers = "spacy.WandbLogger.v1" > project_name = "monitor_spacy_training" -> remove_config_values = ["paths.train", "paths.dev", "training.dev_corpus.path", "training.train_corpus.path"] +> remove_config_values = ["paths.train", "paths.dev", "training.corpus.train.path", "training.corpus.dev.path"] > ``` | Name | Description | @@ -478,7 +478,7 @@ the [`Corpus`](/api/corpus) class. > [paths] > train = "corpus/train.spacy" > -> [training.train_corpus] +> [training.corpus.train] > @readers = "spacy.Corpus.v1" > path = ${paths.train} > gold_preproc = false diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md index 9776dab1b..3a6bd4551 100644 --- a/website/docs/usage/projects.md +++ b/website/docs/usage/projects.md @@ -969,7 +969,7 @@ your results. > [training.logger] > @loggers = "spacy.WandbLogger.v1" > project_name = "monitor_spacy_training" -> remove_config_values = ["paths.train", "paths.dev", "training.dev_corpus.path", "training.train_corpus.path"] +> remove_config_values = ["paths.train", "paths.dev", "training.corpus.train.path", "training.corpus.dev.path"] > ``` ![Screenshot: Visualized training results](../images/wandb1.jpg) diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 65cfb563b..bba2e2853 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -746,7 +746,7 @@ as **config settings** – in this case, `source`. > #### config.cfg > > ```ini -> [training.train_corpus] +> [training.corpus.train] > @readers = "corpus_variants.v1" > source = "s3://your_bucket/path/data.csv" > ``` From 733665766205f350398d3216e94ab8a5ac6c3751 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 15 Sep 2020 22:07:16 +0200 Subject: [PATCH 03/13] corpus is a Dict --- spacy/schemas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/schemas.py b/spacy/schemas.py index d8bcf3c1d..2030048d8 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -198,7 +198,7 @@ class ModelMetaSchema(BaseModel): class ConfigSchemaTraining(BaseModel): # fmt: off vectors: Optional[StrictStr] = Field(..., title="Path to vectors") - corpus: Reader = Field(..., title="Reader for the training and dev data") + corpus: Dict[str, Reader] = Field(..., title="Reader for the training and dev data") batcher: Batcher = Field(..., title="Batcher for the training data") dropout: StrictFloat = Field(..., title="Dropout rate") patience: StrictInt = Field(..., title="How many steps to continue without improvement in evaluation score") From 55f8d5478ecb5fd913a3a5fe7c469e8bc8a4f038 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 15 Sep 2020 22:09:30 +0200 Subject: [PATCH 04/13] fix example output --- website/docs/api/cli.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 8449d23e1..7dd6e6184 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -272,7 +272,7 @@ training -> dropout field required training -> optimizer field required training -> optimize extra fields not permitted -{'vectors': 'en_vectors_web_lg', 'seed': 0, 'accumulate_gradient': 1, 'init_tok2vec': None, 'raw_text': None, 'patience': 1600, 'max_epochs': 0, 'max_steps': 20000, 'eval_frequency': 200, 'frozen_components': [], 'optimize': None, 'batcher': {'@batchers': 'spacy.batch_by_words.v1', 'discard_oversize': False, 'tolerance': 0.2, 'get_length': None, 'size': {'@schedules': 'compounding.v1', 'start': 100, 'stop': 1000, 'compound': 1.001, 't': 0.0}}, 'dev_corpus': {'@readers': 'spacy.Corpus.v1', 'path': '', 'max_length': 0, 'gold_preproc': False, 'limit': 0}, 'score_weights': {'tag_acc': 0.5, 'dep_uas': 0.25, 'dep_las': 0.25, 'sents_f': 0.0}, 'train_corpus': {'@readers': 'spacy.Corpus.v1', 'path': '', 'max_length': 0, 'gold_preproc': False, 'limit': 0}} +{'vectors': 'en_vectors_web_lg', 'seed': 0, 'accumulate_gradient': 1, 'init_tok2vec': None, 'raw_text': None, 'patience': 1600, 'max_epochs': 0, 'max_steps': 20000, 'eval_frequency': 200, 'frozen_components': [], 'optimize': None, 'batcher': {'@batchers': 'spacy.batch_by_words.v1', 'discard_oversize': False, 'tolerance': 0.2, 'get_length': None, 'size': {'@schedules': 'compounding.v1', 'start': 100, 'stop': 1000, 'compound': 1.001, 't': 0.0}}, 'corpus': {'train': {'@readers': 'spacy.Corpus.v1', 'path': '', 'max_length': 0, 'gold_preproc': False, 'limit': 0}, 'dev': {'@readers': 'spacy.Corpus.v1', 'path': '', 'max_length': 0, 'gold_preproc': False, 'limit': 0}} 'score_weights': {'tag_acc': 0.5, 'dep_uas': 0.25, 'dep_las': 0.25, 'sents_f': 0.0}} If your config contains missing values, you can run the 'init fill-config' command to fill in all the defaults, if possible: @@ -370,7 +370,12 @@ Registry @schedules Name compounding.v1 Module thinc.schedules File /path/to/thinc/thinc/schedules.py (line 43) -ℹ [training.dev_corpus] +ℹ [training.corpus.dev] +Registry @readers +Name spacy.Corpus.v1 +Module spacy.training.corpus +File /path/to/spacy/training/corpus.py (line 18) +ℹ [training.corpus.train] Registry @readers Name spacy.Corpus.v1 Module spacy.training.corpus @@ -385,11 +390,6 @@ Registry @schedules Name warmup_linear.v1 Module thinc.schedules File /path/to/thinc/thinc/schedules.py (line 91) -ℹ [training.train_corpus] -Registry @readers -Name spacy.Corpus.v1 -Module spacy.training.corpus -File /path/to/spacy/training/corpus.py (line 18) ``` From 714a5a05c65e28b5264d16e7dba202126de2cbfb Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 16 Sep 2020 16:39:55 +0200 Subject: [PATCH 05/13] test for custom readers with ml_datasets >= 0.2 --- spacy/pipeline/textcat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index 3f6250680..e7cb62a0d 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -181,9 +181,9 @@ class TextCategorizer(Pipe): DOCS: https://nightly.spacy.io/api/textcategorizer#predict """ - tensors = [doc.tensor for doc in docs] if not any(len(doc) for doc in docs): # Handle cases where there are no tokens in any docs. + tensors = [doc.tensor for doc in docs] xp = get_array_module(tensors) scores = xp.zeros((len(docs), len(self.labels))) return scores From 1040e250d8f740db7d0a6b012962b25ce7f95ffb Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 16 Sep 2020 16:41:28 +0200 Subject: [PATCH 06/13] actual commit with test for custom readers with ml_datasets >= 0.2 --- requirements.txt | 2 +- spacy/tests/training/test_readers.py | 58 ++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+), 1 deletion(-) create mode 100644 spacy/tests/training/test_readers.py diff --git a/requirements.txt b/requirements.txt index db6eae2ef..a67ade640 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 thinc>=8.0.0a31,<8.0.0a40 blis>=0.4.0,<0.5.0 -ml_datasets>=0.1.1 +ml_datasets>=0.2.0 murmurhash>=0.28.0,<1.1.0 wasabi>=0.8.0,<1.1.0 srsly>=2.1.0,<3.0.0 diff --git a/spacy/tests/training/test_readers.py b/spacy/tests/training/test_readers.py new file mode 100644 index 000000000..c81ec0897 --- /dev/null +++ b/spacy/tests/training/test_readers.py @@ -0,0 +1,58 @@ +import pytest +from thinc.api import Config +from spacy.util import load_model_from_config + + +@pytest.mark.slow +@pytest.mark.parametrize( + "reader,additional_config", + [ + ("ml_datasets.imdb_sentiment.v1", {"train_limit": 10, "dev_limit": 2}), + ("ml_datasets.dbpedia.v1", {"train_limit": 10, "dev_limit": 2}), + ("ml_datasets.cmu_movies.v1", {"limit": 10, "freq_cutoff": 200, "split": 0.8}), + ], +) +def test_cat_readers(reader, additional_config): + nlp_config_string = """ + [training] + + [training.corpus] + @readers = "PLACEHOLDER" + + [nlp] + lang = "en" + pipeline = ["tok2vec", "textcat"] + + [components] + + [components.tok2vec] + factory = "tok2vec" + + [components.textcat] + factory = "textcat" + """ + config = Config().from_str(nlp_config_string) + config["training"]["corpus"]["@readers"] = reader + config["training"]["corpus"].update(additional_config) + nlp, resolved = load_model_from_config(config, auto_fill=True) + + train_corpus = resolved["training"]["corpus"]["train"] + optimizer = resolved["training"]["optimizer"] + # simulate a training loop + nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer) + for example in train_corpus(nlp): + assert example.y.cats + # this shouldn't fail if each training example has at least one positive label + assert sorted(list(set(example.y.cats.values()))) == [0.0, 1.0] + nlp.update([example], sgd=optimizer) + # simulate performance benchmark on dev corpus + dev_corpus = resolved["training"]["corpus"]["dev"] + dev_examples = list(dev_corpus(nlp)) + for example in dev_examples: + # this shouldn't fail if each dev example has at least one positive label + assert sorted(list(set(example.y.cats.values()))) == [0.0, 1.0] + scores = nlp.evaluate(dev_examples) + assert scores["cats_score"] + # ensure the pipeline runs + doc = nlp("Quick test") + assert doc.cats From 0dc914b667706b4e598b61e3cfff0a85e820118f Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 16 Sep 2020 16:42:58 +0200 Subject: [PATCH 07/13] bump thinc to 8.0.0a33 --- pyproject.toml | 2 +- requirements.txt | 2 +- setup.cfg | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index e610e603e..a413a099c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ requires = [ "cymem>=2.0.2,<2.1.0", "preshed>=3.0.2,<3.1.0", "murmurhash>=0.28.0,<1.1.0", - "thinc>=8.0.0a31,<8.0.0a40", + "thinc>=8.0.0a33,<8.0.0a40", "blis>=0.4.0,<0.5.0", "pytokenizations", "pathy" diff --git a/requirements.txt b/requirements.txt index a67ade640..69477c2d3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # Our libraries cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 -thinc>=8.0.0a31,<8.0.0a40 +thinc>=8.0.0a33,<8.0.0a40 blis>=0.4.0,<0.5.0 ml_datasets>=0.2.0 murmurhash>=0.28.0,<1.1.0 diff --git a/setup.cfg b/setup.cfg index 10a8972b0..359e63172 100644 --- a/setup.cfg +++ b/setup.cfg @@ -34,13 +34,13 @@ setup_requires = cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 murmurhash>=0.28.0,<1.1.0 - thinc>=8.0.0a31,<8.0.0a40 + thinc>=8.0.0a33,<8.0.0a40 install_requires = # Our libraries murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 - thinc>=8.0.0a31,<8.0.0a40 + thinc>=8.0.0a33,<8.0.0a40 blis>=0.4.0,<0.5.0 wasabi>=0.8.0,<1.1.0 srsly>=2.1.0,<3.0.0 From 21dcf92964c6a2c4218d5ffc44a164dead641c44 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Thu, 17 Sep 2020 09:21:36 +0200 Subject: [PATCH 08/13] Update website/docs/api/data-formats.md Co-authored-by: Matthew Honnibal --- website/docs/api/data-formats.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index 74d612862..cf091e16c 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -130,7 +130,7 @@ process that are used when you run [`spacy train`](/api/cli#train). | --------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ | | `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ | -| `corpus` | Dictionary with `train` and `develop` keys, each referring to a callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/top-level#Corpus). ~~Callable[[Language], Iterator[Example]]~~ | +| `corpus` | Dictionary with `train` and `dev` keys, each referring to a callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/top-level#Corpus). ~~Callable[[Language], Iterator[Example]]~~ | | `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ | | `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ | | `frozen_components` | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ | From 0c35885751f2ad83098f54103de33b987b4a199e Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 17 Sep 2020 11:38:59 +0200 Subject: [PATCH 09/13] generalize corpora, dot notation for dev and train corpus --- extra/experiments/onto-joint/defaults.cfg | 34 +++--- .../ptb-joint-pos-dep/defaults.cfg | 32 +++--- spacy/cli/pretrain.py | 3 +- spacy/cli/templates/quickstart_training.jinja | 27 ++--- spacy/cli/train.py | 5 +- spacy/default_config.cfg | 56 +++++---- spacy/default_config_pretraining.cfg | 17 +-- spacy/schemas.py | 6 +- .../tests/serialize/test_serialize_config.py | 20 ++-- spacy/tests/training/test_readers.py | 63 ++++++++++- website/docs/api/cli.md | 20 ++-- website/docs/api/corpus.md | 4 +- website/docs/api/data-formats.md | 107 +++++++++++++----- website/docs/api/top-level.md | 6 +- website/docs/usage/projects.md | 2 +- website/docs/usage/training.md | 2 +- 16 files changed, 261 insertions(+), 143 deletions(-) diff --git a/extra/experiments/onto-joint/defaults.cfg b/extra/experiments/onto-joint/defaults.cfg index 97eebe6b4..90101281c 100644 --- a/extra/experiments/onto-joint/defaults.cfg +++ b/extra/experiments/onto-joint/defaults.cfg @@ -8,6 +8,22 @@ init_tok2vec = null seed = 0 use_pytorch_for_gpu_memory = false +[corpora] + +[corpora.train] +@readers = "spacy.Corpus.v1" +path = ${paths:train} +gold_preproc = true +max_length = 0 +limit = 0 + +[corpora.dev] +@readers = "spacy.Corpus.v1" +path = ${paths:dev} +gold_preproc = ${corpora.train.gold_preproc} +max_length = 0 +limit = 0 + [training] seed = ${system:seed} dropout = 0.1 @@ -20,22 +36,8 @@ patience = 10000 eval_frequency = 200 score_weights = {"dep_las": 0.4, "ents_f": 0.4, "tag_acc": 0.2} frozen_components = [] - -[training.corpus] - -[training.corpus.train] -@readers = "spacy.Corpus.v1" -path = ${paths:train} -gold_preproc = true -max_length = 0 -limit = 0 - -[training.corpus.dev] -@readers = "spacy.Corpus.v1" -path = ${paths:dev} -gold_preproc = ${training.read_train:gold_preproc} -max_length = 0 -limit = 0 +dev_corpus = "corpora.dev" +train_corpus = "corpora.train" [training.batcher] @batchers = "spacy.batch_by_words.v1" diff --git a/extra/experiments/ptb-joint-pos-dep/defaults.cfg b/extra/experiments/ptb-joint-pos-dep/defaults.cfg index 03e2f5bd7..55fb52b99 100644 --- a/extra/experiments/ptb-joint-pos-dep/defaults.cfg +++ b/extra/experiments/ptb-joint-pos-dep/defaults.cfg @@ -8,6 +8,22 @@ init_tok2vec = null seed = 0 use_pytorch_for_gpu_memory = false +[corpora] + +[corpora.train] +@readers = "spacy.Corpus.v1" +path = ${paths:train} +gold_preproc = true +max_length = 0 +limit = 0 + +[corpora.dev] +@readers = "spacy.Corpus.v1" +path = ${paths:dev} +gold_preproc = ${corpora.train.gold_preproc} +max_length = 0 +limit = 0 + [training] seed = ${system:seed} dropout = 0.2 @@ -20,22 +36,6 @@ patience = 10000 eval_frequency = 200 score_weights = {"dep_las": 0.8, "tag_acc": 0.2} -[training.corpus] - -[training.corpus.train] -@readers = "spacy.Corpus.v1" -path = ${paths:train} -gold_preproc = true -max_length = 0 -limit = 0 - -[training.corpus.dev] -@readers = "spacy.Corpus.v1" -path = ${paths:dev} -gold_preproc = ${training.read_train:gold_preproc} -max_length = 0 -limit = 0 - [training.batcher] @batchers = "spacy.batch_by_words.v1" discard_oversize = false diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 70858123d..3567e7339 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -20,6 +20,7 @@ from ..ml.models.multi_task import build_cloze_characters_multi_task_model from ..tokens import Doc from ..attrs import ID from .. import util +from ..util import dot_to_object @app.command( @@ -106,7 +107,7 @@ def pretrain( use_pytorch_for_gpu_memory() nlp, config = util.load_model_from_config(config) P_cfg = config["pretraining"] - corpus = P_cfg["corpus"] + corpus = dot_to_object(config, config["pretraining"]["corpus"]) batcher = P_cfg["batcher"] model = create_pretraining_model(nlp, config["pretraining"]) optimizer = config["pretraining"]["optimizer"] diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index 39d4d875d..00b77af4d 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -173,6 +173,18 @@ factory = "{{ pipe }}" {% endif %} {% endfor %} +[corpora] + +[corpora.train] +@readers = "spacy.Corpus.v1" +path = ${paths.train} +max_length = {{ 500 if hardware == "gpu" else 2000 }} + +[corpora.dev] +@readers = "spacy.Corpus.v1" +path = ${paths.dev} +max_length = 0 + [training] {% if use_transformer or optimize == "efficiency" or not word_vectors -%} vectors = null @@ -182,11 +194,12 @@ vectors = "{{ word_vectors }}" {% if use_transformer -%} accumulate_gradient = {{ transformer["size_factor"] }} {% endif %} +dev_corpus = "corpora.dev" +train_corpus = "corpora.train" [training.optimizer] @optimizers = "Adam.v1" - {% if use_transformer -%} [training.optimizer.learn_rate] @schedules = "warmup_linear.v1" @@ -195,18 +208,6 @@ total_steps = 20000 initial_rate = 5e-5 {% endif %} -[training.corpus] - -[training.corpus.train] -@readers = "spacy.Corpus.v1" -path = ${paths.train} -max_length = {{ 500 if hardware == "gpu" else 2000 }} - -[training.corpus.dev] -@readers = "spacy.Corpus.v1" -path = ${paths.dev} -max_length = 0 - {% if use_transformer %} [training.batcher] @batchers = "spacy.batch_by_padded.v1" diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 2c2eeb88b..15c745b69 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -18,6 +18,7 @@ from ..language import Language from .. import util from ..training.example import Example from ..errors import Errors +from ..util import dot_to_object @app.command( @@ -92,8 +93,8 @@ def train( raw_text, tag_map, morph_rules, weights_data = load_from_paths(config) T_cfg = config["training"] optimizer = T_cfg["optimizer"] - train_corpus = T_cfg["corpus"]["train"] - dev_corpus = T_cfg["corpus"]["dev"] + train_corpus = dot_to_object(config, config["training"]["train_corpus"]) + dev_corpus = dot_to_object(config, config["training"]["dev_corpus"]) batcher = T_cfg["batcher"] train_logger = T_cfg["logger"] # Components that shouldn't be updated during training diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg index 61f3dfe25..c7c9593d7 100644 --- a/spacy/default_config.cfg +++ b/spacy/default_config.cfg @@ -22,6 +22,33 @@ after_pipeline_creation = null [components] +# Readers for corpora like dev and train. +[corpora] + +[corpora.train] +@readers = "spacy.Corpus.v1" +path = ${paths.train} +# Whether to train on sequences with 'gold standard' sentence boundaries +# and tokens. If you set this to true, take care to ensure your run-time +# data is passed in sentence-by-sentence via some prior preprocessing. +gold_preproc = false +# Limitations on training document length +max_length = 0 +# Limitation on number of training examples +limit = 0 + +[corpora.dev] +@readers = "spacy.Corpus.v1" +path = ${paths.dev} +# Whether to train on sequences with 'gold standard' sentence boundaries +# and tokens. If you set this to true, take care to ensure your run-time +# data is passed in sentence-by-sentence via some prior preprocessing. +gold_preproc = false +# Limitations on training document length +max_length = 0 +# Limitation on number of training examples +limit = 0 + # Training hyper-parameters and additional features. [training] seed = ${system.seed} @@ -40,35 +67,14 @@ eval_frequency = 200 score_weights = {} # Names of pipeline components that shouldn't be updated during training frozen_components = [] +# Location in the config where the dev corpus is defined +dev_corpus = "corpora.dev" +# Location in the config where the train corpus is defined +train_corpus = "corpora.train" [training.logger] @loggers = "spacy.ConsoleLogger.v1" -[training.corpus] - -[training.corpus.train] -@readers = "spacy.Corpus.v1" -path = ${paths.train} -# Whether to train on sequences with 'gold standard' sentence boundaries -# and tokens. If you set this to true, take care to ensure your run-time -# data is passed in sentence-by-sentence via some prior preprocessing. -gold_preproc = false -# Limitations on training document length -max_length = 0 -# Limitation on number of training examples -limit = 0 - -[training.corpus.dev] -@readers = "spacy.Corpus.v1" -path = ${paths.dev} -# Whether to train on sequences with 'gold standard' sentence boundaries -# and tokens. If you set this to true, take care to ensure your run-time -# data is passed in sentence-by-sentence via some prior preprocessing. -gold_preproc = false -# Limitations on training document length -max_length = 0 -# Limitation on number of training examples -limit = 0 [training.batcher] @batchers = "spacy.batch_by_words.v1" diff --git a/spacy/default_config_pretraining.cfg b/spacy/default_config_pretraining.cfg index 9120db338..bbd595308 100644 --- a/spacy/default_config_pretraining.cfg +++ b/spacy/default_config_pretraining.cfg @@ -4,6 +4,7 @@ dropout = 0.2 n_save_every = null component = "tok2vec" layer = "" +corpus = "corpora.pretrain" [pretraining.batcher] @batchers = "spacy.batch_by_words.v1" @@ -12,13 +13,6 @@ discard_oversize = false tolerance = 0.2 get_length = null -[pretraining.corpus] -@readers = "spacy.JsonlReader.v1" -path = ${paths.raw} -min_length = 5 -max_length = 500 -limit = 0 - [pretraining.objective] type = "characters" n_characters = 4 @@ -33,3 +27,12 @@ grad_clip = 1.0 use_averages = true eps = 1e-8 learn_rate = 0.001 + +[corpora] + +[corpora.pretrain] +@readers = "spacy.JsonlReader.v1" +path = ${paths.raw} +min_length = 5 +max_length = 500 +limit = 0 diff --git a/spacy/schemas.py b/spacy/schemas.py index 2030048d8..a530db3d0 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -198,7 +198,8 @@ class ModelMetaSchema(BaseModel): class ConfigSchemaTraining(BaseModel): # fmt: off vectors: Optional[StrictStr] = Field(..., title="Path to vectors") - corpus: Dict[str, Reader] = Field(..., title="Reader for the training and dev data") + dev_corpus: StrictStr = Field(..., title="Path in the config to the dev data") + train_corpus: StrictStr = Field(..., title="Path in the config to the training data") batcher: Batcher = Field(..., title="Batcher for the training data") dropout: StrictFloat = Field(..., title="Dropout rate") patience: StrictInt = Field(..., title="How many steps to continue without improvement in evaluation score") @@ -248,7 +249,7 @@ class ConfigSchemaPretrain(BaseModel): dropout: StrictFloat = Field(..., title="Dropout rate") n_save_every: Optional[StrictInt] = Field(..., title="Saving frequency") optimizer: Optimizer = Field(..., title="The optimizer to use") - corpus: Reader = Field(..., title="Reader for the training data") + corpus: StrictStr = Field(..., title="Path in the config to the training data") batcher: Batcher = Field(..., title="Batcher for the training data") component: str = Field(..., title="Component to find the layer to pretrain") layer: str = Field(..., title="Layer to pretrain. Whole model if empty.") @@ -267,6 +268,7 @@ class ConfigSchema(BaseModel): nlp: ConfigSchemaNlp pretraining: Union[ConfigSchemaPretrain, ConfigSchemaPretrainEmpty] = {} components: Dict[str, Dict[str, Any]] + corpora: Dict[str, Reader] @root_validator(allow_reuse=True) def validate_config(cls, values): diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py index d113ac2a5..1e17b3212 100644 --- a/spacy/tests/serialize/test_serialize_config.py +++ b/spacy/tests/serialize/test_serialize_config.py @@ -17,18 +17,18 @@ nlp_config_string = """ train = "" dev = "" -[training] +[corpora] -[training.corpus] - -[training.corpus.train] +[corpora.train] @readers = "spacy.Corpus.v1" path = ${paths.train} -[training.corpus.dev] +[corpora.dev] @readers = "spacy.Corpus.v1" path = ${paths.dev} +[training] + [training.batcher] @batchers = "spacy.batch_by_words.v1" size = 666 @@ -302,20 +302,20 @@ def test_config_overrides(): def test_config_interpolation(): config = Config().from_str(nlp_config_string, interpolate=False) - assert config["training"]["corpus"]["train"]["path"] == "${paths.train}" + assert config["corpora"]["train"]["path"] == "${paths.train}" interpolated = config.interpolate() - assert interpolated["training"]["corpus"]["train"]["path"] == "" + assert interpolated["corpora"]["train"]["path"] == "" nlp = English.from_config(config) - assert nlp.config["training"]["corpus"]["train"]["path"] == "${paths.train}" + assert nlp.config["corpora"]["train"]["path"] == "${paths.train}" # Ensure that variables are preserved in nlp config width = "${components.tok2vec.model.width}" assert config["components"]["tagger"]["model"]["tok2vec"]["width"] == width assert nlp.config["components"]["tagger"]["model"]["tok2vec"]["width"] == width interpolated2 = nlp.config.interpolate() - assert interpolated2["training"]["corpus"]["train"]["path"] == "" + assert interpolated2["corpora"]["train"]["path"] == "" assert interpolated2["components"]["tagger"]["model"]["tok2vec"]["width"] == 342 nlp2 = English.from_config(interpolated) - assert nlp2.config["training"]["corpus"]["train"]["path"] == "" + assert nlp2.config["corpora"]["train"]["path"] == "" assert nlp2.config["components"]["tagger"]["model"]["tok2vec"]["width"] == 342 diff --git a/spacy/tests/training/test_readers.py b/spacy/tests/training/test_readers.py index c81ec0897..52a4abecc 100644 --- a/spacy/tests/training/test_readers.py +++ b/spacy/tests/training/test_readers.py @@ -1,6 +1,57 @@ +from typing import Dict, Iterable, Callable import pytest from thinc.api import Config -from spacy.util import load_model_from_config + +from spacy import Language +from spacy.util import load_model_from_config, registry, dot_to_object +from spacy.training import Example + + +def test_readers(): + config_string = """ + [training] + + [corpora] + @readers = "myreader.v1" + + [nlp] + lang = "en" + pipeline = ["tok2vec", "textcat"] + + [components] + + [components.tok2vec] + factory = "tok2vec" + + [components.textcat] + factory = "textcat" + """ + @registry.readers.register("myreader.v1") + def myreader() -> Dict[str, Callable[[Language, str], Iterable[Example]]]: + annots = {"cats": {"POS": 1.0, "NEG": 0.0}} + def reader(nlp: Language): + doc = nlp.make_doc(f"This is an example") + return [Example.from_dict(doc, annots)] + return {"train": reader, "dev": reader, "extra": reader, "something": reader} + + config = Config().from_str(config_string) + nlp, resolved = load_model_from_config(config, auto_fill=True) + + train_corpus = dot_to_object(resolved, resolved["training"]["train_corpus"]) + assert isinstance(train_corpus, Callable) + optimizer = resolved["training"]["optimizer"] + # simulate a training loop + nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer) + for example in train_corpus(nlp): + nlp.update([example], sgd=optimizer) + dev_corpus = dot_to_object(resolved, resolved["training"]["dev_corpus"]) + scores = nlp.evaluate(list(dev_corpus(nlp))) + assert scores["cats_score"] + # ensure the pipeline runs + doc = nlp("Quick test") + assert doc.cats + extra_corpus = resolved["corpora"]["extra"] + assert isinstance(extra_corpus, Callable) @pytest.mark.slow @@ -16,7 +67,7 @@ def test_cat_readers(reader, additional_config): nlp_config_string = """ [training] - [training.corpus] + [corpora] @readers = "PLACEHOLDER" [nlp] @@ -32,11 +83,11 @@ def test_cat_readers(reader, additional_config): factory = "textcat" """ config = Config().from_str(nlp_config_string) - config["training"]["corpus"]["@readers"] = reader - config["training"]["corpus"].update(additional_config) + config["corpora"]["@readers"] = reader + config["corpora"].update(additional_config) nlp, resolved = load_model_from_config(config, auto_fill=True) - train_corpus = resolved["training"]["corpus"]["train"] + train_corpus = dot_to_object(resolved, resolved["training"]["train_corpus"]) optimizer = resolved["training"]["optimizer"] # simulate a training loop nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer) @@ -46,7 +97,7 @@ def test_cat_readers(reader, additional_config): assert sorted(list(set(example.y.cats.values()))) == [0.0, 1.0] nlp.update([example], sgd=optimizer) # simulate performance benchmark on dev corpus - dev_corpus = resolved["training"]["corpus"]["dev"] + dev_corpus = dot_to_object(resolved, resolved["training"]["dev_corpus"]) dev_examples = list(dev_corpus(nlp)) for example in dev_examples: # this shouldn't fail if each dev example has at least one positive label diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 7dd6e6184..5c5eb6486 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -355,6 +355,16 @@ Registry @architectures Name spacy.MaxoutWindowEncoder.v1 Module spacy.ml.models.tok2vec File /path/to/spacy/ml/models/tok2vec.py (line 207) +ℹ [corpora.dev] +Registry @readers +Name spacy.Corpus.v1 +Module spacy.training.corpus +File /path/to/spacy/training/corpus.py (line 18) +ℹ [corpora.train] +Registry @readers +Name spacy.Corpus.v1 +Module spacy.training.corpus +File /path/to/spacy/training/corpus.py (line 18) ℹ [training.logger] Registry @loggers Name spacy.ConsoleLogger.v1 @@ -370,16 +380,6 @@ Registry @schedules Name compounding.v1 Module thinc.schedules File /path/to/thinc/thinc/schedules.py (line 43) -ℹ [training.corpus.dev] -Registry @readers -Name spacy.Corpus.v1 -Module spacy.training.corpus -File /path/to/spacy/training/corpus.py (line 18) -ℹ [training.corpus.train] -Registry @readers -Name spacy.Corpus.v1 -Module spacy.training.corpus -File /path/to/spacy/training/corpus.py (line 18) ℹ [training.optimizer] Registry @optimizers Name Adam.v1 diff --git a/website/docs/api/corpus.md b/website/docs/api/corpus.md index c25ce1651..2b308d618 100644 --- a/website/docs/api/corpus.md +++ b/website/docs/api/corpus.md @@ -26,7 +26,7 @@ streaming. > [paths] > train = "corpus/train.spacy" > -> [training.corpus.train] +> [corpora.train] > @readers = "spacy.Corpus.v1" > path = ${paths.train} > gold_preproc = false @@ -135,7 +135,7 @@ Initialize the reader. > > ```ini > ### Example config -> [pretraining.corpus] +> [corpora.pretrain] > @readers = "spacy.JsonlReader.v1" > path = "corpus/raw_text.jsonl" > min_length = 0 diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index cf091e16c..f868233c7 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -121,28 +121,78 @@ that you don't want to hard-code in your config file. $ python -m spacy train config.cfg --paths.train ./corpus/train.spacy ``` +### corpora {#config-corpora tag="section"} + +This section defines a dictionary mapping of string keys to `Callable` +functions. Each callable takes an `nlp` object and yields +[`Example`](/api/example) objects. By default, the two keys `train` and `dev` +are specified and each refer to a [`Corpus`](/api/top-level#Corpus). When +pretraining, an additional pretrain section is added that defaults to a +[`JsonlReader`](/api/top-level#JsonlReader). + +These subsections can be expanded with additional subsections, each referring to +a callback of type `Callable[[Language], Iterator[Example]]`: + +> #### Example +> +> ```ini +> [corpora] +> [corpora.train] +> @readers = "spacy.Corpus.v1" +> path = ${paths:train} +> +> [corpora.dev] +> @readers = "spacy.Corpus.v1" +> path = ${paths:dev} +> +> [corpora.pretrain] +> @readers = "spacy.JsonlReader.v1" +> path = ${paths.raw} +> min_length = 5 +> max_length = 500 +> +> [corpora.mydata] +> @readers = "my_reader.v1" +> shuffle = true +> ``` + +Alternatively, the `corpora` block could refer to one function with return type +`Dict[str, Callable[[Language], Iterator[Example]]]`: + +> #### Example +> +> ```ini +> [corpora] +> @readers = "my_dict_reader.v1" +> train_path = ${paths:train} +> dev_path = ${paths:dev} +> shuffle = true +> +> ``` + ### training {#config-training tag="section"} This section defines settings and controls for the training and evaluation process that are used when you run [`spacy train`](/api/cli#train). -| Name | Description | -| --------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ | -| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ | -| `corpus` | Dictionary with `train` and `dev` keys, each referring to a callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/top-level#Corpus). ~~Callable[[Language], Iterator[Example]]~~ | -| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ | -| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ | -| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ | -| `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~ | -| `max_epochs` | Maximum number of epochs to train for. Defaults to `0`. ~~int~~ | -| `max_steps` | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~ | -| `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ | -| `patience` | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~ | -| `raw_text` | Optional path to a jsonl file with unlabelled text documents for a [rehearsal](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~ | -| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ | -| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ | -| `vectors` | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~ | +| Name | Description | +| --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ | +| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ | +| `dev_corpus` | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~ | +| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ | +| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ | +| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ | +| `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~ | +| `max_epochs` | Maximum number of epochs to train for. Defaults to `0`. ~~int~~ | +| `max_steps` | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~ | +| `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ | +| `patience` | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~ | +| `raw_text` | Optional path to a jsonl file with unlabelled text documents for a [rehearsal](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~ | +| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ | +| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ | +| `corpus` | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~ | +| `vectors` | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~ | ### pretraining {#config-pretraining tag="section,optional"} @@ -150,17 +200,18 @@ This section is optional and defines settings and controls for [language model pretraining](/usage/embeddings-transformers#pretraining). It's used when you run [`spacy pretrain`](/api/cli#pretrain). -| Name | Description | -| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `max_epochs` | Maximum number of epochs. Defaults to `1000`. ~~int~~ | -| `dropout` | The dropout rate. Defaults to `0.2`. ~~float~~ | -| `n_save_every` | Saving frequency. Defaults to `null`. ~~Optional[int]~~ | -| `objective` | The pretraining objective. Defaults to `{"type": "characters", "n_characters": 4}`. ~~Dict[str, Any]~~ | -| `optimizer` | The optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ | -| `corpus` | Callable that takes the current `nlp` object and yields [`Doc`](/api/doc) objects. Defaults to [`JsonlReader`](/api/top-level#JsonlReader). ~~Callable[[Language, str], Iterable[Example]]~~ | -| `batcher` | Batcher for the training data. ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ | -| `component` | Component to find the layer to pretrain. Defaults to `"tok2vec"`. ~~str~~ | -| `layer` | The layer to pretrain. If empty, the whole component model will be used. ~~str~~ | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------ | +| `max_epochs` | Maximum number of epochs. Defaults to `1000`. ~~int~~ | +| `dropout` | The dropout rate. Defaults to `0.2`. ~~float~~ | +| `n_save_every` | Saving frequency. Defaults to `null`. ~~Optional[int]~~ | +| `objective` | The pretraining objective. Defaults to `{"type": "characters", "n_characters": 4}`. ~~Dict[str, Any]~~ | +| `optimizer` | The optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ | +| `corpus` | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~ | +| `batcher` | Batcher for the training data. ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ | +| `component` | Component to find the layer to pretrain. Defaults to `"tok2vec"`. ~~str~~ | +| `layer` | The layer to pretrain. If empty, the whole component model will be used. ~~str~~ | +| | ## Training data {#training} diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index be7994d5d..72b79de48 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -448,7 +448,7 @@ remain in the config file stored on your local system. > [training.logger] > @loggers = "spacy.WandbLogger.v1" > project_name = "monitor_spacy_training" -> remove_config_values = ["paths.train", "paths.dev", "training.corpus.train.path", "training.corpus.dev.path"] +> remove_config_values = ["paths.train", "paths.dev", "corpora.train.path", "corpora.dev.path"] > ``` | Name | Description | @@ -478,7 +478,7 @@ the [`Corpus`](/api/corpus) class. > [paths] > train = "corpus/train.spacy" > -> [training.corpus.train] +> [corpora.train] > @readers = "spacy.Corpus.v1" > path = ${paths.train} > gold_preproc = false @@ -506,7 +506,7 @@ JSONL file. Also see the [`JsonlReader`](/api/corpus#jsonlreader) class. > [paths] > pretrain = "corpus/raw_text.jsonl" > -> [pretraining.corpus] +> [corpora.pretrain] > @readers = "spacy.JsonlReader.v1" > path = ${paths.pretrain} > min_length = 0 diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md index 3a6bd4551..665caa15b 100644 --- a/website/docs/usage/projects.md +++ b/website/docs/usage/projects.md @@ -969,7 +969,7 @@ your results. > [training.logger] > @loggers = "spacy.WandbLogger.v1" > project_name = "monitor_spacy_training" -> remove_config_values = ["paths.train", "paths.dev", "training.corpus.train.path", "training.corpus.dev.path"] +> remove_config_values = ["paths.train", "paths.dev", "corpora.train.path", "corpora.dev.path"] > ``` ![Screenshot: Visualized training results](../images/wandb1.jpg) diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index bba2e2853..c0f4caad7 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -746,7 +746,7 @@ as **config settings** – in this case, `source`. > #### config.cfg > > ```ini -> [training.corpus.train] +> [corpora.train] > @readers = "corpus_variants.v1" > source = "s3://your_bucket/path/data.csv" > ``` From 427dbecdd63706f9c6c55875d46ed570f5a6a48b Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 17 Sep 2020 11:48:04 +0200 Subject: [PATCH 10/13] cleanup and formatting --- spacy/cli/pretrain.py | 14 +++++--------- spacy/cli/train.py | 4 ++-- spacy/schemas.py | 2 +- spacy/tests/training/test_readers.py | 3 +++ 4 files changed, 11 insertions(+), 12 deletions(-) diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 3567e7339..aec077eb7 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -71,9 +71,7 @@ def pretrain_cli( with show_validation_error(config_path): config = util.load_config( - config_path, - overrides=config_overrides, - interpolate=True + config_path, overrides=config_overrides, interpolate=True ) if not config.get("pretraining"): # TODO: What's the solution here? How do we handle optional blocks? @@ -84,7 +82,7 @@ def pretrain_cli( config.to_disk(output_dir / "config.cfg") msg.good("Saved config file in the output directory") - + pretrain( config, output_dir, @@ -99,7 +97,7 @@ def pretrain( output_dir: Path, resume_path: Optional[Path] = None, epoch_resume: Optional[int] = None, - use_gpu: int=-1 + use_gpu: int = -1, ): if config["system"].get("seed") is not None: fix_random_seed(config["system"]["seed"]) @@ -107,7 +105,7 @@ def pretrain( use_pytorch_for_gpu_memory() nlp, config = util.load_model_from_config(config) P_cfg = config["pretraining"] - corpus = dot_to_object(config, config["pretraining"]["corpus"]) + corpus = dot_to_object(config, P_cfg["corpus"]) batcher = P_cfg["batcher"] model = create_pretraining_model(nlp, config["pretraining"]) optimizer = config["pretraining"]["optimizer"] @@ -148,9 +146,7 @@ def pretrain( progress = tracker.update(epoch, loss, docs) if progress: msg.row(progress, **row_settings) - if P_cfg["n_save_every"] and ( - batch_id % P_cfg["n_save_every"] == 0 - ): + if P_cfg["n_save_every"] and (batch_id % P_cfg["n_save_every"] == 0): _save_model(epoch, is_temp=True) _save_model(epoch) tracker.epoch_loss = 0.0 diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 15c745b69..50306b350 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -93,8 +93,8 @@ def train( raw_text, tag_map, morph_rules, weights_data = load_from_paths(config) T_cfg = config["training"] optimizer = T_cfg["optimizer"] - train_corpus = dot_to_object(config, config["training"]["train_corpus"]) - dev_corpus = dot_to_object(config, config["training"]["dev_corpus"]) + train_corpus = dot_to_object(config, T_cfg["train_corpus"]) + dev_corpus = dot_to_object(config, T_cfg["dev_corpus"]) batcher = T_cfg["batcher"] train_logger = T_cfg["logger"] # Components that shouldn't be updated during training diff --git a/spacy/schemas.py b/spacy/schemas.py index a530db3d0..06bc4beed 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -104,7 +104,7 @@ class TokenPatternOperator(str, Enum): StringValue = Union[TokenPatternString, StrictStr] NumberValue = Union[TokenPatternNumber, StrictInt, StrictFloat] UnderscoreValue = Union[ - TokenPatternString, TokenPatternNumber, str, int, float, list, bool, + TokenPatternString, TokenPatternNumber, str, int, float, list, bool ] diff --git a/spacy/tests/training/test_readers.py b/spacy/tests/training/test_readers.py index 52a4abecc..898746c2a 100644 --- a/spacy/tests/training/test_readers.py +++ b/spacy/tests/training/test_readers.py @@ -26,12 +26,15 @@ def test_readers(): [components.textcat] factory = "textcat" """ + @registry.readers.register("myreader.v1") def myreader() -> Dict[str, Callable[[Language, str], Iterable[Example]]]: annots = {"cats": {"POS": 1.0, "NEG": 0.0}} + def reader(nlp: Language): doc = nlp.make_doc(f"This is an example") return [Example.from_dict(doc, annots)] + return {"train": reader, "dev": reader, "extra": reader, "something": reader} config = Config().from_str(config_string) From 130ffa5fbf8751de4eeb4bfd2463f46242ecc50d Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 17 Sep 2020 14:59:41 +0200 Subject: [PATCH 11/13] fix typos in docs --- website/docs/api/data-formats.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index f868233c7..b9e185d9c 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -191,7 +191,7 @@ process that are used when you run [`spacy train`](/api/cli#train). | `raw_text` | Optional path to a jsonl file with unlabelled text documents for a [rehearsal](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~ | | `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ | | `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ | -| `corpus` | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~ | +| `train_corpus` | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~ | | `vectors` | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~ | ### pretraining {#config-pretraining tag="section,optional"} @@ -207,7 +207,7 @@ used when you run [`spacy pretrain`](/api/cli#pretrain). | `n_save_every` | Saving frequency. Defaults to `null`. ~~Optional[int]~~ | | `objective` | The pretraining objective. Defaults to `{"type": "characters", "n_characters": 4}`. ~~Dict[str, Any]~~ | | `optimizer` | The optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ | -| `corpus` | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~ | +| `corpus` | Dot notation of the config location defining the train corpus. Defaults to `corpora.pretrain`. ~~str~~ | | `batcher` | Batcher for the training data. ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ | | `component` | Component to find the layer to pretrain. Defaults to `"tok2vec"`. ~~str~~ | | `layer` | The layer to pretrain. If empty, the whole component model will be used. ~~str~~ | From 3a3110ef6040e6cd9a745676586954f7508c6a6c Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 17 Sep 2020 15:44:11 +0200 Subject: [PATCH 12/13] remove empty files --- extra/experiments/onto-joint/defaults.cfg | 0 extra/experiments/ptb-joint-pos-dep/defaults.cfg | 0 2 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 extra/experiments/onto-joint/defaults.cfg delete mode 100644 extra/experiments/ptb-joint-pos-dep/defaults.cfg diff --git a/extra/experiments/onto-joint/defaults.cfg b/extra/experiments/onto-joint/defaults.cfg deleted file mode 100644 index e69de29bb..000000000 diff --git a/extra/experiments/ptb-joint-pos-dep/defaults.cfg b/extra/experiments/ptb-joint-pos-dep/defaults.cfg deleted file mode 100644 index e69de29bb..000000000 From ed0fb034cb487a1fcc206e250ca34c8a38b7e0de Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Thu, 17 Sep 2020 18:11:10 +0200 Subject: [PATCH 13/13] ml_datasets v0.2.0a0 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 69477c2d3..55fe627b8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 thinc>=8.0.0a33,<8.0.0a40 blis>=0.4.0,<0.5.0 -ml_datasets>=0.2.0 +ml_datasets==0.2.0a0 murmurhash>=0.28.0,<1.1.0 wasabi>=0.8.0,<1.1.0 srsly>=2.1.0,<3.0.0