From ba6cf9821f0ba4174fe91a840688785fbaa5ed98 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 4 Sep 2020 14:28:28 +0200 Subject: [PATCH 01/46] Replace docs analytics [ci skip] --- website/gatsby-config.js | 9 --------- website/meta/site.json | 1 - website/package.json | 1 - 3 files changed, 11 deletions(-) diff --git a/website/gatsby-config.js b/website/gatsby-config.js index 2a5f957f4..144b8e93e 100644 --- a/website/gatsby-config.js +++ b/website/gatsby-config.js @@ -131,15 +131,6 @@ module.exports = { icon: `src/images/icon.png`, }, }, - { - resolve: `gatsby-plugin-google-analytics`, - options: { - trackingId: site.analytics, - head: false, - anonymize: true, - respectDNT: true, - }, - }, { resolve: `gatsby-plugin-plausible`, options: { diff --git a/website/meta/site.json b/website/meta/site.json index 4d12a4c46..31f2f2f68 100644 --- a/website/meta/site.json +++ b/website/meta/site.json @@ -14,7 +14,6 @@ "github": "explosion" }, "theme": "#09a3d5", - "analytics": "UA-58931649-1", "newsletter": { "user": "spacy.us12", "id": "83b0498b1e7fa3c91ce68c3f1", diff --git a/website/package.json b/website/package.json index a59bc9bdc..8d8ba6408 100644 --- a/website/package.json +++ b/website/package.json @@ -20,7 +20,6 @@ "gatsby-image": "^2.0.29", "gatsby-mdx": "^0.3.6", "gatsby-plugin-catch-links": "^2.0.11", - "gatsby-plugin-google-analytics": "^2.0.14", "gatsby-plugin-manifest": "^2.0.17", "gatsby-plugin-offline": "^2.0.24", "gatsby-plugin-plausible": "0.0.6", From 33d9c649771cf03122ccb9fe7544e8c14ed788fa Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 4 Sep 2020 14:44:38 +0200 Subject: [PATCH 02/46] Fix outbound link and update package lock [ci skip] --- website/package-lock.json | 8 -------- website/src/components/link.js | 11 ++--------- 2 files changed, 2 insertions(+), 17 deletions(-) diff --git a/website/package-lock.json b/website/package-lock.json index dded33fb0..63e67ebd2 100644 --- a/website/package-lock.json +++ b/website/package-lock.json @@ -7441,14 +7441,6 @@ "escape-string-regexp": "^1.0.5" } }, - "gatsby-plugin-google-analytics": { - "version": "2.0.14", - "resolved": "https://registry.npmjs.org/gatsby-plugin-google-analytics/-/gatsby-plugin-google-analytics-2.0.14.tgz", - "integrity": "sha512-sFD73d9isJQknnDAAkDidaybHJx6VIaLfy3nO3DwbFaitvZ08RimbynYOkcWAeA0zwwix2RgAvbq/9pAmtTb/A==", - "requires": { - "@babel/runtime": "^7.0.0" - } - }, "gatsby-plugin-manifest": { "version": "2.0.17", "resolved": "https://registry.npmjs.org/gatsby-plugin-manifest/-/gatsby-plugin-manifest-2.0.17.tgz", diff --git a/website/src/components/link.js b/website/src/components/link.js index 4c4aa9492..dc0cfda8e 100644 --- a/website/src/components/link.js +++ b/website/src/components/link.js @@ -1,7 +1,6 @@ import React, { Fragment } from 'react' import PropTypes from 'prop-types' import { Link as GatsbyLink } from 'gatsby' -import { OutboundLink } from 'gatsby-plugin-google-analytics' import classNames from 'classnames' import Icon from './icon' @@ -74,15 +73,9 @@ const Link = ({ const rel = isInternal ? null : 'noopener nofollow noreferrer' return ( - + {content} - + ) } From a26f864ed3c227fab1d2a506e27cb4b5b5d831d2 Mon Sep 17 00:00:00 2001 From: Marek Grzenkowicz Date: Tue, 8 Sep 2020 21:13:50 +0200 Subject: [PATCH 03/46] Clarify how to choose pretrained weights files (closes #6027) [ci skip] (#6039) --- website/docs/api/cli.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 779fa7695..b97308aab 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -445,7 +445,8 @@ an approximate language-modeling objective. Specifically, we load pretrained vectors, and train a component like a CNN, BiLSTM, etc to predict vectors which match the pretrained ones. The weights are saved to a directory after each epoch. You can then pass a path to one of these pretrained weights files to the -`spacy train` command. +`spacy train` command. You can try to use a few with low `Loss` values reported +in the output. This technique may be especially helpful if you have little labelled data. However, it's still quite experimental, so your mileage may vary. To load the From bd87e8686e05487116c3a0c631bcb789059b2636 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 15 Sep 2020 21:40:38 +0200 Subject: [PATCH 04/46] move tests to correct subdir --- spacy/tests/{ => pipeline}/test_tok2vec.py | 2 +- spacy/tests/training/__init__.py | 0 spacy/tests/{ => training}/test_training.py | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) rename spacy/tests/{ => pipeline}/test_tok2vec.py (99%) create mode 100644 spacy/tests/training/__init__.py rename spacy/tests/{ => training}/test_training.py (99%) diff --git a/spacy/tests/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py similarity index 99% rename from spacy/tests/test_tok2vec.py rename to spacy/tests/pipeline/test_tok2vec.py index fb30c6ae5..0365554bc 100644 --- a/spacy/tests/test_tok2vec.py +++ b/spacy/tests/pipeline/test_tok2vec.py @@ -9,7 +9,7 @@ from spacy.tokens import Doc from spacy.training import Example from spacy import util from spacy.lang.en import English -from .util import get_batch +from ..util import get_batch from thinc.api import Config diff --git a/spacy/tests/training/__init__.py b/spacy/tests/training/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/test_training.py b/spacy/tests/training/test_training.py similarity index 99% rename from spacy/tests/test_training.py rename to spacy/tests/training/test_training.py index 1926aca1f..67cc37b1c 100644 --- a/spacy/tests/test_training.py +++ b/spacy/tests/training/test_training.py @@ -12,7 +12,7 @@ from thinc.api import compounding import pytest import srsly -from .util import make_tempdir +from ..util import make_tempdir @pytest.fixture From 51fa929f47120272bd6b8dfbba1f000833446f0f Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 15 Sep 2020 21:58:04 +0200 Subject: [PATCH 05/46] rewrite train_corpus to corpus.train in config --- extra/experiments/onto-joint/defaults.cfg | 6 ++-- .../ptb-joint-pos-dep/defaults.cfg | 6 ++-- spacy/cli/templates/quickstart_training.jinja | 6 ++-- spacy/cli/train.py | 4 +-- spacy/default_config.cfg | 6 ++-- spacy/schemas.py | 3 +- .../tests/serialize/test_serialize_config.py | 16 +++++---- website/docs/api/corpus.md | 2 +- website/docs/api/data-formats.md | 35 +++++++++---------- website/docs/api/top-level.md | 4 +-- website/docs/usage/projects.md | 2 +- website/docs/usage/training.md | 2 +- 12 files changed, 50 insertions(+), 42 deletions(-) diff --git a/extra/experiments/onto-joint/defaults.cfg b/extra/experiments/onto-joint/defaults.cfg index 7954b57b5..97eebe6b4 100644 --- a/extra/experiments/onto-joint/defaults.cfg +++ b/extra/experiments/onto-joint/defaults.cfg @@ -21,14 +21,16 @@ eval_frequency = 200 score_weights = {"dep_las": 0.4, "ents_f": 0.4, "tag_acc": 0.2} frozen_components = [] -[training.train_corpus] +[training.corpus] + +[training.corpus.train] @readers = "spacy.Corpus.v1" path = ${paths:train} gold_preproc = true max_length = 0 limit = 0 -[training.dev_corpus] +[training.corpus.dev] @readers = "spacy.Corpus.v1" path = ${paths:dev} gold_preproc = ${training.read_train:gold_preproc} diff --git a/extra/experiments/ptb-joint-pos-dep/defaults.cfg b/extra/experiments/ptb-joint-pos-dep/defaults.cfg index 8f9c5666e..03e2f5bd7 100644 --- a/extra/experiments/ptb-joint-pos-dep/defaults.cfg +++ b/extra/experiments/ptb-joint-pos-dep/defaults.cfg @@ -20,14 +20,16 @@ patience = 10000 eval_frequency = 200 score_weights = {"dep_las": 0.8, "tag_acc": 0.2} -[training.read_train] +[training.corpus] + +[training.corpus.train] @readers = "spacy.Corpus.v1" path = ${paths:train} gold_preproc = true max_length = 0 limit = 0 -[training.read_dev] +[training.corpus.dev] @readers = "spacy.Corpus.v1" path = ${paths:dev} gold_preproc = ${training.read_train:gold_preproc} diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index 199aae217..39d4d875d 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -195,12 +195,14 @@ total_steps = 20000 initial_rate = 5e-5 {% endif %} -[training.train_corpus] +[training.corpus] + +[training.corpus.train] @readers = "spacy.Corpus.v1" path = ${paths.train} max_length = {{ 500 if hardware == "gpu" else 2000 }} -[training.dev_corpus] +[training.corpus.dev] @readers = "spacy.Corpus.v1" path = ${paths.dev} max_length = 0 diff --git a/spacy/cli/train.py b/spacy/cli/train.py index ae4a8455e..2c2eeb88b 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -92,8 +92,8 @@ def train( raw_text, tag_map, morph_rules, weights_data = load_from_paths(config) T_cfg = config["training"] optimizer = T_cfg["optimizer"] - train_corpus = T_cfg["train_corpus"] - dev_corpus = T_cfg["dev_corpus"] + train_corpus = T_cfg["corpus"]["train"] + dev_corpus = T_cfg["corpus"]["dev"] batcher = T_cfg["batcher"] train_logger = T_cfg["logger"] # Components that shouldn't be updated during training diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg index 7cd71453f..61f3dfe25 100644 --- a/spacy/default_config.cfg +++ b/spacy/default_config.cfg @@ -44,7 +44,9 @@ frozen_components = [] [training.logger] @loggers = "spacy.ConsoleLogger.v1" -[training.train_corpus] +[training.corpus] + +[training.corpus.train] @readers = "spacy.Corpus.v1" path = ${paths.train} # Whether to train on sequences with 'gold standard' sentence boundaries @@ -56,7 +58,7 @@ max_length = 0 # Limitation on number of training examples limit = 0 -[training.dev_corpus] +[training.corpus.dev] @readers = "spacy.Corpus.v1" path = ${paths.dev} # Whether to train on sequences with 'gold standard' sentence boundaries diff --git a/spacy/schemas.py b/spacy/schemas.py index 0dd2b9204..d8bcf3c1d 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -198,8 +198,7 @@ class ModelMetaSchema(BaseModel): class ConfigSchemaTraining(BaseModel): # fmt: off vectors: Optional[StrictStr] = Field(..., title="Path to vectors") - train_corpus: Reader = Field(..., title="Reader for the training data") - dev_corpus: Reader = Field(..., title="Reader for the dev data") + corpus: Reader = Field(..., title="Reader for the training and dev data") batcher: Batcher = Field(..., title="Batcher for the training data") dropout: StrictFloat = Field(..., title="Dropout rate") patience: StrictInt = Field(..., title="How many steps to continue without improvement in evaluation score") diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py index 0ab212fda..d113ac2a5 100644 --- a/spacy/tests/serialize/test_serialize_config.py +++ b/spacy/tests/serialize/test_serialize_config.py @@ -19,11 +19,13 @@ dev = "" [training] -[training.train_corpus] +[training.corpus] + +[training.corpus.train] @readers = "spacy.Corpus.v1" path = ${paths.train} -[training.dev_corpus] +[training.corpus.dev] @readers = "spacy.Corpus.v1" path = ${paths.dev} @@ -300,20 +302,20 @@ def test_config_overrides(): def test_config_interpolation(): config = Config().from_str(nlp_config_string, interpolate=False) - assert config["training"]["train_corpus"]["path"] == "${paths.train}" + assert config["training"]["corpus"]["train"]["path"] == "${paths.train}" interpolated = config.interpolate() - assert interpolated["training"]["train_corpus"]["path"] == "" + assert interpolated["training"]["corpus"]["train"]["path"] == "" nlp = English.from_config(config) - assert nlp.config["training"]["train_corpus"]["path"] == "${paths.train}" + assert nlp.config["training"]["corpus"]["train"]["path"] == "${paths.train}" # Ensure that variables are preserved in nlp config width = "${components.tok2vec.model.width}" assert config["components"]["tagger"]["model"]["tok2vec"]["width"] == width assert nlp.config["components"]["tagger"]["model"]["tok2vec"]["width"] == width interpolated2 = nlp.config.interpolate() - assert interpolated2["training"]["train_corpus"]["path"] == "" + assert interpolated2["training"]["corpus"]["train"]["path"] == "" assert interpolated2["components"]["tagger"]["model"]["tok2vec"]["width"] == 342 nlp2 = English.from_config(interpolated) - assert nlp2.config["training"]["train_corpus"]["path"] == "" + assert nlp2.config["training"]["corpus"]["train"]["path"] == "" assert nlp2.config["components"]["tagger"]["model"]["tok2vec"]["width"] == 342 diff --git a/website/docs/api/corpus.md b/website/docs/api/corpus.md index 0f49b02e3..c25ce1651 100644 --- a/website/docs/api/corpus.md +++ b/website/docs/api/corpus.md @@ -26,7 +26,7 @@ streaming. > [paths] > train = "corpus/train.spacy" > -> [training.train_corpus] +> [training.corpus.train] > @readers = "spacy.Corpus.v1" > path = ${paths.train} > gold_preproc = false diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index 79ecb08b3..74d612862 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -126,24 +126,23 @@ $ python -m spacy train config.cfg --paths.train ./corpus/train.spacy This section defines settings and controls for the training and evaluation process that are used when you run [`spacy train`](/api/cli#train). -| Name | Description | -| --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ | -| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ | -| `dev_corpus` | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/top-level#Corpus). ~~Callable[[Language], Iterator[Example]]~~ | -| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ | -| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ | -| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ | -| `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~ | -| `max_epochs` | Maximum number of epochs to train for. Defaults to `0`. ~~int~~ | -| `max_steps` | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~ | -| `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ | -| `patience` | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~ | -| `raw_text` | Optional path to a jsonl file with unlabelled text documents for a [rehearsal](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~ | -| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ | -| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ | -| `train_corpus` | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/top-level#Corpus). ~~Callable[[Language], Iterator[Example]]~~ | -| `vectors` | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~ | +| Name | Description | +| --------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ | +| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ | +| `corpus` | Dictionary with `train` and `develop` keys, each referring to a callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/top-level#Corpus). ~~Callable[[Language], Iterator[Example]]~~ | +| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ | +| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ | +| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ | +| `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~ | +| `max_epochs` | Maximum number of epochs to train for. Defaults to `0`. ~~int~~ | +| `max_steps` | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~ | +| `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ | +| `patience` | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~ | +| `raw_text` | Optional path to a jsonl file with unlabelled text documents for a [rehearsal](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~ | +| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ | +| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ | +| `vectors` | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~ | ### pretraining {#config-pretraining tag="section,optional"} diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index f52c63f18..be7994d5d 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -448,7 +448,7 @@ remain in the config file stored on your local system. > [training.logger] > @loggers = "spacy.WandbLogger.v1" > project_name = "monitor_spacy_training" -> remove_config_values = ["paths.train", "paths.dev", "training.dev_corpus.path", "training.train_corpus.path"] +> remove_config_values = ["paths.train", "paths.dev", "training.corpus.train.path", "training.corpus.dev.path"] > ``` | Name | Description | @@ -478,7 +478,7 @@ the [`Corpus`](/api/corpus) class. > [paths] > train = "corpus/train.spacy" > -> [training.train_corpus] +> [training.corpus.train] > @readers = "spacy.Corpus.v1" > path = ${paths.train} > gold_preproc = false diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md index 9776dab1b..3a6bd4551 100644 --- a/website/docs/usage/projects.md +++ b/website/docs/usage/projects.md @@ -969,7 +969,7 @@ your results. > [training.logger] > @loggers = "spacy.WandbLogger.v1" > project_name = "monitor_spacy_training" -> remove_config_values = ["paths.train", "paths.dev", "training.dev_corpus.path", "training.train_corpus.path"] +> remove_config_values = ["paths.train", "paths.dev", "training.corpus.train.path", "training.corpus.dev.path"] > ``` ![Screenshot: Visualized training results](../images/wandb1.jpg) diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 65cfb563b..bba2e2853 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -746,7 +746,7 @@ as **config settings** – in this case, `source`. > #### config.cfg > > ```ini -> [training.train_corpus] +> [training.corpus.train] > @readers = "corpus_variants.v1" > source = "s3://your_bucket/path/data.csv" > ``` From 733665766205f350398d3216e94ab8a5ac6c3751 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 15 Sep 2020 22:07:16 +0200 Subject: [PATCH 06/46] corpus is a Dict --- spacy/schemas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/schemas.py b/spacy/schemas.py index d8bcf3c1d..2030048d8 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -198,7 +198,7 @@ class ModelMetaSchema(BaseModel): class ConfigSchemaTraining(BaseModel): # fmt: off vectors: Optional[StrictStr] = Field(..., title="Path to vectors") - corpus: Reader = Field(..., title="Reader for the training and dev data") + corpus: Dict[str, Reader] = Field(..., title="Reader for the training and dev data") batcher: Batcher = Field(..., title="Batcher for the training data") dropout: StrictFloat = Field(..., title="Dropout rate") patience: StrictInt = Field(..., title="How many steps to continue without improvement in evaluation score") From 55f8d5478ecb5fd913a3a5fe7c469e8bc8a4f038 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 15 Sep 2020 22:09:30 +0200 Subject: [PATCH 07/46] fix example output --- website/docs/api/cli.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 8449d23e1..7dd6e6184 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -272,7 +272,7 @@ training -> dropout field required training -> optimizer field required training -> optimize extra fields not permitted -{'vectors': 'en_vectors_web_lg', 'seed': 0, 'accumulate_gradient': 1, 'init_tok2vec': None, 'raw_text': None, 'patience': 1600, 'max_epochs': 0, 'max_steps': 20000, 'eval_frequency': 200, 'frozen_components': [], 'optimize': None, 'batcher': {'@batchers': 'spacy.batch_by_words.v1', 'discard_oversize': False, 'tolerance': 0.2, 'get_length': None, 'size': {'@schedules': 'compounding.v1', 'start': 100, 'stop': 1000, 'compound': 1.001, 't': 0.0}}, 'dev_corpus': {'@readers': 'spacy.Corpus.v1', 'path': '', 'max_length': 0, 'gold_preproc': False, 'limit': 0}, 'score_weights': {'tag_acc': 0.5, 'dep_uas': 0.25, 'dep_las': 0.25, 'sents_f': 0.0}, 'train_corpus': {'@readers': 'spacy.Corpus.v1', 'path': '', 'max_length': 0, 'gold_preproc': False, 'limit': 0}} +{'vectors': 'en_vectors_web_lg', 'seed': 0, 'accumulate_gradient': 1, 'init_tok2vec': None, 'raw_text': None, 'patience': 1600, 'max_epochs': 0, 'max_steps': 20000, 'eval_frequency': 200, 'frozen_components': [], 'optimize': None, 'batcher': {'@batchers': 'spacy.batch_by_words.v1', 'discard_oversize': False, 'tolerance': 0.2, 'get_length': None, 'size': {'@schedules': 'compounding.v1', 'start': 100, 'stop': 1000, 'compound': 1.001, 't': 0.0}}, 'corpus': {'train': {'@readers': 'spacy.Corpus.v1', 'path': '', 'max_length': 0, 'gold_preproc': False, 'limit': 0}, 'dev': {'@readers': 'spacy.Corpus.v1', 'path': '', 'max_length': 0, 'gold_preproc': False, 'limit': 0}} 'score_weights': {'tag_acc': 0.5, 'dep_uas': 0.25, 'dep_las': 0.25, 'sents_f': 0.0}} If your config contains missing values, you can run the 'init fill-config' command to fill in all the defaults, if possible: @@ -370,7 +370,12 @@ Registry @schedules Name compounding.v1 Module thinc.schedules File /path/to/thinc/thinc/schedules.py (line 43) -ℹ [training.dev_corpus] +ℹ [training.corpus.dev] +Registry @readers +Name spacy.Corpus.v1 +Module spacy.training.corpus +File /path/to/spacy/training/corpus.py (line 18) +ℹ [training.corpus.train] Registry @readers Name spacy.Corpus.v1 Module spacy.training.corpus @@ -385,11 +390,6 @@ Registry @schedules Name warmup_linear.v1 Module thinc.schedules File /path/to/thinc/thinc/schedules.py (line 91) -ℹ [training.train_corpus] -Registry @readers -Name spacy.Corpus.v1 -Module spacy.training.corpus -File /path/to/spacy/training/corpus.py (line 18) ``` From 714a5a05c65e28b5264d16e7dba202126de2cbfb Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 16 Sep 2020 16:39:55 +0200 Subject: [PATCH 08/46] test for custom readers with ml_datasets >= 0.2 --- spacy/pipeline/textcat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index 3f6250680..e7cb62a0d 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -181,9 +181,9 @@ class TextCategorizer(Pipe): DOCS: https://nightly.spacy.io/api/textcategorizer#predict """ - tensors = [doc.tensor for doc in docs] if not any(len(doc) for doc in docs): # Handle cases where there are no tokens in any docs. + tensors = [doc.tensor for doc in docs] xp = get_array_module(tensors) scores = xp.zeros((len(docs), len(self.labels))) return scores From 1040e250d8f740db7d0a6b012962b25ce7f95ffb Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 16 Sep 2020 16:41:28 +0200 Subject: [PATCH 09/46] actual commit with test for custom readers with ml_datasets >= 0.2 --- requirements.txt | 2 +- spacy/tests/training/test_readers.py | 58 ++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+), 1 deletion(-) create mode 100644 spacy/tests/training/test_readers.py diff --git a/requirements.txt b/requirements.txt index db6eae2ef..a67ade640 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 thinc>=8.0.0a31,<8.0.0a40 blis>=0.4.0,<0.5.0 -ml_datasets>=0.1.1 +ml_datasets>=0.2.0 murmurhash>=0.28.0,<1.1.0 wasabi>=0.8.0,<1.1.0 srsly>=2.1.0,<3.0.0 diff --git a/spacy/tests/training/test_readers.py b/spacy/tests/training/test_readers.py new file mode 100644 index 000000000..c81ec0897 --- /dev/null +++ b/spacy/tests/training/test_readers.py @@ -0,0 +1,58 @@ +import pytest +from thinc.api import Config +from spacy.util import load_model_from_config + + +@pytest.mark.slow +@pytest.mark.parametrize( + "reader,additional_config", + [ + ("ml_datasets.imdb_sentiment.v1", {"train_limit": 10, "dev_limit": 2}), + ("ml_datasets.dbpedia.v1", {"train_limit": 10, "dev_limit": 2}), + ("ml_datasets.cmu_movies.v1", {"limit": 10, "freq_cutoff": 200, "split": 0.8}), + ], +) +def test_cat_readers(reader, additional_config): + nlp_config_string = """ + [training] + + [training.corpus] + @readers = "PLACEHOLDER" + + [nlp] + lang = "en" + pipeline = ["tok2vec", "textcat"] + + [components] + + [components.tok2vec] + factory = "tok2vec" + + [components.textcat] + factory = "textcat" + """ + config = Config().from_str(nlp_config_string) + config["training"]["corpus"]["@readers"] = reader + config["training"]["corpus"].update(additional_config) + nlp, resolved = load_model_from_config(config, auto_fill=True) + + train_corpus = resolved["training"]["corpus"]["train"] + optimizer = resolved["training"]["optimizer"] + # simulate a training loop + nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer) + for example in train_corpus(nlp): + assert example.y.cats + # this shouldn't fail if each training example has at least one positive label + assert sorted(list(set(example.y.cats.values()))) == [0.0, 1.0] + nlp.update([example], sgd=optimizer) + # simulate performance benchmark on dev corpus + dev_corpus = resolved["training"]["corpus"]["dev"] + dev_examples = list(dev_corpus(nlp)) + for example in dev_examples: + # this shouldn't fail if each dev example has at least one positive label + assert sorted(list(set(example.y.cats.values()))) == [0.0, 1.0] + scores = nlp.evaluate(dev_examples) + assert scores["cats_score"] + # ensure the pipeline runs + doc = nlp("Quick test") + assert doc.cats From 0dc914b667706b4e598b61e3cfff0a85e820118f Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 16 Sep 2020 16:42:58 +0200 Subject: [PATCH 10/46] bump thinc to 8.0.0a33 --- pyproject.toml | 2 +- requirements.txt | 2 +- setup.cfg | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index e610e603e..a413a099c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ requires = [ "cymem>=2.0.2,<2.1.0", "preshed>=3.0.2,<3.1.0", "murmurhash>=0.28.0,<1.1.0", - "thinc>=8.0.0a31,<8.0.0a40", + "thinc>=8.0.0a33,<8.0.0a40", "blis>=0.4.0,<0.5.0", "pytokenizations", "pathy" diff --git a/requirements.txt b/requirements.txt index a67ade640..69477c2d3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # Our libraries cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 -thinc>=8.0.0a31,<8.0.0a40 +thinc>=8.0.0a33,<8.0.0a40 blis>=0.4.0,<0.5.0 ml_datasets>=0.2.0 murmurhash>=0.28.0,<1.1.0 diff --git a/setup.cfg b/setup.cfg index 10a8972b0..359e63172 100644 --- a/setup.cfg +++ b/setup.cfg @@ -34,13 +34,13 @@ setup_requires = cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 murmurhash>=0.28.0,<1.1.0 - thinc>=8.0.0a31,<8.0.0a40 + thinc>=8.0.0a33,<8.0.0a40 install_requires = # Our libraries murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 - thinc>=8.0.0a31,<8.0.0a40 + thinc>=8.0.0a33,<8.0.0a40 blis>=0.4.0,<0.5.0 wasabi>=0.8.0,<1.1.0 srsly>=2.1.0,<3.0.0 From 21dcf92964c6a2c4218d5ffc44a164dead641c44 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Thu, 17 Sep 2020 09:21:36 +0200 Subject: [PATCH 11/46] Update website/docs/api/data-formats.md Co-authored-by: Matthew Honnibal --- website/docs/api/data-formats.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index 74d612862..cf091e16c 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -130,7 +130,7 @@ process that are used when you run [`spacy train`](/api/cli#train). | --------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ | | `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ | -| `corpus` | Dictionary with `train` and `develop` keys, each referring to a callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/top-level#Corpus). ~~Callable[[Language], Iterator[Example]]~~ | +| `corpus` | Dictionary with `train` and `dev` keys, each referring to a callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/top-level#Corpus). ~~Callable[[Language], Iterator[Example]]~~ | | `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ | | `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ | | `frozen_components` | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ | From 0c35885751f2ad83098f54103de33b987b4a199e Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 17 Sep 2020 11:38:59 +0200 Subject: [PATCH 12/46] generalize corpora, dot notation for dev and train corpus --- extra/experiments/onto-joint/defaults.cfg | 34 +++--- .../ptb-joint-pos-dep/defaults.cfg | 32 +++--- spacy/cli/pretrain.py | 3 +- spacy/cli/templates/quickstart_training.jinja | 27 ++--- spacy/cli/train.py | 5 +- spacy/default_config.cfg | 56 +++++---- spacy/default_config_pretraining.cfg | 17 +-- spacy/schemas.py | 6 +- .../tests/serialize/test_serialize_config.py | 20 ++-- spacy/tests/training/test_readers.py | 63 ++++++++++- website/docs/api/cli.md | 20 ++-- website/docs/api/corpus.md | 4 +- website/docs/api/data-formats.md | 107 +++++++++++++----- website/docs/api/top-level.md | 6 +- website/docs/usage/projects.md | 2 +- website/docs/usage/training.md | 2 +- 16 files changed, 261 insertions(+), 143 deletions(-) diff --git a/extra/experiments/onto-joint/defaults.cfg b/extra/experiments/onto-joint/defaults.cfg index 97eebe6b4..90101281c 100644 --- a/extra/experiments/onto-joint/defaults.cfg +++ b/extra/experiments/onto-joint/defaults.cfg @@ -8,6 +8,22 @@ init_tok2vec = null seed = 0 use_pytorch_for_gpu_memory = false +[corpora] + +[corpora.train] +@readers = "spacy.Corpus.v1" +path = ${paths:train} +gold_preproc = true +max_length = 0 +limit = 0 + +[corpora.dev] +@readers = "spacy.Corpus.v1" +path = ${paths:dev} +gold_preproc = ${corpora.train.gold_preproc} +max_length = 0 +limit = 0 + [training] seed = ${system:seed} dropout = 0.1 @@ -20,22 +36,8 @@ patience = 10000 eval_frequency = 200 score_weights = {"dep_las": 0.4, "ents_f": 0.4, "tag_acc": 0.2} frozen_components = [] - -[training.corpus] - -[training.corpus.train] -@readers = "spacy.Corpus.v1" -path = ${paths:train} -gold_preproc = true -max_length = 0 -limit = 0 - -[training.corpus.dev] -@readers = "spacy.Corpus.v1" -path = ${paths:dev} -gold_preproc = ${training.read_train:gold_preproc} -max_length = 0 -limit = 0 +dev_corpus = "corpora.dev" +train_corpus = "corpora.train" [training.batcher] @batchers = "spacy.batch_by_words.v1" diff --git a/extra/experiments/ptb-joint-pos-dep/defaults.cfg b/extra/experiments/ptb-joint-pos-dep/defaults.cfg index 03e2f5bd7..55fb52b99 100644 --- a/extra/experiments/ptb-joint-pos-dep/defaults.cfg +++ b/extra/experiments/ptb-joint-pos-dep/defaults.cfg @@ -8,6 +8,22 @@ init_tok2vec = null seed = 0 use_pytorch_for_gpu_memory = false +[corpora] + +[corpora.train] +@readers = "spacy.Corpus.v1" +path = ${paths:train} +gold_preproc = true +max_length = 0 +limit = 0 + +[corpora.dev] +@readers = "spacy.Corpus.v1" +path = ${paths:dev} +gold_preproc = ${corpora.train.gold_preproc} +max_length = 0 +limit = 0 + [training] seed = ${system:seed} dropout = 0.2 @@ -20,22 +36,6 @@ patience = 10000 eval_frequency = 200 score_weights = {"dep_las": 0.8, "tag_acc": 0.2} -[training.corpus] - -[training.corpus.train] -@readers = "spacy.Corpus.v1" -path = ${paths:train} -gold_preproc = true -max_length = 0 -limit = 0 - -[training.corpus.dev] -@readers = "spacy.Corpus.v1" -path = ${paths:dev} -gold_preproc = ${training.read_train:gold_preproc} -max_length = 0 -limit = 0 - [training.batcher] @batchers = "spacy.batch_by_words.v1" discard_oversize = false diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 70858123d..3567e7339 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -20,6 +20,7 @@ from ..ml.models.multi_task import build_cloze_characters_multi_task_model from ..tokens import Doc from ..attrs import ID from .. import util +from ..util import dot_to_object @app.command( @@ -106,7 +107,7 @@ def pretrain( use_pytorch_for_gpu_memory() nlp, config = util.load_model_from_config(config) P_cfg = config["pretraining"] - corpus = P_cfg["corpus"] + corpus = dot_to_object(config, config["pretraining"]["corpus"]) batcher = P_cfg["batcher"] model = create_pretraining_model(nlp, config["pretraining"]) optimizer = config["pretraining"]["optimizer"] diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index 39d4d875d..00b77af4d 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -173,6 +173,18 @@ factory = "{{ pipe }}" {% endif %} {% endfor %} +[corpora] + +[corpora.train] +@readers = "spacy.Corpus.v1" +path = ${paths.train} +max_length = {{ 500 if hardware == "gpu" else 2000 }} + +[corpora.dev] +@readers = "spacy.Corpus.v1" +path = ${paths.dev} +max_length = 0 + [training] {% if use_transformer or optimize == "efficiency" or not word_vectors -%} vectors = null @@ -182,11 +194,12 @@ vectors = "{{ word_vectors }}" {% if use_transformer -%} accumulate_gradient = {{ transformer["size_factor"] }} {% endif %} +dev_corpus = "corpora.dev" +train_corpus = "corpora.train" [training.optimizer] @optimizers = "Adam.v1" - {% if use_transformer -%} [training.optimizer.learn_rate] @schedules = "warmup_linear.v1" @@ -195,18 +208,6 @@ total_steps = 20000 initial_rate = 5e-5 {% endif %} -[training.corpus] - -[training.corpus.train] -@readers = "spacy.Corpus.v1" -path = ${paths.train} -max_length = {{ 500 if hardware == "gpu" else 2000 }} - -[training.corpus.dev] -@readers = "spacy.Corpus.v1" -path = ${paths.dev} -max_length = 0 - {% if use_transformer %} [training.batcher] @batchers = "spacy.batch_by_padded.v1" diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 2c2eeb88b..15c745b69 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -18,6 +18,7 @@ from ..language import Language from .. import util from ..training.example import Example from ..errors import Errors +from ..util import dot_to_object @app.command( @@ -92,8 +93,8 @@ def train( raw_text, tag_map, morph_rules, weights_data = load_from_paths(config) T_cfg = config["training"] optimizer = T_cfg["optimizer"] - train_corpus = T_cfg["corpus"]["train"] - dev_corpus = T_cfg["corpus"]["dev"] + train_corpus = dot_to_object(config, config["training"]["train_corpus"]) + dev_corpus = dot_to_object(config, config["training"]["dev_corpus"]) batcher = T_cfg["batcher"] train_logger = T_cfg["logger"] # Components that shouldn't be updated during training diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg index 61f3dfe25..c7c9593d7 100644 --- a/spacy/default_config.cfg +++ b/spacy/default_config.cfg @@ -22,6 +22,33 @@ after_pipeline_creation = null [components] +# Readers for corpora like dev and train. +[corpora] + +[corpora.train] +@readers = "spacy.Corpus.v1" +path = ${paths.train} +# Whether to train on sequences with 'gold standard' sentence boundaries +# and tokens. If you set this to true, take care to ensure your run-time +# data is passed in sentence-by-sentence via some prior preprocessing. +gold_preproc = false +# Limitations on training document length +max_length = 0 +# Limitation on number of training examples +limit = 0 + +[corpora.dev] +@readers = "spacy.Corpus.v1" +path = ${paths.dev} +# Whether to train on sequences with 'gold standard' sentence boundaries +# and tokens. If you set this to true, take care to ensure your run-time +# data is passed in sentence-by-sentence via some prior preprocessing. +gold_preproc = false +# Limitations on training document length +max_length = 0 +# Limitation on number of training examples +limit = 0 + # Training hyper-parameters and additional features. [training] seed = ${system.seed} @@ -40,35 +67,14 @@ eval_frequency = 200 score_weights = {} # Names of pipeline components that shouldn't be updated during training frozen_components = [] +# Location in the config where the dev corpus is defined +dev_corpus = "corpora.dev" +# Location in the config where the train corpus is defined +train_corpus = "corpora.train" [training.logger] @loggers = "spacy.ConsoleLogger.v1" -[training.corpus] - -[training.corpus.train] -@readers = "spacy.Corpus.v1" -path = ${paths.train} -# Whether to train on sequences with 'gold standard' sentence boundaries -# and tokens. If you set this to true, take care to ensure your run-time -# data is passed in sentence-by-sentence via some prior preprocessing. -gold_preproc = false -# Limitations on training document length -max_length = 0 -# Limitation on number of training examples -limit = 0 - -[training.corpus.dev] -@readers = "spacy.Corpus.v1" -path = ${paths.dev} -# Whether to train on sequences with 'gold standard' sentence boundaries -# and tokens. If you set this to true, take care to ensure your run-time -# data is passed in sentence-by-sentence via some prior preprocessing. -gold_preproc = false -# Limitations on training document length -max_length = 0 -# Limitation on number of training examples -limit = 0 [training.batcher] @batchers = "spacy.batch_by_words.v1" diff --git a/spacy/default_config_pretraining.cfg b/spacy/default_config_pretraining.cfg index 9120db338..bbd595308 100644 --- a/spacy/default_config_pretraining.cfg +++ b/spacy/default_config_pretraining.cfg @@ -4,6 +4,7 @@ dropout = 0.2 n_save_every = null component = "tok2vec" layer = "" +corpus = "corpora.pretrain" [pretraining.batcher] @batchers = "spacy.batch_by_words.v1" @@ -12,13 +13,6 @@ discard_oversize = false tolerance = 0.2 get_length = null -[pretraining.corpus] -@readers = "spacy.JsonlReader.v1" -path = ${paths.raw} -min_length = 5 -max_length = 500 -limit = 0 - [pretraining.objective] type = "characters" n_characters = 4 @@ -33,3 +27,12 @@ grad_clip = 1.0 use_averages = true eps = 1e-8 learn_rate = 0.001 + +[corpora] + +[corpora.pretrain] +@readers = "spacy.JsonlReader.v1" +path = ${paths.raw} +min_length = 5 +max_length = 500 +limit = 0 diff --git a/spacy/schemas.py b/spacy/schemas.py index 2030048d8..a530db3d0 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -198,7 +198,8 @@ class ModelMetaSchema(BaseModel): class ConfigSchemaTraining(BaseModel): # fmt: off vectors: Optional[StrictStr] = Field(..., title="Path to vectors") - corpus: Dict[str, Reader] = Field(..., title="Reader for the training and dev data") + dev_corpus: StrictStr = Field(..., title="Path in the config to the dev data") + train_corpus: StrictStr = Field(..., title="Path in the config to the training data") batcher: Batcher = Field(..., title="Batcher for the training data") dropout: StrictFloat = Field(..., title="Dropout rate") patience: StrictInt = Field(..., title="How many steps to continue without improvement in evaluation score") @@ -248,7 +249,7 @@ class ConfigSchemaPretrain(BaseModel): dropout: StrictFloat = Field(..., title="Dropout rate") n_save_every: Optional[StrictInt] = Field(..., title="Saving frequency") optimizer: Optimizer = Field(..., title="The optimizer to use") - corpus: Reader = Field(..., title="Reader for the training data") + corpus: StrictStr = Field(..., title="Path in the config to the training data") batcher: Batcher = Field(..., title="Batcher for the training data") component: str = Field(..., title="Component to find the layer to pretrain") layer: str = Field(..., title="Layer to pretrain. Whole model if empty.") @@ -267,6 +268,7 @@ class ConfigSchema(BaseModel): nlp: ConfigSchemaNlp pretraining: Union[ConfigSchemaPretrain, ConfigSchemaPretrainEmpty] = {} components: Dict[str, Dict[str, Any]] + corpora: Dict[str, Reader] @root_validator(allow_reuse=True) def validate_config(cls, values): diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py index d113ac2a5..1e17b3212 100644 --- a/spacy/tests/serialize/test_serialize_config.py +++ b/spacy/tests/serialize/test_serialize_config.py @@ -17,18 +17,18 @@ nlp_config_string = """ train = "" dev = "" -[training] +[corpora] -[training.corpus] - -[training.corpus.train] +[corpora.train] @readers = "spacy.Corpus.v1" path = ${paths.train} -[training.corpus.dev] +[corpora.dev] @readers = "spacy.Corpus.v1" path = ${paths.dev} +[training] + [training.batcher] @batchers = "spacy.batch_by_words.v1" size = 666 @@ -302,20 +302,20 @@ def test_config_overrides(): def test_config_interpolation(): config = Config().from_str(nlp_config_string, interpolate=False) - assert config["training"]["corpus"]["train"]["path"] == "${paths.train}" + assert config["corpora"]["train"]["path"] == "${paths.train}" interpolated = config.interpolate() - assert interpolated["training"]["corpus"]["train"]["path"] == "" + assert interpolated["corpora"]["train"]["path"] == "" nlp = English.from_config(config) - assert nlp.config["training"]["corpus"]["train"]["path"] == "${paths.train}" + assert nlp.config["corpora"]["train"]["path"] == "${paths.train}" # Ensure that variables are preserved in nlp config width = "${components.tok2vec.model.width}" assert config["components"]["tagger"]["model"]["tok2vec"]["width"] == width assert nlp.config["components"]["tagger"]["model"]["tok2vec"]["width"] == width interpolated2 = nlp.config.interpolate() - assert interpolated2["training"]["corpus"]["train"]["path"] == "" + assert interpolated2["corpora"]["train"]["path"] == "" assert interpolated2["components"]["tagger"]["model"]["tok2vec"]["width"] == 342 nlp2 = English.from_config(interpolated) - assert nlp2.config["training"]["corpus"]["train"]["path"] == "" + assert nlp2.config["corpora"]["train"]["path"] == "" assert nlp2.config["components"]["tagger"]["model"]["tok2vec"]["width"] == 342 diff --git a/spacy/tests/training/test_readers.py b/spacy/tests/training/test_readers.py index c81ec0897..52a4abecc 100644 --- a/spacy/tests/training/test_readers.py +++ b/spacy/tests/training/test_readers.py @@ -1,6 +1,57 @@ +from typing import Dict, Iterable, Callable import pytest from thinc.api import Config -from spacy.util import load_model_from_config + +from spacy import Language +from spacy.util import load_model_from_config, registry, dot_to_object +from spacy.training import Example + + +def test_readers(): + config_string = """ + [training] + + [corpora] + @readers = "myreader.v1" + + [nlp] + lang = "en" + pipeline = ["tok2vec", "textcat"] + + [components] + + [components.tok2vec] + factory = "tok2vec" + + [components.textcat] + factory = "textcat" + """ + @registry.readers.register("myreader.v1") + def myreader() -> Dict[str, Callable[[Language, str], Iterable[Example]]]: + annots = {"cats": {"POS": 1.0, "NEG": 0.0}} + def reader(nlp: Language): + doc = nlp.make_doc(f"This is an example") + return [Example.from_dict(doc, annots)] + return {"train": reader, "dev": reader, "extra": reader, "something": reader} + + config = Config().from_str(config_string) + nlp, resolved = load_model_from_config(config, auto_fill=True) + + train_corpus = dot_to_object(resolved, resolved["training"]["train_corpus"]) + assert isinstance(train_corpus, Callable) + optimizer = resolved["training"]["optimizer"] + # simulate a training loop + nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer) + for example in train_corpus(nlp): + nlp.update([example], sgd=optimizer) + dev_corpus = dot_to_object(resolved, resolved["training"]["dev_corpus"]) + scores = nlp.evaluate(list(dev_corpus(nlp))) + assert scores["cats_score"] + # ensure the pipeline runs + doc = nlp("Quick test") + assert doc.cats + extra_corpus = resolved["corpora"]["extra"] + assert isinstance(extra_corpus, Callable) @pytest.mark.slow @@ -16,7 +67,7 @@ def test_cat_readers(reader, additional_config): nlp_config_string = """ [training] - [training.corpus] + [corpora] @readers = "PLACEHOLDER" [nlp] @@ -32,11 +83,11 @@ def test_cat_readers(reader, additional_config): factory = "textcat" """ config = Config().from_str(nlp_config_string) - config["training"]["corpus"]["@readers"] = reader - config["training"]["corpus"].update(additional_config) + config["corpora"]["@readers"] = reader + config["corpora"].update(additional_config) nlp, resolved = load_model_from_config(config, auto_fill=True) - train_corpus = resolved["training"]["corpus"]["train"] + train_corpus = dot_to_object(resolved, resolved["training"]["train_corpus"]) optimizer = resolved["training"]["optimizer"] # simulate a training loop nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer) @@ -46,7 +97,7 @@ def test_cat_readers(reader, additional_config): assert sorted(list(set(example.y.cats.values()))) == [0.0, 1.0] nlp.update([example], sgd=optimizer) # simulate performance benchmark on dev corpus - dev_corpus = resolved["training"]["corpus"]["dev"] + dev_corpus = dot_to_object(resolved, resolved["training"]["dev_corpus"]) dev_examples = list(dev_corpus(nlp)) for example in dev_examples: # this shouldn't fail if each dev example has at least one positive label diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 7dd6e6184..5c5eb6486 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -355,6 +355,16 @@ Registry @architectures Name spacy.MaxoutWindowEncoder.v1 Module spacy.ml.models.tok2vec File /path/to/spacy/ml/models/tok2vec.py (line 207) +ℹ [corpora.dev] +Registry @readers +Name spacy.Corpus.v1 +Module spacy.training.corpus +File /path/to/spacy/training/corpus.py (line 18) +ℹ [corpora.train] +Registry @readers +Name spacy.Corpus.v1 +Module spacy.training.corpus +File /path/to/spacy/training/corpus.py (line 18) ℹ [training.logger] Registry @loggers Name spacy.ConsoleLogger.v1 @@ -370,16 +380,6 @@ Registry @schedules Name compounding.v1 Module thinc.schedules File /path/to/thinc/thinc/schedules.py (line 43) -ℹ [training.corpus.dev] -Registry @readers -Name spacy.Corpus.v1 -Module spacy.training.corpus -File /path/to/spacy/training/corpus.py (line 18) -ℹ [training.corpus.train] -Registry @readers -Name spacy.Corpus.v1 -Module spacy.training.corpus -File /path/to/spacy/training/corpus.py (line 18) ℹ [training.optimizer] Registry @optimizers Name Adam.v1 diff --git a/website/docs/api/corpus.md b/website/docs/api/corpus.md index c25ce1651..2b308d618 100644 --- a/website/docs/api/corpus.md +++ b/website/docs/api/corpus.md @@ -26,7 +26,7 @@ streaming. > [paths] > train = "corpus/train.spacy" > -> [training.corpus.train] +> [corpora.train] > @readers = "spacy.Corpus.v1" > path = ${paths.train} > gold_preproc = false @@ -135,7 +135,7 @@ Initialize the reader. > > ```ini > ### Example config -> [pretraining.corpus] +> [corpora.pretrain] > @readers = "spacy.JsonlReader.v1" > path = "corpus/raw_text.jsonl" > min_length = 0 diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index cf091e16c..f868233c7 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -121,28 +121,78 @@ that you don't want to hard-code in your config file. $ python -m spacy train config.cfg --paths.train ./corpus/train.spacy ``` +### corpora {#config-corpora tag="section"} + +This section defines a dictionary mapping of string keys to `Callable` +functions. Each callable takes an `nlp` object and yields +[`Example`](/api/example) objects. By default, the two keys `train` and `dev` +are specified and each refer to a [`Corpus`](/api/top-level#Corpus). When +pretraining, an additional pretrain section is added that defaults to a +[`JsonlReader`](/api/top-level#JsonlReader). + +These subsections can be expanded with additional subsections, each referring to +a callback of type `Callable[[Language], Iterator[Example]]`: + +> #### Example +> +> ```ini +> [corpora] +> [corpora.train] +> @readers = "spacy.Corpus.v1" +> path = ${paths:train} +> +> [corpora.dev] +> @readers = "spacy.Corpus.v1" +> path = ${paths:dev} +> +> [corpora.pretrain] +> @readers = "spacy.JsonlReader.v1" +> path = ${paths.raw} +> min_length = 5 +> max_length = 500 +> +> [corpora.mydata] +> @readers = "my_reader.v1" +> shuffle = true +> ``` + +Alternatively, the `corpora` block could refer to one function with return type +`Dict[str, Callable[[Language], Iterator[Example]]]`: + +> #### Example +> +> ```ini +> [corpora] +> @readers = "my_dict_reader.v1" +> train_path = ${paths:train} +> dev_path = ${paths:dev} +> shuffle = true +> +> ``` + ### training {#config-training tag="section"} This section defines settings and controls for the training and evaluation process that are used when you run [`spacy train`](/api/cli#train). -| Name | Description | -| --------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ | -| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ | -| `corpus` | Dictionary with `train` and `dev` keys, each referring to a callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/top-level#Corpus). ~~Callable[[Language], Iterator[Example]]~~ | -| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ | -| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ | -| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ | -| `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~ | -| `max_epochs` | Maximum number of epochs to train for. Defaults to `0`. ~~int~~ | -| `max_steps` | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~ | -| `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ | -| `patience` | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~ | -| `raw_text` | Optional path to a jsonl file with unlabelled text documents for a [rehearsal](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~ | -| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ | -| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ | -| `vectors` | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~ | +| Name | Description | +| --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ | +| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ | +| `dev_corpus` | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~ | +| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ | +| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ | +| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ | +| `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~ | +| `max_epochs` | Maximum number of epochs to train for. Defaults to `0`. ~~int~~ | +| `max_steps` | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~ | +| `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ | +| `patience` | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~ | +| `raw_text` | Optional path to a jsonl file with unlabelled text documents for a [rehearsal](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~ | +| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ | +| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ | +| `corpus` | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~ | +| `vectors` | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~ | ### pretraining {#config-pretraining tag="section,optional"} @@ -150,17 +200,18 @@ This section is optional and defines settings and controls for [language model pretraining](/usage/embeddings-transformers#pretraining). It's used when you run [`spacy pretrain`](/api/cli#pretrain). -| Name | Description | -| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `max_epochs` | Maximum number of epochs. Defaults to `1000`. ~~int~~ | -| `dropout` | The dropout rate. Defaults to `0.2`. ~~float~~ | -| `n_save_every` | Saving frequency. Defaults to `null`. ~~Optional[int]~~ | -| `objective` | The pretraining objective. Defaults to `{"type": "characters", "n_characters": 4}`. ~~Dict[str, Any]~~ | -| `optimizer` | The optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ | -| `corpus` | Callable that takes the current `nlp` object and yields [`Doc`](/api/doc) objects. Defaults to [`JsonlReader`](/api/top-level#JsonlReader). ~~Callable[[Language, str], Iterable[Example]]~~ | -| `batcher` | Batcher for the training data. ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ | -| `component` | Component to find the layer to pretrain. Defaults to `"tok2vec"`. ~~str~~ | -| `layer` | The layer to pretrain. If empty, the whole component model will be used. ~~str~~ | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------ | +| `max_epochs` | Maximum number of epochs. Defaults to `1000`. ~~int~~ | +| `dropout` | The dropout rate. Defaults to `0.2`. ~~float~~ | +| `n_save_every` | Saving frequency. Defaults to `null`. ~~Optional[int]~~ | +| `objective` | The pretraining objective. Defaults to `{"type": "characters", "n_characters": 4}`. ~~Dict[str, Any]~~ | +| `optimizer` | The optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ | +| `corpus` | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~ | +| `batcher` | Batcher for the training data. ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ | +| `component` | Component to find the layer to pretrain. Defaults to `"tok2vec"`. ~~str~~ | +| `layer` | The layer to pretrain. If empty, the whole component model will be used. ~~str~~ | +| | ## Training data {#training} diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index be7994d5d..72b79de48 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -448,7 +448,7 @@ remain in the config file stored on your local system. > [training.logger] > @loggers = "spacy.WandbLogger.v1" > project_name = "monitor_spacy_training" -> remove_config_values = ["paths.train", "paths.dev", "training.corpus.train.path", "training.corpus.dev.path"] +> remove_config_values = ["paths.train", "paths.dev", "corpora.train.path", "corpora.dev.path"] > ``` | Name | Description | @@ -478,7 +478,7 @@ the [`Corpus`](/api/corpus) class. > [paths] > train = "corpus/train.spacy" > -> [training.corpus.train] +> [corpora.train] > @readers = "spacy.Corpus.v1" > path = ${paths.train} > gold_preproc = false @@ -506,7 +506,7 @@ JSONL file. Also see the [`JsonlReader`](/api/corpus#jsonlreader) class. > [paths] > pretrain = "corpus/raw_text.jsonl" > -> [pretraining.corpus] +> [corpora.pretrain] > @readers = "spacy.JsonlReader.v1" > path = ${paths.pretrain} > min_length = 0 diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md index 3a6bd4551..665caa15b 100644 --- a/website/docs/usage/projects.md +++ b/website/docs/usage/projects.md @@ -969,7 +969,7 @@ your results. > [training.logger] > @loggers = "spacy.WandbLogger.v1" > project_name = "monitor_spacy_training" -> remove_config_values = ["paths.train", "paths.dev", "training.corpus.train.path", "training.corpus.dev.path"] +> remove_config_values = ["paths.train", "paths.dev", "corpora.train.path", "corpora.dev.path"] > ``` ![Screenshot: Visualized training results](../images/wandb1.jpg) diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index bba2e2853..c0f4caad7 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -746,7 +746,7 @@ as **config settings** – in this case, `source`. > #### config.cfg > > ```ini -> [training.corpus.train] +> [corpora.train] > @readers = "corpus_variants.v1" > source = "s3://your_bucket/path/data.csv" > ``` From 427dbecdd63706f9c6c55875d46ed570f5a6a48b Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 17 Sep 2020 11:48:04 +0200 Subject: [PATCH 13/46] cleanup and formatting --- spacy/cli/pretrain.py | 14 +++++--------- spacy/cli/train.py | 4 ++-- spacy/schemas.py | 2 +- spacy/tests/training/test_readers.py | 3 +++ 4 files changed, 11 insertions(+), 12 deletions(-) diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 3567e7339..aec077eb7 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -71,9 +71,7 @@ def pretrain_cli( with show_validation_error(config_path): config = util.load_config( - config_path, - overrides=config_overrides, - interpolate=True + config_path, overrides=config_overrides, interpolate=True ) if not config.get("pretraining"): # TODO: What's the solution here? How do we handle optional blocks? @@ -84,7 +82,7 @@ def pretrain_cli( config.to_disk(output_dir / "config.cfg") msg.good("Saved config file in the output directory") - + pretrain( config, output_dir, @@ -99,7 +97,7 @@ def pretrain( output_dir: Path, resume_path: Optional[Path] = None, epoch_resume: Optional[int] = None, - use_gpu: int=-1 + use_gpu: int = -1, ): if config["system"].get("seed") is not None: fix_random_seed(config["system"]["seed"]) @@ -107,7 +105,7 @@ def pretrain( use_pytorch_for_gpu_memory() nlp, config = util.load_model_from_config(config) P_cfg = config["pretraining"] - corpus = dot_to_object(config, config["pretraining"]["corpus"]) + corpus = dot_to_object(config, P_cfg["corpus"]) batcher = P_cfg["batcher"] model = create_pretraining_model(nlp, config["pretraining"]) optimizer = config["pretraining"]["optimizer"] @@ -148,9 +146,7 @@ def pretrain( progress = tracker.update(epoch, loss, docs) if progress: msg.row(progress, **row_settings) - if P_cfg["n_save_every"] and ( - batch_id % P_cfg["n_save_every"] == 0 - ): + if P_cfg["n_save_every"] and (batch_id % P_cfg["n_save_every"] == 0): _save_model(epoch, is_temp=True) _save_model(epoch) tracker.epoch_loss = 0.0 diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 15c745b69..50306b350 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -93,8 +93,8 @@ def train( raw_text, tag_map, morph_rules, weights_data = load_from_paths(config) T_cfg = config["training"] optimizer = T_cfg["optimizer"] - train_corpus = dot_to_object(config, config["training"]["train_corpus"]) - dev_corpus = dot_to_object(config, config["training"]["dev_corpus"]) + train_corpus = dot_to_object(config, T_cfg["train_corpus"]) + dev_corpus = dot_to_object(config, T_cfg["dev_corpus"]) batcher = T_cfg["batcher"] train_logger = T_cfg["logger"] # Components that shouldn't be updated during training diff --git a/spacy/schemas.py b/spacy/schemas.py index a530db3d0..06bc4beed 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -104,7 +104,7 @@ class TokenPatternOperator(str, Enum): StringValue = Union[TokenPatternString, StrictStr] NumberValue = Union[TokenPatternNumber, StrictInt, StrictFloat] UnderscoreValue = Union[ - TokenPatternString, TokenPatternNumber, str, int, float, list, bool, + TokenPatternString, TokenPatternNumber, str, int, float, list, bool ] diff --git a/spacy/tests/training/test_readers.py b/spacy/tests/training/test_readers.py index 52a4abecc..898746c2a 100644 --- a/spacy/tests/training/test_readers.py +++ b/spacy/tests/training/test_readers.py @@ -26,12 +26,15 @@ def test_readers(): [components.textcat] factory = "textcat" """ + @registry.readers.register("myreader.v1") def myreader() -> Dict[str, Callable[[Language, str], Iterable[Example]]]: annots = {"cats": {"POS": 1.0, "NEG": 0.0}} + def reader(nlp: Language): doc = nlp.make_doc(f"This is an example") return [Example.from_dict(doc, annots)] + return {"train": reader, "dev": reader, "extra": reader, "something": reader} config = Config().from_str(config_string) From 130ffa5fbf8751de4eeb4bfd2463f46242ecc50d Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 17 Sep 2020 14:59:41 +0200 Subject: [PATCH 14/46] fix typos in docs --- website/docs/api/data-formats.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index f868233c7..b9e185d9c 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -191,7 +191,7 @@ process that are used when you run [`spacy train`](/api/cli#train). | `raw_text` | Optional path to a jsonl file with unlabelled text documents for a [rehearsal](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~ | | `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ | | `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ | -| `corpus` | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~ | +| `train_corpus` | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~ | | `vectors` | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~ | ### pretraining {#config-pretraining tag="section,optional"} @@ -207,7 +207,7 @@ used when you run [`spacy pretrain`](/api/cli#pretrain). | `n_save_every` | Saving frequency. Defaults to `null`. ~~Optional[int]~~ | | `objective` | The pretraining objective. Defaults to `{"type": "characters", "n_characters": 4}`. ~~Dict[str, Any]~~ | | `optimizer` | The optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ | -| `corpus` | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~ | +| `corpus` | Dot notation of the config location defining the train corpus. Defaults to `corpora.pretrain`. ~~str~~ | | `batcher` | Batcher for the training data. ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ | | `component` | Component to find the layer to pretrain. Defaults to `"tok2vec"`. ~~str~~ | | `layer` | The layer to pretrain. If empty, the whole component model will be used. ~~str~~ | From 3a3110ef6040e6cd9a745676586954f7508c6a6c Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 17 Sep 2020 15:44:11 +0200 Subject: [PATCH 15/46] remove empty files --- extra/experiments/onto-joint/defaults.cfg | 0 extra/experiments/ptb-joint-pos-dep/defaults.cfg | 0 2 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 extra/experiments/onto-joint/defaults.cfg delete mode 100644 extra/experiments/ptb-joint-pos-dep/defaults.cfg diff --git a/extra/experiments/onto-joint/defaults.cfg b/extra/experiments/onto-joint/defaults.cfg deleted file mode 100644 index e69de29bb..000000000 diff --git a/extra/experiments/ptb-joint-pos-dep/defaults.cfg b/extra/experiments/ptb-joint-pos-dep/defaults.cfg deleted file mode 100644 index e69de29bb..000000000 From ddfc1fc146ec35dab19f835602345de91342eeee Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 17 Sep 2020 16:05:40 +0200 Subject: [PATCH 16/46] add pretraining option to init config --- spacy/cli/init_config.py | 12 +++++++++--- website/docs/api/cli.md | 34 ++++++++++++++++++---------------- 2 files changed, 27 insertions(+), 19 deletions(-) diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py index ec65b0e0a..60ea1b640 100644 --- a/spacy/cli/init_config.py +++ b/spacy/cli/init_config.py @@ -30,6 +30,7 @@ def init_config_cli( pipeline: Optional[str] = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"), optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."), cpu: bool = Opt(False, "--cpu", "-C", help="Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."), + pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"), # fmt: on ): """ @@ -43,7 +44,7 @@ def init_config_cli( if isinstance(optimize, Optimizations): # instance of enum from the CLI optimize = optimize.value pipeline = string_to_list(pipeline) - init_config(output_file, lang=lang, pipeline=pipeline, optimize=optimize, cpu=cpu) + init_config(output_file, lang=lang, pipeline=pipeline, optimize=optimize, cpu=cpu, pretraining=pretraining) @init_cli.command("fill-config") @@ -109,7 +110,7 @@ def fill_config( def init_config( - output_file: Path, *, lang: str, pipeline: List[str], optimize: str, cpu: bool + output_file: Path, *, lang: str, pipeline: List[str], optimize: str, cpu: bool, pretraining: bool = False, ) -> None: is_stdout = str(output_file) == "-" msg = Printer(no_print=is_stdout) @@ -156,8 +157,13 @@ def init_config( with show_validation_error(hint_fill=False): config = util.load_config_from_str(base_template) nlp, _ = util.load_model_from_config(config, auto_fill=True) + config = nlp.config + if pretraining: + validate_config_for_pretrain(config, msg) + pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH) + config = pretrain_config.merge(config) msg.good("Auto-filled config with all values") - save_config(nlp.config, output_file, is_stdout=is_stdout) + save_config(config, output_file, is_stdout=is_stdout) def save_config( diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 8449d23e1..7ba451c2f 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -124,15 +124,16 @@ customize those settings in your config file later. $ python -m spacy init config [output_file] [--lang] [--pipeline] [--optimize] [--cpu] ``` -| Name | Description | -| ------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `output_file` | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~ | -| `--lang`, `-l` | Optional code of the [language](/usage/models#languages) to use. Defaults to `"en"`. ~~str (option)~~ | -| `--pipeline`, `-p` | Comma-separated list of trainable [pipeline components](/usage/processing-pipelines#built-in) to include. Defaults to `"tagger,parser,ner"`. ~~str (option)~~ | -| `--optimize`, `-o` | `"efficiency"` or `"accuracy"`. Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters. Defaults to `"efficiency"`. ~~str (option)~~ | -| `--cpu`, `-C` | Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters. ~~bool (flag)~~ | -| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | -| **CREATES** | The config file for training. | +| Name | Description | +| --------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `output_file` | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~ | +| `--lang`, `-l` | Optional code of the [language](/usage/models#languages) to use. Defaults to `"en"`. ~~str (option)~~ | +| `--pipeline`, `-p` | Comma-separated list of trainable [pipeline components](/usage/processing-pipelines#built-in) to include. Defaults to `"tagger,parser,ner"`. ~~str (option)~~ | +| `--pretraining`, `-p` | Include config for pretraining (with 'spacy pretrain'). Default False. ~~bool~~ | +| `--optimize`, `-o` | `"efficiency"` or `"accuracy"`. Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters. Defaults to `"efficiency"`. ~~str (option)~~ | +| `--cpu`, `-C` | Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters. ~~bool (flag)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| **CREATES** | The config file for training. | ### init fill-config {#init-fill-config new="3"} @@ -160,13 +161,14 @@ validation error with more details. $ python -m spacy init fill-config [base_path] [output_file] [--diff] ``` -| Name | Description | -| -------------- | ----------------------------------------------------------------------------------------------------------------------------------- | -| `base_path` | Path to base config to fill, e.g. generated by the [quickstart widget](/usage/training#quickstart). ~~Path (positional)~~ | -| `output_file` | Path to output `.cfg` file. If not set, the config is written to stdout so you can pipe it forward to a file. ~~Path (positional)~~ | -| `--diff`, `-D` | Print a visual diff highlighting the changes. ~~bool (flag)~~ | -| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | -| **CREATES** | Complete and auto-filled config file for training. | +| Name | Description | +| --------------------- | ----------------------------------------------------------------------------------------------------------------------------------- | +| `base_path` | Path to base config to fill, e.g. generated by the [quickstart widget](/usage/training#quickstart). ~~Path (positional)~~ | +| `output_file` | Path to output `.cfg` file. If not set, the config is written to stdout so you can pipe it forward to a file. ~~Path (positional)~~ | +| `--pretraining`, `-p` | Include config for pretraining (with 'spacy pretrain'). Default False. ~~bool~~ | +| `--diff`, `-D` | Print a visual diff highlighting the changes. ~~bool (flag)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| **CREATES** | Complete and auto-filled config file for training. | ### init vocab {#init-vocab new="3" tag="command"} From 5fade4feb7fbd3d579a6b9a2d696a470456a997f Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 17 Sep 2020 16:15:20 +0200 Subject: [PATCH 17/46] fix cli abbrev --- website/docs/api/cli.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 7ba451c2f..8edee6b29 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -124,16 +124,16 @@ customize those settings in your config file later. $ python -m spacy init config [output_file] [--lang] [--pipeline] [--optimize] [--cpu] ``` -| Name | Description | -| --------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `output_file` | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~ | -| `--lang`, `-l` | Optional code of the [language](/usage/models#languages) to use. Defaults to `"en"`. ~~str (option)~~ | -| `--pipeline`, `-p` | Comma-separated list of trainable [pipeline components](/usage/processing-pipelines#built-in) to include. Defaults to `"tagger,parser,ner"`. ~~str (option)~~ | -| `--pretraining`, `-p` | Include config for pretraining (with 'spacy pretrain'). Default False. ~~bool~~ | -| `--optimize`, `-o` | `"efficiency"` or `"accuracy"`. Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters. Defaults to `"efficiency"`. ~~str (option)~~ | -| `--cpu`, `-C` | Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters. ~~bool (flag)~~ | -| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | -| **CREATES** | The config file for training. | +| Name | Description | +| ---------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `output_file` | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~ | +| `--lang`, `-l` | Optional code of the [language](/usage/models#languages) to use. Defaults to `"en"`. ~~str (option)~~ | +| `--pipeline`, `-p` | Comma-separated list of trainable [pipeline components](/usage/processing-pipelines#built-in) to include. Defaults to `"tagger,parser,ner"`. ~~str (option)~~ | +| `--pretraining`, `-pt` | Include config for pretraining (with 'spacy pretrain'). Default False. ~~bool~~ | +| `--optimize`, `-o` | `"efficiency"` or `"accuracy"`. Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters. Defaults to `"efficiency"`. ~~str (option)~~ | +| `--cpu`, `-C` | Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters. ~~bool (flag)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| **CREATES** | The config file for training. | ### init fill-config {#init-fill-config new="3"} From ec751068f328e47ae7fa8ca1745a1dd8ac00529d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 17 Sep 2020 16:42:53 +0200 Subject: [PATCH 18/46] Draft text for static vectors intro --- website/docs/usage/embeddings-transformers.md | 45 +++++++++++++++---- 1 file changed, 36 insertions(+), 9 deletions(-) diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md index 8dd104ead..6a239cb1e 100644 --- a/website/docs/usage/embeddings-transformers.md +++ b/website/docs/usage/embeddings-transformers.md @@ -30,14 +30,20 @@ to predict. Otherwise, you could try using a "one-shot learning" approach using -The key difference between [word vectors](#word-vectors) and contextual language -models such as [transformers](#transformers) is that word vectors model -**lexical types**, rather than _tokens_. If you have a list of terms with no -context around them, a transformer model like BERT can't really help you. BERT -is designed to understand language **in context**, which isn't what you have. A -word vectors table will be a much better fit for your task. However, if you do -have words in context — whole sentences or paragraphs of running text — word -vectors will only provide a very rough approximation of what the text is about. +[Transformers](#transformers) are large and powerful neural networks that give +you better accuracy, but are harder to deploy in production, as they require a GPU to run +effectively. [Word vectors](#word-vectors) are a slightly older technique that +can give your models a smaller improvement in accuracy, and can also provide +some additional capabilities. + +The key difference between word-vectors and contextual language +models such as transformers is that word vectors model **lexical types**, rather +than _tokens_. If you have a list of terms with no context around them, a transformer +model like BERT can't really help you. BERT is designed to understand language +**in context**, which isn't what you have. A word vectors table will be a much +better fit for your task. However, if you do have words in context — whole sentences +or paragraphs of running text — word vectors will only provide a very rough +approximation of what the text is about. Word vectors are also very computationally efficient, as they map a word to a vector with a single indexing operation. Word vectors are therefore useful as a @@ -478,7 +484,28 @@ training. ## Static vectors {#static-vectors} - +If your pipeline includes a word vectors table, you'll be able to use the +`.similarity()` method on the `Doc`, `Span`, `Token` and `Lexeme` objects. +You'll also be able to access the vectors using the `.vector` attribute, or you +can look up one or more vectors directly using the `Vocab` object. Pipelines +with word vectors can also use the vectors as features for the statistical +models, which can improve the accuracy of your components. + +Word vectors in spaCy are "static" in the sense that they are not learned +parameters of the statistical models, and spaCy itself does not feature any +algorithms for learning word vector tables. You can train a word vectors table +using tools such as Gensim, word2vec, FastText or GloVe. There are also many +word vector tables available for download. Once you have a word vectors table +you want to use, you can convert it for use with spaCy using the `spacy init vocab` +command, which will give you a directory you can load or refer to in your training +configs. + +When converting the vectors, there are two ways you can trim them down to make +your package smaller. You can _truncate_ the vectors with the `--truncate-vectors` +option, which will remove entries for rarer words from the table. Alternatively, +you can use the `--prune-vectors` option to remap rarer words to the closest vector +that remains in the table. This allows the vectors table to return meaningful +(albeit imperfect) results for more words than you have rows in the table. ### Using word vectors in your models {#word-vectors-models} From 127ce0c574da23f2e17c824dcebec6f229d4561f Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Thu, 17 Sep 2020 16:55:53 +0200 Subject: [PATCH 19/46] Update website/docs/api/cli.md Co-authored-by: Ines Montani --- website/docs/api/cli.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 8edee6b29..5f3a06c36 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -121,7 +121,7 @@ customize those settings in your config file later. > ``` ```cli -$ python -m spacy init config [output_file] [--lang] [--pipeline] [--optimize] [--cpu] +$ python -m spacy init config [output_file] [--lang] [--pipeline] [--optimize] [--cpu] [--pretraining] ``` | Name | Description | From e5ceec5df0cf7d279d6f2bac716a30f4edb71fc8 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Thu, 17 Sep 2020 16:56:20 +0200 Subject: [PATCH 20/46] Update website/docs/api/cli.md Co-authored-by: Ines Montani --- website/docs/api/cli.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 5f3a06c36..f5ac943e2 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -129,7 +129,7 @@ $ python -m spacy init config [output_file] [--lang] [--pipeline] [--optimize] [ | `output_file` | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~ | | `--lang`, `-l` | Optional code of the [language](/usage/models#languages) to use. Defaults to `"en"`. ~~str (option)~~ | | `--pipeline`, `-p` | Comma-separated list of trainable [pipeline components](/usage/processing-pipelines#built-in) to include. Defaults to `"tagger,parser,ner"`. ~~str (option)~~ | -| `--pretraining`, `-pt` | Include config for pretraining (with 'spacy pretrain'). Default False. ~~bool~~ | +| `--pretraining`, `-pt` | Include config for pretraining (with [`spacy pretrain`](/api/cli#pretrain)). Defaults to `False`. ~~bool (flag)~~ | | `--optimize`, `-o` | `"efficiency"` or `"accuracy"`. Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters. Defaults to `"efficiency"`. ~~str (option)~~ | | `--cpu`, `-C` | Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters. ~~bool (flag)~~ | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | From 38652143434207531c2779ab6905331269f072ca Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 17 Sep 2020 16:57:02 +0200 Subject: [PATCH 21/46] Use consistent shortcut --- spacy/cli/init_config.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py index 60ea1b640..e70195e15 100644 --- a/spacy/cli/init_config.py +++ b/spacy/cli/init_config.py @@ -44,7 +44,14 @@ def init_config_cli( if isinstance(optimize, Optimizations): # instance of enum from the CLI optimize = optimize.value pipeline = string_to_list(pipeline) - init_config(output_file, lang=lang, pipeline=pipeline, optimize=optimize, cpu=cpu, pretraining=pretraining) + init_config( + output_file, + lang=lang, + pipeline=pipeline, + optimize=optimize, + cpu=cpu, + pretraining=pretraining, + ) @init_cli.command("fill-config") @@ -52,7 +59,7 @@ def init_fill_config_cli( # fmt: off base_path: Path = Arg(..., help="Base config to fill", exists=True, dir_okay=False), output_file: Path = Arg("-", help="File to save config.cfg to (or - for stdout)", allow_dash=True), - pretraining: bool = Opt(False, "--pretraining", "-p", help="Include config for pretraining (with 'spacy pretrain')"), + pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"), diff: bool = Opt(False, "--diff", "-D", help="Print a visual diff highlighting the changes") # fmt: on ): @@ -110,7 +117,13 @@ def fill_config( def init_config( - output_file: Path, *, lang: str, pipeline: List[str], optimize: str, cpu: bool, pretraining: bool = False, + output_file: Path, + *, + lang: str, + pipeline: List[str], + optimize: str, + cpu: bool, + pretraining: bool = False, ) -> None: is_stdout = str(output_file) == "-" msg = Printer(no_print=is_stdout) From c4b414b2825021410c8f8e80304b83eac3847bf1 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 17 Sep 2020 16:58:09 +0200 Subject: [PATCH 22/46] Update website/docs/api/cli.md --- website/docs/api/cli.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index f5ac943e2..9d0b872c3 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -165,7 +165,7 @@ $ python -m spacy init fill-config [base_path] [output_file] [--diff] | --------------------- | ----------------------------------------------------------------------------------------------------------------------------------- | | `base_path` | Path to base config to fill, e.g. generated by the [quickstart widget](/usage/training#quickstart). ~~Path (positional)~~ | | `output_file` | Path to output `.cfg` file. If not set, the config is written to stdout so you can pipe it forward to a file. ~~Path (positional)~~ | -| `--pretraining`, `-p` | Include config for pretraining (with 'spacy pretrain'). Default False. ~~bool~~ | +| `--pretraining`, `-pt` | Include config for pretraining (with [`spacy pretrain`](/api/cli#pretrain)). Defaults to `False`. ~~bool (flag)~~ | | `--diff`, `-D` | Print a visual diff highlighting the changes. ~~bool (flag)~~ | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | | **CREATES** | Complete and auto-filled config file for training. | From 3d8e010655e7180eb875fe784f2c8f098a332388 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 17 Sep 2020 16:58:46 +0200 Subject: [PATCH 23/46] Change order --- website/docs/api/cli.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index f5ac943e2..f9a192000 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -129,9 +129,9 @@ $ python -m spacy init config [output_file] [--lang] [--pipeline] [--optimize] [ | `output_file` | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~ | | `--lang`, `-l` | Optional code of the [language](/usage/models#languages) to use. Defaults to `"en"`. ~~str (option)~~ | | `--pipeline`, `-p` | Comma-separated list of trainable [pipeline components](/usage/processing-pipelines#built-in) to include. Defaults to `"tagger,parser,ner"`. ~~str (option)~~ | -| `--pretraining`, `-pt` | Include config for pretraining (with [`spacy pretrain`](/api/cli#pretrain)). Defaults to `False`. ~~bool (flag)~~ | | `--optimize`, `-o` | `"efficiency"` or `"accuracy"`. Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters. Defaults to `"efficiency"`. ~~str (option)~~ | | `--cpu`, `-C` | Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters. ~~bool (flag)~~ | +| `--pretraining`, `-pt` | Include config for pretraining (with [`spacy pretrain`](/api/cli#pretrain)). Defaults to `False`. ~~bool (flag)~~ | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | | **CREATES** | The config file for training. | From a2c8cda26ffbc6ba0e15b0872b8691ee4f366994 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 17 Sep 2020 17:12:51 +0200 Subject: [PATCH 24/46] Update docs [ci skip] --- website/docs/usage/embeddings-transformers.md | 60 ++++++++++--------- 1 file changed, 32 insertions(+), 28 deletions(-) diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md index 6a239cb1e..9f73661c3 100644 --- a/website/docs/usage/embeddings-transformers.md +++ b/website/docs/usage/embeddings-transformers.md @@ -31,18 +31,18 @@ to predict. Otherwise, you could try using a "one-shot learning" approach using [Transformers](#transformers) are large and powerful neural networks that give -you better accuracy, but are harder to deploy in production, as they require a GPU to run -effectively. [Word vectors](#word-vectors) are a slightly older technique that -can give your models a smaller improvement in accuracy, and can also provide -some additional capabilities. +you better accuracy, but are harder to deploy in production, as they require a +GPU to run effectively. [Word vectors](#word-vectors) are a slightly older +technique that can give your models a smaller improvement in accuracy, and can +also provide some additional capabilities. -The key difference between word-vectors and contextual language -models such as transformers is that word vectors model **lexical types**, rather -than _tokens_. If you have a list of terms with no context around them, a transformer -model like BERT can't really help you. BERT is designed to understand language -**in context**, which isn't what you have. A word vectors table will be a much -better fit for your task. However, if you do have words in context — whole sentences -or paragraphs of running text — word vectors will only provide a very rough +The key difference between word-vectors and contextual language models such as +transformers is that word vectors model **lexical types**, rather than _tokens_. +If you have a list of terms with no context around them, a transformer model +like BERT can't really help you. BERT is designed to understand language **in +context**, which isn't what you have. A word vectors table will be a much better +fit for your task. However, if you do have words in context — whole sentences or +paragraphs of running text — word vectors will only provide a very rough approximation of what the text is about. Word vectors are also very computationally efficient, as they map a word to a @@ -484,28 +484,32 @@ training. ## Static vectors {#static-vectors} -If your pipeline includes a word vectors table, you'll be able to use the -`.similarity()` method on the `Doc`, `Span`, `Token` and `Lexeme` objects. -You'll also be able to access the vectors using the `.vector` attribute, or you -can look up one or more vectors directly using the `Vocab` object. Pipelines -with word vectors can also use the vectors as features for the statistical -models, which can improve the accuracy of your components. +If your pipeline includes a **word vectors table**, you'll be able to use the +`.similarity()` method on the [`Doc`](/api/doc), [`Span`](/api/span), +[`Token`](/api/token) and [`Lexeme`](/api/lexeme) objects. You'll also be able +to access the vectors using the `.vector` attribute, or you can look up one or +more vectors directly using the [`Vocab`](/api/vocab) object. Pipelines with +word vectors can also **use the vectors as features** for the statistical +models, which can **improve the accuracy** of your components. Word vectors in spaCy are "static" in the sense that they are not learned parameters of the statistical models, and spaCy itself does not feature any algorithms for learning word vector tables. You can train a word vectors table -using tools such as Gensim, word2vec, FastText or GloVe. There are also many -word vector tables available for download. Once you have a word vectors table -you want to use, you can convert it for use with spaCy using the `spacy init vocab` -command, which will give you a directory you can load or refer to in your training -configs. +using tools such as [Gensim](https://radimrehurek.com/gensim/), +[FastText](https://fasttext.cc/) or +[GloVe](https://nlp.stanford.edu/projects/glove/), or download existing +pretrained vectors. The [`init vocab`](/api/cli#init-vocab) command lets you +convert vectors for use with spaCy and will give you a directory you can load or +refer to in your [training configs](/usage/training#config). -When converting the vectors, there are two ways you can trim them down to make -your package smaller. You can _truncate_ the vectors with the `--truncate-vectors` -option, which will remove entries for rarer words from the table. Alternatively, -you can use the `--prune-vectors` option to remap rarer words to the closest vector -that remains in the table. This allows the vectors table to return meaningful -(albeit imperfect) results for more words than you have rows in the table. + + +For more details on loading word vectors into spaCy, using them for similarity +and improving word vector coverage by truncating and pruning the vectors, see +the usage guide on +[word vectors and similarity](/usage/linguistic-features#vectors-similarity). + + ### Using word vectors in your models {#word-vectors-models} From ed0fb034cb487a1fcc206e250ca34c8a38b7e0de Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Thu, 17 Sep 2020 18:11:10 +0200 Subject: [PATCH 25/46] ml_datasets v0.2.0a0 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 69477c2d3..55fe627b8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 thinc>=8.0.0a33,<8.0.0a40 blis>=0.4.0,<0.5.0 -ml_datasets>=0.2.0 +ml_datasets==0.2.0a0 murmurhash>=0.28.0,<1.1.0 wasabi>=0.8.0,<1.1.0 srsly>=2.1.0,<3.0.0 From 6efb7688a65faae489de33073c1c40b11ec4f432 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 17 Sep 2020 18:17:03 +0200 Subject: [PATCH 26/46] Draft pretrain usage --- website/docs/usage/embeddings-transformers.md | 86 ++++++++++++++++--- 1 file changed, 76 insertions(+), 10 deletions(-) diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md index 9f73661c3..678237dc2 100644 --- a/website/docs/usage/embeddings-transformers.md +++ b/website/docs/usage/embeddings-transformers.md @@ -610,17 +610,83 @@ def MyCustomVectors( ## Pretraining {#pretraining} - - +The `spacy pretrain` command lets you initialize your models with information +from raw text. Without pretraining, the models for your components will usually +be initialized randomly. The idea behind pretraining is simple: random probably +isn't optimal, so if we have some text to learn from, we can probably find +a way to get the model off to a better start. The impact of `spacy pretrain` varies, +but it will usually be worth trying if you're not using a transformer model and +you have relatively little training data (for instance, fewer than 5,000 sentence). +A good rule of thumb is that pretraining will generally give you a similar accuracy +improvement to using word vectors in your model. If word vectors have given you +a 10% error reduction, the `spacy pretrain` command might give you another 10%, +for a 20% error reduction in total. - +The `spacy pretrain` command will take a specific subnetwork within one of your +components, and add additional layers to build a network for a temporary task, +that forces the model to learn something about sentence structure and word +cooccurrence statistics. Pretraining produces a binary weights file that can be +loaded back in at the start of training. The weights file specifies an initial +set of weights. Training then proceeds as normal. + +You can only pretrain one subnetwork from your pipeline at a time, and the subnetwork +must be typed `Model[List[Doc], List[Floats2d]]` (i.e., it has to be a "tok2vec" layer). +The most common workflow is to use the `Tok2Vec` component to create a shared +token-to-vector layer for several components of your pipeline, and apply +pretraining to its whole model. + +The `spacy pretrain` command is configured using the `[pretraining]` section of +your config file. The `pretraining.component` and `pretraining.layer` settings +tell spaCy how to find the subnetwork to pretrain. The `pretraining.layer` +setting should be either the empty string (to use the whole model), or a +[node reference](https://thinc.ai/docs/usage-models#model-state). Most of spaCy's +built-in model architectures have a reference named `"tok2vec"` that will refer +to the right layer. + +```ini +# Pretrain nlp.get_pipe("tok2vec").model +[pretraining] +component = "tok2vec" +layer = "" + +[pretraining] +# Pretrain nlp.get_pipe("textcat").model.get_ref("tok2vec") +component = "textcat" +layer = "tok2vec" +``` + +two pretraining objectives are available, both of which are variants of the cloze +task Devlin et al (2018) introduced for BERT. + +* The *characters* objective asks the model to predict some number of leading and + trailing UTF-8 bytes for the words. For instance, setting `n_characters=2`, the + model will try to predict the first two and last two characters of the word. + +* The *vectors* objective asks the model to predict the word's vector, from + a static embeddings table. This requires a word vectors model to be trained + and loaded. The vectors objective can optimize either a cosine or an L2 loss. + We've generally found cosine loss to perform better. + +These pretraining objectives use a trick that we term _language modelling with +approximate outputs (LMAO)_. The motivation for the trick is that predicting +an exact word ID introduces a lot of incidental complexity. You need a large +output layer, and even then, the vocabulary is too large, which motivates +tokenization schemes that do not align to actual word boundaries. At the end of +training, the output layer will be thrown away regardless: we just want a task +that forces the network to model something about word cooccurrence statistics. +Predicting leading and trailing characters does that more than adequately, as +the exact word sequence could be recovered with high accuracy if the initial +and trailing characters are predicted accurately. With the vectors objective, +the pretraining is use the embedding space learned by an algorithm such as +GloVe or word2vec, allowing the model to focus on the contextual +modelling we actual care about. + +The `[pretraining]` section has several configuration subsections that are +familiar from the training block: the `[pretraining.batcher]`, +[pretraining.optimizer]` and `[pretraining.corpus]` all work the same way and +expect the same types of objects, although for pretraining your corpus does not +need to have any annotations, so you will often use a different reader, such as +`spacy.training.JsonlReader1`. > #### Raw text format > From a0b4389a3845a1692b934a6ca79caf54bb29b1a3 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 17 Sep 2020 19:24:48 +0200 Subject: [PATCH 27/46] Update docs [ci skip] --- website/docs/usage/embeddings-transformers.md | 200 +++++++++++------- 1 file changed, 121 insertions(+), 79 deletions(-) diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md index 678237dc2..4adcd927c 100644 --- a/website/docs/usage/embeddings-transformers.md +++ b/website/docs/usage/embeddings-transformers.md @@ -610,99 +610,141 @@ def MyCustomVectors( ## Pretraining {#pretraining} -The `spacy pretrain` command lets you initialize your models with information -from raw text. Without pretraining, the models for your components will usually -be initialized randomly. The idea behind pretraining is simple: random probably -isn't optimal, so if we have some text to learn from, we can probably find -a way to get the model off to a better start. The impact of `spacy pretrain` varies, -but it will usually be worth trying if you're not using a transformer model and -you have relatively little training data (for instance, fewer than 5,000 sentence). -A good rule of thumb is that pretraining will generally give you a similar accuracy -improvement to using word vectors in your model. If word vectors have given you -a 10% error reduction, the `spacy pretrain` command might give you another 10%, -for a 20% error reduction in total. +The [`spacy pretrain`](/api/cli#pretrain) command lets you initialize your +models with **information from raw text**. Without pretraining, the models for +your components will usually be initialized randomly. The idea behind +pretraining is simple: random probably isn't optimal, so if we have some text to +learn from, we can probably find a way to get the model off to a better start. -The `spacy pretrain` command will take a specific subnetwork within one of your -components, and add additional layers to build a network for a temporary task, -that forces the model to learn something about sentence structure and word -cooccurrence statistics. Pretraining produces a binary weights file that can be -loaded back in at the start of training. The weights file specifies an initial -set of weights. Training then proceeds as normal. - -You can only pretrain one subnetwork from your pipeline at a time, and the subnetwork -must be typed `Model[List[Doc], List[Floats2d]]` (i.e., it has to be a "tok2vec" layer). -The most common workflow is to use the `Tok2Vec` component to create a shared -token-to-vector layer for several components of your pipeline, and apply -pretraining to its whole model. - -The `spacy pretrain` command is configured using the `[pretraining]` section of -your config file. The `pretraining.component` and `pretraining.layer` settings -tell spaCy how to find the subnetwork to pretrain. The `pretraining.layer` -setting should be either the empty string (to use the whole model), or a -[node reference](https://thinc.ai/docs/usage-models#model-state). Most of spaCy's -built-in model architectures have a reference named `"tok2vec"` that will refer -to the right layer. - -```ini -# Pretrain nlp.get_pipe("tok2vec").model -[pretraining] -component = "tok2vec" -layer = "" - -[pretraining] -# Pretrain nlp.get_pipe("textcat").model.get_ref("tok2vec") -component = "textcat" -layer = "tok2vec" -``` - -two pretraining objectives are available, both of which are variants of the cloze -task Devlin et al (2018) introduced for BERT. - -* The *characters* objective asks the model to predict some number of leading and - trailing UTF-8 bytes for the words. For instance, setting `n_characters=2`, the - model will try to predict the first two and last two characters of the word. - -* The *vectors* objective asks the model to predict the word's vector, from - a static embeddings table. This requires a word vectors model to be trained - and loaded. The vectors objective can optimize either a cosine or an L2 loss. - We've generally found cosine loss to perform better. - -These pretraining objectives use a trick that we term _language modelling with -approximate outputs (LMAO)_. The motivation for the trick is that predicting -an exact word ID introduces a lot of incidental complexity. You need a large -output layer, and even then, the vocabulary is too large, which motivates -tokenization schemes that do not align to actual word boundaries. At the end of -training, the output layer will be thrown away regardless: we just want a task -that forces the network to model something about word cooccurrence statistics. -Predicting leading and trailing characters does that more than adequately, as -the exact word sequence could be recovered with high accuracy if the initial -and trailing characters are predicted accurately. With the vectors objective, -the pretraining is use the embedding space learned by an algorithm such as -GloVe or word2vec, allowing the model to focus on the contextual -modelling we actual care about. - -The `[pretraining]` section has several configuration subsections that are -familiar from the training block: the `[pretraining.batcher]`, -[pretraining.optimizer]` and `[pretraining.corpus]` all work the same way and +Pretraining uses the same [`config.cfg`](/usage/training#config) file as the +regular training, which helps keep the settings and hyperparameters consistent. +The additional `[pretraining]` section has several configuration subsections +that are familiar from the training block: the `[pretraining.batcher]`, +`[pretraining.optimizer]` and `[pretraining.corpus]` all work the same way and expect the same types of objects, although for pretraining your corpus does not -need to have any annotations, so you will often use a different reader, such as -`spacy.training.JsonlReader1`. +need to have any annotations, so you will often use a different reader, such as +the [`JsonlReader`](/api/toplevel#jsonlreader). > #### Raw text format > -> The raw text can be provided as JSONL (newline-delimited JSON) with a key -> `"text"` per entry. This allows the data to be read in line by line, while -> also allowing you to include newlines in the texts. +> The raw text can be provided in spaCy's +> [binary `.spacy` format](/api/data-formats#training) consisting of serialized +> `Doc` objects or as a JSONL (newline-delimited JSON) with a key `"text"` per +> entry. This allows the data to be read in line by line, while also allowing +> you to include newlines in the texts. > > ```json > {"text": "Can I ask where you work now and what you do, and if you enjoy it?"} > {"text": "They may just pull out of the Seattle market completely, at least until they have autonomous vehicles."} > ``` +> +> You can also use your own custom corpus loader instead. + +You can add a `[pretraining]` block to your config by setting the +`--pretraining` flag on [`init config`](/api/cli#init-config) or +[`init fill-config`](/api/cli#init-fill-config): ```cli $ python -m spacy init fill-config config.cfg config_pretrain.cfg --pretraining ``` +You can then run [`spacy pretrain`](/api/cli#pretrain) with the updated config +and pass in optional config overrides, like the path to the raw text file: + ```cli -$ python -m spacy pretrain raw_text.jsonl /output config_pretrain.cfg +$ python -m spacy pretrain config_pretrain.cfg ./output --paths.raw text.jsonl ``` + +### How pretraining works {#pretraining-details} + +The impact of [`spacy pretrain`](/api/cli#pretrain) varies, but it will usually +be worth trying if you're **not using a transformer** model and you have +**relatively little training data** (for instance, fewer than 5,000 sentences). +A good rule of thumb is that pretraining will generally give you a similar +accuracy improvement to using word vectors in your model. If word vectors have +given you a 10% error reduction, pretraining with spaCy might give you another +10%, for a 20% error reduction in total. + +The [`spacy pretrain`](/api/cli#pretrain) command will take a **specific +subnetwork** within one of your components, and add additional layers to build a +network for a temporary task, that forces the model to learn something about +sentence structure and word cooccurrence statistics. Pretraining produces a +**binary weights file** that can be loaded back in at the start of training. The +weights file specifies an initial set of weights. Training then proceeds as +normal. + +You can only pretrain one subnetwork from your pipeline at a time, and the +subnetwork must be typed ~~Model[List[Doc], List[Floats2d]]~~ (i.e. it has to be +a "tok2vec" layer). The most common workflow is to use the +[`Tok2Vec`](/api/tok2vec) component to create a shared token-to-vector layer for +several components of your pipeline, and apply pretraining to its whole model. + +#### Configuring the pretraining {#pretraining-configure} + +The [`spacy pretrain`](/api/cli#pretrain) command is configured using the +`[pretraining]` section of your [config file](/usage/training#config). The +`component` and `layer` settings tell spaCy how to **find the subnetwork** to +pretrain. The `layer` setting should be either the empty string (to use the +whole model), or a +[node reference](https://thinc.ai/docs/usage-models#model-state). Most of +spaCy's built-in model architectures have a reference named `"tok2vec"` that +will refer to the right layer. + +```ini +### config.cfg +# 1. Use the whole model of the "tok2vec" component +[pretraining] +component = "tok2vec" +layer = "" + +# 2. Pretrain the "tok2vec" node of the "textcat" component +[pretraining] +component = "textcat" +layer = "tok2vec" +``` + +#### Pretraining objectives {#pretraining-details} + +Two pretraining objectives are available, both of which are variants of the +cloze task [Devlin et al. (2018)](https://arxiv.org/abs/1810.04805) introduced +for BERT. The objective can be defined and configured via the +`[pretraining.objective]` config block. + +> ```ini +> ### Characters objective +> [pretraining.objective] +> type = "characters" +> n_characters = 4 +> ``` +> +> ```ini +> ### Vectors objective +> [pretraining.objective] +> type = "vectors" +> loss = "cosine" +> ``` + +- **Characters:** The `"characters"` objective asks the model to predict some + number of leading and trailing UTF-8 bytes for the words. For instance, + setting `n_characters = 2`, the model will try to predict the first two and + last two characters of the word. + +- **Vectors:** The `"vectors"` objective asks the model to predict the word's + vector, from a static embeddings table. This requires a word vectors model to + be trained and loaded. The vectors objective can optimize either a cosine or + an L2 loss. We've generally found cosine loss to perform better. + +These pretraining objectives use a trick that we term **language modelling with +approximate outputs (LMAO)**. The motivation for the trick is that predicting an +exact word ID introduces a lot of incidental complexity. You need a large output +layer, and even then, the vocabulary is too large, which motivates tokenization +schemes that do not align to actual word boundaries. At the end of training, the +output layer will be thrown away regardless: we just want a task that forces the +network to model something about word cooccurrence statistics. Predicting +leading and trailing characters does that more than adequately, as the exact +word sequence could be recovered with high accuracy if the initial and trailing +characters are predicted accurately. With the vectors objective, the pretraining +is use the embedding space learned by an algorithm such as +[GloVe](https://nlp.stanford.edu/projects/glove/) or +[Word2vec](https://code.google.com/archive/p/word2vec/), allowing the model to +focus on the contextual modelling we actual care about. From a88106e852b08bcbbe607d5bb83929e5a13120f4 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 18 Sep 2020 03:01:29 +0200 Subject: [PATCH 28/46] Remove W106: HEAD and SENT_START in doc.from_array (#6086) * Remove W106: HEAD and SENT_START in doc.from_array This warning was hacky and being triggered too often. * Fix test --- spacy/errors.py | 3 --- spacy/tests/doc/test_doc_api.py | 5 ++--- spacy/tokens/doc.pyx | 2 -- 3 files changed, 2 insertions(+), 8 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 173aedab9..81e3616be 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -119,9 +119,6 @@ class Warnings: W105 = ("As of spaCy v3.0, the {matcher}.pipe method is deprecated. If you " "need to match on a stream of documents, you can use nlp.pipe and " "call the {matcher} on each Doc object.") - W106 = ("Both HEAD and SENT_START are included as attributes in " - "doc.from_array(). The parse trees based on the HEAD attribute " - "will override the values in SENT_START.") W107 = ("The property Doc.{prop} is deprecated. Use " "Doc.has_annotation(\"{attr}\") instead.") diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index ce979d3d1..c979931b1 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -274,12 +274,11 @@ def test_doc_from_array_sent_starts(en_vocab): # fmt: on doc = get_doc(en_vocab, words=words, heads=heads, deps=deps) - # HEAD overrides SENT_START with warning + # HEAD overrides SENT_START without warning attrs = [SENT_START, HEAD] arr = doc.to_array(attrs) new_doc = Doc(en_vocab, words=words) - with pytest.warns(UserWarning): - new_doc.from_array(attrs, arr) + new_doc.from_array(attrs, arr) # no warning using default attrs attrs = doc._get_array_attrs() diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 5c5443258..2d9de278b 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -817,8 +817,6 @@ cdef class Doc: if array.dtype != numpy.uint64: warnings.warn(Warnings.W028.format(type=array.dtype)) - if set(attrs) != set(Doc._get_array_attrs()) and SENT_START in attrs and HEAD in attrs: - warnings.warn(Warnings.W106) cdef int i, col cdef int32_t abs_head_index cdef attr_id_t attr_id From d32ce121beb38d05e1e926053f1fdf9cce8d2aa6 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 18 Sep 2020 13:41:12 +0200 Subject: [PATCH 29/46] Fix docs [ci skip] --- website/docs/api/top-level.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index f52c63f18..a37f24213 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -84,7 +84,7 @@ Create a blank pipeline of a given language class. This function is the twin of | _keyword-only_ | | | `vocab` 3 | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~. | | `config` 3 | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ | -| `meta` 3 | Optional meta overrides for [`nlp.meta`](/api/language#meta). ~~Dict[str, Any]~~ | +| `meta` 3 | Optional meta overrides for [`nlp.meta`](/api/language#meta). ~~Dict[str, Any]~~ | | **RETURNS** | An empty `Language` object of the appropriate subclass. ~~Language~~ | ### spacy.info {#spacy.info tag="function"} From bbdb5f62b70e9e12c6d4a8d9581e064ce846d19c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 18 Sep 2020 14:26:42 +0200 Subject: [PATCH 30/46] Temporary work-around for scoring a subset of components (#6090) * Try hacking the scorer to work around sentence boundaries * Upd scorer * Set dev version * Upd scorer hack * Fix version * Improve comment on hack --- spacy/scorer.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/spacy/scorer.py b/spacy/scorer.py index 7f7418237..da22d59d4 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -270,6 +270,18 @@ class Scorer: for example in examples: pred_doc = example.predicted gold_doc = example.reference + # TODO + # This is a temporary hack to work around the problem that the scorer + # fails if you have examples that are not fully annotated for all + # the tasks in your pipeline. For instance, you might have a corpus + # of NER annotations that does not set sentence boundaries, but the + # pipeline includes a parser or senter, and then the score_weights + # are used to evaluate that component. When the scorer attempts + # to read the sentences from the gold document, it fails. + try: + list(getter(gold_doc, attr)) + except ValueError: + continue # Find all labels in gold and doc labels = set( [k.label_ for k in getter(gold_doc, attr)] From 0406200a1ea1c960cf6d07c11f91f3b4d7f2d551 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 18 Sep 2020 15:13:13 +0200 Subject: [PATCH 31/46] Update docs [ci skip] --- website/docs/api/data-formats.md | 43 +++++++++++++++++++------------- 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index b9e185d9c..3ed846b9e 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -123,20 +123,11 @@ $ python -m spacy train config.cfg --paths.train ./corpus/train.spacy ### corpora {#config-corpora tag="section"} -This section defines a dictionary mapping of string keys to `Callable` -functions. Each callable takes an `nlp` object and yields -[`Example`](/api/example) objects. By default, the two keys `train` and `dev` -are specified and each refer to a [`Corpus`](/api/top-level#Corpus). When -pretraining, an additional pretrain section is added that defaults to a -[`JsonlReader`](/api/top-level#JsonlReader). - -These subsections can be expanded with additional subsections, each referring to -a callback of type `Callable[[Language], Iterator[Example]]`: - > #### Example > > ```ini > [corpora] +> > [corpora.train] > @readers = "spacy.Corpus.v1" > path = ${paths:train} @@ -148,28 +139,44 @@ a callback of type `Callable[[Language], Iterator[Example]]`: > [corpora.pretrain] > @readers = "spacy.JsonlReader.v1" > path = ${paths.raw} -> min_length = 5 -> max_length = 500 > -> [corpora.mydata] -> @readers = "my_reader.v1" -> shuffle = true +> [corpora.my_custom_data] +> @readers = "my_custom_reader.v1" > ``` -Alternatively, the `corpora` block could refer to one function with return type -`Dict[str, Callable[[Language], Iterator[Example]]]`: +This section defines a **dictionary** mapping of string keys to functions. Each +function takes an `nlp` object and yields [`Example`](/api/example) objects. By +default, the two keys `train` and `dev` are specified and each refer to a +[`Corpus`](/api/top-level#Corpus). When pretraining, an additional `pretrain` +section is added that defaults to a [`JsonlReader`](/api/top-level#JsonlReader). +You can also register custom functions that return a callable. + +| Name | Description | +| ---------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `train` | Training data corpus, typically used in `[training]` block. ~~Callable[[Language], Iterator[Example]]~~ | +| `dev` | Development data corpus, typically used in `[training]` block. ~~Callable[[Language], Iterator[Example]]~~ | +| `pretrain` | Raw text for [pretraining](/usage/embeddings-transformers#pretraining), typically used in `[pretraining]` block (if available). ~~Callable[[Language], Iterator[Example]]~~ | +| ... | Any custom or alternative corpora. ~~Callable[[Language], Iterator[Example]]~~ | + +Alternatively, the `[corpora]` block can refer to **one function** that returns +a dictionary keyed by the corpus names. This can be useful if you want to load a +single corpus once and then divide it up into `train` and `dev` partitions. > #### Example > > ```ini > [corpora] -> @readers = "my_dict_reader.v1" +> @readers = "my_custom_reader.v1" > train_path = ${paths:train} > dev_path = ${paths:dev} > shuffle = true > > ``` +| Name | Description | +| --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `corpora` | A dictionary keyed by string names, mapped to corpus functions that receive the current `nlp` object and return an iterator of [`Example`](/api/example) objects. ~~Dict[str, Callable[[Language], Iterator[Example]]]~~ | + ### training {#config-training tag="section"} This section defines settings and controls for the training and evaluation From eed4b785f51fcff2783e06306441f55437fc95fb Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 18 Sep 2020 15:45:55 +0200 Subject: [PATCH 32/46] Load vocab lookups tables at beginning of training Similar to how vectors are handled, move the vocab lookups to be loaded at the start of training rather than when the vocab is initialized, since the vocab doesn't have access to the full config when it's created. The option moves from `nlp.load_vocab_data` to `training.lookups`. Typically these tables will come from `spacy-lookups-data`, but any `Lookups` object can be provided. The loading from `spacy-lookups-data` is now strict, so configs for each language should specify the exact tables required. This also makes it easier to control whether the larger clusters and probs tables are included. To load `lexeme_norm` from `spacy-lookups-data`: ``` [training.lookups] @misc = "spacy.LoadLookupsData.v1" lang = ${nlp.lang} tables = ["lexeme_norm"] ``` --- spacy/cli/train.py | 1 + spacy/default_config.cfg | 2 +- spacy/language.py | 8 +++++++- spacy/schemas.py | 3 ++- spacy/tests/test_util.py | 7 ++----- spacy/util.py | 8 ++++++++ spacy/vocab.pyx | 17 +++++++++-------- 7 files changed, 30 insertions(+), 16 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 50306b350..c6b39c289 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -88,6 +88,7 @@ def train( sourced_components = get_sourced_components(config) with show_validation_error(config_path): nlp, config = util.load_model_from_config(config) + util.load_vocab_data_into_model(nlp, lookups=config["training"]["lookups"]) if config["training"]["vectors"] is not None: util.load_vectors_into_model(nlp, config["training"]["vectors"]) raw_text, tag_map, morph_rules, weights_data = load_from_paths(config) diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg index c7c9593d7..1517421f0 100644 --- a/spacy/default_config.cfg +++ b/spacy/default_config.cfg @@ -12,7 +12,6 @@ use_pytorch_for_gpu_memory = false lang = null pipeline = [] disabled = [] -load_vocab_data = true before_creation = null after_creation = null after_pipeline_creation = null @@ -58,6 +57,7 @@ accumulate_gradient = 1 init_tok2vec = ${paths.init_tok2vec} raw_text = ${paths.raw} vectors = null +lookups = null # Controls early-stopping. 0 or -1 mean unlimited. patience = 1600 max_epochs = 0 diff --git a/spacy/language.py b/spacy/language.py index d530e6b92..1d0990c55 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -31,6 +31,7 @@ from .schemas import ConfigSchema from .git_info import GIT_VERSION from . import util from . import about +from .lookups import load_lookups # This is the base config will all settings (training etc.) @@ -86,6 +87,12 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]: return tokenizer_factory +@registry.misc("spacy.LoadLookupsData.v1") +def load_lookups_data(lang, tables): + lookups = load_lookups(lang=lang, tables=tables) + return lookups + + class Language: """A text-processing pipeline. Usually you'll load this once per process, and pass the instance around your application. @@ -152,7 +159,6 @@ class Language: self.lang, self.Defaults, vectors_name=vectors_name, - load_data=self._config["nlp"]["load_vocab_data"], ) else: if (self.lang and vocab.lang) and (self.lang != vocab.lang): diff --git a/spacy/schemas.py b/spacy/schemas.py index 06bc4beed..c72b5ca8b 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -8,6 +8,7 @@ from collections import defaultdict from thinc.api import Optimizer from .attrs import NAMES +from .lookups import Lookups if TYPE_CHECKING: # This lets us add type hints for mypy etc. without causing circular imports @@ -198,6 +199,7 @@ class ModelMetaSchema(BaseModel): class ConfigSchemaTraining(BaseModel): # fmt: off vectors: Optional[StrictStr] = Field(..., title="Path to vectors") + lookups: Optional[Lookups] = Field(..., title="Vocab lookups") dev_corpus: StrictStr = Field(..., title="Path in the config to the dev data") train_corpus: StrictStr = Field(..., title="Path in the config to the training data") batcher: Batcher = Field(..., title="Batcher for the training data") @@ -227,7 +229,6 @@ class ConfigSchemaNlp(BaseModel): pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order") disabled: List[StrictStr] = Field(..., title="Pipeline components to disable by default") tokenizer: Callable = Field(..., title="The tokenizer to use") - load_vocab_data: StrictBool = Field(..., title="Whether to load additional vocab data from spacy-lookups-data") before_creation: Optional[Callable[[Type["Language"]], Type["Language"]]] = Field(..., title="Optional callback to modify Language class before initialization") after_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after creation and before the pipeline is constructed") after_pipeline_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after the pipeline is constructed") diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py index 1f073ab32..8c931d31e 100644 --- a/spacy/tests/test_util.py +++ b/spacy/tests/test_util.py @@ -69,7 +69,6 @@ def test_util_dot_section(): [nlp] lang = "en" pipeline = ["textcat"] - load_vocab_data = false [components] @@ -95,15 +94,13 @@ def test_util_dot_section(): # not exclusive_classes assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False # Test that default values got overwritten - assert not en_config["nlp"]["load_vocab_data"] - assert nl_config["nlp"]["load_vocab_data"] # default value True + assert en_config["nlp"]["pipeline"] == ["textcat"] + assert nl_config["nlp"]["pipeline"] == [] # default value [] # Test proper functioning of 'dot_to_object' with pytest.raises(KeyError): dot_to_object(en_config, "nlp.pipeline.tagger") with pytest.raises(KeyError): dot_to_object(en_config, "nlp.unknownattribute") - assert not dot_to_object(en_config, "nlp.load_vocab_data") - assert dot_to_object(nl_config, "nlp.load_vocab_data") assert isinstance(dot_to_object(nl_config, "training.optimizer"), Optimizer) diff --git a/spacy/util.py b/spacy/util.py index 18b34e4d6..2e285a128 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -253,6 +253,14 @@ def load_vectors_into_model( nlp.vocab.strings.add(vectors_nlp.vocab.strings[key]) +def load_vocab_data_into_model( + nlp: "Language", *, lookups: Optional["Lookups"]=None +) -> None: + """Load vocab data.""" + if lookups: + nlp.vocab.load_lookups(lookups) + + def load_model( name: Union[str, Path], *, diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index ef0847e54..94289036a 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -22,14 +22,9 @@ from .lang.norm_exceptions import BASE_NORMS from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang -def create_vocab(lang, defaults, vectors_name=None, load_data=True): +def create_vocab(lang, defaults, vectors_name=None): # If the spacy-lookups-data package is installed, we pre-populate the lookups # with lexeme data, if available - if load_data: - tables = ["lexeme_norm", "lexeme_prob", "lexeme_cluster", "lexeme_settings"] - lookups = load_lookups(lang, tables=tables, strict=False) - else: - lookups = Lookups() lex_attrs = {**LEX_ATTRS, **defaults.lex_attr_getters} # This is messy, but it's the minimal working fix to Issue #639. lex_attrs[IS_STOP] = functools.partial(is_stop, stops=defaults.stop_words) @@ -38,11 +33,9 @@ def create_vocab(lang, defaults, vectors_name=None, load_data=True): lex_attrs[NORM] = util.add_lookups( lex_attrs.get(NORM, LEX_ATTRS[NORM]), BASE_NORMS, - lookups.get_table("lexeme_norm", {}), ) return Vocab( lex_attr_getters=lex_attrs, - lookups=lookups, writing_system=defaults.writing_system, get_noun_chunks=defaults.syntax_iterators.get("noun_chunks"), vectors_name=vectors_name, @@ -424,6 +417,14 @@ cdef class Vocab: orth = self.strings.add(orth) return orth in self.vectors + def load_lookups(self, lookups): + self.lookups = lookups + if lookups.has_table("lexeme_norm"): + self.lex_attr_getters[NORM] = util.add_lookups( + self.lex_attr_getters[NORM], + lookups.get_table("lexeme_norm"), + ) + def to_disk(self, path, *, exclude=tuple()): """Save the current state to a directory. From 47080fba98bf7efd7432a0ac831d5715fad91a59 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 18 Sep 2020 19:43:19 +0200 Subject: [PATCH 33/46] Minor renaming / refactoring * Rename loader to `spacy.LookupsDataLoader.v1`, add debugging message * Make `Vocab.lookups` a property --- spacy/language.py | 3 ++- spacy/util.py | 2 +- spacy/vocab.pxd | 2 +- spacy/vocab.pyx | 19 ++++++++++++------- 4 files changed, 16 insertions(+), 10 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 1d0990c55..7d463731a 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -87,8 +87,9 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]: return tokenizer_factory -@registry.misc("spacy.LoadLookupsData.v1") +@registry.misc("spacy.LookupsDataLoader.v1") def load_lookups_data(lang, tables): + util.logger.debug(f"Loading lookups from spacy-lookups-data: {tables}") lookups = load_lookups(lang=lang, tables=tables) return lookups diff --git a/spacy/util.py b/spacy/util.py index 2e285a128..88162b23a 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -258,7 +258,7 @@ def load_vocab_data_into_model( ) -> None: """Load vocab data.""" if lookups: - nlp.vocab.load_lookups(lookups) + nlp.vocab.lookups = lookups def load_model( diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd index 69cec7d3d..7d8dfd5d6 100644 --- a/spacy/vocab.pxd +++ b/spacy/vocab.pxd @@ -28,7 +28,7 @@ cdef class Vocab: cpdef readonly StringStore strings cpdef public Morphology morphology cpdef public object vectors - cpdef public object lookups + cpdef public object _lookups cpdef public object writing_system cpdef public object get_noun_chunks cdef readonly int length diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 94289036a..ce104d9db 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -417,13 +417,18 @@ cdef class Vocab: orth = self.strings.add(orth) return orth in self.vectors - def load_lookups(self, lookups): - self.lookups = lookups - if lookups.has_table("lexeme_norm"): - self.lex_attr_getters[NORM] = util.add_lookups( - self.lex_attr_getters[NORM], - lookups.get_table("lexeme_norm"), - ) + property lookups: + def __get__(self): + return self._lookups + + def __set__(self, lookups): + self._lookups = lookups + if lookups.has_table("lexeme_norm"): + self.lex_attr_getters[NORM] = util.add_lookups( + self.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]), + self.lookups.get_table("lexeme_norm"), + ) + def to_disk(self, path, *, exclude=tuple()): """Save the current state to a directory. From 39872de1f6e49c4b59ed747a2f15ca448a52f7db Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Sat, 19 Sep 2020 01:17:02 +0200 Subject: [PATCH 34/46] Introducing the gpu_allocator (#6091) * rename 'use_pytorch_for_gpu_memory' to 'gpu_allocator' * --code instead of --code-path * update documentation * avoid querying the "system" section directly * add explanation of gpu_allocator to TF/PyTorch section in docs * fix typo * fix typo 2 * use set_gpu_allocator from thinc 8.0.0a34 * default null instead of empty string --- pyproject.toml | 2 +- requirements.txt | 2 +- setup.cfg | 4 ++-- spacy/cli/debug_model.py | 9 +++++++-- spacy/cli/pretrain.py | 17 +++++++++-------- spacy/cli/templates/quickstart_training.jinja | 2 +- spacy/cli/train.py | 13 ++++++------- spacy/default_config.cfg | 4 ++-- spacy/schemas.py | 1 + website/docs/api/cli.md | 4 +++- website/docs/api/data-formats.md | 1 + website/docs/api/top-level.md | 14 ++++++++------ website/docs/usage/layers-architectures.md | 12 ++++++++++++ 13 files changed, 54 insertions(+), 31 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index a413a099c..5290660aa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ requires = [ "cymem>=2.0.2,<2.1.0", "preshed>=3.0.2,<3.1.0", "murmurhash>=0.28.0,<1.1.0", - "thinc>=8.0.0a33,<8.0.0a40", + "thinc>=8.0.0a34,<8.0.0a40", "blis>=0.4.0,<0.5.0", "pytokenizations", "pathy" diff --git a/requirements.txt b/requirements.txt index 55fe627b8..4d6c1dfd0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # Our libraries cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 -thinc>=8.0.0a33,<8.0.0a40 +thinc>=8.0.0a34,<8.0.0a40 blis>=0.4.0,<0.5.0 ml_datasets==0.2.0a0 murmurhash>=0.28.0,<1.1.0 diff --git a/setup.cfg b/setup.cfg index 359e63172..dd0975800 100644 --- a/setup.cfg +++ b/setup.cfg @@ -34,13 +34,13 @@ setup_requires = cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 murmurhash>=0.28.0,<1.1.0 - thinc>=8.0.0a33,<8.0.0a40 + thinc>=8.0.0a34,<8.0.0a40 install_requires = # Our libraries murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 - thinc>=8.0.0a33,<8.0.0a40 + thinc>=8.0.0a34,<8.0.0a40 blis>=0.4.0,<0.5.0 wasabi>=0.8.0,<1.1.0 srsly>=2.1.0,<3.0.0 diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py index a4899a458..349849f58 100644 --- a/spacy/cli/debug_model.py +++ b/spacy/cli/debug_model.py @@ -2,7 +2,7 @@ from typing import Dict, Any, Optional from pathlib import Path from wasabi import msg from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam -from thinc.api import Model, data_validation +from thinc.api import Model, data_validation, set_gpu_allocator import typer from ._util import Arg, Opt, debug_cli, show_validation_error @@ -53,7 +53,12 @@ def debug_model_cli( } config_overrides = parse_config_overrides(ctx.args) with show_validation_error(config_path): - config = util.load_config(config_path, overrides=config_overrides) + config = util.load_config( + config_path, overrides=config_overrides, interpolate=True + ) + allocator = config["training"]["gpu_allocator"] + if use_gpu >= 0 and allocator: + set_gpu_allocator(allocator) nlp, config = util.load_model_from_config(config_path) seed = config["training"]["seed"] if seed is not None: diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index aec077eb7..9e913396e 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -4,10 +4,9 @@ import time import re from collections import Counter from pathlib import Path -from thinc.api import Config -from thinc.api import use_pytorch_for_gpu_memory, require_gpu +from thinc.api import require_gpu, set_gpu_allocator from thinc.api import set_dropout_rate, to_categorical, fix_random_seed -from thinc.api import CosineDistance, L2Distance +from thinc.api import Config, CosineDistance, L2Distance from wasabi import msg import srsly from functools import partial @@ -32,7 +31,7 @@ def pretrain_cli( ctx: typer.Context, # This is only used to read additional arguments config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False), output_dir: Path = Arg(..., help="Directory to write weights to on each epoch"), - code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"), + code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"), epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."), use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"), @@ -99,10 +98,12 @@ def pretrain( epoch_resume: Optional[int] = None, use_gpu: int = -1, ): - if config["system"].get("seed") is not None: - fix_random_seed(config["system"]["seed"]) - if use_gpu >= 0 and config["system"].get("use_pytorch_for_gpu_memory"): - use_pytorch_for_gpu_memory() + if config["training"]["seed"] is not None: + fix_random_seed(config["training"]["seed"]) + allocator = config["training"]["gpu_allocator"] + if use_gpu >= 0 and allocator: + set_gpu_allocator(allocator) + nlp, config = util.load_model_from_config(config) P_cfg = config["pretraining"] corpus = dot_to_object(config, P_cfg["corpus"]) diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index 00b77af4d..ef608e5e8 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -8,7 +8,7 @@ train = "" dev = "" [system] -use_pytorch_for_gpu_memory = {{ "true" if use_transformer else "false" }} +gpu_allocator = {{ "pytorch" if use_transformer else "" }} [nlp] lang = "{{ lang }}" diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 50306b350..debecd0b1 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -6,8 +6,7 @@ from pathlib import Path from wasabi import msg import thinc import thinc.schedules -from thinc.api import use_pytorch_for_gpu_memory, require_gpu, fix_random_seed -from thinc.api import Config, Optimizer +from thinc.api import Config, Optimizer, require_gpu, fix_random_seed, set_gpu_allocator import random import typer import logging @@ -29,7 +28,7 @@ def train_cli( ctx: typer.Context, # This is only used to read additional arguments config_path: Path = Arg(..., help="Path to config file", exists=True), output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"), - code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"), + code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"), resume: bool = Opt(False, "--resume", "-R", help="Resume training"), @@ -79,11 +78,11 @@ def train( config = util.load_config( config_path, overrides=config_overrides, interpolate=True ) - if config.get("training", {}).get("seed") is not None: + if config["training"]["seed"] is not None: fix_random_seed(config["training"]["seed"]) - if config.get("system", {}).get("use_pytorch_for_gpu_memory"): - # It feels kind of weird to not have a default for this. - use_pytorch_for_gpu_memory() + allocator = config["training"]["gpu_allocator"] + if use_gpu >= 0 and allocator: + set_gpu_allocator(allocator) # Use original config here before it's resolved to functions sourced_components = get_sourced_components(config) with show_validation_error(config_path): diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg index c7c9593d7..f4a453f2a 100644 --- a/spacy/default_config.cfg +++ b/spacy/default_config.cfg @@ -6,7 +6,7 @@ init_tok2vec = null [system] seed = 0 -use_pytorch_for_gpu_memory = false +gpu_allocator = null [nlp] lang = null @@ -52,6 +52,7 @@ limit = 0 # Training hyper-parameters and additional features. [training] seed = ${system.seed} +gpu_allocator = ${system.gpu_allocator} dropout = 0.1 accumulate_gradient = 1 # Extra resources for transfer-learning or pseudo-rehearsal @@ -75,7 +76,6 @@ train_corpus = "corpora.train" [training.logger] @loggers = "spacy.ConsoleLogger.v1" - [training.batcher] @batchers = "spacy.batch_by_words.v1" discard_oversize = false diff --git a/spacy/schemas.py b/spacy/schemas.py index 06bc4beed..db71af9ca 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -207,6 +207,7 @@ class ConfigSchemaTraining(BaseModel): max_steps: StrictInt = Field(..., title="Maximum number of update steps to train for") eval_frequency: StrictInt = Field(..., title="How often to evaluate during training (steps)") seed: Optional[StrictInt] = Field(..., title="Random seed") + gpu_allocator: Optional[StrictStr] = Field(..., title="Memory allocator when running on GPU") accumulate_gradient: StrictInt = Field(..., title="Whether to divide the batch up into substeps") score_weights: Dict[StrictStr, Union[StrictFloat, StrictInt]] = Field(..., title="Scores to report and their weights for selecting final model") init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights") diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index bd65a1516..7374e1e3f 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -763,6 +763,7 @@ $ python -m spacy train [config_path] [--output] [--code] [--verbose] [overrides | `--output`, `-o` | Directory to store trained pipeline in. Will be created if it doesn't exist. ~~Optional[Path] \(positional)~~ | | `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | | `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~ | +| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | | overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ | | **CREATES** | The final trained pipeline and the best trained pipeline. | @@ -798,11 +799,12 @@ $ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [ | Name | Description | | ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `output_dir` | Directory to save binary weights to on each epoch. ~~Path (positional)~~ | | `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ | +| `output_dir` | Directory to save binary weights to on each epoch. ~~Path (positional)~~ | | `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | | `--resume-path`, `-r` | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~ | | `--epoch-resume`, `-er` | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. ~~Optional[int] \(option)~~ | +| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | | overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.dropout 0.2`. ~~Any (option/flag)~~ | | **CREATES** | The pretrained weights that can be used to initialize `spacy train`. | diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index 3ed846b9e..6e80bb409 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -189,6 +189,7 @@ process that are used when you run [`spacy train`](/api/cli#train). | `dev_corpus` | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~ | | `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ | | `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ | +| `gpu_allocator` | Library for cupy to route GPU memory allocation to. Can be "pytorch" or "tensorflow". Defaults to variable `${system.gpu_allocator}`. ~~str~~ | | `frozen_components` | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ | | `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~ | | `max_epochs` | Maximum number of epochs to train for. Defaults to `0`. ~~int~~ | diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 5d850be01..3f51d21aa 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -145,9 +145,10 @@ pipelines. > nlp = spacy.load("en_core_web_sm") > ``` -| Name | Description | -| ----------- | --------------------------------------- | -| **RETURNS** | Whether the GPU was activated. ~~bool~~ | +| Name | Description | +| ----------- | ------------------------------------------------ | +| `gpu_id` | Device index to select. Defaults to `0`. ~~int~~ | +| **RETURNS** | Whether the GPU was activated. ~~bool~~ | ### spacy.require_gpu {#spacy.require_gpu tag="function" new="2.0.14"} @@ -164,9 +165,10 @@ and _before_ loading any pipelines. > nlp = spacy.load("en_core_web_sm") > ``` -| Name | Description | -| ----------- | --------------- | -| **RETURNS** | `True` ~~bool~~ | +| Name | Description | +| ----------- | ------------------------------------------------ | +| `gpu_id` | Device index to select. Defaults to `0`. ~~int~~ | +| **RETURNS** | `True` ~~bool~~ | ## displaCy {#displacy source="spacy/displacy"} diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md index aefc64ece..f9787d815 100644 --- a/website/docs/usage/layers-architectures.md +++ b/website/docs/usage/layers-architectures.md @@ -356,6 +356,18 @@ that training configs are complete and experiments fully reproducible. +Note that when using a PyTorch or Tensorflow model, it is recommended to set the GPU +memory allocator accordingly. When `gpu_allocator` is set to "pytorch" or +"tensorflow" in the training config, cupy will allocate memory via those respective libraries, +preventing OOM errors when there's available memory sitting in the other +library's pool. + +```ini +### config.cfg (excerpt) +[training] +gpu_allocator = "pytorch" +``` + ## Custom models with Thinc {#thinc} Of course it's also possible to define the `Model` from the previous section From 554c9a24978d968113da02783c7257b5133ec5e6 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 20 Sep 2020 12:30:53 +0200 Subject: [PATCH 35/46] Update docs [ci skip] --- spacy/cli/templates/quickstart_training.jinja | 6 +++++- website/docs/api/data-formats.md | 7 +++---- website/docs/api/top-level.md | 10 ++++++++++ website/docs/usage/embeddings-transformers.md | 10 ++++++++++ website/docs/usage/projects.md | 10 ++++------ 5 files changed, 32 insertions(+), 11 deletions(-) diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index ef608e5e8..0db4c8a59 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -8,7 +8,11 @@ train = "" dev = "" [system] -gpu_allocator = {{ "pytorch" if use_transformer else "" }} +{% if use_transformer -%} +gpu_allocator = "pytorch" +{% else -%} +gpu_allocator = null +{% endif %} [nlp] lang = "{{ lang }}" diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index 6e80bb409..3a214428b 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -60,7 +60,6 @@ your config and check that it's valid, you can run the > [nlp] > lang = "en" > pipeline = ["tagger", "parser", "ner"] -> load_vocab_data = true > before_creation = null > after_creation = null > after_pipeline_creation = null @@ -77,7 +76,6 @@ Defines the `nlp` object, its tokenizer and | `lang` | Pipeline language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). Defaults to `null`. ~~str~~ | | `pipeline` | Names of pipeline components in order. Should correspond to sections in the `[components]` block, e.g. `[components.ner]`. See docs on [defining components](/usage/training#config-components). Defaults to `[]`. ~~List[str]~~ | | `disabled` | Names of pipeline components that are loaded but disabled by default and not run as part of the pipeline. Should correspond to components listed in `pipeline`. After a pipeline is loaded, disabled components can be enabled using [`Language.enable_pipe`](/api/language#enable_pipe). ~~List[str]~~ | -| `load_vocab_data` | Whether to load additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) if available. Defaults to `true`. ~~bool~~ | | `before_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `Language` subclass before it's initialized. Defaults to `null`. ~~Optional[Callable[[Type[Language]], Type[Language]]]~~ | | `after_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object right after it's initialized. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ | | `after_pipeline_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object after the pipeline components have been added. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ | @@ -189,9 +187,10 @@ process that are used when you run [`spacy train`](/api/cli#train). | `dev_corpus` | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~ | | `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ | | `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ | -| `gpu_allocator` | Library for cupy to route GPU memory allocation to. Can be "pytorch" or "tensorflow". Defaults to variable `${system.gpu_allocator}`. ~~str~~ | | `frozen_components` | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ | +| `gpu_allocator` | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~ | | `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~ | +| `lookups` | Additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `null`. ~~Optional[Lookups]~~ | | `max_epochs` | Maximum number of epochs to train for. Defaults to `0`. ~~int~~ | | `max_steps` | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~ | | `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ | @@ -476,7 +475,7 @@ lexical data. Here's an example of the 20 most frequent lexemes in the English training data: ```json -%%GITHUB_SPACY / extra / example_data / vocab - data.jsonl +%%GITHUB_SPACY/extra/example_data/vocab-data.jsonl ``` ## Pipeline meta {#meta} diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 3f51d21aa..7afe02403 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -458,6 +458,16 @@ remain in the config file stored on your local system. | `project_name` | The name of the project in the Weights & Biases interface. The project will be created automatically if it doesn't exist yet. ~~str~~ | | `remove_config_values` | A list of values to include from the config before it is uploaded to W&B (default: empty). ~~List[str]~~ | + + +Get started with tracking your spaCy training runs in Weights & Biases using our +project template. It trains on the IMDB Movie Review Dataset and includes a +simple config with the built-in `WandbLogger`, as well as a custom example of +creating variants of the config for a simple hyperparameter grid search and +logging the results. + + + ## Readers {#readers source="spacy/training/corpus.py" new="3"} Corpus readers are registered functions that load data and return a function diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md index 4adcd927c..c6c703842 100644 --- a/website/docs/usage/embeddings-transformers.md +++ b/website/docs/usage/embeddings-transformers.md @@ -655,6 +655,16 @@ and pass in optional config overrides, like the path to the raw text file: $ python -m spacy pretrain config_pretrain.cfg ./output --paths.raw text.jsonl ``` +The following defaults are used for the `[pretraining]` block and merged into +your existing config when you run [`init config`](/api/cli#init-config) or +[`init fill-config`](/api/cli#init-fill-config) with `--pretraining`. If needed, +you can [configure](#pretraining-configure) the settings and hyperparameters or +change the [objective](#pretraining-details). + +```ini +%%GITHUB_SPACY/spacy/default_config_pretraining.cfg +``` + ### How pretraining works {#pretraining-details} The impact of [`spacy pretrain`](/api/cli#pretrain) varies, but it will usually diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md index 665caa15b..08bfb9da2 100644 --- a/website/docs/usage/projects.md +++ b/website/docs/usage/projects.md @@ -976,14 +976,12 @@ your results. ![Screenshot: Parameter importance using config values](../images/wandb2.jpg 'Parameter importance using config values') - From 889128e5c586f39eb6f18ae6a6b6fbe1505f4080 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 20 Sep 2020 16:20:57 +0200 Subject: [PATCH 36/46] Improve error handling in run_command --- spacy/util.py | 43 ++++++++++++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 15 deletions(-) diff --git a/spacy/util.py b/spacy/util.py index 88162b23a..6e7b28fec 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -659,8 +659,8 @@ def join_command(command: List[str]) -> str: def run_command( command: Union[str, List[str]], *, - capture: bool = False, stdin: Optional[Any] = None, + capture: bool=False, ) -> Optional[subprocess.CompletedProcess]: """Run a command on the command line as a subprocess. If the subprocess returns a non-zero exit code, a system exit is performed. @@ -668,33 +668,46 @@ def run_command( command (str / List[str]): The command. If provided as a string, the string will be split using shlex.split. stdin (Optional[Any]): stdin to read from or None. - capture (bool): Whether to capture the output. + capture (bool): Whether to capture the output and errors. If False, + the stdout and stderr will not be redirected, and if there's an error, + sys.exit will be called with the returncode. You should use capture=False + when you want to turn over execution to the command, and capture=True + when you want to run the command more like a function. RETURNS (Optional[CompletedProcess]): The process object. """ if isinstance(command, str): - command = split_command(command) + cmd_list = split_command(command) + cmd_str = command + else: + cmd_list = command + cmd_str = " ".join(command) try: ret = subprocess.run( - command, + cmd_list, env=os.environ.copy(), input=stdin, encoding="utf8", - check=True, + check=False, stdout=subprocess.PIPE if capture else None, - stderr=subprocess.PIPE if capture else None, + stderr=subprocess.STDOUT if capture else None, ) except FileNotFoundError: + # Indicates the *command* wasn't found, it's an error before the command + # is run. raise FileNotFoundError( - Errors.E970.format(str_command=" ".join(command), tool=command[0]) + Errors.E970.format(str_command=cmd_str, tool=cmd_list[0]) ) from None - except subprocess.CalledProcessError as e: - # We don't want a duplicate traceback here so we're making sure the - # CalledProcessError isn't re-raised. We also print both the string - # message and the stderr, in case the error only has one of them. - print(e.stderr) - print(e) - sys.exit(1) - if ret.returncode != 0: + if ret.returncode != 0 and capture: + message = f"Error running command:\n\n{cmd_str}\n\n" + message += f"Subprocess exited with status {ret.returncode}" + if ret.stdout is not None: + message += f"\n\nProcess log (stdout and stderr):\n\n" + message += ret.stdout + error = subprocess.SubprocessError(message) + error.ret = ret + error.command = cmd_str + raise error + elif ret.returncode != 0: sys.exit(ret.returncode) return ret From 2c24d633d0f81e17dca2158b5185f316ae910130 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 20 Sep 2020 16:21:43 +0200 Subject: [PATCH 37/46] Use updated run_command --- spacy/cli/package.py | 2 +- spacy/cli/project/run.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 8d6cd84c1..49a0ab75d 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -110,7 +110,7 @@ def package( msg.good(f"Successfully created package '{model_name_v}'", main_path) if create_sdist: with util.working_dir(main_path): - util.run_command([sys.executable, "setup.py", "sdist"]) + util.run_command([sys.executable, "setup.py", "sdist"], capture=False) zip_file = main_path / "dist" / f"{model_name_v}.tar.gz" msg.good(f"Successfully created zipped Python package", zip_file) diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py index eb7b8cc5b..13c28f1da 100644 --- a/spacy/cli/project/run.py +++ b/spacy/cli/project/run.py @@ -144,7 +144,7 @@ def run_commands( if not silent: print(f"Running command: {join_command(command)}") if not dry: - run_command(command) + run_command(command, capture=False) def validate_subcommand( From a0fb5e50dbb1e24901f7b1470ee53cc6bce7a4d6 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 20 Sep 2020 16:22:04 +0200 Subject: [PATCH 38/46] Use simple git clone call if not sparse --- spacy/cli/_util.py | 77 ++++++++++++++++++++-------------------------- 1 file changed, 33 insertions(+), 44 deletions(-) diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index e8f3be995..6675f4d50 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -308,6 +308,31 @@ def git_checkout( msg.fail("Destination of checkout must not exist", exits=1) if not dest.parent.exists(): raise IOError("Parent of destination of checkout must exist") + + if sparse and git_version >= (2, 22): + return git_sparse_checkout(repo, subpath, dest, branch) + elif sparse: + # Only show warnings if the user explicitly wants sparse checkout but + # the Git version doesn't support it + err_old = ( + f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) " + f"that doesn't fully support sparse checkout yet." + ) + err_unk = "You're running an unknown version of Git, so sparse checkout has been disabled." + msg.warn( + f"{err_unk if git_version == (0, 0) else err_old} " + f"This means that more files than necessary may be downloaded " + f"temporarily. To only download the files needed, make sure " + f"you're using Git v2.22 or above." + ) + with make_tempdir() as tmp_dir: + cmd = f"git -C {tmp_dir} clone {repo} . -b {branch}" + ret = run_command(cmd, capture=True) + # We need Path(name) to make sure we also support subdirectories + shutil.copytree(str(tmp_dir / Path(subpath)), str(dest)) + + +def git_sparse_checkout(repo, subpath, dest, branch): # We're using Git, partial clone and sparse checkout to # only clone the files we need # This ends up being RIDICULOUS. omg. @@ -324,47 +349,28 @@ def git_checkout( # *that* we can do by path. # We're using Git and sparse checkout to only clone the files we need with make_tempdir() as tmp_dir: - supports_sparse = git_version >= (2, 22) - use_sparse = supports_sparse and sparse # This is the "clone, but don't download anything" part. - cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 " f"-b {branch} " - if use_sparse: - cmd += f"--filter=blob:none" # <-- The key bit - # Only show warnings if the user explicitly wants sparse checkout but - # the Git version doesn't support it - elif sparse: - err_old = ( - f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) " - f"that doesn't fully support sparse checkout yet." - ) - err_unk = "You're running an unknown version of Git, so sparse checkout has been disabled." - msg.warn( - f"{err_unk if git_version == (0, 0) else err_old} " - f"This means that more files than necessary may be downloaded " - f"temporarily. To only download the files needed, make sure " - f"you're using Git v2.22 or above." - ) - try_run_command(cmd) + cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 " f"-b {branch} --filter=blob:none" + run_command(cmd) # Now we need to find the missing filenames for the subpath we want. # Looking for this 'rev-list' command in the git --help? Hah. cmd = f"git -C {tmp_dir} rev-list --objects --all {'--missing=print ' if use_sparse else ''} -- {subpath}" - ret = try_run_command(cmd) + ret = run_command(cmd, capture=True) git_repo = _from_http_to_git(repo) # Now pass those missings into another bit of git internals missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")]) - if use_sparse and not missings: + if not missings: err = ( f"Could not find any relevant files for '{subpath}'. " f"Did you specify a correct and complete path within repo '{repo}' " f"and branch {branch}?" ) msg.fail(err, exits=1) - if use_sparse: - cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}" - try_run_command(cmd) + cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}" + run_command(cmd, capture=True) # And finally, we can checkout our subpath cmd = f"git -C {tmp_dir} checkout {branch} {subpath}" - try_run_command(cmd) + run_command(cmd, capture=True) # We need Path(name) to make sure we also support subdirectories shutil.move(str(tmp_dir / Path(subpath)), str(dest)) @@ -378,7 +384,7 @@ def get_git_version( RETURNS (Tuple[int, int]): The version as a (major, minor) tuple. Returns (0, 0) if the version couldn't be determined. """ - ret = try_run_command(["git", "--version"], error=error) + ret = run_command("git --version", capture=True) stdout = ret.stdout.strip() if not stdout or not stdout.startswith("git version"): return (0, 0) @@ -386,23 +392,6 @@ def get_git_version( return (int(version[0]), int(version[1])) -def try_run_command( - cmd: Union[str, List[str]], error: str = "Could not run command" -) -> subprocess.CompletedProcess: - """Try running a command and raise an error if it fails. - - cmd (Union[str, List[str]]): The command to run. - error (str): The error message. - RETURNS (CompletedProcess): The completed process if the command ran. - """ - try: - return run_command(cmd, capture=True) - except subprocess.CalledProcessError as e: - msg.fail(error) - print(cmd) - sys.exit(1) - - def _from_http_to_git(repo: str) -> str: if repo.startswith("http://"): repo = repo.replace(r"http://", r"https://") From dc22771f879455a81d8338588aa726a58b08bf50 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 20 Sep 2020 16:30:05 +0200 Subject: [PATCH 39/46] Fix sparse checkout --- spacy/cli/_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index 6675f4d50..cc7be1144 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -354,7 +354,7 @@ def git_sparse_checkout(repo, subpath, dest, branch): run_command(cmd) # Now we need to find the missing filenames for the subpath we want. # Looking for this 'rev-list' command in the git --help? Hah. - cmd = f"git -C {tmp_dir} rev-list --objects --all {'--missing=print ' if use_sparse else ''} -- {subpath}" + cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}" ret = run_command(cmd, capture=True) git_repo = _from_http_to_git(repo) # Now pass those missings into another bit of git internals From 8fb59d958c9676f32d84227c0b042a26b088da35 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 20 Sep 2020 16:31:48 +0200 Subject: [PATCH 40/46] Format --- spacy/cli/_util.py | 5 ++++- spacy/util.py | 4 ++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index cc7be1144..c67863ef1 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -350,7 +350,10 @@ def git_sparse_checkout(repo, subpath, dest, branch): # We're using Git and sparse checkout to only clone the files we need with make_tempdir() as tmp_dir: # This is the "clone, but don't download anything" part. - cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 " f"-b {branch} --filter=blob:none" + cmd = ( + f"git clone {repo} {tmp_dir} --no-checkout --depth 1 " + f"-b {branch} --filter=blob:none" + ) run_command(cmd) # Now we need to find the missing filenames for the subpath we want. # Looking for this 'rev-list' command in the git --help? Hah. diff --git a/spacy/util.py b/spacy/util.py index 6e7b28fec..93000ea27 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -254,7 +254,7 @@ def load_vectors_into_model( def load_vocab_data_into_model( - nlp: "Language", *, lookups: Optional["Lookups"]=None + nlp: "Language", *, lookups: Optional["Lookups"] = None ) -> None: """Load vocab data.""" if lookups: @@ -660,7 +660,7 @@ def run_command( command: Union[str, List[str]], *, stdin: Optional[Any] = None, - capture: bool=False, + capture: bool = False, ) -> Optional[subprocess.CompletedProcess]: """Run a command on the command line as a subprocess. If the subprocess returns a non-zero exit code, a system exit is performed. From 744f259b9c93858d97937157414cb67641d4c846 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 20 Sep 2020 16:37:23 +0200 Subject: [PATCH 41/46] Update landing [ci skip] --- website/src/widgets/landing.js | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/website/src/widgets/landing.js b/website/src/widgets/landing.js index 77fcdfd81..41b009010 100644 --- a/website/src/widgets/landing.js +++ b/website/src/widgets/landing.js @@ -30,7 +30,6 @@ import Benchmarks from 'usage/_benchmarks-models.md' const CODE_EXAMPLE = `# pip install spacy # python -m spacy download en_core_web_sm - import spacy # Load English tokenizer, tagger, parser and NER @@ -120,7 +119,7 @@ const Landing = ({ data }) => {
  • ✅ Components for named entity recognition, - part-of-speech-tagging, dependency parsing, sentence segmentation,{' '} + part-of-speech tagging, dependency parsing, sentence segmentation,{' '} text classification, lemmatization, morphological analysis, entity linking and more
  • From b2302c0a1ce7bacafdde22039cbd8da9782a3f27 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 20 Sep 2020 17:44:51 +0200 Subject: [PATCH 42/46] Improve error for missing dependency --- spacy/cli/project/run.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py index 13c28f1da..d7e1075f3 100644 --- a/spacy/cli/project/run.py +++ b/spacy/cli/project/run.py @@ -59,8 +59,9 @@ def project_run( for dep in cmd.get("deps", []): if not (project_dir / dep).exists(): err = f"Missing dependency specified by command '{subcommand}': {dep}" + err_help = "Maybe you forgot to run the 'project assets' command?" err_kwargs = {"exits": 1} if not dry else {} - msg.fail(err, **err_kwargs) + msg.fail(err, err_help, **err_kwargs) with working_dir(project_dir) as current_dir: rerun = check_rerun(current_dir, cmd) if not rerun and not force: From 012b3a709636224534e44720bca00cb0cc6e3f92 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 20 Sep 2020 17:44:58 +0200 Subject: [PATCH 43/46] Update docs [ci skip] --- website/docs/api/cli.md | 4 +-- website/docs/usage/embeddings-transformers.md | 5 ++- website/docs/usage/facts-figures.md | 6 ++-- website/docs/usage/layers-architectures.md | 12 +++---- website/docs/usage/models.md | 2 -- website/docs/usage/projects.md | 18 ++++------ website/docs/usage/saving-loading.md | 13 +++++-- website/docs/usage/training.md | 11 +++++- website/docs/usage/v3.md | 34 +++++++++++-------- website/meta/site.json | 1 + website/src/components/tag.js | 2 +- website/src/components/util.js | 1 + website/src/widgets/landing.js | 9 ++--- website/src/widgets/project.js | 18 ++++++---- 14 files changed, 77 insertions(+), 59 deletions(-) diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 7374e1e3f..53cd954be 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -895,8 +895,6 @@ what you need. By default, spaCy's can provide any other repo (public or private) that you have access to using the `--repo` option. - - ```cli $ python -m spacy project clone [name] [dest] [--repo] [--branch] [--sparse] ``` @@ -904,7 +902,7 @@ $ python -m spacy project clone [name] [dest] [--repo] [--branch] [--sparse] > #### Example > > ```cli -> $ python -m spacy project clone some_example +> $ python -m spacy project clone pipelines/ner_wikiner > ``` > > Clone from custom repo: diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md index c6c703842..a855d703c 100644 --- a/website/docs/usage/embeddings-transformers.md +++ b/website/docs/usage/embeddings-transformers.md @@ -289,8 +289,7 @@ of objects by referring to creation functions, including functions you register yourself. For details on how to get started with training your own model, check out the [training quickstart](/usage/training#quickstart). - + > #### Evaluation details > @@ -68,6 +68,6 @@ our project template. - +--> diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md index f9787d815..a58ba2ba9 100644 --- a/website/docs/usage/layers-architectures.md +++ b/website/docs/usage/layers-architectures.md @@ -356,11 +356,11 @@ that training configs are complete and experiments fully reproducible. -Note that when using a PyTorch or Tensorflow model, it is recommended to set the GPU -memory allocator accordingly. When `gpu_allocator` is set to "pytorch" or -"tensorflow" in the training config, cupy will allocate memory via those respective libraries, -preventing OOM errors when there's available memory sitting in the other -library's pool. +Note that when using a PyTorch or Tensorflow model, it is recommended to set the +GPU memory allocator accordingly. When `gpu_allocator` is set to "pytorch" or +"tensorflow" in the training config, cupy will allocate memory via those +respective libraries, preventing OOM errors when there's available memory +sitting in the other library's pool. ```ini ### config.cfg (excerpt) @@ -489,7 +489,7 @@ with Model.define_operators({">>": chain}): - - ### Downloading and requiring package dependencies {#models-download} spaCy's built-in [`download`](/api/cli#download) command is mostly intended as a diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md index 08bfb9da2..f8d5a3761 100644 --- a/website/docs/usage/projects.md +++ b/website/docs/usage/projects.md @@ -29,15 +29,13 @@ and share your results with your team. spaCy projects can be used via the new ![Illustration of project workflow and commands](../images/projects.svg) - spaCy projects make it easy to integrate with many other **awesome tools** in the data science and machine learning ecosystem to track and manage your data @@ -65,10 +63,8 @@ project template and copies the files to a local directory. You can then run the project, e.g. to train a pipeline and edit the commands and scripts to build fully custom workflows. - - ```cli -python -m spacy project clone some_example_project +python -m spacy project clone pipelines/tagger_parser_ud ``` By default, the project will be cloned into the current working directory. You @@ -216,10 +212,8 @@ format, train a pipeline, evaluate it and export metrics, package it and spin up a quick web demo. It looks pretty similar to a config file used to define CI pipelines. - - ```yaml -https://github.com/explosion/projects/tree/v3/tutorials/ner_fashion_brands/project.yml +https://github.com/explosion/projects/tree/v3/pipelines/tagger_parser_ud/project.yml ``` | Section | Description | diff --git a/website/docs/usage/saving-loading.md b/website/docs/usage/saving-loading.md index c0fe1323c..3a95bf6aa 100644 --- a/website/docs/usage/saving-loading.md +++ b/website/docs/usage/saving-loading.md @@ -574,7 +574,7 @@ The directory will be created if it doesn't exist, and the whole pipeline data, meta and configuration will be written out. To make the pipeline more convenient to deploy, we recommend wrapping it as a [Python package](/api/cli#package). - + When you save a pipeline in spaCy v3.0+, two files will be exported: a [`config.cfg`](/api/data-formats#config) based on @@ -596,6 +596,15 @@ based on [`nlp.meta`](/api/language#meta). + + +The easiest way to get started with an end-to-end workflow is to clone a +[project template](/usage/projects) and run it – for example, this template that +lets you train a **part-of-speech tagger** and **dependency parser** on a +Universal Dependencies treebank and generates an installable Python package. + + + ### Generating a pipeline package {#models-generating} @@ -699,5 +708,3 @@ class and call [`from_disk`](/api/language#from_disk) instead. ```python nlp = spacy.blank("en").from_disk("/path/to/data") ``` - - diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index c0f4caad7..6e9de62c5 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -92,7 +92,7 @@ spaCy's binary `.spacy` format. You can either include the data paths in the $ python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./dev.spacy ``` - + The recommended config settings generated by the quickstart widget and the [`init config`](/api/cli#init-config) command are based on some general **best @@ -112,6 +112,15 @@ as we run more experiments. + + +The easiest way to get started is to clone a [project template](/usage/projects) +and run it – for example, this end-to-end template that lets you train a +**part-of-speech tagger** and **dependency parser** on a Universal Dependencies +treebank. + + + ## Training config {#config} Training config files include all **settings and hyperparameters** for training diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md index 24babc9bd..5abeb5707 100644 --- a/website/docs/usage/v3.md +++ b/website/docs/usage/v3.md @@ -176,18 +176,16 @@ freely combine implementations from different frameworks into a single model. ### Manage end-to-end workflows with projects {#features-projects} - - > #### Example > > ```cli > # Clone a project template -> $ python -m spacy project clone example -> $ cd example +> $ python -m spacy project clone pipelines/tagger_parser_ud +> $ cd tagger_parser_ud > # Download data assets > $ python -m spacy project assets > # Run a workflow -> $ python -m spacy project run train +> $ python -m spacy project run all > ``` spaCy projects let you manage and share **end-to-end spaCy workflows** for @@ -207,14 +205,6 @@ data, [Streamlit](/usage/projects#streamlit) for building interactive apps, [Ray](/usage/projects#ray) for parallel training, [Weights & Biases](/usage/projects#wandb) for experiment tracking, and more! - - - **Usage:** [spaCy projects](/usage/projects), @@ -224,6 +214,15 @@ workflows, from data preprocessing to training and packaging your pipeline. + + +The easiest way to get started is to clone a [project template](/usage/projects) +and run it – for example, this end-to-end template that lets you train a +**part-of-speech tagger** and **dependency parser** on a Universal Dependencies +treebank. + + + ### Parallel and distributed training with Ray {#features-parallel-training} > #### Example @@ -875,7 +874,14 @@ values. You can then use the auto-generated `config.cfg` for training: + python -m spacy train ./config.cfg --output ./output ``` - + + +The easiest way to get started is to clone a [project template](/usage/projects) +and run it – for example, this end-to-end template that lets you train a +**part-of-speech tagger** and **dependency parser** on a Universal Dependencies +treebank. + + #### Training via the Python API {#migrating-training-python} diff --git a/website/meta/site.json b/website/meta/site.json index 1955932b9..1a96ca660 100644 --- a/website/meta/site.json +++ b/website/meta/site.json @@ -12,6 +12,7 @@ "companyUrl": "https://explosion.ai", "repo": "explosion/spaCy", "modelsRepo": "explosion/spacy-models", + "projectsRepo": "explosion/projects/tree/v3", "social": { "twitter": "spacy_io", "github": "explosion" diff --git a/website/src/components/tag.js b/website/src/components/tag.js index 3f2b4e994..b406e771e 100644 --- a/website/src/components/tag.js +++ b/website/src/components/tag.js @@ -13,7 +13,7 @@ export default function Tag({ spaced = false, variant, tooltip, children }) { const isValid = isString(children) && !isNaN(children) const version = isValid ? Number(children).toFixed(1) : children const tooltipText = `This feature is new and was introduced in spaCy v${version}` - // TODO: we probably want to handle this more elegantly, but the idea is + // We probably want to handle this more elegantly, but the idea is // that we can hide tags referring to old versions const major = isString(version) ? Number(version.split('.')[0]) : version return major < MIN_VERSION ? null : ( diff --git a/website/src/components/util.js b/website/src/components/util.js index 3d86cf37e..be55f0bb3 100644 --- a/website/src/components/util.js +++ b/website/src/components/util.js @@ -10,6 +10,7 @@ const htmlToReactParser = new HtmlToReactParser() const DEFAULT_BRANCH = 'develop' export const repo = siteMetadata.repo export const modelsRepo = siteMetadata.modelsRepo +export const projectsRepo = siteMetadata.projectsRepo /** * This is used to provide selectors for headings so they can be crawled by diff --git a/website/src/widgets/landing.js b/website/src/widgets/landing.js index 41b009010..2e75c893a 100644 --- a/website/src/widgets/landing.js +++ b/website/src/widgets/landing.js @@ -222,10 +222,11 @@ const Landing = ({ data }) => {


    - {/** TODO: update with actual example */} - - Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus interdum - sodales lectus. + + The easiest way to get started is to clone a project template and run it + – for example, this template for training a{' '} + part-of-speech tagger and{' '} + dependency parser on a Universal Dependencies treebank. diff --git a/website/src/widgets/project.js b/website/src/widgets/project.js index 0bd74bc90..8d309394d 100644 --- a/website/src/widgets/project.js +++ b/website/src/widgets/project.js @@ -4,25 +4,29 @@ import CopyInput from '../components/copy' import Infobox from '../components/infobox' import Link from '../components/link' import { InlineCode } from '../components/code' +import { projectsRepo } from '../components/util' -// TODO: move to meta? -const DEFAULT_REPO = 'https://github.com/explosion/projects/tree/v3' const COMMAND = 'python -m spacy project clone' -export default function Project({ id, repo, children }) { +export default function Project({ + title = 'Get started with a project template', + id, + repo, + children, +}) { const repoArg = repo ? ` --repo ${repo}` : '' const text = `${COMMAND} ${id}${repoArg}` - const url = `${repo || DEFAULT_REPO}/${id}` - const title = ( + const url = `${repo || projectsRepo}/${id}` + const header = ( <> - Get started with a project template:{' '} + {title}:{' '} {id} ) return ( - + {children} From b9d2b29684c051f956ec808705a2e7288ccf27dd Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 20 Sep 2020 17:49:09 +0200 Subject: [PATCH 44/46] Update docs [ci skip] --- website/src/styles/copy.module.sass | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/website/src/styles/copy.module.sass b/website/src/styles/copy.module.sass index c6d2f68cb..3a942552d 100644 --- a/website/src/styles/copy.module.sass +++ b/website/src/styles/copy.module.sass @@ -15,6 +15,10 @@ background: transparent resize: none font: inherit + overflow: hidden + white-space: nowrap + text-overflow: ellipsis + margin-right: 1rem .prefix margin-right: 0.75em From 9d32cac736da47351e3f38f961aae2fc9e591401 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 21 Sep 2020 10:55:36 +0200 Subject: [PATCH 45/46] Update docs [ci skip] --- website/docs/usage/projects.md | 12 ++++++++---- website/docs/usage/training.md | 8 ++++++-- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md index f8d5a3761..95e20525a 100644 --- a/website/docs/usage/projects.md +++ b/website/docs/usage/projects.md @@ -921,6 +921,14 @@ package is installed in the same environment as spaCy, it will automatically add [parallel training](/usage/training#parallel-training) for more details on how it works under the hood. + + +Get started with parallel training using our project template. It trains a +simple model on a Universal Dependencies Treebank and lets you parallelize the +training with Ray. + + + You can integrate [`spacy ray train`](/api/cli#ray-train) into your `project.yml` just like the regular training command and pass it the config, and optional output directory or remote storage URL and config overrides if needed. @@ -940,10 +948,6 @@ commands: - "training/model-best" ``` - - --- ### Weights & Biases {#wandb} diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 6e9de62c5..071434162 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -895,9 +895,13 @@ cluster. If it's not set, Ray will run locally. python -m spacy ray train config.cfg --n-workers 2 ``` - +Get started with parallel training using our project template. It trains a +simple model on a Universal Dependencies Treebank and lets you parallelize the +training with Ray. + + ### How parallel training works {#parallel-training-details} From 1114219ae3034a9bec070967cdbf03001ea747d8 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 21 Sep 2020 10:59:07 +0200 Subject: [PATCH 46/46] Tidy up and auto-format --- spacy/cli/_util.py | 3 +-- spacy/language.py | 8 ++------ spacy/ml/models/tok2vec.py | 16 +++++++++++----- spacy/schemas.py | 4 ++-- spacy/tests/doc/test_span.py | 7 ++++++- spacy/tests/parser/test_parse_navigate.py | 7 ++++++- spacy/tests/pipeline/test_pipe_factories.py | 15 +++------------ spacy/tests/regression/test_issue1501-2000.py | 12 ++++++++++-- .../tests/serialize/test_serialize_pipeline.py | 8 +++++++- spacy/tests/test_cli.py | 1 - spacy/tests/test_language.py | 3 +-- spacy/tests/test_util.py | 2 +- spacy/tests/training/test_readers.py | 17 ++++++++--------- spacy/tests/training/test_training.py | 12 +++++++++++- 14 files changed, 69 insertions(+), 46 deletions(-) diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index c67863ef1..040434c05 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -6,7 +6,6 @@ from wasabi import msg import srsly import hashlib import typer -import subprocess from click import NoSuchOption from typer.main import get_command from contextlib import contextmanager @@ -327,7 +326,7 @@ def git_checkout( ) with make_tempdir() as tmp_dir: cmd = f"git -C {tmp_dir} clone {repo} . -b {branch}" - ret = run_command(cmd, capture=True) + run_command(cmd, capture=True) # We need Path(name) to make sure we also support subdirectories shutil.copytree(str(tmp_dir / Path(subpath)), str(dest)) diff --git a/spacy/language.py b/spacy/language.py index 7d463731a..4dffd9679 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -156,11 +156,7 @@ class Language: raise ValueError(Errors.E918.format(vocab=vocab, vocab_type=type(Vocab))) if vocab is True: vectors_name = meta.get("vectors", {}).get("name") - vocab = create_vocab( - self.lang, - self.Defaults, - vectors_name=vectors_name, - ) + vocab = create_vocab(self.lang, self.Defaults, vectors_name=vectors_name) else: if (self.lang and vocab.lang) and (self.lang != vocab.lang): raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang)) @@ -1462,7 +1458,7 @@ class Language: # here :( for i, (name1, proc1) in enumerate(self.pipeline): if hasattr(proc1, "find_listeners"): - for name2, proc2 in self.pipeline[i+1:]: + for name2, proc2 in self.pipeline[i + 1 :]: if isinstance(getattr(proc2, "model", None), Model): proc1.find_listeners(proc2.model) diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index 7ced4bd04..fec478e21 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -164,7 +164,9 @@ def MultiHashEmbed( @registry.architectures.register("spacy.CharacterEmbed.v1") -def CharacterEmbed(width: int, rows: int, nM: int, nC: int, also_use_static_vectors: bool): +def CharacterEmbed( + width: int, rows: int, nM: int, nC: int, also_use_static_vectors: bool +): """Construct an embedded representation based on character embeddings, using a feed-forward network. A fixed number of UTF-8 byte characters are used for each word, taken from the beginning and end of the word equally. Padding is @@ -202,9 +204,11 @@ def CharacterEmbed(width: int, rows: int, nM: int, nC: int, also_use_static_vect ), StaticVectors(width, dropout=0.0), ), - with_array(Maxout(width, nM * nC + (2 * width), nP=3, normalize=True, dropout=0.0)), + with_array( + Maxout(width, nM * nC + (2 * width), nP=3, normalize=True, dropout=0.0) + ), ragged2list(), - ) + ) else: model = chain( concatenate( @@ -215,9 +219,11 @@ def CharacterEmbed(width: int, rows: int, nM: int, nC: int, also_use_static_vect with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)), ), ), - with_array(Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0)), + with_array( + Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0) + ), ragged2list(), - ) + ) return model diff --git a/spacy/schemas.py b/spacy/schemas.py index 60655da8c..b0f26dcd7 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -1,4 +1,4 @@ -from typing import Dict, List, Union, Optional, Sequence, Any, Callable, Type, Tuple +from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple from typing import Iterable, TypeVar, TYPE_CHECKING from enum import Enum from pydantic import BaseModel, Field, ValidationError, validator @@ -255,7 +255,7 @@ class ConfigSchemaPretrain(BaseModel): batcher: Batcher = Field(..., title="Batcher for the training data") component: str = Field(..., title="Component to find the layer to pretrain") layer: str = Field(..., title="Layer to pretrain. Whole model if empty.") - + # TODO: use a more detailed schema for this? objective: Dict[str, Any] = Field(..., title="Pretraining objective") # fmt: on diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py index ad4f49042..0c538a0eb 100644 --- a/spacy/tests/doc/test_span.py +++ b/spacy/tests/doc/test_span.py @@ -92,7 +92,12 @@ def test_spans_span_sent(doc, doc_not_parsed): def test_spans_lca_matrix(en_tokenizer): """Test span's lca matrix generation""" tokens = en_tokenizer("the lazy dog slept") - doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=[2, 1, 1, 0], deps=["dep"] * 4) + doc = get_doc( + tokens.vocab, + words=[t.text for t in tokens], + heads=[2, 1, 1, 0], + deps=["dep"] * 4, + ) lca = doc[:2].get_lca_matrix() assert lca.shape == (2, 2) assert lca[0, 0] == 0 # the & the -> the diff --git a/spacy/tests/parser/test_parse_navigate.py b/spacy/tests/parser/test_parse_navigate.py index db1e98ba0..f181a799a 100644 --- a/spacy/tests/parser/test_parse_navigate.py +++ b/spacy/tests/parser/test_parse_navigate.py @@ -63,7 +63,12 @@ def test_parser_parse_navigate_consistency(en_tokenizer, text, heads): def test_parser_parse_navigate_child_consistency(en_tokenizer, text, heads): tokens = en_tokenizer(text) - doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=["dep"] * len(heads)) + doc = get_doc( + tokens.vocab, + words=[t.text for t in tokens], + heads=heads, + deps=["dep"] * len(heads), + ) lefts = {} rights = {} diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py index 1cf06d97f..881460704 100644 --- a/spacy/tests/pipeline/test_pipe_factories.py +++ b/spacy/tests/pipeline/test_pipe_factories.py @@ -345,10 +345,7 @@ def test_language_factories_invalid(): [{"a": 100, "b": 400}, {"c": 0.5, "d": 0.5}], {"a": 0.1, "b": 0.4, "c": 0.25, "d": 0.25}, ), - ( - [{"a": 0.5, "b": 0.5}, {"b": 1.0}], - {"a": 0.25, "b": 0.75}, - ), + ([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.25, "b": 0.75},), ], ) def test_language_factories_combine_score_weights(weights, expected): @@ -363,16 +360,10 @@ def test_language_factories_scores(): weights1 = {"a1": 0.5, "a2": 0.5} weights2 = {"b1": 0.2, "b2": 0.7, "b3": 0.1} Language.factory( - f"{name}1", - scores=list(weights1), - default_score_weights=weights1, - func=func, + f"{name}1", scores=list(weights1), default_score_weights=weights1, func=func, ) Language.factory( - f"{name}2", - scores=list(weights2), - default_score_weights=weights2, - func=func, + f"{name}2", scores=list(weights2), default_score_weights=weights2, func=func, ) meta1 = Language.get_factory_meta(f"{name}1") assert meta1.default_score_weights == weights1 diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py index e226c8524..71ed2ea03 100644 --- a/spacy/tests/regression/test_issue1501-2000.py +++ b/spacy/tests/regression/test_issue1501-2000.py @@ -212,9 +212,17 @@ def test_issue1834(): heads=[0, -1, -2, -3, -4, -5, 0, -1, -2], deps=["dep"] * len(words), ) - print(doc.has_annotation("DEP"), [t.head.i for t in doc], [t.is_sent_start for t in doc]) + print( + doc.has_annotation("DEP"), + [t.head.i for t in doc], + [t.is_sent_start for t in doc], + ) new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes()) - print(new_doc.has_annotation("DEP"), [t.head.i for t in new_doc], [t.is_sent_start for t in new_doc]) + print( + new_doc.has_annotation("DEP"), + [t.head.i for t in new_doc], + [t.is_sent_start for t in new_doc], + ) assert new_doc[6].sent_start assert new_doc.has_annotation("DEP") assert new_doc.has_annotation("TAG") diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py index eedad31e0..d1c4553be 100644 --- a/spacy/tests/serialize/test_serialize_pipeline.py +++ b/spacy/tests/serialize/test_serialize_pipeline.py @@ -136,7 +136,13 @@ def test_serialize_textcat_empty(en_vocab): # See issue #1105 cfg = {"model": DEFAULT_TEXTCAT_MODEL} model = registry.make_from_config(cfg, validate=True)["model"] - textcat = TextCategorizer(en_vocab, model, labels=["ENTITY", "ACTION", "MODIFIER"], threshold=0.5, positive_label=None) + textcat = TextCategorizer( + en_vocab, + model, + labels=["ENTITY", "ACTION", "MODIFIER"], + threshold=0.5, + positive_label=None, + ) textcat.to_bytes(exclude=["vocab"]) diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 0a2300455..422ae74b4 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -3,7 +3,6 @@ from click import NoSuchOption from spacy.training import docs_to_json, biluo_tags_from_offsets from spacy.training.converters import iob2docs, conll_ner2docs, conllu2docs -from spacy.lang.en import English from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate from spacy.cli.init_config import init_config, RECOMMENDATIONS from spacy.cli._util import validate_project_commands, parse_config_overrides diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index 2a24d368a..da46ad424 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -291,8 +291,7 @@ def test_spacy_blank(): @pytest.mark.parametrize( - "value", - [False, None, ["x", "y"], Language, Vocab], + "value", [False, None, ["x", "y"], Language, Vocab], ) def test_language_init_invalid_vocab(value): err_fragment = "invalid value" diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py index 8c931d31e..1668991cd 100644 --- a/spacy/tests/test_util.py +++ b/spacy/tests/test_util.py @@ -95,7 +95,7 @@ def test_util_dot_section(): assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False # Test that default values got overwritten assert en_config["nlp"]["pipeline"] == ["textcat"] - assert nl_config["nlp"]["pipeline"] == [] # default value [] + assert nl_config["nlp"]["pipeline"] == [] # default value [] # Test proper functioning of 'dot_to_object' with pytest.raises(KeyError): dot_to_object(en_config, "nlp.pipeline.tagger") diff --git a/spacy/tests/training/test_readers.py b/spacy/tests/training/test_readers.py index 898746c2a..d20a032e8 100644 --- a/spacy/tests/training/test_readers.py +++ b/spacy/tests/training/test_readers.py @@ -1,7 +1,6 @@ from typing import Dict, Iterable, Callable import pytest from thinc.api import Config - from spacy import Language from spacy.util import load_model_from_config, registry, dot_to_object from spacy.training import Example @@ -10,19 +9,19 @@ from spacy.training import Example def test_readers(): config_string = """ [training] - + [corpora] @readers = "myreader.v1" [nlp] lang = "en" pipeline = ["tok2vec", "textcat"] - + [components] - + [components.tok2vec] factory = "tok2vec" - + [components.textcat] factory = "textcat" """ @@ -69,19 +68,19 @@ def test_readers(): def test_cat_readers(reader, additional_config): nlp_config_string = """ [training] - + [corpora] @readers = "PLACEHOLDER" [nlp] lang = "en" pipeline = ["tok2vec", "textcat"] - + [components] - + [components.tok2vec] factory = "tok2vec" - + [components.textcat] factory = "textcat" """ diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py index 1d3c72a8b..b09487965 100644 --- a/spacy/tests/training/test_training.py +++ b/spacy/tests/training/test_training.py @@ -34,7 +34,17 @@ def doc(): # fmt: on nlp = English() words = [t.text for t in nlp.make_doc(text)] - doc = get_doc(nlp.vocab, words=words, tags=tags, pos=pos, morphs=morphs, heads=heads, deps=deps, lemmas=lemmas, ents=ents) + doc = get_doc( + nlp.vocab, + words=words, + tags=tags, + pos=pos, + morphs=morphs, + heads=heads, + deps=deps, + lemmas=lemmas, + ents=ents, + ) doc.cats = cats return doc