diff --git a/spacy/tests/regression/test_issue4501-5000.py b/spacy/tests/regression/test_issue4501-5000.py index 0d4ce9a30..d16ecc1e6 100644 --- a/spacy/tests/regression/test_issue4501-5000.py +++ b/spacy/tests/regression/test_issue4501-5000.py @@ -65,7 +65,7 @@ def test_issue4590(en_vocab): def test_issue4651_with_phrase_matcher_attr(): - """Test that the EntityRuler PhraseMatcher is deserialize correctly using + """Test that the EntityRuler PhraseMatcher is deserialized correctly using the method from_disk when the EntityRuler argument phrase_matcher_attr is specified. """ @@ -87,7 +87,7 @@ def test_issue4651_with_phrase_matcher_attr(): def test_issue4651_without_phrase_matcher_attr(): - """Test that the EntityRuler PhraseMatcher is deserialize correctly using + """Test that the EntityRuler PhraseMatcher is deserialized correctly using the method from_disk when the EntityRuler argument phrase_matcher_attr is not specified. """ diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index d37423e2f..cd080bf35 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -1193,8 +1193,7 @@ cdef class Doc: retokenizer.merge(span, attributes[i]) def to_json(self, underscore=None): - """Convert a Doc to JSON. The format it produces will be the new format - for the `spacy train` command (not implemented yet). + """Convert a Doc to JSON. underscore (list): Optional list of string names of custom doc._. attributes. Attribute values need to be JSON-serializable. Values will diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index 8b67aa263..727c0f35c 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -127,26 +127,24 @@ $ python -m spacy train config.cfg --paths.train ./corpus/train.spacy This section defines settings and controls for the training and evaluation process that are used when you run [`spacy train`](/api/cli#train). - - | Name | Description | | --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ | -| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ | | `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ | +| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ | +| `dev_corpus` | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/corpus). ~~Callable[[Language], Iterator[Example]]~~ | +| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ | +| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ | +| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ | | `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~ | -| `raw_text` | Optional path to a jsonl file with unlabelled text documents for a [rehearsal](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~ | -| `vectors` | Model name or path to model containing pretrained word vectors to use, e.g. created with [`init model`](/api/cli#init-model). Defaults to `null`. ~~Optional[str]~~ | -| `patience` | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~ | | `max_epochs` | Maximum number of epochs to train for. Defaults to `0`. ~~int~~ | | `max_steps` | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~ | -| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ | -| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ | -| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ | -| `train_corpus` | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/corpus). ~~Callable[[Language], Iterator[Example]]~~ | -| `dev_corpus` | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/corpus). ~~Callable[[Language], Iterator[Example]]~~ | -| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ | | `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ | +| `patience` | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~ | +| `raw_text` | Optional path to a jsonl file with unlabelled text documents for a [rehearsal](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~ | +| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ | +| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ | +| `train_corpus` | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/corpus). ~~Callable[[Language], Iterator[Example]]~~ | +| `vectors` | Model name or path to model containing pretrained word vectors to use, e.g. created with [`init model`](/api/cli#init-model). Defaults to `null`. ~~Optional[str]~~ | ### pretraining {#config-pretraining tag="section,optional"} diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 9c65b2982..797fa0191 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -299,20 +299,20 @@ factories. | Registry name | Description | | ----------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `architectures` | Registry for functions that create [model architectures](/api/architectures). Can be used to register custom model architectures and reference them in the `config.cfg`. | -| `factories` | Registry for functions that create [pipeline components](/usage/processing-pipelines#custom-components). Added automatically when you use the `@spacy.component` decorator and also reads from [entry points](/usage/saving-loading#entry-points). | -| `tokenizers` | Registry for tokenizer factories. Registered functions should return a callback that receives the `nlp` object and returns a [`Tokenizer`](/api/tokenizer) or a custom callable. | -| `languages` | Registry for language-specific `Language` subclasses. Automatically reads from [entry points](/usage/saving-loading#entry-points). | -| `lookups` | Registry for large lookup tables available via `vocab.lookups`. | -| `displacy_colors` | Registry for custom color scheme for the [`displacy` NER visualizer](/usage/visualizers). Automatically reads from [entry points](/usage/saving-loading#entry-points). | | `assets` | Registry for data assets, knowledge bases etc. | -| `callbacks` | Registry for custom callbacks to [modify the `nlp` object](/usage/training#custom-code-nlp-callbacks) before training. | -| `readers` | Registry for training and evaluation data readers like [`Corpus`](/api/corpus). | | `batchers` | Registry for training and evaluation [data batchers](#batchers). | -| `optimizers` | Registry for functions that create [optimizers](https://thinc.ai/docs/api-optimizers). | -| `schedules` | Registry for functions that create [schedules](https://thinc.ai/docs/api-schedules). | -| `layers` | Registry for functions that create [layers](https://thinc.ai/docs/api-layers). | -| `losses` | Registry for functions that create [losses](https://thinc.ai/docs/api-loss). | +| `callbacks` | Registry for custom callbacks to [modify the `nlp` object](/usage/training#custom-code-nlp-callbacks) before training. | +| `displacy_colors` | Registry for custom color scheme for the [`displacy` NER visualizer](/usage/visualizers). Automatically reads from [entry points](/usage/saving-loading#entry-points). | +| `factories` | Registry for functions that create [pipeline components](/usage/processing-pipelines#custom-components). Added automatically when you use the `@spacy.component` decorator and also reads from [entry points](/usage/saving-loading#entry-points). | | `initializers` | Registry for functions that create [initializers](https://thinc.ai/docs/api-initializers). | +| `languages` | Registry for language-specific `Language` subclasses. Automatically reads from [entry points](/usage/saving-loading#entry-points). | +| `layers` | Registry for functions that create [layers](https://thinc.ai/docs/api-layers). | +| `lookups` | Registry for large lookup tables available via `vocab.lookups`. | +| `losses` | Registry for functions that create [losses](https://thinc.ai/docs/api-loss). | +| `optimizers` | Registry for functions that create [optimizers](https://thinc.ai/docs/api-optimizers). | +| `readers` | Registry for training and evaluation data readers like [`Corpus`](/api/corpus). | +| `schedules` | Registry for functions that create [schedules](https://thinc.ai/docs/api-schedules). | +| `tokenizers` | Registry for tokenizer factories. Registered functions should return a callback that receives the `nlp` object and returns a [`Tokenizer`](/api/tokenizer) or a custom callable. | ### spacy-transformers registry {#registry-transformers} diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index a863c6c32..614f113b3 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -265,7 +265,7 @@ for doc in nlp.pipe(texts, disable=["tagger", "parser"]): If you need to **execute more code** with components disabled – e.g. to reset the weights or update only some components during training – you can use the -[`nlp.select_pipes`](/api/language#select_pipes) contextmanager. At the end of +[`nlp.select_pipes`](/api/language#select_pipes) context manager. At the end of the `with` block, the disabled pipeline components will be restored automatically. Alternatively, `select_pipes` returns an object that lets you call its `restore()` method to restore the disabled components when needed. This @@ -274,7 +274,7 @@ blocks. ```python ### Disable for block -# 1. Use as a contextmanager +# 1. Use as a context manager with nlp.select_pipes(disable=["tagger", "parser"]): doc = nlp("I won't be tagged and parsed") doc = nlp("I will be tagged and parsed") diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 00eb2b882..c04d3ca77 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -35,8 +35,8 @@ ready-to-use spaCy models. The recommended way to train your spaCy models is via the [`spacy train`](/api/cli#train) command on the command line. It only needs a single [`config.cfg`](#config) **configuration file** that includes all settings -and hyperparameters. You can optionally [overwritten](#config-overrides) -settings on the command line, and load in a Python file to register +and hyperparameters. You can optionally [overwrite](#config-overrides) settings +on the command line, and load in a Python file to register [custom functions](#custom-code) and architectures. This quickstart widget helps you generate a starter config with the **recommended settings** for your specific use case. It's also available in spaCy as the @@ -82,7 +82,7 @@ $ python -m spacy init fill-config base_config.cfg config.cfg Instead of exporting your starter config from the quickstart widget and auto-filling it, you can also use the [`init config`](/api/cli#init-config) -command and specify your requirement and settings and CLI arguments. You can now +command and specify your requirement and settings as CLI arguments. You can now add your data and run [`train`](/api/cli#train) with your config. See the [`convert`](/api/cli#convert) command for details on how to convert your data to spaCy's binary `.spacy` format. You can either include the data paths in the @@ -104,11 +104,6 @@ workflows, from data preprocessing to training and packaging your model. ## Training config {#config} - - Training config files include all **settings and hyperparameters** for training your model. Instead of providing lots of arguments on the command line, you only need to pass your `config.cfg` file to [`spacy train`](/api/cli#train). Under @@ -126,9 +121,10 @@ Some of the main advantages and features of spaCy's training config are: functions like [model architectures](/api/architectures), [optimizers](https://thinc.ai/docs/api-optimizers) or [schedules](https://thinc.ai/docs/api-schedules) and define arguments that are - passed into them. You can also register your own functions to define - [custom architectures](#custom-functions), reference them in your config and - tweak their parameters. + passed into them. You can also + [register your own functions](#custom-functions) to define custom + architectures or methods, reference them in your config and tweak their + parameters. - **Interpolation.** If you have hyperparameters or other settings used by multiple components, define them once and reference them as [variables](#config-interpolation). @@ -226,21 +222,21 @@ passed to the component factory as arguments. This lets you configure the model settings and hyperparameters. If a component block defines a `source`, the component will be copied over from an existing pretrained model, with its existing weights. This lets you include an already trained component in your -model pipeline, or update a pretrained components with more data specific to -your use case. +model pipeline, or update a pretrained component with more data specific to your +use case. ```ini ### config.cfg (excerpt) [components] -# "parser" and "ner" are sourced from pretrained model +# "parser" and "ner" are sourced from a pretrained model [components.parser] source = "en_core_web_sm" [components.ner] source = "en_core_web_sm" -# "textcat" and "custom" are created blank from built-in / custom factory +# "textcat" and "custom" are created blank from a built-in / custom factory [components.textcat] factory = "textcat" @@ -294,11 +290,11 @@ batch_size = 128 ``` To refer to a function instead, you can make `[training.batch_size]` its own -section and use the `@` syntax specify the function and its arguments – in this -case [`compounding.v1`](https://thinc.ai/docs/api-schedules#compounding) defined -in the [function registry](/api/top-level#registry). All other values defined in -the block are passed to the function as keyword arguments when it's initialized. -You can also use this mechanism to register +section and use the `@` syntax to specify the function and its arguments – in +this case [`compounding.v1`](https://thinc.ai/docs/api-schedules#compounding) +defined in the [function registry](/api/top-level#registry). All other values +defined in the block are passed to the function as keyword arguments when it's +initialized. You can also use this mechanism to register [custom implementations and architectures](#custom-functions) and reference them from your configs. @@ -726,9 +722,9 @@ a stream of items into a stream of batches. spaCy has several useful built-in [batching strategies](/api/top-level#batchers) with customizable sizes, but it's also easy to implement your own. For instance, the following function takes the stream of generated [`Example`](/api/example) objects, and removes those which -have the exact same underlying raw text, to avoid duplicates within each batch. -Note that in a more realistic implementation, you'd also want to check whether -the annotations are exactly the same. +have the same underlying raw text, to avoid duplicates within each batch. Note +that in a more realistic implementation, you'd also want to check whether the +annotations are the same. > #### config.cfg > @@ -843,8 +839,8 @@ called the **gold standard**. It's initialized with a [`Doc`](/api/doc) object that will hold the predictions, and another `Doc` object that holds the gold-standard annotations. It also includes the **alignment** between those two documents if they differ in tokenization. The `Example` class ensures that spaCy -can rely on one **standardized format** that's passed through the pipeline. -Here's an example of a simple `Example` for part-of-speech tags: +can rely on one **standardized format** that's passed through the pipeline. For +instance, let's say we want to define gold-standard part-of-speech tags: ```python words = ["I", "like", "stuff"] @@ -856,9 +852,10 @@ reference = Doc(vocab, words=words).from_array("TAG", numpy.array(tag_ids, dtype example = Example(predicted, reference) ``` -Alternatively, the `reference` `Doc` with the gold-standard annotations can be -created from a dictionary with keyword arguments specifying the annotations, -like `tags` or `entities`. Using the `Example` object and its gold-standard +As this is quite verbose, there's an alternative way to create the reference +`Doc` with the gold-standard annotations. The function `Example.from_dict` takes +a dictionary with keyword arguments specifying the annotations, like `tags` or +`entities`. Using the resulting `Example` object and its gold-standard annotations, the model can be updated to learn a sentence of three words with their assigned part-of-speech tags. @@ -883,7 +880,7 @@ example = Example.from_dict(predicted, {"tags": tags}) Here's another example that shows how to define gold-standard named entities. The letters added before the labels refer to the tags of the [BILUO scheme](/usage/linguistic-features#updating-biluo) – `O` is a token -outside an entity, `U` an single entity unit, `B` the beginning of an entity, +outside an entity, `U` a single entity unit, `B` the beginning of an entity, `I` a token inside an entity and `L` the last token of an entity. ```python @@ -958,7 +955,7 @@ dictionary of annotations: ```diff text = "Facebook released React in 2014" annotations = {"entities": ["U-ORG", "O", "U-TECHNOLOGY", "O", "U-DATE"]} -+ example = Example.from_dict(nlp.make_doc(text), {"entities": entities}) ++ example = Example.from_dict(nlp.make_doc(text), annotations) - nlp.update([text], [annotations]) + nlp.update([example]) ``` diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md index b017dcdab..aea1d892c 100644 --- a/website/docs/usage/v3.md +++ b/website/docs/usage/v3.md @@ -249,7 +249,7 @@ The following methods, attributes and commands are new in spaCy v3.0. | ----------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | [`Token.lex`](/api/token#attributes) | Access a token's [`Lexeme`](/api/lexeme). | | [`Token.morph`](/api/token#attributes) [`Token.morph_`](/api/token#attributes) | Access a token's morphological analysis. | -| [`Language.select_pipes`](/api/language#select_pipes) | Contextmanager for enabling or disabling specific pipeline components for a block. | +| [`Language.select_pipes`](/api/language#select_pipes) | Context manager for enabling or disabling specific pipeline components for a block. | | [`Language.analyze_pipes`](/api/language#analyze_pipes) | [Analyze](/usage/processing-pipelines#analysis) components and their interdependencies. | | [`Language.resume_training`](/api/language#resume_training) | Experimental: continue training a pretrained model and initialize "rehearsal" for components that implement a `rehearse` method to prevent catastrophic forgetting. | | [`@Language.factory`](/api/language#factory) [`@Language.component`](/api/language#component) | Decorators for [registering](/usage/processing-pipelines#custom-components) pipeline component factories and simple stateless component functions. | @@ -338,13 +338,13 @@ Note that spaCy v3.0 now requires **Python 3.6+**. [training config](/usage/training#config). - [`Language.add_pipe`](/api/language#add_pipe) now takes the **string name** of the component factory instead of the component function. -- **Custom pipeline components** now needs to be decorated with the +- **Custom pipeline components** now need to be decorated with the [`@Language.component`](/api/language#component) or [`@Language.factory`](/api/language#factory) decorator. - [`Language.update`](/api/language#update) now takes a batch of [`Example`](/api/example) objects instead of raw texts and annotations, or `Doc` and `GoldParse` objects. -- The `Language.disable_pipes` contextmanager has been replaced by +- The `Language.disable_pipes` context manager has been replaced by [`Language.select_pipes`](/api/language#select_pipes), which can explicitly disable or enable components. - The [`Language.update`](/api/language#update), @@ -364,16 +364,16 @@ Note that spaCy v3.0 now requires **Python 3.6+**. ### Removed or renamed API {#incompat-removed} -| Removed | Replacement | -| -------------------------------------------------------- | ----------------------------------------------------------------------------------------- | -| `Language.disable_pipes` | [`Language.select_pipes`](/api/language#select_pipes) | -| `GoldParse` | [`Example`](/api/example) | -| `GoldCorpus` | [`Corpus`](/api/corpus) | -| `KnowledgeBase.load_bulk` `KnowledgeBase.dump` | [`KnowledgeBase.from_disk`](/api/kb#from_disk) [`KnowledgeBase.to_disk`](/api/kb#to_disk) | -| `spacy init-model` | [`spacy init model`](/api/cli#init-model) | -| `spacy debug-data` | [`spacy debug data`](/api/cli#debug-data) | -| `spacy profile` | [`spacy debug profile`](/api/cli#debug-profile) | -| `spacy link`, `util.set_data_path`, `util.get_data_path` | not needed, model symlinks are deprecated | +| Removed | Replacement | +| -------------------------------------------------------- | ------------------------------------------------------------------------------------------ | +| `Language.disable_pipes` | [`Language.select_pipes`](/api/language#select_pipes) | +| `GoldParse` | [`Example`](/api/example) | +| `GoldCorpus` | [`Corpus`](/api/corpus) | +| `KnowledgeBase.load_bulk`, `KnowledgeBase.dump` | [`KnowledgeBase.from_disk`](/api/kb#from_disk), [`KnowledgeBase.to_disk`](/api/kb#to_disk) | +| `spacy init-model` | [`spacy init model`](/api/cli#init-model) | +| `spacy debug-data` | [`spacy debug data`](/api/cli#debug-data) | +| `spacy profile` | [`spacy debug profile`](/api/cli#debug-profile) | +| `spacy link`, `util.set_data_path`, `util.get_data_path` | not needed, model symlinks are deprecated | The following deprecated methods, attributes and arguments were removed in v3.0. Most of them have been **deprecated for a while** and many would previously