From f2627157c85aeda99969df8b1bf9730539d43c1f Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 1 Oct 2020 17:38:17 +0200 Subject: [PATCH] Update docs [ci skip] --- spacy/cli/init_pipeline.py | 2 +- spacy/pipeline/tagger.pyx | 3 ++ spacy/pipeline/textcat.py | 3 ++ spacy/training/corpus.py | 2 +- website/docs/api/cli.md | 58 +++++++++++++++++++++------- website/docs/api/data-formats.md | 25 ++++++------ website/docs/api/dependencyparser.md | 32 ++++++++++----- website/docs/api/doc.md | 34 ++++++++-------- website/docs/api/entitylinker.md | 2 +- website/docs/api/entityrecognizer.md | 29 ++++++++++---- website/docs/api/language.md | 2 +- website/docs/api/morphologizer.md | 25 +++++++++--- website/docs/api/pipe.md | 2 +- website/docs/api/tagger.md | 27 +++++++++---- website/docs/api/textcategorizer.md | 27 +++++++++---- website/docs/api/top-level.md | 26 +++++++++++++ website/docs/usage/training.md | 20 +++++++++- 17 files changed, 235 insertions(+), 84 deletions(-) diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py index 62d9096d9..1c0233539 100644 --- a/spacy/cli/init_pipeline.py +++ b/spacy/cli/init_pipeline.py @@ -95,7 +95,7 @@ def init_labels_cli( use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU") # fmt: on ): - """Generate a JSON file for labels in the data. This helps speed up the + """Generate JSON files for the labels in the data. This helps speed up the training process, since spaCy won't have to preprocess the data to extract the labels.""" util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index a4f9d395f..37ad42b88 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -268,6 +268,9 @@ class Tagger(Pipe): get_examples (Callable[[], Iterable[Example]]): Function that returns a representative sample of gold-standard Example objects.. nlp (Language): The current nlp object the component is part of. + labels: The labels to add to the component, typically generated by the + `init labels` command. If no labels are provided, the get_examples + callback is used to extract the labels from the data. DOCS: https://nightly.spacy.io/api/tagger#initialize """ diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index c5b8b615b..a092d960f 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -355,6 +355,9 @@ class TextCategorizer(Pipe): get_examples (Callable[[], Iterable[Example]]): Function that returns a representative sample of gold-standard Example objects. nlp (Language): The current nlp object the component is part of. + labels: The labels to add to the component, typically generated by the + `init labels` command. If no labels are provided, the get_examples + callback is used to extract the labels from the data. DOCS: https://nightly.spacy.io/api/textcategorizer#initialize """ diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py index 9d8e4ff5c..57787cf76 100644 --- a/spacy/training/corpus.py +++ b/spacy/training/corpus.py @@ -46,7 +46,7 @@ def create_jsonl_reader( @util.registry.readers("spacy.read_labels.v1") -def read_labels(path: Path, *, require: bool=False): +def read_labels(path: Path, *, require: bool = False): # I decided not to give this a generic name, because I don't want people to # use it for arbitrary stuff, as I want this require arg with default False. if not require and not path.exists(): diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 347ce1683..436582780 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -186,21 +186,53 @@ This functionality was previously available as part of the command `init-model`. ```cli -$ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--truncate] [--name] [--lexemes-jsonl] [--verbose] +$ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--truncate] [--name] [--verbose] ``` -| Name | Description | -| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `lang` | Pipeline language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. ~~str (positional)~~ | -| `vectors_loc` | Location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Path (positional)~~ | -| `output_dir` | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~ | -| `--lexemes-jsonl`, `-j` | Optional location of JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) with lexical attributes. ~~Optional[Path] \(option)~~ | -| `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ | -| `--prune`, `-p` | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~ | -| `--name`, `-n` | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. ~~Optional[str] \(option)~~ | -| `--verbose`, `-V` | Print additional information and explanations. ~~bool (flag)~~ | -| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | -| **CREATES** | A spaCy pipeline directory containing the vocab and vectors. | +| Name | Description | +| ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `lang` | Pipeline language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. ~~str (positional)~~ | +| `vectors_loc` | Location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Path (positional)~~ | +| `output_dir` | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~ | +| `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ | +| `--prune`, `-p` | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~ | +| `--name`, `-n` | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. ~~Optional[str] \(option)~~ | +| `--verbose`, `-V` | Print additional information and explanations. ~~bool (flag)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| **CREATES** | A spaCy pipeline directory containing the vocab and vectors. | + +### init labels {#init-labels new="3" tag="command"} + +Generate JSON files for the labels in the data. This helps speed up the training +process, since spaCy won't have to preprocess the data to extract the labels. +After generating the labels, you can provide them to components that accept a +`labels` argument on initialization via the +[`[initialize]`](/api/data-formats#config-initialize) block of your config. + +> #### Example config +> +> ```ini +> [initialize.components.ner] +> +> [initialize.components.ner.labels] +> @readers = "spacy.read_labels.v1" +> path = "corpus/labels/ner.json +> ``` + +```cli +$ python -m spacy init labels [config_path] [output_path] [--code] [--verbose] [--gpu-id] [overrides] +``` + +| Name | Description | +| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ | +| `output_path` | Output directory for the label files. Will create one JSON file per component. ~~Path (positional)~~ | +| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | +| `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~ | +| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ | +| **CREATES** | The final trained pipeline and the best trained pipeline. | ## convert {#convert tag="command"} diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index 825d95def..22a0076cd 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -238,8 +238,6 @@ without requiring them at runtime when you load the trained pipeline back in. > data_path = "/path/to/component_data" > ``` - - | Name | Description | | -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `components` | Additional arguments passed to the `initialize` method of a pipeline component, keyed by component name. If type annotations are available on the method, the config will be validated against them. The `initialize` methods will always receive the `get_examples` callback and the current `nlp` object. ~~Dict[str, Dict[str, Any]]~~ | @@ -454,15 +452,20 @@ example = Example.from_dict(doc, gold_dict) ## Lexical data for vocabulary {#vocab-jsonl new="2"} -To populate a pipeline's vocabulary, you can use the -[`spacy init vectors`](/api/cli#init-vectors) command and load in a -[newline-delimited JSON](http://jsonlines.org/) (JSONL) file containing one -lexical entry per line via the `--jsonl-loc` option. The first line defines the -language and vocabulary settings. All other lines are expected to be JSON -objects describing an individual lexeme. The lexical attributes will be then set -as attributes on spaCy's [`Lexeme`](/api/lexeme#attributes) object. The `vocab` -command outputs a ready-to-use spaCy pipeline with a `Vocab` containing the -lexical data. +This data file can be provided via the `vocab_data` setting in the +`[initialize]` block of the training config to pre-define the lexical data to +initialize the `nlp` object's vocabulary with. The file should contain one +lexical entry per line. The first line defines the language and vocabulary +settings. All other lines are expected to be JSON objects describing an +individual lexeme. The lexical attributes will be then set as attributes on +spaCy's [`Lexeme`](/api/lexeme#attributes) object. + +> #### Example config +> +> ```ini +> [initialize] +> vocab_data = "/path/to/vocab-data.jsonl" +> ``` ```python ### First line diff --git a/website/docs/api/dependencyparser.md b/website/docs/api/dependencyparser.md index ad627b006..ea4b779c7 100644 --- a/website/docs/api/dependencyparser.md +++ b/website/docs/api/dependencyparser.md @@ -21,8 +21,9 @@ non-projective parses. The parser is trained using an **imitation learning objective**. It follows the actions predicted by the current weights, and at each state, determines which actions are compatible with the optimal parse that could be reached from the -current state. The weights are updated such that the scores assigned to the set of optimal actions is increased, while scores assigned to other actions are decreased. Note -that more than one action may be optimal for a given state. +current state. The weights are updated such that the scores assigned to the set +of optimal actions is increased, while scores assigned to other actions are +decreased. Note that more than one action may be optimal for a given state. ## Config and implementation {#config} @@ -139,7 +140,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/dependencyparser#call) and | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | | **YIELDS** | The processed documents in order. ~~Doc~~ | -## DependencyParser.initialize {#initialize tag="method"} +## DependencyParser.initialize {#initialize tag="method" new="3"} Initialize the component for training. `get_examples` should be a function that returns an iterable of [`Example`](/api/example) objects. The data examples are @@ -148,7 +149,10 @@ training data or a representative sample. Initialization includes validating the network, [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and setting up the label scheme based on the data. This method is typically called -by [`Language.initialize`](/api/language#initialize). +by [`Language.initialize`](/api/language#initialize) and lets you customize +arguments it receives via the +[`[initialize.components]`](/api/data-formats#config-initialize) block in the +config. @@ -162,12 +166,22 @@ This method was previously called `begin_training`. > parser = nlp.add_pipe("parser") > parser.initialize(lambda: [], nlp=nlp) > ``` +> +> ```ini +> ### config.cfg +> [initialize.components.parser] +> +> [initialize.components.parser.labels] +> @readers = "spacy.read_labels.v1" +> path = "corpus/labels/parser.json +> ``` -| Name | Description | -| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- | -| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | -| _keyword-only_ | | -| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | +| _keyword-only_ | | +| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | +| `labels` | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ | ## DependencyParser.predict {#predict tag="method"} diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index 43d968c3a..d511dc889 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -31,21 +31,21 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the > doc = Doc(nlp.vocab, words=words, spaces=spaces) > ``` -| Name | Description | -| ---------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | A storage container for lexical types. ~~Vocab~~ | -| `words` | A list of strings to add to the container. ~~Optional[List[str]]~~ | -| `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ | -| _keyword-only_ | | -| `user\_data` | Optional extra data to attach to the Doc. ~~Dict~~ | -| `tags` 3 | A list of strings, of the same length as `words`, to assign as `token.tag` for each word. Defaults to `None`. ~~Optional[List[str]]~~ | -| `pos` 3 | A list of strings, of the same length as `words`, to assign as `token.pos` for each word. Defaults to `None`. ~~Optional[List[str]]~~ | -| `morphs` 3 | A list of strings, of the same length as `words`, to assign as `token.morph` for each word. Defaults to `None`. ~~Optional[List[str]]~~ | -| `lemmas` 3 | A list of strings, of the same length as `words`, to assign as `token.lemma` for each word. Defaults to `None`. ~~Optional[List[str]]~~ | -| `heads` 3 | A list of values, of the same length as `words`, to assign as the head for each word. Head indices are the absolute position of the head in the `Doc`. Defaults to `None`. ~~Optional[List[int]]~~ | -| `deps` 3 | A list of strings, of the same length as `words`, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~ | -| `sent_starts` 3 | A list of values, of the same length as `words`, to assign as `token.is_sent_start`. Will be overridden by heads if `heads` is provided. Defaults to `None`. ~~Optional[List[Union[bool, None]]~~ | -| `ents` 3 | A list of `(label, start, end)` tuples to assign as `doc.ents`. Note that the `start` and `end` indices here refer to the token indices. Defaults to `None`. ~~Optional[List[Tuple[Union[str, int], int, int]]]~~ | +| Name | Description | +| ---------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | A storage container for lexical types. ~~Vocab~~ | +| `words` | A list of strings to add to the container. ~~Optional[List[str]]~~ | +| `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ | +| _keyword-only_ | | +| `user\_data` | Optional extra data to attach to the Doc. ~~Dict~~ | +| `tags` 3 | A list of strings, of the same length as `words`, to assign as `token.tag` for each word. Defaults to `None`. ~~Optional[List[str]]~~ | +| `pos` 3 | A list of strings, of the same length as `words`, to assign as `token.pos` for each word. Defaults to `None`. ~~Optional[List[str]]~~ | +| `morphs` 3 | A list of strings, of the same length as `words`, to assign as `token.morph` for each word. Defaults to `None`. ~~Optional[List[str]]~~ | +| `lemmas` 3 | A list of strings, of the same length as `words`, to assign as `token.lemma` for each word. Defaults to `None`. ~~Optional[List[str]]~~ | +| `heads` 3 | A list of values, of the same length as `words`, to assign as the head for each word. Head indices are the absolute position of the head in the `Doc`. Defaults to `None`. ~~Optional[List[int]]~~ | +| `deps` 3 | A list of strings, of the same length as `words`, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~ | +| `sent_starts` 3 | A list of values, of the same length as `words`, to assign as `token.is_sent_start`. Will be overridden by heads if `heads` is provided. Defaults to `None`. ~~Optional[List[Union[bool, None]]~~ | +| `ents` 3 | A list of strings, of the same length of `words`, to assign the token-based IOB tag. Defaults to `None`. ~~Optional[List[str]]~~ | ## Doc.\_\_getitem\_\_ {#getitem tag="method"} @@ -503,7 +503,9 @@ invalidated, although they may accidentally continue to work. Mark a span for merging. The `attrs` will be applied to the resulting token (if they're context-dependent token attributes like `LEMMA` or `DEP`) or to the underlying lexeme (if they're context-independent lexical attributes like -`LOWER` or `IS_STOP`). Writable custom extension attributes can be provided using the `"_"` key and specifying a dictionary that maps attribute names to values. +`LOWER` or `IS_STOP`). Writable custom extension attributes can be provided +using the `"_"` key and specifying a dictionary that maps attribute names to +values. > #### Example > diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md index b3c3f20f5..169a175e2 100644 --- a/website/docs/api/entitylinker.md +++ b/website/docs/api/entitylinker.md @@ -139,7 +139,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | | **YIELDS** | The processed documents in order. ~~Doc~~ | -## EntityLinker.initialize {#initialize tag="method"} +## EntityLinker.initialize {#initialize tag="method" new="3"} Initialize the component for training. `get_examples` should be a function that returns an iterable of [`Example`](/api/example) objects. The data examples are diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md index 65dcfc17c..5fbd0b229 100644 --- a/website/docs/api/entityrecognizer.md +++ b/website/docs/api/entityrecognizer.md @@ -43,7 +43,7 @@ architectures and their arguments and hyperparameters. | Setting | Description | | ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `moves` | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[List[str]]~~ | +| `moves` | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[List[str]]~~ | | `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ | | `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [TransitionBasedParser](/api/architectures#TransitionBasedParser). ~~Model[List[Doc], List[Floats2d]]~~ | @@ -129,7 +129,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entityrecognizer#call) and | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | | **YIELDS** | The processed documents in order. ~~Doc~~ | -## EntityRecognizer.initialize {#initialize tag="method"} +## EntityRecognizer.initialize {#initialize tag="method" new="3"} Initialize the component for training. `get_examples` should be a function that returns an iterable of [`Example`](/api/example) objects. The data examples are @@ -138,7 +138,10 @@ training data or a representative sample. Initialization includes validating the network, [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and setting up the label scheme based on the data. This method is typically called -by [`Language.initialize`](/api/language#initialize). +by [`Language.initialize`](/api/language#initialize) and lets you customize +arguments it receives via the +[`[initialize.components]`](/api/data-formats#config-initialize) block in the +config. @@ -152,12 +155,22 @@ This method was previously called `begin_training`. > ner = nlp.add_pipe("ner") > ner.initialize(lambda: [], nlp=nlp) > ``` +> +> ```ini +> ### config.cfg +> [initialize.components.ner] +> +> [initialize.components.ner.labels] +> @readers = "spacy.read_labels.v1" +> path = "corpus/labels/ner.json +> ``` -| Name | Description | -| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- | -| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | -| _keyword-only_ | | -| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | +| _keyword-only_ | | +| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | +| `labels` | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ | ## EntityRecognizer.predict {#predict tag="method"} diff --git a/website/docs/api/language.md b/website/docs/api/language.md index d8d3b3edc..9f0612b2b 100644 --- a/website/docs/api/language.md +++ b/website/docs/api/language.md @@ -202,7 +202,7 @@ more efficient than processing texts one-by-one. | `n_process` 2.2.2 | Number of processors to use. Defaults to `1`. ~~int~~ | | **YIELDS** | Documents in the order of the original text. ~~Doc~~ | -## Language.initialize {#initialize tag="method"} +## Language.initialize {#initialize tag="method" new="3"} Initialize the pipeline for training and return an [`Optimizer`](https://thinc.ai/docs/api-optimizers). Under the hood, it uses the diff --git a/website/docs/api/morphologizer.md b/website/docs/api/morphologizer.md index 68e096ab7..50e2bb33a 100644 --- a/website/docs/api/morphologizer.md +++ b/website/docs/api/morphologizer.md @@ -126,7 +126,10 @@ training data or a representative sample. Initialization includes validating the network, [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and setting up the label scheme based on the data. This method is typically called -by [`Language.initialize`](/api/language#initialize). +by [`Language.initialize`](/api/language#initialize) and lets you customize +arguments it receives via the +[`[initialize.components]`](/api/data-formats#config-initialize) block in the +config. > #### Example > @@ -134,12 +137,22 @@ by [`Language.initialize`](/api/language#initialize). > morphologizer = nlp.add_pipe("morphologizer") > morphologizer.initialize(lambda: [], nlp=nlp) > ``` +> +> ```ini +> ### config.cfg +> [initialize.components.morphologizer] +> +> [initialize.components.morphologizer.labels] +> @readers = "spacy.read_labels.v1" +> path = "corpus/labels/morphologizer.json +> ``` -| Name | Description | -| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- | -| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | -| _keyword-only_ | | -| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | +| _keyword-only_ | | +| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | +| `labels` | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ | ## Morphologizer.predict {#predict tag="method"} diff --git a/website/docs/api/pipe.md b/website/docs/api/pipe.md index 385ad7ec9..4f5ac6f61 100644 --- a/website/docs/api/pipe.md +++ b/website/docs/api/pipe.md @@ -98,7 +98,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/pipe#call) and | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | | **YIELDS** | The processed documents in order. ~~Doc~~ | -## Pipe.initialize {#initialize tag="method"} +## Pipe.initialize {#initialize tag="method" new="3"} Initialize the component for training. `get_examples` should be a function that returns an iterable of [`Example`](/api/example) objects. The data examples are diff --git a/website/docs/api/tagger.md b/website/docs/api/tagger.md index ff9763e61..d7c56be67 100644 --- a/website/docs/api/tagger.md +++ b/website/docs/api/tagger.md @@ -112,7 +112,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/tagger#call) and | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | | **YIELDS** | The processed documents in order. ~~Doc~~ | -## Tagger.initialize {#initialize tag="method"} +## Tagger.initialize {#initialize tag="method" new="3"} Initialize the component for training. `get_examples` should be a function that returns an iterable of [`Example`](/api/example) objects. The data examples are @@ -121,7 +121,10 @@ training data or a representative sample. Initialization includes validating the network, [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and setting up the label scheme based on the data. This method is typically called -by [`Language.initialize`](/api/language#initialize). +by [`Language.initialize`](/api/language#initialize) and lets you customize +arguments it receives via the +[`[initialize.components]`](/api/data-formats#config-initialize) block in the +config. @@ -135,12 +138,22 @@ This method was previously called `begin_training`. > tagger = nlp.add_pipe("tagger") > tagger.initialize(lambda: [], nlp=nlp) > ``` +> +> ```ini +> ### config.cfg +> [initialize.components.tagger] +> +> [initialize.components.tagger.labels] +> @readers = "spacy.read_labels.v1" +> path = "corpus/labels/tagger.json +> ``` -| Name | Description | -| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- | -| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | -| _keyword-only_ | | -| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | +| _keyword-only_ | | +| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | +| `labels` | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[list]~~ | ## Tagger.predict {#predict tag="method"} diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md index f279189f6..dd8c81040 100644 --- a/website/docs/api/textcategorizer.md +++ b/website/docs/api/textcategorizer.md @@ -125,7 +125,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/textcategorizer#call) and | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | | **YIELDS** | The processed documents in order. ~~Doc~~ | -## TextCategorizer.initialize {#initialize tag="method"} +## TextCategorizer.initialize {#initialize tag="method" new="3"} Initialize the component for training. `get_examples` should be a function that returns an iterable of [`Example`](/api/example) objects. The data examples are @@ -134,7 +134,10 @@ training data or a representative sample. Initialization includes validating the network, [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and setting up the label scheme based on the data. This method is typically called -by [`Language.initialize`](/api/language#initialize). +by [`Language.initialize`](/api/language#initialize) and lets you customize +arguments it receives via the +[`[initialize.components]`](/api/data-formats#config-initialize) block in the +config. @@ -148,12 +151,22 @@ This method was previously called `begin_training`. > textcat = nlp.add_pipe("textcat") > textcat.initialize(lambda: [], nlp=nlp) > ``` +> +> ```ini +> ### config.cfg +> [initialize.components.textcat] +> +> [initialize.components.textcat.labels] +> @readers = "spacy.read_labels.v1" +> path = "corpus/labels/textcat.json +> ``` -| Name | Description | -| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- | -| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | -| _keyword-only_ | | -| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | +| _keyword-only_ | | +| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | +| `labels` | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ | ## TextCategorizer.predict {#predict tag="method"} diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index c16983c78..68d7a3039 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -538,6 +538,32 @@ JSONL file. Also see the [`JsonlReader`](/api/corpus#jsonlreader) class. | `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ | | **CREATES** | The corpus reader. ~~JsonlTexts~~ | +### spacy.read_labels.v1 {#read_labels tag="registered function"} + +Read a JSON-formatted labels file generated with +[`init labels`](/api/cli#init-labels). Typically used in the +[`[initialize]`](/api/data-formats#config-initialize) block of the training +config to speed up the model initialization process and provide pre-generated +label sets. + +> #### Example config +> +> ```ini +> [initialize.components] +> +> [initialize.components.ner] +> +> [initialize.components.ner.labels] +> @readers = "spacy.read_labels.v1" +> path = "corpus/labels/ner.json" +> ``` + +| Name | Description | +| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `path` | The path to the labels file generated with [`init labels`](/api/cli#init-labels). ~~Path~~ | +| `require` | Whether to require the file to exist. If set to `False` and the labels file doesn't exist, the loader will return `None` and the `initialize` method will extract the labels from the data. Defaults to `False`. ~~bool~~ | +| **CREATES** | The | + ## Batchers {#batchers source="spacy/training/batchers.py" new="3"} A data batcher implements a batching strategy that essentially turns a stream of diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 51aa82618..c6c05ac5b 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -204,7 +204,19 @@ initialize it. ![Illustration of pipeline lifecycle](../images/lifecycle.svg) - +At runtime spaCy will only use the `[nlp]` and `[components]` blocks of the +config and load all data, including tokenization rules, model weights and other +resources from the pipeline directory. The `[training]` block contains the +settings for training the model and is only used during training. Similarly, the +`[initialize]` block defines how the initial `nlp` object should be set up +before training and whether it should be initialized with vectors or pretrained +tok2vec weights, or any other data needed by the components. + +The initialization settings are only loaded and used when +[`nlp.initialize`](/api/language#initialize) is called (typically right before +training). This allows you to set up your pipeline using local data resources +and custom functions, and preserve the information in your config – but without +requiring it to be available at runtime ### Overwriting config settings on the command line {#config-overrides} @@ -803,6 +815,10 @@ def MyModel(output_width: int) -> Model[List[Doc], List[Floats2d]]: return create_model(output_width) ``` + + ## Data utilities {#data} spaCy includes various features and utilities to make it easy to train models @@ -853,7 +869,7 @@ nlp = spacy.blank("en") docbin = DocBin(nlp.vocab) words = ["Apple", "is", "looking", "at", "buying", "U.K.", "startup", "."] spaces = [True, True, True, True, True, True, True, False] -ents = [("ORG", 0, 1), ("GPE", 5, 6)] +ents = ["B-ORG", "O", "O", "O", "O", "B-GPE", "O", "O"] doc = Doc(nlp.vocab, words=words, spaces=spaces, ents=ents) docbin.add(doc) docbin.to_disk("./train.spacy")