mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-15 18:52:29 +03:00
Update docs [ci skip]
This commit is contained in:
parent
7f68f4bd92
commit
f2627157c8
|
@ -95,7 +95,7 @@ def init_labels_cli(
|
||||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
|
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""Generate a JSON file for labels in the data. This helps speed up the
|
"""Generate JSON files for the labels in the data. This helps speed up the
|
||||||
training process, since spaCy won't have to preprocess the data to
|
training process, since spaCy won't have to preprocess the data to
|
||||||
extract the labels."""
|
extract the labels."""
|
||||||
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||||
|
|
|
@ -268,6 +268,9 @@ class Tagger(Pipe):
|
||||||
get_examples (Callable[[], Iterable[Example]]): Function that
|
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||||
returns a representative sample of gold-standard Example objects..
|
returns a representative sample of gold-standard Example objects..
|
||||||
nlp (Language): The current nlp object the component is part of.
|
nlp (Language): The current nlp object the component is part of.
|
||||||
|
labels: The labels to add to the component, typically generated by the
|
||||||
|
`init labels` command. If no labels are provided, the get_examples
|
||||||
|
callback is used to extract the labels from the data.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/tagger#initialize
|
DOCS: https://nightly.spacy.io/api/tagger#initialize
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -355,6 +355,9 @@ class TextCategorizer(Pipe):
|
||||||
get_examples (Callable[[], Iterable[Example]]): Function that
|
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||||
returns a representative sample of gold-standard Example objects.
|
returns a representative sample of gold-standard Example objects.
|
||||||
nlp (Language): The current nlp object the component is part of.
|
nlp (Language): The current nlp object the component is part of.
|
||||||
|
labels: The labels to add to the component, typically generated by the
|
||||||
|
`init labels` command. If no labels are provided, the get_examples
|
||||||
|
callback is used to extract the labels from the data.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/textcategorizer#initialize
|
DOCS: https://nightly.spacy.io/api/textcategorizer#initialize
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -46,7 +46,7 @@ def create_jsonl_reader(
|
||||||
|
|
||||||
|
|
||||||
@util.registry.readers("spacy.read_labels.v1")
|
@util.registry.readers("spacy.read_labels.v1")
|
||||||
def read_labels(path: Path, *, require: bool=False):
|
def read_labels(path: Path, *, require: bool = False):
|
||||||
# I decided not to give this a generic name, because I don't want people to
|
# I decided not to give this a generic name, because I don't want people to
|
||||||
# use it for arbitrary stuff, as I want this require arg with default False.
|
# use it for arbitrary stuff, as I want this require arg with default False.
|
||||||
if not require and not path.exists():
|
if not require and not path.exists():
|
||||||
|
|
|
@ -186,15 +186,14 @@ This functionality was previously available as part of the command `init-model`.
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
```cli
|
```cli
|
||||||
$ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--truncate] [--name] [--lexemes-jsonl] [--verbose]
|
$ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--truncate] [--name] [--verbose]
|
||||||
```
|
```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `lang` | Pipeline language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. ~~str (positional)~~ |
|
| `lang` | Pipeline language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. ~~str (positional)~~ |
|
||||||
| `vectors_loc` | Location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Path (positional)~~ |
|
| `vectors_loc` | Location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Path (positional)~~ |
|
||||||
| `output_dir` | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~ |
|
| `output_dir` | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~ |
|
||||||
| `--lexemes-jsonl`, `-j` | Optional location of JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) with lexical attributes. ~~Optional[Path] \(option)~~ |
|
|
||||||
| `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ |
|
| `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ |
|
||||||
| `--prune`, `-p` | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~ |
|
| `--prune`, `-p` | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~ |
|
||||||
| `--name`, `-n` | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. ~~Optional[str] \(option)~~ |
|
| `--name`, `-n` | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. ~~Optional[str] \(option)~~ |
|
||||||
|
@ -202,6 +201,39 @@ $ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--tr
|
||||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||||
| **CREATES** | A spaCy pipeline directory containing the vocab and vectors. |
|
| **CREATES** | A spaCy pipeline directory containing the vocab and vectors. |
|
||||||
|
|
||||||
|
### init labels {#init-labels new="3" tag="command"}
|
||||||
|
|
||||||
|
Generate JSON files for the labels in the data. This helps speed up the training
|
||||||
|
process, since spaCy won't have to preprocess the data to extract the labels.
|
||||||
|
After generating the labels, you can provide them to components that accept a
|
||||||
|
`labels` argument on initialization via the
|
||||||
|
[`[initialize]`](/api/data-formats#config-initialize) block of your config.
|
||||||
|
|
||||||
|
> #### Example config
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> [initialize.components.ner]
|
||||||
|
>
|
||||||
|
> [initialize.components.ner.labels]
|
||||||
|
> @readers = "spacy.read_labels.v1"
|
||||||
|
> path = "corpus/labels/ner.json
|
||||||
|
> ```
|
||||||
|
|
||||||
|
```cli
|
||||||
|
$ python -m spacy init labels [config_path] [output_path] [--code] [--verbose] [--gpu-id] [overrides]
|
||||||
|
```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
|
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
|
||||||
|
| `output_path` | Output directory for the label files. Will create one JSON file per component. ~~Path (positional)~~ |
|
||||||
|
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||||
|
| `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~ |
|
||||||
|
| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ |
|
||||||
|
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||||
|
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
|
||||||
|
| **CREATES** | The final trained pipeline and the best trained pipeline. |
|
||||||
|
|
||||||
## convert {#convert tag="command"}
|
## convert {#convert tag="command"}
|
||||||
|
|
||||||
Convert files into spaCy's
|
Convert files into spaCy's
|
||||||
|
|
|
@ -238,8 +238,6 @@ without requiring them at runtime when you load the trained pipeline back in.
|
||||||
> data_path = "/path/to/component_data"
|
> data_path = "/path/to/component_data"
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
<!-- TODO: -->
|
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `components` | Additional arguments passed to the `initialize` method of a pipeline component, keyed by component name. If type annotations are available on the method, the config will be validated against them. The `initialize` methods will always receive the `get_examples` callback and the current `nlp` object. ~~Dict[str, Dict[str, Any]]~~ |
|
| `components` | Additional arguments passed to the `initialize` method of a pipeline component, keyed by component name. If type annotations are available on the method, the config will be validated against them. The `initialize` methods will always receive the `get_examples` callback and the current `nlp` object. ~~Dict[str, Dict[str, Any]]~~ |
|
||||||
|
@ -454,15 +452,20 @@ example = Example.from_dict(doc, gold_dict)
|
||||||
|
|
||||||
## Lexical data for vocabulary {#vocab-jsonl new="2"}
|
## Lexical data for vocabulary {#vocab-jsonl new="2"}
|
||||||
|
|
||||||
To populate a pipeline's vocabulary, you can use the
|
This data file can be provided via the `vocab_data` setting in the
|
||||||
[`spacy init vectors`](/api/cli#init-vectors) command and load in a
|
`[initialize]` block of the training config to pre-define the lexical data to
|
||||||
[newline-delimited JSON](http://jsonlines.org/) (JSONL) file containing one
|
initialize the `nlp` object's vocabulary with. The file should contain one
|
||||||
lexical entry per line via the `--jsonl-loc` option. The first line defines the
|
lexical entry per line. The first line defines the language and vocabulary
|
||||||
language and vocabulary settings. All other lines are expected to be JSON
|
settings. All other lines are expected to be JSON objects describing an
|
||||||
objects describing an individual lexeme. The lexical attributes will be then set
|
individual lexeme. The lexical attributes will be then set as attributes on
|
||||||
as attributes on spaCy's [`Lexeme`](/api/lexeme#attributes) object. The `vocab`
|
spaCy's [`Lexeme`](/api/lexeme#attributes) object.
|
||||||
command outputs a ready-to-use spaCy pipeline with a `Vocab` containing the
|
|
||||||
lexical data.
|
> #### Example config
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> [initialize]
|
||||||
|
> vocab_data = "/path/to/vocab-data.jsonl"
|
||||||
|
> ```
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### First line
|
### First line
|
||||||
|
|
|
@ -21,8 +21,9 @@ non-projective parses.
|
||||||
The parser is trained using an **imitation learning objective**. It follows the
|
The parser is trained using an **imitation learning objective**. It follows the
|
||||||
actions predicted by the current weights, and at each state, determines which
|
actions predicted by the current weights, and at each state, determines which
|
||||||
actions are compatible with the optimal parse that could be reached from the
|
actions are compatible with the optimal parse that could be reached from the
|
||||||
current state. The weights are updated such that the scores assigned to the set of optimal actions is increased, while scores assigned to other actions are decreased. Note
|
current state. The weights are updated such that the scores assigned to the set
|
||||||
that more than one action may be optimal for a given state.
|
of optimal actions is increased, while scores assigned to other actions are
|
||||||
|
decreased. Note that more than one action may be optimal for a given state.
|
||||||
|
|
||||||
## Config and implementation {#config}
|
## Config and implementation {#config}
|
||||||
|
|
||||||
|
@ -139,7 +140,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/dependencyparser#call) and
|
||||||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||||
|
|
||||||
## DependencyParser.initialize {#initialize tag="method"}
|
## DependencyParser.initialize {#initialize tag="method" new="3"}
|
||||||
|
|
||||||
Initialize the component for training. `get_examples` should be a function that
|
Initialize the component for training. `get_examples` should be a function that
|
||||||
returns an iterable of [`Example`](/api/example) objects. The data examples are
|
returns an iterable of [`Example`](/api/example) objects. The data examples are
|
||||||
|
@ -148,7 +149,10 @@ training data or a representative sample. Initialization includes validating the
|
||||||
network,
|
network,
|
||||||
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||||
setting up the label scheme based on the data. This method is typically called
|
setting up the label scheme based on the data. This method is typically called
|
||||||
by [`Language.initialize`](/api/language#initialize).
|
by [`Language.initialize`](/api/language#initialize) and lets you customize
|
||||||
|
arguments it receives via the
|
||||||
|
[`[initialize.components]`](/api/data-formats#config-initialize) block in the
|
||||||
|
config.
|
||||||
|
|
||||||
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
|
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
|
||||||
|
|
||||||
|
@ -162,12 +166,22 @@ This method was previously called `begin_training`.
|
||||||
> parser = nlp.add_pipe("parser")
|
> parser = nlp.add_pipe("parser")
|
||||||
> parser.initialize(lambda: [], nlp=nlp)
|
> parser.initialize(lambda: [], nlp=nlp)
|
||||||
> ```
|
> ```
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> ### config.cfg
|
||||||
|
> [initialize.components.parser]
|
||||||
|
>
|
||||||
|
> [initialize.components.parser.labels]
|
||||||
|
> @readers = "spacy.read_labels.v1"
|
||||||
|
> path = "corpus/labels/parser.json
|
||||||
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
||||||
|
| `labels` | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ |
|
||||||
|
|
||||||
## DependencyParser.predict {#predict tag="method"}
|
## DependencyParser.predict {#predict tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -32,7 +32,7 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ---------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ---------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `vocab` | A storage container for lexical types. ~~Vocab~~ |
|
| `vocab` | A storage container for lexical types. ~~Vocab~~ |
|
||||||
| `words` | A list of strings to add to the container. ~~Optional[List[str]]~~ |
|
| `words` | A list of strings to add to the container. ~~Optional[List[str]]~~ |
|
||||||
| `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ |
|
| `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ |
|
||||||
|
@ -45,7 +45,7 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
|
||||||
| `heads` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as the head for each word. Head indices are the absolute position of the head in the `Doc`. Defaults to `None`. ~~Optional[List[int]]~~ |
|
| `heads` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as the head for each word. Head indices are the absolute position of the head in the `Doc`. Defaults to `None`. ~~Optional[List[int]]~~ |
|
||||||
| `deps` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
| `deps` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||||
| `sent_starts` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as `token.is_sent_start`. Will be overridden by heads if `heads` is provided. Defaults to `None`. ~~Optional[List[Union[bool, None]]~~ |
|
| `sent_starts` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as `token.is_sent_start`. Will be overridden by heads if `heads` is provided. Defaults to `None`. ~~Optional[List[Union[bool, None]]~~ |
|
||||||
| `ents` <Tag variant="new">3</Tag> | A list of `(label, start, end)` tuples to assign as `doc.ents`. Note that the `start` and `end` indices here refer to the token indices. Defaults to `None`. ~~Optional[List[Tuple[Union[str, int], int, int]]]~~ |
|
| `ents` <Tag variant="new">3</Tag> | A list of strings, of the same length of `words`, to assign the token-based IOB tag. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||||
|
|
||||||
## Doc.\_\_getitem\_\_ {#getitem tag="method"}
|
## Doc.\_\_getitem\_\_ {#getitem tag="method"}
|
||||||
|
|
||||||
|
@ -503,7 +503,9 @@ invalidated, although they may accidentally continue to work.
|
||||||
Mark a span for merging. The `attrs` will be applied to the resulting token (if
|
Mark a span for merging. The `attrs` will be applied to the resulting token (if
|
||||||
they're context-dependent token attributes like `LEMMA` or `DEP`) or to the
|
they're context-dependent token attributes like `LEMMA` or `DEP`) or to the
|
||||||
underlying lexeme (if they're context-independent lexical attributes like
|
underlying lexeme (if they're context-independent lexical attributes like
|
||||||
`LOWER` or `IS_STOP`). Writable custom extension attributes can be provided using the `"_"` key and specifying a dictionary that maps attribute names to values.
|
`LOWER` or `IS_STOP`). Writable custom extension attributes can be provided
|
||||||
|
using the `"_"` key and specifying a dictionary that maps attribute names to
|
||||||
|
values.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
|
|
@ -139,7 +139,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and
|
||||||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||||
|
|
||||||
## EntityLinker.initialize {#initialize tag="method"}
|
## EntityLinker.initialize {#initialize tag="method" new="3"}
|
||||||
|
|
||||||
Initialize the component for training. `get_examples` should be a function that
|
Initialize the component for training. `get_examples` should be a function that
|
||||||
returns an iterable of [`Example`](/api/example) objects. The data examples are
|
returns an iterable of [`Example`](/api/example) objects. The data examples are
|
||||||
|
|
|
@ -129,7 +129,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entityrecognizer#call) and
|
||||||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||||
|
|
||||||
## EntityRecognizer.initialize {#initialize tag="method"}
|
## EntityRecognizer.initialize {#initialize tag="method" new="3"}
|
||||||
|
|
||||||
Initialize the component for training. `get_examples` should be a function that
|
Initialize the component for training. `get_examples` should be a function that
|
||||||
returns an iterable of [`Example`](/api/example) objects. The data examples are
|
returns an iterable of [`Example`](/api/example) objects. The data examples are
|
||||||
|
@ -138,7 +138,10 @@ training data or a representative sample. Initialization includes validating the
|
||||||
network,
|
network,
|
||||||
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||||
setting up the label scheme based on the data. This method is typically called
|
setting up the label scheme based on the data. This method is typically called
|
||||||
by [`Language.initialize`](/api/language#initialize).
|
by [`Language.initialize`](/api/language#initialize) and lets you customize
|
||||||
|
arguments it receives via the
|
||||||
|
[`[initialize.components]`](/api/data-formats#config-initialize) block in the
|
||||||
|
config.
|
||||||
|
|
||||||
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
|
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
|
||||||
|
|
||||||
|
@ -152,12 +155,22 @@ This method was previously called `begin_training`.
|
||||||
> ner = nlp.add_pipe("ner")
|
> ner = nlp.add_pipe("ner")
|
||||||
> ner.initialize(lambda: [], nlp=nlp)
|
> ner.initialize(lambda: [], nlp=nlp)
|
||||||
> ```
|
> ```
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> ### config.cfg
|
||||||
|
> [initialize.components.ner]
|
||||||
|
>
|
||||||
|
> [initialize.components.ner.labels]
|
||||||
|
> @readers = "spacy.read_labels.v1"
|
||||||
|
> path = "corpus/labels/ner.json
|
||||||
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
||||||
|
| `labels` | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ |
|
||||||
|
|
||||||
## EntityRecognizer.predict {#predict tag="method"}
|
## EntityRecognizer.predict {#predict tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -202,7 +202,7 @@ more efficient than processing texts one-by-one.
|
||||||
| `n_process` <Tag variant="new">2.2.2</Tag> | Number of processors to use. Defaults to `1`. ~~int~~ |
|
| `n_process` <Tag variant="new">2.2.2</Tag> | Number of processors to use. Defaults to `1`. ~~int~~ |
|
||||||
| **YIELDS** | Documents in the order of the original text. ~~Doc~~ |
|
| **YIELDS** | Documents in the order of the original text. ~~Doc~~ |
|
||||||
|
|
||||||
## Language.initialize {#initialize tag="method"}
|
## Language.initialize {#initialize tag="method" new="3"}
|
||||||
|
|
||||||
Initialize the pipeline for training and return an
|
Initialize the pipeline for training and return an
|
||||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers). Under the hood, it uses the
|
[`Optimizer`](https://thinc.ai/docs/api-optimizers). Under the hood, it uses the
|
||||||
|
|
|
@ -126,7 +126,10 @@ training data or a representative sample. Initialization includes validating the
|
||||||
network,
|
network,
|
||||||
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||||
setting up the label scheme based on the data. This method is typically called
|
setting up the label scheme based on the data. This method is typically called
|
||||||
by [`Language.initialize`](/api/language#initialize).
|
by [`Language.initialize`](/api/language#initialize) and lets you customize
|
||||||
|
arguments it receives via the
|
||||||
|
[`[initialize.components]`](/api/data-formats#config-initialize) block in the
|
||||||
|
config.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
@ -134,12 +137,22 @@ by [`Language.initialize`](/api/language#initialize).
|
||||||
> morphologizer = nlp.add_pipe("morphologizer")
|
> morphologizer = nlp.add_pipe("morphologizer")
|
||||||
> morphologizer.initialize(lambda: [], nlp=nlp)
|
> morphologizer.initialize(lambda: [], nlp=nlp)
|
||||||
> ```
|
> ```
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> ### config.cfg
|
||||||
|
> [initialize.components.morphologizer]
|
||||||
|
>
|
||||||
|
> [initialize.components.morphologizer.labels]
|
||||||
|
> @readers = "spacy.read_labels.v1"
|
||||||
|
> path = "corpus/labels/morphologizer.json
|
||||||
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
||||||
|
| `labels` | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ |
|
||||||
|
|
||||||
## Morphologizer.predict {#predict tag="method"}
|
## Morphologizer.predict {#predict tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -98,7 +98,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/pipe#call) and
|
||||||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||||
|
|
||||||
## Pipe.initialize {#initialize tag="method"}
|
## Pipe.initialize {#initialize tag="method" new="3"}
|
||||||
|
|
||||||
Initialize the component for training. `get_examples` should be a function that
|
Initialize the component for training. `get_examples` should be a function that
|
||||||
returns an iterable of [`Example`](/api/example) objects. The data examples are
|
returns an iterable of [`Example`](/api/example) objects. The data examples are
|
||||||
|
|
|
@ -112,7 +112,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/tagger#call) and
|
||||||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||||
|
|
||||||
## Tagger.initialize {#initialize tag="method"}
|
## Tagger.initialize {#initialize tag="method" new="3"}
|
||||||
|
|
||||||
Initialize the component for training. `get_examples` should be a function that
|
Initialize the component for training. `get_examples` should be a function that
|
||||||
returns an iterable of [`Example`](/api/example) objects. The data examples are
|
returns an iterable of [`Example`](/api/example) objects. The data examples are
|
||||||
|
@ -121,7 +121,10 @@ training data or a representative sample. Initialization includes validating the
|
||||||
network,
|
network,
|
||||||
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||||
setting up the label scheme based on the data. This method is typically called
|
setting up the label scheme based on the data. This method is typically called
|
||||||
by [`Language.initialize`](/api/language#initialize).
|
by [`Language.initialize`](/api/language#initialize) and lets you customize
|
||||||
|
arguments it receives via the
|
||||||
|
[`[initialize.components]`](/api/data-formats#config-initialize) block in the
|
||||||
|
config.
|
||||||
|
|
||||||
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
|
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
|
||||||
|
|
||||||
|
@ -135,12 +138,22 @@ This method was previously called `begin_training`.
|
||||||
> tagger = nlp.add_pipe("tagger")
|
> tagger = nlp.add_pipe("tagger")
|
||||||
> tagger.initialize(lambda: [], nlp=nlp)
|
> tagger.initialize(lambda: [], nlp=nlp)
|
||||||
> ```
|
> ```
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> ### config.cfg
|
||||||
|
> [initialize.components.tagger]
|
||||||
|
>
|
||||||
|
> [initialize.components.tagger.labels]
|
||||||
|
> @readers = "spacy.read_labels.v1"
|
||||||
|
> path = "corpus/labels/tagger.json
|
||||||
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
||||||
|
| `labels` | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[list]~~ |
|
||||||
|
|
||||||
## Tagger.predict {#predict tag="method"}
|
## Tagger.predict {#predict tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -125,7 +125,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/textcategorizer#call) and
|
||||||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||||
|
|
||||||
## TextCategorizer.initialize {#initialize tag="method"}
|
## TextCategorizer.initialize {#initialize tag="method" new="3"}
|
||||||
|
|
||||||
Initialize the component for training. `get_examples` should be a function that
|
Initialize the component for training. `get_examples` should be a function that
|
||||||
returns an iterable of [`Example`](/api/example) objects. The data examples are
|
returns an iterable of [`Example`](/api/example) objects. The data examples are
|
||||||
|
@ -134,7 +134,10 @@ training data or a representative sample. Initialization includes validating the
|
||||||
network,
|
network,
|
||||||
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||||
setting up the label scheme based on the data. This method is typically called
|
setting up the label scheme based on the data. This method is typically called
|
||||||
by [`Language.initialize`](/api/language#initialize).
|
by [`Language.initialize`](/api/language#initialize) and lets you customize
|
||||||
|
arguments it receives via the
|
||||||
|
[`[initialize.components]`](/api/data-formats#config-initialize) block in the
|
||||||
|
config.
|
||||||
|
|
||||||
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
|
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
|
||||||
|
|
||||||
|
@ -148,12 +151,22 @@ This method was previously called `begin_training`.
|
||||||
> textcat = nlp.add_pipe("textcat")
|
> textcat = nlp.add_pipe("textcat")
|
||||||
> textcat.initialize(lambda: [], nlp=nlp)
|
> textcat.initialize(lambda: [], nlp=nlp)
|
||||||
> ```
|
> ```
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> ### config.cfg
|
||||||
|
> [initialize.components.textcat]
|
||||||
|
>
|
||||||
|
> [initialize.components.textcat.labels]
|
||||||
|
> @readers = "spacy.read_labels.v1"
|
||||||
|
> path = "corpus/labels/textcat.json
|
||||||
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
||||||
|
| `labels` | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ |
|
||||||
|
|
||||||
## TextCategorizer.predict {#predict tag="method"}
|
## TextCategorizer.predict {#predict tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -538,6 +538,32 @@ JSONL file. Also see the [`JsonlReader`](/api/corpus#jsonlreader) class.
|
||||||
| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ |
|
| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ |
|
||||||
| **CREATES** | The corpus reader. ~~JsonlTexts~~ |
|
| **CREATES** | The corpus reader. ~~JsonlTexts~~ |
|
||||||
|
|
||||||
|
### spacy.read_labels.v1 {#read_labels tag="registered function"}
|
||||||
|
|
||||||
|
Read a JSON-formatted labels file generated with
|
||||||
|
[`init labels`](/api/cli#init-labels). Typically used in the
|
||||||
|
[`[initialize]`](/api/data-formats#config-initialize) block of the training
|
||||||
|
config to speed up the model initialization process and provide pre-generated
|
||||||
|
label sets.
|
||||||
|
|
||||||
|
> #### Example config
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> [initialize.components]
|
||||||
|
>
|
||||||
|
> [initialize.components.ner]
|
||||||
|
>
|
||||||
|
> [initialize.components.ner.labels]
|
||||||
|
> @readers = "spacy.read_labels.v1"
|
||||||
|
> path = "corpus/labels/ner.json"
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `path` | The path to the labels file generated with [`init labels`](/api/cli#init-labels). ~~Path~~ |
|
||||||
|
| `require` | Whether to require the file to exist. If set to `False` and the labels file doesn't exist, the loader will return `None` and the `initialize` method will extract the labels from the data. Defaults to `False`. ~~bool~~ |
|
||||||
|
| **CREATES** | The |
|
||||||
|
|
||||||
## Batchers {#batchers source="spacy/training/batchers.py" new="3"}
|
## Batchers {#batchers source="spacy/training/batchers.py" new="3"}
|
||||||
|
|
||||||
A data batcher implements a batching strategy that essentially turns a stream of
|
A data batcher implements a batching strategy that essentially turns a stream of
|
||||||
|
|
|
@ -204,7 +204,19 @@ initialize it.
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
<!-- TODO: explain lifecycle and initialization -->
|
At runtime spaCy will only use the `[nlp]` and `[components]` blocks of the
|
||||||
|
config and load all data, including tokenization rules, model weights and other
|
||||||
|
resources from the pipeline directory. The `[training]` block contains the
|
||||||
|
settings for training the model and is only used during training. Similarly, the
|
||||||
|
`[initialize]` block defines how the initial `nlp` object should be set up
|
||||||
|
before training and whether it should be initialized with vectors or pretrained
|
||||||
|
tok2vec weights, or any other data needed by the components.
|
||||||
|
|
||||||
|
The initialization settings are only loaded and used when
|
||||||
|
[`nlp.initialize`](/api/language#initialize) is called (typically right before
|
||||||
|
training). This allows you to set up your pipeline using local data resources
|
||||||
|
and custom functions, and preserve the information in your config – but without
|
||||||
|
requiring it to be available at runtime
|
||||||
|
|
||||||
### Overwriting config settings on the command line {#config-overrides}
|
### Overwriting config settings on the command line {#config-overrides}
|
||||||
|
|
||||||
|
@ -803,6 +815,10 @@ def MyModel(output_width: int) -> Model[List[Doc], List[Floats2d]]:
|
||||||
return create_model(output_width)
|
return create_model(output_width)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
<!-- TODO:
|
||||||
|
### Customizing the initialization {#initialization}
|
||||||
|
-->
|
||||||
|
|
||||||
## Data utilities {#data}
|
## Data utilities {#data}
|
||||||
|
|
||||||
spaCy includes various features and utilities to make it easy to train models
|
spaCy includes various features and utilities to make it easy to train models
|
||||||
|
@ -853,7 +869,7 @@ nlp = spacy.blank("en")
|
||||||
docbin = DocBin(nlp.vocab)
|
docbin = DocBin(nlp.vocab)
|
||||||
words = ["Apple", "is", "looking", "at", "buying", "U.K.", "startup", "."]
|
words = ["Apple", "is", "looking", "at", "buying", "U.K.", "startup", "."]
|
||||||
spaces = [True, True, True, True, True, True, True, False]
|
spaces = [True, True, True, True, True, True, True, False]
|
||||||
ents = [("ORG", 0, 1), ("GPE", 5, 6)]
|
ents = ["B-ORG", "O", "O", "O", "O", "B-GPE", "O", "O"]
|
||||||
doc = Doc(nlp.vocab, words=words, spaces=spaces, ents=ents)
|
doc = Doc(nlp.vocab, words=words, spaces=spaces, ents=ents)
|
||||||
docbin.add(doc)
|
docbin.add(doc)
|
||||||
docbin.to_disk("./train.spacy")
|
docbin.to_disk("./train.spacy")
|
||||||
|
|
Loading…
Reference in New Issue
Block a user