mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
Update docs [ci skip]
This commit is contained in:
parent
7f68f4bd92
commit
f2627157c8
|
@ -95,7 +95,7 @@ def init_labels_cli(
|
|||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
|
||||
# fmt: on
|
||||
):
|
||||
"""Generate a JSON file for labels in the data. This helps speed up the
|
||||
"""Generate JSON files for the labels in the data. This helps speed up the
|
||||
training process, since spaCy won't have to preprocess the data to
|
||||
extract the labels."""
|
||||
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||
|
|
|
@ -268,6 +268,9 @@ class Tagger(Pipe):
|
|||
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||
returns a representative sample of gold-standard Example objects..
|
||||
nlp (Language): The current nlp object the component is part of.
|
||||
labels: The labels to add to the component, typically generated by the
|
||||
`init labels` command. If no labels are provided, the get_examples
|
||||
callback is used to extract the labels from the data.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/tagger#initialize
|
||||
"""
|
||||
|
|
|
@ -355,6 +355,9 @@ class TextCategorizer(Pipe):
|
|||
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||
returns a representative sample of gold-standard Example objects.
|
||||
nlp (Language): The current nlp object the component is part of.
|
||||
labels: The labels to add to the component, typically generated by the
|
||||
`init labels` command. If no labels are provided, the get_examples
|
||||
callback is used to extract the labels from the data.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/textcategorizer#initialize
|
||||
"""
|
||||
|
|
|
@ -46,7 +46,7 @@ def create_jsonl_reader(
|
|||
|
||||
|
||||
@util.registry.readers("spacy.read_labels.v1")
|
||||
def read_labels(path: Path, *, require: bool=False):
|
||||
def read_labels(path: Path, *, require: bool = False):
|
||||
# I decided not to give this a generic name, because I don't want people to
|
||||
# use it for arbitrary stuff, as I want this require arg with default False.
|
||||
if not require and not path.exists():
|
||||
|
|
|
@ -186,21 +186,53 @@ This functionality was previously available as part of the command `init-model`.
|
|||
</Infobox>
|
||||
|
||||
```cli
|
||||
$ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--truncate] [--name] [--lexemes-jsonl] [--verbose]
|
||||
$ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--truncate] [--name] [--verbose]
|
||||
```
|
||||
|
||||
| Name | Description |
|
||||
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `lang` | Pipeline language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. ~~str (positional)~~ |
|
||||
| `vectors_loc` | Location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Path (positional)~~ |
|
||||
| `output_dir` | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~ |
|
||||
| `--lexemes-jsonl`, `-j` | Optional location of JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) with lexical attributes. ~~Optional[Path] \(option)~~ |
|
||||
| `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ |
|
||||
| `--prune`, `-p` | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~ |
|
||||
| `--name`, `-n` | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. ~~Optional[str] \(option)~~ |
|
||||
| `--verbose`, `-V` | Print additional information and explanations. ~~bool (flag)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| **CREATES** | A spaCy pipeline directory containing the vocab and vectors. |
|
||||
| Name | Description |
|
||||
| ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `lang` | Pipeline language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. ~~str (positional)~~ |
|
||||
| `vectors_loc` | Location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Path (positional)~~ |
|
||||
| `output_dir` | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~ |
|
||||
| `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ |
|
||||
| `--prune`, `-p` | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~ |
|
||||
| `--name`, `-n` | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. ~~Optional[str] \(option)~~ |
|
||||
| `--verbose`, `-V` | Print additional information and explanations. ~~bool (flag)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| **CREATES** | A spaCy pipeline directory containing the vocab and vectors. |
|
||||
|
||||
### init labels {#init-labels new="3" tag="command"}
|
||||
|
||||
Generate JSON files for the labels in the data. This helps speed up the training
|
||||
process, since spaCy won't have to preprocess the data to extract the labels.
|
||||
After generating the labels, you can provide them to components that accept a
|
||||
`labels` argument on initialization via the
|
||||
[`[initialize]`](/api/data-formats#config-initialize) block of your config.
|
||||
|
||||
> #### Example config
|
||||
>
|
||||
> ```ini
|
||||
> [initialize.components.ner]
|
||||
>
|
||||
> [initialize.components.ner.labels]
|
||||
> @readers = "spacy.read_labels.v1"
|
||||
> path = "corpus/labels/ner.json
|
||||
> ```
|
||||
|
||||
```cli
|
||||
$ python -m spacy init labels [config_path] [output_path] [--code] [--verbose] [--gpu-id] [overrides]
|
||||
```
|
||||
|
||||
| Name | Description |
|
||||
| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
|
||||
| `output_path` | Output directory for the label files. Will create one JSON file per component. ~~Path (positional)~~ |
|
||||
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||
| `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~ |
|
||||
| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
|
||||
| **CREATES** | The final trained pipeline and the best trained pipeline. |
|
||||
|
||||
## convert {#convert tag="command"}
|
||||
|
||||
|
|
|
@ -238,8 +238,6 @@ without requiring them at runtime when you load the trained pipeline back in.
|
|||
> data_path = "/path/to/component_data"
|
||||
> ```
|
||||
|
||||
<!-- TODO: -->
|
||||
|
||||
| Name | Description |
|
||||
| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `components` | Additional arguments passed to the `initialize` method of a pipeline component, keyed by component name. If type annotations are available on the method, the config will be validated against them. The `initialize` methods will always receive the `get_examples` callback and the current `nlp` object. ~~Dict[str, Dict[str, Any]]~~ |
|
||||
|
@ -454,15 +452,20 @@ example = Example.from_dict(doc, gold_dict)
|
|||
|
||||
## Lexical data for vocabulary {#vocab-jsonl new="2"}
|
||||
|
||||
To populate a pipeline's vocabulary, you can use the
|
||||
[`spacy init vectors`](/api/cli#init-vectors) command and load in a
|
||||
[newline-delimited JSON](http://jsonlines.org/) (JSONL) file containing one
|
||||
lexical entry per line via the `--jsonl-loc` option. The first line defines the
|
||||
language and vocabulary settings. All other lines are expected to be JSON
|
||||
objects describing an individual lexeme. The lexical attributes will be then set
|
||||
as attributes on spaCy's [`Lexeme`](/api/lexeme#attributes) object. The `vocab`
|
||||
command outputs a ready-to-use spaCy pipeline with a `Vocab` containing the
|
||||
lexical data.
|
||||
This data file can be provided via the `vocab_data` setting in the
|
||||
`[initialize]` block of the training config to pre-define the lexical data to
|
||||
initialize the `nlp` object's vocabulary with. The file should contain one
|
||||
lexical entry per line. The first line defines the language and vocabulary
|
||||
settings. All other lines are expected to be JSON objects describing an
|
||||
individual lexeme. The lexical attributes will be then set as attributes on
|
||||
spaCy's [`Lexeme`](/api/lexeme#attributes) object.
|
||||
|
||||
> #### Example config
|
||||
>
|
||||
> ```ini
|
||||
> [initialize]
|
||||
> vocab_data = "/path/to/vocab-data.jsonl"
|
||||
> ```
|
||||
|
||||
```python
|
||||
### First line
|
||||
|
|
|
@ -21,8 +21,9 @@ non-projective parses.
|
|||
The parser is trained using an **imitation learning objective**. It follows the
|
||||
actions predicted by the current weights, and at each state, determines which
|
||||
actions are compatible with the optimal parse that could be reached from the
|
||||
current state. The weights are updated such that the scores assigned to the set of optimal actions is increased, while scores assigned to other actions are decreased. Note
|
||||
that more than one action may be optimal for a given state.
|
||||
current state. The weights are updated such that the scores assigned to the set
|
||||
of optimal actions is increased, while scores assigned to other actions are
|
||||
decreased. Note that more than one action may be optimal for a given state.
|
||||
|
||||
## Config and implementation {#config}
|
||||
|
||||
|
@ -139,7 +140,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/dependencyparser#call) and
|
|||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||
|
||||
## DependencyParser.initialize {#initialize tag="method"}
|
||||
## DependencyParser.initialize {#initialize tag="method" new="3"}
|
||||
|
||||
Initialize the component for training. `get_examples` should be a function that
|
||||
returns an iterable of [`Example`](/api/example) objects. The data examples are
|
||||
|
@ -148,7 +149,10 @@ training data or a representative sample. Initialization includes validating the
|
|||
network,
|
||||
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||
setting up the label scheme based on the data. This method is typically called
|
||||
by [`Language.initialize`](/api/language#initialize).
|
||||
by [`Language.initialize`](/api/language#initialize) and lets you customize
|
||||
arguments it receives via the
|
||||
[`[initialize.components]`](/api/data-formats#config-initialize) block in the
|
||||
config.
|
||||
|
||||
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
|
||||
|
||||
|
@ -162,12 +166,22 @@ This method was previously called `begin_training`.
|
|||
> parser = nlp.add_pipe("parser")
|
||||
> parser.initialize(lambda: [], nlp=nlp)
|
||||
> ```
|
||||
>
|
||||
> ```ini
|
||||
> ### config.cfg
|
||||
> [initialize.components.parser]
|
||||
>
|
||||
> [initialize.components.parser.labels]
|
||||
> @readers = "spacy.read_labels.v1"
|
||||
> path = "corpus/labels/parser.json
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
||||
| `labels` | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ |
|
||||
|
||||
## DependencyParser.predict {#predict tag="method"}
|
||||
|
||||
|
|
|
@ -31,21 +31,21 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
|
|||
> doc = Doc(nlp.vocab, words=words, spaces=spaces)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ---------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | A storage container for lexical types. ~~Vocab~~ |
|
||||
| `words` | A list of strings to add to the container. ~~Optional[List[str]]~~ |
|
||||
| `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `user\_data` | Optional extra data to attach to the Doc. ~~Dict~~ |
|
||||
| `tags` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.tag` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
| `pos` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.pos` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
| `morphs` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.morph` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
| `lemmas` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.lemma` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
| `heads` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as the head for each word. Head indices are the absolute position of the head in the `Doc`. Defaults to `None`. ~~Optional[List[int]]~~ |
|
||||
| `deps` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
| `sent_starts` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as `token.is_sent_start`. Will be overridden by heads if `heads` is provided. Defaults to `None`. ~~Optional[List[Union[bool, None]]~~ |
|
||||
| `ents` <Tag variant="new">3</Tag> | A list of `(label, start, end)` tuples to assign as `doc.ents`. Note that the `start` and `end` indices here refer to the token indices. Defaults to `None`. ~~Optional[List[Tuple[Union[str, int], int, int]]]~~ |
|
||||
| Name | Description |
|
||||
| ---------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | A storage container for lexical types. ~~Vocab~~ |
|
||||
| `words` | A list of strings to add to the container. ~~Optional[List[str]]~~ |
|
||||
| `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `user\_data` | Optional extra data to attach to the Doc. ~~Dict~~ |
|
||||
| `tags` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.tag` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
| `pos` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.pos` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
| `morphs` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.morph` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
| `lemmas` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.lemma` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
| `heads` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as the head for each word. Head indices are the absolute position of the head in the `Doc`. Defaults to `None`. ~~Optional[List[int]]~~ |
|
||||
| `deps` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
| `sent_starts` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as `token.is_sent_start`. Will be overridden by heads if `heads` is provided. Defaults to `None`. ~~Optional[List[Union[bool, None]]~~ |
|
||||
| `ents` <Tag variant="new">3</Tag> | A list of strings, of the same length of `words`, to assign the token-based IOB tag. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
|
||||
## Doc.\_\_getitem\_\_ {#getitem tag="method"}
|
||||
|
||||
|
@ -503,7 +503,9 @@ invalidated, although they may accidentally continue to work.
|
|||
Mark a span for merging. The `attrs` will be applied to the resulting token (if
|
||||
they're context-dependent token attributes like `LEMMA` or `DEP`) or to the
|
||||
underlying lexeme (if they're context-independent lexical attributes like
|
||||
`LOWER` or `IS_STOP`). Writable custom extension attributes can be provided using the `"_"` key and specifying a dictionary that maps attribute names to values.
|
||||
`LOWER` or `IS_STOP`). Writable custom extension attributes can be provided
|
||||
using the `"_"` key and specifying a dictionary that maps attribute names to
|
||||
values.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
|
|
@ -139,7 +139,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and
|
|||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||
|
||||
## EntityLinker.initialize {#initialize tag="method"}
|
||||
## EntityLinker.initialize {#initialize tag="method" new="3"}
|
||||
|
||||
Initialize the component for training. `get_examples` should be a function that
|
||||
returns an iterable of [`Example`](/api/example) objects. The data examples are
|
||||
|
|
|
@ -43,7 +43,7 @@ architectures and their arguments and hyperparameters.
|
|||
|
||||
| Setting | Description |
|
||||
| ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `moves` | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
| `moves` | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
| `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ |
|
||||
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [TransitionBasedParser](/api/architectures#TransitionBasedParser). ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
|
||||
|
@ -129,7 +129,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entityrecognizer#call) and
|
|||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||
|
||||
## EntityRecognizer.initialize {#initialize tag="method"}
|
||||
## EntityRecognizer.initialize {#initialize tag="method" new="3"}
|
||||
|
||||
Initialize the component for training. `get_examples` should be a function that
|
||||
returns an iterable of [`Example`](/api/example) objects. The data examples are
|
||||
|
@ -138,7 +138,10 @@ training data or a representative sample. Initialization includes validating the
|
|||
network,
|
||||
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||
setting up the label scheme based on the data. This method is typically called
|
||||
by [`Language.initialize`](/api/language#initialize).
|
||||
by [`Language.initialize`](/api/language#initialize) and lets you customize
|
||||
arguments it receives via the
|
||||
[`[initialize.components]`](/api/data-formats#config-initialize) block in the
|
||||
config.
|
||||
|
||||
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
|
||||
|
||||
|
@ -152,12 +155,22 @@ This method was previously called `begin_training`.
|
|||
> ner = nlp.add_pipe("ner")
|
||||
> ner.initialize(lambda: [], nlp=nlp)
|
||||
> ```
|
||||
>
|
||||
> ```ini
|
||||
> ### config.cfg
|
||||
> [initialize.components.ner]
|
||||
>
|
||||
> [initialize.components.ner.labels]
|
||||
> @readers = "spacy.read_labels.v1"
|
||||
> path = "corpus/labels/ner.json
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
||||
| `labels` | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ |
|
||||
|
||||
## EntityRecognizer.predict {#predict tag="method"}
|
||||
|
||||
|
|
|
@ -202,7 +202,7 @@ more efficient than processing texts one-by-one.
|
|||
| `n_process` <Tag variant="new">2.2.2</Tag> | Number of processors to use. Defaults to `1`. ~~int~~ |
|
||||
| **YIELDS** | Documents in the order of the original text. ~~Doc~~ |
|
||||
|
||||
## Language.initialize {#initialize tag="method"}
|
||||
## Language.initialize {#initialize tag="method" new="3"}
|
||||
|
||||
Initialize the pipeline for training and return an
|
||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers). Under the hood, it uses the
|
||||
|
|
|
@ -126,7 +126,10 @@ training data or a representative sample. Initialization includes validating the
|
|||
network,
|
||||
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||
setting up the label scheme based on the data. This method is typically called
|
||||
by [`Language.initialize`](/api/language#initialize).
|
||||
by [`Language.initialize`](/api/language#initialize) and lets you customize
|
||||
arguments it receives via the
|
||||
[`[initialize.components]`](/api/data-formats#config-initialize) block in the
|
||||
config.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -134,12 +137,22 @@ by [`Language.initialize`](/api/language#initialize).
|
|||
> morphologizer = nlp.add_pipe("morphologizer")
|
||||
> morphologizer.initialize(lambda: [], nlp=nlp)
|
||||
> ```
|
||||
>
|
||||
> ```ini
|
||||
> ### config.cfg
|
||||
> [initialize.components.morphologizer]
|
||||
>
|
||||
> [initialize.components.morphologizer.labels]
|
||||
> @readers = "spacy.read_labels.v1"
|
||||
> path = "corpus/labels/morphologizer.json
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
||||
| `labels` | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ |
|
||||
|
||||
## Morphologizer.predict {#predict tag="method"}
|
||||
|
||||
|
|
|
@ -98,7 +98,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/pipe#call) and
|
|||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||
|
||||
## Pipe.initialize {#initialize tag="method"}
|
||||
## Pipe.initialize {#initialize tag="method" new="3"}
|
||||
|
||||
Initialize the component for training. `get_examples` should be a function that
|
||||
returns an iterable of [`Example`](/api/example) objects. The data examples are
|
||||
|
|
|
@ -112,7 +112,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/tagger#call) and
|
|||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||
|
||||
## Tagger.initialize {#initialize tag="method"}
|
||||
## Tagger.initialize {#initialize tag="method" new="3"}
|
||||
|
||||
Initialize the component for training. `get_examples` should be a function that
|
||||
returns an iterable of [`Example`](/api/example) objects. The data examples are
|
||||
|
@ -121,7 +121,10 @@ training data or a representative sample. Initialization includes validating the
|
|||
network,
|
||||
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||
setting up the label scheme based on the data. This method is typically called
|
||||
by [`Language.initialize`](/api/language#initialize).
|
||||
by [`Language.initialize`](/api/language#initialize) and lets you customize
|
||||
arguments it receives via the
|
||||
[`[initialize.components]`](/api/data-formats#config-initialize) block in the
|
||||
config.
|
||||
|
||||
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
|
||||
|
||||
|
@ -135,12 +138,22 @@ This method was previously called `begin_training`.
|
|||
> tagger = nlp.add_pipe("tagger")
|
||||
> tagger.initialize(lambda: [], nlp=nlp)
|
||||
> ```
|
||||
>
|
||||
> ```ini
|
||||
> ### config.cfg
|
||||
> [initialize.components.tagger]
|
||||
>
|
||||
> [initialize.components.tagger.labels]
|
||||
> @readers = "spacy.read_labels.v1"
|
||||
> path = "corpus/labels/tagger.json
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
||||
| `labels` | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[list]~~ |
|
||||
|
||||
## Tagger.predict {#predict tag="method"}
|
||||
|
||||
|
|
|
@ -125,7 +125,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/textcategorizer#call) and
|
|||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||
|
||||
## TextCategorizer.initialize {#initialize tag="method"}
|
||||
## TextCategorizer.initialize {#initialize tag="method" new="3"}
|
||||
|
||||
Initialize the component for training. `get_examples` should be a function that
|
||||
returns an iterable of [`Example`](/api/example) objects. The data examples are
|
||||
|
@ -134,7 +134,10 @@ training data or a representative sample. Initialization includes validating the
|
|||
network,
|
||||
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||
setting up the label scheme based on the data. This method is typically called
|
||||
by [`Language.initialize`](/api/language#initialize).
|
||||
by [`Language.initialize`](/api/language#initialize) and lets you customize
|
||||
arguments it receives via the
|
||||
[`[initialize.components]`](/api/data-formats#config-initialize) block in the
|
||||
config.
|
||||
|
||||
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
|
||||
|
||||
|
@ -148,12 +151,22 @@ This method was previously called `begin_training`.
|
|||
> textcat = nlp.add_pipe("textcat")
|
||||
> textcat.initialize(lambda: [], nlp=nlp)
|
||||
> ```
|
||||
>
|
||||
> ```ini
|
||||
> ### config.cfg
|
||||
> [initialize.components.textcat]
|
||||
>
|
||||
> [initialize.components.textcat.labels]
|
||||
> @readers = "spacy.read_labels.v1"
|
||||
> path = "corpus/labels/textcat.json
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
||||
| `labels` | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ |
|
||||
|
||||
## TextCategorizer.predict {#predict tag="method"}
|
||||
|
||||
|
|
|
@ -538,6 +538,32 @@ JSONL file. Also see the [`JsonlReader`](/api/corpus#jsonlreader) class.
|
|||
| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ |
|
||||
| **CREATES** | The corpus reader. ~~JsonlTexts~~ |
|
||||
|
||||
### spacy.read_labels.v1 {#read_labels tag="registered function"}
|
||||
|
||||
Read a JSON-formatted labels file generated with
|
||||
[`init labels`](/api/cli#init-labels). Typically used in the
|
||||
[`[initialize]`](/api/data-formats#config-initialize) block of the training
|
||||
config to speed up the model initialization process and provide pre-generated
|
||||
label sets.
|
||||
|
||||
> #### Example config
|
||||
>
|
||||
> ```ini
|
||||
> [initialize.components]
|
||||
>
|
||||
> [initialize.components.ner]
|
||||
>
|
||||
> [initialize.components.ner.labels]
|
||||
> @readers = "spacy.read_labels.v1"
|
||||
> path = "corpus/labels/ner.json"
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | The path to the labels file generated with [`init labels`](/api/cli#init-labels). ~~Path~~ |
|
||||
| `require` | Whether to require the file to exist. If set to `False` and the labels file doesn't exist, the loader will return `None` and the `initialize` method will extract the labels from the data. Defaults to `False`. ~~bool~~ |
|
||||
| **CREATES** | The |
|
||||
|
||||
## Batchers {#batchers source="spacy/training/batchers.py" new="3"}
|
||||
|
||||
A data batcher implements a batching strategy that essentially turns a stream of
|
||||
|
|
|
@ -204,7 +204,19 @@ initialize it.
|
|||
|
||||
![Illustration of pipeline lifecycle](../images/lifecycle.svg)
|
||||
|
||||
<!-- TODO: explain lifecycle and initialization -->
|
||||
At runtime spaCy will only use the `[nlp]` and `[components]` blocks of the
|
||||
config and load all data, including tokenization rules, model weights and other
|
||||
resources from the pipeline directory. The `[training]` block contains the
|
||||
settings for training the model and is only used during training. Similarly, the
|
||||
`[initialize]` block defines how the initial `nlp` object should be set up
|
||||
before training and whether it should be initialized with vectors or pretrained
|
||||
tok2vec weights, or any other data needed by the components.
|
||||
|
||||
The initialization settings are only loaded and used when
|
||||
[`nlp.initialize`](/api/language#initialize) is called (typically right before
|
||||
training). This allows you to set up your pipeline using local data resources
|
||||
and custom functions, and preserve the information in your config – but without
|
||||
requiring it to be available at runtime
|
||||
|
||||
### Overwriting config settings on the command line {#config-overrides}
|
||||
|
||||
|
@ -803,6 +815,10 @@ def MyModel(output_width: int) -> Model[List[Doc], List[Floats2d]]:
|
|||
return create_model(output_width)
|
||||
```
|
||||
|
||||
<!-- TODO:
|
||||
### Customizing the initialization {#initialization}
|
||||
-->
|
||||
|
||||
## Data utilities {#data}
|
||||
|
||||
spaCy includes various features and utilities to make it easy to train models
|
||||
|
@ -853,7 +869,7 @@ nlp = spacy.blank("en")
|
|||
docbin = DocBin(nlp.vocab)
|
||||
words = ["Apple", "is", "looking", "at", "buying", "U.K.", "startup", "."]
|
||||
spaces = [True, True, True, True, True, True, True, False]
|
||||
ents = [("ORG", 0, 1), ("GPE", 5, 6)]
|
||||
ents = ["B-ORG", "O", "O", "O", "O", "B-GPE", "O", "O"]
|
||||
doc = Doc(nlp.vocab, words=words, spaces=spaces, ents=ents)
|
||||
docbin.add(doc)
|
||||
docbin.to_disk("./train.spacy")
|
||||
|
|
Loading…
Reference in New Issue
Block a user