Merge branch 'nightly.spacy.io' into develop
|
@ -4,4 +4,34 @@ teaser: Pre-defined model architectures included with the core library
|
||||||
source: spacy/ml/models
|
source: spacy/ml/models
|
||||||
---
|
---
|
||||||
|
|
||||||
TODO: write
|
TODO: intro and how architectures work, link to
|
||||||
|
[`registry`](/api/top-level#registry),
|
||||||
|
[custom models](/usage/training#custom-models) usage etc.
|
||||||
|
|
||||||
|
## Parser architectures {source="spacy/ml/models/parser.py"}
|
||||||
|
|
||||||
|
### spacy.TransitionBasedParser.v1
|
||||||
|
|
||||||
|
<!-- TODO: intro -->
|
||||||
|
|
||||||
|
> #### Example Config
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> [model]
|
||||||
|
> @architectures = "spacy.TransitionBasedParser.v1"
|
||||||
|
> nr_feature_tokens = 6
|
||||||
|
> hidden_width = 64
|
||||||
|
> maxout_pieces = 2
|
||||||
|
>
|
||||||
|
> [model.tok2vec]
|
||||||
|
> # ...
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| ------------------- | ------------------------------------------ | ----------- |
|
||||||
|
| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | |
|
||||||
|
| `nr_feature_tokens` | int | |
|
||||||
|
| `hidden_width` | int | |
|
||||||
|
| `maxout_pieces` | int | |
|
||||||
|
| `use_upper` | bool | |
|
||||||
|
| `nO` | int | |
|
||||||
|
|
|
@ -297,60 +297,41 @@ will not be available.
|
||||||
|
|
||||||
## Train {#train}
|
## Train {#train}
|
||||||
|
|
||||||
<!-- TODO: document new training -->
|
|
||||||
|
|
||||||
Train a model. Expects data in spaCy's
|
Train a model. Expects data in spaCy's
|
||||||
[JSON format](/api/data-formats#json-input). On each epoch, a model will be
|
[binary format](/api/data-formats#training) and a
|
||||||
saved out to the directory. Accuracy scores and model details will be added to a
|
[config file](/api/data-formats#config) with all settings and hyperparameters.
|
||||||
[`meta.json`](/usage/training#models-generating) to allow packaging the model
|
Will save out the best model from all epochs, as well as the final model. The
|
||||||
using the [`package`](/api/cli#package) command.
|
`--code` argument can be used to provide a Python file that's imported before
|
||||||
|
the training process starts. This lets you register
|
||||||
|
[custom functions](/usage/training#custom-models) and architectures and refer to
|
||||||
|
them in your config, all while still using spaCy's built-in `train` workflow. If
|
||||||
|
you need to manage complex multi-step training workflows, check out the new
|
||||||
|
[spaCy projects](/usage/projects).
|
||||||
|
|
||||||
|
<Infobox title="New in v3.0" variant="warning">
|
||||||
|
|
||||||
|
As of spaCy v3.0, the `train` command doesn't take a long list of command-line
|
||||||
|
arguments anymore and instead expects a single
|
||||||
|
[`config.cfg` file](/usage/training#config) containing all settings for the
|
||||||
|
pipeline, training process and hyperparameters.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ python -m spacy train [lang] [output_path] [train_path] [dev_path]
|
$ python -m spacy train [train_path] [dev_path] [config_path] [--output]
|
||||||
[--base-model] [--pipeline] [--vectors] [--n-iter] [--n-early-stopping]
|
[--code] [--verbose]
|
||||||
[--n-examples] [--use-gpu] [--version] [--meta-path] [--init-tok2vec]
|
|
||||||
[--parser-multitasks] [--entity-multitasks] [--gold-preproc] [--noise-level]
|
|
||||||
[--orth-variant-level] [--learn-tokens] [--textcat-arch] [--textcat-multilabel]
|
|
||||||
[--textcat-positive-label] [--verbose]
|
|
||||||
```
|
```
|
||||||
|
|
||||||
| Argument | Type | Description |
|
| Argument | Type | Description |
|
||||||
| --------------------------------------------------------------- | ------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------------- | ---------- | ---------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `lang` | positional | Model language. |
|
| `train_path` | positional | Location of training data in spaCy's [binary format](/api/data-formats#training). Can be a file or a directory of files. |
|
||||||
| `output_path` | positional | Directory to store model in. Will be created if it doesn't exist. |
|
| `dev_path` | positional | Location of development data for evaluation in spaCy's [binary format](/api/data-formats#training). Can be a file or a directory of files. |
|
||||||
| `train_path` | positional | Location of JSON-formatted training data. Can be a file or a directory of files. |
|
| `config_path` | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. |
|
||||||
| `dev_path` | positional | Location of JSON-formatted development data for evaluation. Can be a file or a directory of files. |
|
| `--output`, `-o` | positional | Directory to store model in. Will be created if it doesn't exist. |
|
||||||
| `--base-model`, `-b` <Tag variant="new">2.1</Tag> | option | Optional name of base model to update. Can be any loadable spaCy model. |
|
| `--code`, `-c` | option | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. |
|
||||||
| `--pipeline`, `-p` <Tag variant="new">2.1</Tag> | option | Comma-separated names of pipeline components to train. Defaults to `'tagger,parser,ner'`. |
|
| `--verbose`, `-V` | flag | Show more detailed messages during training. |
|
||||||
| `--replace-components`, `-R` | flag | Replace components from the base model. |
|
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||||
| `--vectors`, `-v` | option | Model to load vectors from. |
|
| **CREATES** | model | The final model and the best model. |
|
||||||
| `--n-iter`, `-n` | option | Number of iterations (default: `30`). |
|
|
||||||
| `--n-early-stopping`, `-ne` | option | Maximum number of training epochs without dev accuracy improvement. |
|
|
||||||
| `--n-examples`, `-ns` | option | Number of examples to use (defaults to `0` for all examples). |
|
|
||||||
| `--use-gpu`, `-g` | option | GPU ID or `-1` for CPU only (default: `-1`). |
|
|
||||||
| `--version`, `-V` | option | Model version. Will be written out to the model's `meta.json` after training. |
|
|
||||||
| `--meta-path`, `-m` <Tag variant="new">2</Tag> | option | Optional path to model [`meta.json`](/usage/training#models-generating). All relevant properties like `lang`, `pipeline` and `spacy_version` will be overwritten. |
|
|
||||||
| `--init-tok2vec`, `-t2v` <Tag variant="new">2.1</Tag> | option | Path to pretrained weights for the token-to-vector parts of the models. See `spacy pretrain`. Experimental. |
|
|
||||||
| `--parser-multitasks`, `-pt` | option | Side objectives for parser CNN, e.g. `'dep'` or `'dep,tag'` |
|
|
||||||
| `--entity-multitasks`, `-et` | option | Side objectives for NER CNN, e.g. `'dep'` or `'dep,tag'` |
|
|
||||||
| `--width`, `-cw` <Tag variant="new">2.2.4</Tag> | option | Width of CNN layers of `Tok2Vec` component. |
|
|
||||||
| `--conv-depth`, `-cd` <Tag variant="new">2.2.4</Tag> | option | Depth of CNN layers of `Tok2Vec` component. |
|
|
||||||
| `--cnn-window`, `-cW` <Tag variant="new">2.2.4</Tag> | option | Window size for CNN layers of `Tok2Vec` component. |
|
|
||||||
| `--cnn-pieces`, `-cP` <Tag variant="new">2.2.4</Tag> | option | Maxout size for CNN layers of `Tok2Vec` component. |
|
|
||||||
| `--use-chars`, `-chr` <Tag variant="new">2.2.4</Tag> | flag | Whether to use character-based embedding of `Tok2Vec` component. |
|
|
||||||
| `--bilstm-depth`, `-lstm` <Tag variant="new">2.2.4</Tag> | option | Depth of BiLSTM layers of `Tok2Vec` component (requires PyTorch). |
|
|
||||||
| `--embed-rows`, `-er` <Tag variant="new">2.2.4</Tag> | option | Number of embedding rows of `Tok2Vec` component. |
|
|
||||||
| `--noise-level`, `-nl` | option | Float indicating the amount of corruption for data augmentation. |
|
|
||||||
| `--orth-variant-level`, `-ovl` <Tag variant="new">2.2</Tag> | option | Float indicating the orthography variation for data augmentation (e.g. `0.3` for making 30% of occurrences of some tokens subject to replacement). |
|
|
||||||
| `--gold-preproc`, `-G` | flag | Use gold preprocessing. |
|
|
||||||
| `--learn-tokens`, `-T` | flag | Make parser learn gold-standard tokenization by merging ] subtokens. Typically used for languages like Chinese. |
|
|
||||||
| `--textcat-multilabel`, `-TML` <Tag variant="new">2.2</Tag> | flag | Text classification classes aren't mutually exclusive (multilabel). |
|
|
||||||
| `--textcat-arch`, `-ta` <Tag variant="new">2.2</Tag> | option | Text classification model architecture. Defaults to `"bow"`. |
|
|
||||||
| `--textcat-positive-label`, `-tpl` <Tag variant="new">2.2</Tag> | option | Text classification positive label for binary classes with two labels. |
|
|
||||||
| `--tag-map-path`, `-tm` <Tag variant="new">2.2.4</Tag> | option | Location of JSON-formatted tag map. |
|
|
||||||
| `--verbose`, `-VV` <Tag variant="new">2.0.13</Tag> | flag | Show more detailed messages during training. |
|
|
||||||
| `--help`, `-h` | flag | Show help message and available arguments. |
|
|
||||||
| **CREATES** | model, pickle | A spaCy model on each epoch. |
|
|
||||||
|
|
||||||
## Pretrain {#pretrain new="2.1" tag="experimental"}
|
## Pretrain {#pretrain new="2.1" tag="experimental"}
|
||||||
|
|
||||||
|
@ -507,12 +488,13 @@ so you don't have to run `python setup.py sdist` separately anymore.
|
||||||
$ python -m spacy package [input_dir] [output_dir] [--meta-path] [--create-meta] [--force]
|
$ python -m spacy package [input_dir] [output_dir] [--meta-path] [--create-meta] [--force]
|
||||||
```
|
```
|
||||||
|
|
||||||
```bash
|
> #### Example
|
||||||
### Example
|
>
|
||||||
python -m spacy package /input /output
|
> ```bash
|
||||||
cd /output/en_model-0.0.0
|
> python -m spacy package /input /output
|
||||||
pip install dist/en_model-0.0.0.tar.gz
|
> cd /output/en_model-0.0.0
|
||||||
```
|
> pip install dist/en_model-0.0.0.tar.gz
|
||||||
|
> ```
|
||||||
|
|
||||||
| Argument | Type | Description |
|
| Argument | Type | Description |
|
||||||
| ------------------------------------------------ | ---------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------------------------------------ | ---------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
@ -525,18 +507,143 @@ pip install dist/en_model-0.0.0.tar.gz
|
||||||
| `--help`, `-h` | flag | Show help message and available arguments. |
|
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||||
| **CREATES** | directory | A Python package containing the spaCy model. |
|
| **CREATES** | directory | A Python package containing the spaCy model. |
|
||||||
|
|
||||||
## Project {#project}
|
## Project {#project new="3"}
|
||||||
|
|
||||||
<!-- TODO: document project command and subcommands. We should probably wait and only finalize this once we've finalized the design -->
|
<!-- TODO: document project command and subcommands. We should probably wait and only finalize this once we've finalized the design -->
|
||||||
|
|
||||||
|
The `spacy project` CLI includes subcommands for working with
|
||||||
|
[spaCy projects](/usage/projects), end-to-end workflows for building and
|
||||||
|
deploying custom spaCy models.
|
||||||
|
|
||||||
### project clone {#project-clone}
|
### project clone {#project-clone}
|
||||||
|
|
||||||
|
Clone a project template from a Git repository. Calls into `git` under the hood
|
||||||
|
and uses the sparse checkout feature, so you're only downloading what you need.
|
||||||
|
By default, spaCy's
|
||||||
|
[project templates repo](https://github.com/explosion/projects) is used, but you
|
||||||
|
can provide any other repo (public or private) that you have access to using the
|
||||||
|
`--repo` option.
|
||||||
|
|
||||||
|
<!-- TODO: update example once we've decided on repo structure -->
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ python -m spacy project clone [name] [dest] [--repo]
|
||||||
|
```
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```bash
|
||||||
|
> $ python -m spacy project clone some_example
|
||||||
|
> ```
|
||||||
|
>
|
||||||
|
> Clone from custom repo:
|
||||||
|
>
|
||||||
|
> ```bash
|
||||||
|
> $ python -m spacy project clone template --repo https://github.com/your_org/your_repo
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Argument | Type | Description |
|
||||||
|
| -------------- | ---------- | ---------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `name` | positional | The name of the template to clone, relative to the repo. Can be a top-level directory or a subdirectory like `dir/template`. |
|
||||||
|
| `dest` | positional | Where to clone the project. Defaults to current working directory. |
|
||||||
|
| `--repo`, `-r` | option | The repository to clone from. Can be any public or private Git repo you have access to. |
|
||||||
|
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||||
|
| **CREATES** | directory | The cloned [project directory](/usage/projects#project-files). |
|
||||||
|
|
||||||
### project assets {#project-assets}
|
### project assets {#project-assets}
|
||||||
|
|
||||||
### project run-all {#project-run-all}
|
Fetch project assets like datasets and pretrained weights. Assets are defined in
|
||||||
|
the `assets` section of the [`project.yml`](/usage/projects#project-yml). If a
|
||||||
|
`checksum` is provided, the file is only downloaded if no local file with the
|
||||||
|
same checksum exists and spaCy will show an error if the checksum of the
|
||||||
|
downloaded file doesn't match. If assets don't specify a `url` they're
|
||||||
|
considered "private" and you have to take care of putting them into the
|
||||||
|
destination directory yourself. If a local path is provided, the asset is copied
|
||||||
|
into the current project.
|
||||||
|
|
||||||
|
<!-- TODO: update example once we've decided on repo structure -->
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ python -m spacy project assets [project_dir]
|
||||||
|
```
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```bash
|
||||||
|
> $ python -m spacy project assets
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Argument | Type | Description |
|
||||||
|
| -------------- | ---------- | ----------------------------------------------------------------- |
|
||||||
|
| `project_dir` | positional | Path to project directory. Defaults to current working directory. |
|
||||||
|
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||||
|
| **CREATES** | files | Downloaded or copied assets defined in the `project.yml`. |
|
||||||
|
|
||||||
### project run {#project-run}
|
### project run {#project-run}
|
||||||
|
|
||||||
### project init {#project-init}
|
Run a named command or workflow defined in the
|
||||||
|
[`project.yml`](/usage/projects#project-yml). If a workflow name is specified,
|
||||||
|
all commands in the workflow are run, in order. If commands define
|
||||||
|
[dependencies or outputs](/usage/projects#deps-outputs), they will only be
|
||||||
|
re-run if state has changed. For example, if the input dataset changes, a
|
||||||
|
preprocessing command that depends on those files will be re-run.
|
||||||
|
|
||||||
### project update-dvc {#project-update-dvc}
|
<!-- TODO: update example once we've decided on repo structure -->
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ python -m spacy project run [subcommand] [project_dir] [--force] [--dry]
|
||||||
|
```
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```bash
|
||||||
|
> $ python -m spacy project run train
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Argument | Type | Description |
|
||||||
|
| --------------- | ---------- | ----------------------------------------------------------------- |
|
||||||
|
| `subcommand` | positional | Name of the command or workflow to run. |
|
||||||
|
| `project_dir` | positional | Path to project directory. Defaults to current working directory. |
|
||||||
|
| `--force`, `-F` | flag | Force re-running steps, even if nothing changed. |
|
||||||
|
| `--dry`, `-D` | flag | Perform a dry run and don't execute scripts. |
|
||||||
|
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||||
|
|
||||||
|
### project dvc {#project-dvc}
|
||||||
|
|
||||||
|
Auto-generate [Data Version Control](https://dvc.org) (DVC) config file. Calls
|
||||||
|
[`dvc run`](https://dvc.org/doc/command-reference/run) with `--no-exec` under
|
||||||
|
the hood to generate the `dvc.yaml`. A DVC project can only define one pipeline,
|
||||||
|
so you need to specify one workflow defined in the
|
||||||
|
[`project.yml`](/usage/projects#project-yml). If no workflow is specified, the
|
||||||
|
first defined workflow is used. The DVC config will only be updated if the
|
||||||
|
`project.yml` changed. For details, see the
|
||||||
|
[DVC integration](/usage/projects#dvc) docs.
|
||||||
|
|
||||||
|
<Infobox variant="warning">
|
||||||
|
|
||||||
|
This command requires DVC to be installed and initialized in the project
|
||||||
|
directory, e.g. via [`dvc init`](https://dvc.org/doc/command-reference/init).
|
||||||
|
You'll also need to add the assets you want to track with
|
||||||
|
[`dvc add`](https://dvc.org/doc/command-reference/add).
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose]
|
||||||
|
```
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```bash
|
||||||
|
> git init
|
||||||
|
> dvc init
|
||||||
|
> python -m spacy project dvc all
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Argument | Type | Description |
|
||||||
|
| ----------------- | ---------- | --------------------------------------------------------------------------------- |
|
||||||
|
| `project_dir` | positional | Path to project directory. Defaults to current working directory. |
|
||||||
|
| `workflow` | positional | Name of workflow defined in `project.yml`. Defaults to first workflow if not set. |
|
||||||
|
| `--force`, `-F` | flag | Force-updating config file. |
|
||||||
|
| `--verbose`, `-V` | flag | Print more output generated by DVC. |
|
||||||
|
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||||
|
|
|
@ -122,7 +122,7 @@ where the rescuers keep passing out from low oxygen, causing another rescuer to
|
||||||
follow — only to succumb themselves. In short, just say no to optimizing your
|
follow — only to succumb themselves. In short, just say no to optimizing your
|
||||||
Python. If it's not fast enough the first time, just switch to Cython.
|
Python. If it's not fast enough the first time, just switch to Cython.
|
||||||
|
|
||||||
<Infobox title="📖 Resources">
|
<Infobox title="Resources" emoji="📖">
|
||||||
|
|
||||||
- [Official Cython documentation](http://docs.cython.org/en/latest/)
|
- [Official Cython documentation](http://docs.cython.org/en/latest/)
|
||||||
(cython.org)
|
(cython.org)
|
||||||
|
|
|
@ -2,7 +2,8 @@
|
||||||
title: Data formats
|
title: Data formats
|
||||||
teaser: Details on spaCy's input and output data formats
|
teaser: Details on spaCy's input and output data formats
|
||||||
menu:
|
menu:
|
||||||
- ['Training data', 'training']
|
- ['Training Data', 'training']
|
||||||
|
- ['Training Config', 'config']
|
||||||
- ['Vocabulary', 'vocab']
|
- ['Vocabulary', 'vocab']
|
||||||
---
|
---
|
||||||
|
|
||||||
|
@ -74,6 +75,29 @@ from the English Wall Street Journal portion of the Penn Treebank:
|
||||||
https://github.com/explosion/spaCy/tree/master/examples/training/training-data.json
|
https://github.com/explosion/spaCy/tree/master/examples/training/training-data.json
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Training config {#config new="3"}
|
||||||
|
|
||||||
|
Config files define the training process and model pipeline and can be passed to
|
||||||
|
[`spacy train`](/api/cli#train). They use
|
||||||
|
[Thinc's configuration system](https://thinc.ai/docs/usage-config) under the
|
||||||
|
hood. For details on how to use training configs, see the
|
||||||
|
[usage documentation](/usage/training#config).
|
||||||
|
|
||||||
|
<Infobox variant="warning">
|
||||||
|
|
||||||
|
The `@` syntax lets you refer to function names registered in the
|
||||||
|
[function registry](/api/top-level#registry). For example,
|
||||||
|
`@architectures = "spacy.HashEmbedCNN.v1"` refers to a registered function of
|
||||||
|
the name `"spacy.HashEmbedCNN.v1"` and all other values defined in its block
|
||||||
|
will be passed into that function as arguments. Those arguments depend on the
|
||||||
|
registered function. See the [model architectures](/api/architectures) docs for
|
||||||
|
API details.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
|
<!-- TODO: we need to come up with a good way to present the sections and their expected values visually? -->
|
||||||
|
<!-- TODO: once we know how we want to implement "starter config" workflow or outputting a full default config for the user, update this section with the command -->
|
||||||
|
|
||||||
## Lexical data for vocabulary {#vocab-jsonl new="2"}
|
## Lexical data for vocabulary {#vocab-jsonl new="2"}
|
||||||
|
|
||||||
To populate a model's vocabulary, you can use the
|
To populate a model's vocabulary, you can use the
|
||||||
|
|
|
@ -8,41 +8,46 @@ This class is a subclass of `Pipe` and follows the same API. The pipeline
|
||||||
component is available in the [processing pipeline](/usage/processing-pipelines)
|
component is available in the [processing pipeline](/usage/processing-pipelines)
|
||||||
via the ID `"parser"`.
|
via the ID `"parser"`.
|
||||||
|
|
||||||
## DependencyParser.Model {#model tag="classmethod"}
|
## Default config {#config}
|
||||||
|
|
||||||
Initialize a model for the pipe. The model should implement the
|
This is the default configuration used to initialize the model powering the
|
||||||
`thinc.neural.Model` API. Wrappers are under development for most major machine
|
pipeline component. See the [model architectures](/api/architectures)
|
||||||
learning libraries.
|
documentation for details on the architectures and their arguments and
|
||||||
|
hyperparameters. To learn more about how to customize the config and train
|
||||||
|
custom models, check out the [training config](/usage/training#config) docs.
|
||||||
|
|
||||||
| Name | Type | Description |
|
```python
|
||||||
| ----------- | ------ | ------------------------------------- |
|
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/defaults/parser_defaults.cfg
|
||||||
| `**kwargs` | - | Parameters for initializing the model |
|
```
|
||||||
| **RETURNS** | object | The initialized model. |
|
|
||||||
|
|
||||||
## DependencyParser.\_\_init\_\_ {#init tag="method"}
|
## DependencyParser.\_\_init\_\_ {#init tag="method"}
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> # Construction via create_pipe with default model
|
||||||
|
> parser = nlp.create_pipe("parser")
|
||||||
|
>
|
||||||
|
> # Construction via create_pipe with custom model
|
||||||
|
> config = {"model": {"@architectures": "my_parser"}}
|
||||||
|
> parser = nlp.create_pipe("parser", config)
|
||||||
|
>
|
||||||
|
> # Construction from class with custom model from file
|
||||||
|
> from spacy.pipeline import DependencyParser
|
||||||
|
> model = util.load_config("model.cfg", create_objects=True)["model"]
|
||||||
|
> parser = DependencyParser(nlp.vocab, model)
|
||||||
|
> ```
|
||||||
|
|
||||||
Create a new pipeline instance. In your application, you would normally use a
|
Create a new pipeline instance. In your application, you would normally use a
|
||||||
shortcut for this and instantiate the component using its string name and
|
shortcut for this and instantiate the component using its string name and
|
||||||
[`nlp.create_pipe`](/api/language#create_pipe).
|
[`nlp.create_pipe`](/api/language#create_pipe).
|
||||||
|
|
||||||
> #### Example
|
| Name | Type | Description |
|
||||||
>
|
| ----------- | ------------------ | ------------------------------------------------------------------------------- |
|
||||||
> ```python
|
| `vocab` | `Vocab` | The shared vocabulary. |
|
||||||
> # Construction via create_pipe
|
| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
||||||
> parser = nlp.create_pipe("parser")
|
| `**cfg` | - | Configuration parameters. |
|
||||||
>
|
| **RETURNS** | `DependencyParser` | The newly constructed object. |
|
||||||
> # Construction from class
|
|
||||||
> from spacy.pipeline import DependencyParser
|
|
||||||
> parser = DependencyParser(nlp.vocab)
|
|
||||||
> parser.from_disk("/path/to/model")
|
|
||||||
> ```
|
|
||||||
|
|
||||||
| Name | Type | Description |
|
|
||||||
| ----------- | ----------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- |
|
|
||||||
| `vocab` | `Vocab` | The shared vocabulary. |
|
|
||||||
| `model` | `thinc.neural.Model` / `True` | The model powering the pipeline component. If no model is supplied, the model is created when you call `begin_training`, `from_disk` or `from_bytes`. |
|
|
||||||
| `**cfg` | - | Configuration parameters. |
|
|
||||||
| **RETURNS** | `DependencyParser` | The newly constructed object. |
|
|
||||||
|
|
||||||
## DependencyParser.\_\_call\_\_ {#call tag="method"}
|
## DependencyParser.\_\_call\_\_ {#call tag="method"}
|
||||||
|
|
||||||
|
@ -85,11 +90,11 @@ applied to the `Doc` in order. Both [`__call__`](/api/dependencyparser#call) and
|
||||||
> pass
|
> pass
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ------------ | -------- | ------------------------------------------------------ |
|
| ------------ | --------------- | ------------------------------------------------------ |
|
||||||
| `stream` | iterable | A stream of documents. |
|
| `stream` | `Iterable[Doc]` | A stream of documents. |
|
||||||
| `batch_size` | int | The number of texts to buffer. Defaults to `128`. |
|
| `batch_size` | int | The number of texts to buffer. Defaults to `128`. |
|
||||||
| **YIELDS** | `Doc` | Processed documents in the order of the original text. |
|
| **YIELDS** | `Doc` | Processed documents in the order of the original text. |
|
||||||
|
|
||||||
## DependencyParser.predict {#predict tag="method"}
|
## DependencyParser.predict {#predict tag="method"}
|
||||||
|
|
||||||
|
@ -104,7 +109,7 @@ Apply the pipeline's model to a batch of docs, without modifying them.
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ------------------- | ---------------------------------------------- |
|
| ----------- | ------------------- | ---------------------------------------------- |
|
||||||
| `docs` | iterable | The documents to predict. |
|
| `docs` | `Iterable[Doc]` | The documents to predict. |
|
||||||
| **RETURNS** | `syntax.StateClass` | A helper class for the parse state (internal). |
|
| **RETURNS** | `syntax.StateClass` | A helper class for the parse state (internal). |
|
||||||
|
|
||||||
## DependencyParser.set_annotations {#set_annotations tag="method"}
|
## DependencyParser.set_annotations {#set_annotations tag="method"}
|
||||||
|
@ -119,33 +124,34 @@ Modify a batch of documents, using pre-computed scores.
|
||||||
> parser.set_annotations([doc1, doc2], scores)
|
> parser.set_annotations([doc1, doc2], scores)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| -------- | -------- | ---------------------------------------------------------- |
|
| -------- | ------------------- | ---------------------------------------------------------- |
|
||||||
| `docs` | iterable | The documents to modify. |
|
| `docs` | `Iterable[Doc]` | The documents to modify. |
|
||||||
| `scores` | - | The scores to set, produced by `DependencyParser.predict`. |
|
| `scores` | `syntax.StateClass` | The scores to set, produced by `DependencyParser.predict`. |
|
||||||
|
|
||||||
## DependencyParser.update {#update tag="method"}
|
## DependencyParser.update {#update tag="method"}
|
||||||
|
|
||||||
Learn from a batch of documents and gold-standard information, updating the
|
Learn from a batch of [`Example`](/api/example) objects, updating the pipe's
|
||||||
pipe's model. Delegates to [`predict`](/api/dependencyparser#predict) and
|
model. Delegates to [`predict`](/api/dependencyparser#predict) and
|
||||||
[`get_loss`](/api/dependencyparser#get_loss).
|
[`get_loss`](/api/dependencyparser#get_loss).
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> parser = DependencyParser(nlp.vocab)
|
> parser = DependencyParser(nlp.vocab, parser_model)
|
||||||
> losses = {}
|
|
||||||
> optimizer = nlp.begin_training()
|
> optimizer = nlp.begin_training()
|
||||||
> parser.update([doc1, doc2], [gold1, gold2], losses=losses, sgd=optimizer)
|
> losses = parser.update(examples, sgd=optimizer)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| -------- | -------- | -------------------------------------------------------------------------------------------- |
|
| ----------------- | ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `docs` | iterable | A batch of documents to learn from. |
|
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
|
||||||
| `golds` | iterable | The gold-standard data. Must have the same length as `docs`. |
|
| _keyword-only_ | | |
|
||||||
| `drop` | float | The dropout rate. |
|
| `drop` | float | The dropout rate. |
|
||||||
| `sgd` | callable | The optimizer. Should take two arguments `weights` and `gradient`, and an optional ID. |
|
| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/dependencyparser#set_annotations). |
|
||||||
| `losses` | dict | Optional record of the loss during training. The value keyed by the model's name is updated. |
|
| `sgd` | `Optimizer` | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. |
|
||||||
|
| `losses` | `Dict[str, float]` | Optional record of the loss during training. The value keyed by the model's name is updated. |
|
||||||
|
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
||||||
|
|
||||||
## DependencyParser.get_loss {#get_loss tag="method"}
|
## DependencyParser.get_loss {#get_loss tag="method"}
|
||||||
|
|
||||||
|
@ -156,21 +162,20 @@ predicted scores.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> parser = DependencyParser(nlp.vocab)
|
> parser = DependencyParser(nlp.vocab)
|
||||||
> scores = parser.predict([doc1, doc2])
|
> scores = parser.predict([eg.predicted for eg in examples])
|
||||||
> loss, d_loss = parser.get_loss([doc1, doc2], [gold1, gold2], scores)
|
> loss, d_loss = parser.get_loss(examples, scores)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | -------- | ------------------------------------------------------------ |
|
| ----------- | ------------------- | --------------------------------------------------- |
|
||||||
| `docs` | iterable | The batch of documents. |
|
| `examples` | `Iterable[Example]` | The batch of examples. |
|
||||||
| `golds` | iterable | The gold-standard data. Must have the same length as `docs`. |
|
| `scores` | `syntax.StateClass` | Scores representing the model's predictions. |
|
||||||
| `scores` | - | Scores representing the model's predictions. |
|
| **RETURNS** | tuple | The loss and the gradient, i.e. `(loss, gradient)`. |
|
||||||
| **RETURNS** | tuple | The loss and the gradient, i.e. `(loss, gradient)`. |
|
|
||||||
|
|
||||||
## DependencyParser.begin_training {#begin_training tag="method"}
|
## DependencyParser.begin_training {#begin_training tag="method"}
|
||||||
|
|
||||||
Initialize the pipe for training, using data examples if available. If no model
|
Initialize the pipe for training, using data examples if available. Return an
|
||||||
has been initialized yet, the model is added.
|
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
@ -180,16 +185,17 @@ has been initialized yet, the model is added.
|
||||||
> optimizer = parser.begin_training(pipeline=nlp.pipeline)
|
> optimizer = parser.begin_training(pipeline=nlp.pipeline)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------- | ----------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `gold_tuples` | iterable | Optional gold-standard annotations from which to construct [`GoldParse`](/api/goldparse) objects. |
|
| `get_examples` | `Iterable[Example]` | Optional gold-standard annotations in the form of [`Example`](/api/example) objects. |
|
||||||
| `pipeline` | list | Optional list of pipeline components that this component is part of. |
|
| `pipeline` | `List[(str, callable)]` | Optional list of pipeline components that this component is part of. |
|
||||||
| `sgd` | callable | An optional optimizer. Should take two arguments `weights` and `gradient`, and an optional ID. Will be created via [`DependencyParser`](/api/dependencyparser#create_optimizer) if not set. |
|
| `sgd` | `Optimizer` | An optional [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. Will be created via [`create_optimizer`](/api/dependencyparser#create_optimizer) if not set. |
|
||||||
| **RETURNS** | callable | An optimizer. |
|
| **RETURNS** | `Optimizer` | An optimizer. |
|
||||||
|
|
||||||
## DependencyParser.create_optimizer {#create_optimizer tag="method"}
|
## DependencyParser.create_optimizer {#create_optimizer tag="method"}
|
||||||
|
|
||||||
Create an optimizer for the pipeline component.
|
Create an [`Optimizer`](https://thinc.ai/docs/api-optimizers) for the pipeline
|
||||||
|
component.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
@ -198,9 +204,9 @@ Create an optimizer for the pipeline component.
|
||||||
> optimizer = parser.create_optimizer()
|
> optimizer = parser.create_optimizer()
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | -------- | -------------- |
|
| ----------- | ----------- | --------------------------------------------------------------- |
|
||||||
| **RETURNS** | callable | The optimizer. |
|
| **RETURNS** | `Optimizer` | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. |
|
||||||
|
|
||||||
## DependencyParser.use_params {#use_params tag="method, contextmanager"}
|
## DependencyParser.use_params {#use_params tag="method, contextmanager"}
|
||||||
|
|
||||||
|
|
|
@ -12,44 +12,47 @@ This class is a subclass of `Pipe` and follows the same API. The pipeline
|
||||||
component is available in the [processing pipeline](/usage/processing-pipelines)
|
component is available in the [processing pipeline](/usage/processing-pipelines)
|
||||||
via the ID `"entity_linker"`.
|
via the ID `"entity_linker"`.
|
||||||
|
|
||||||
## EntityLinker.Model {#model tag="classmethod"}
|
## Default config {#config}
|
||||||
|
|
||||||
Initialize a model for the pipe. The model should implement the
|
This is the default configuration used to initialize the model powering the
|
||||||
`thinc.neural.Model` API, and should contain a field `tok2vec` that contains the
|
pipeline component. See the [model architectures](/api/architectures)
|
||||||
context encoder. Wrappers are under development for most major machine learning
|
documentation for details on the architectures and their arguments and
|
||||||
libraries.
|
hyperparameters. To learn more about how to customize the config and train
|
||||||
|
custom models, check out the [training config](/usage/training#config) docs.
|
||||||
|
|
||||||
| Name | Type | Description |
|
```python
|
||||||
| ----------- | ------ | ------------------------------------- |
|
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/defaults/entity_linker_defaults.cfg
|
||||||
| `**kwargs` | - | Parameters for initializing the model |
|
```
|
||||||
| **RETURNS** | object | The initialized model. |
|
|
||||||
|
|
||||||
## EntityLinker.\_\_init\_\_ {#init tag="method"}
|
## EntityLinker.\_\_init\_\_ {#init tag="method"}
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> # Construction via create_pipe with default model
|
||||||
|
> entity_linker = nlp.create_pipe("entity_linker")
|
||||||
|
>
|
||||||
|
> # Construction via create_pipe with custom model
|
||||||
|
> config = {"model": {"@architectures": "my_el"}}
|
||||||
|
> entity_linker = nlp.create_pipe("entity_linker", config)
|
||||||
|
>
|
||||||
|
> # Construction from class with custom model from file
|
||||||
|
> from spacy.pipeline import EntityLinker
|
||||||
|
> model = util.load_config("model.cfg", create_objects=True)["model"]
|
||||||
|
> entity_linker = EntityLinker(nlp.vocab, model)
|
||||||
|
> ```
|
||||||
|
|
||||||
Create a new pipeline instance. In your application, you would normally use a
|
Create a new pipeline instance. In your application, you would normally use a
|
||||||
shortcut for this and instantiate the component using its string name and
|
shortcut for this and instantiate the component using its string name and
|
||||||
[`nlp.create_pipe`](/api/language#create_pipe).
|
[`nlp.create_pipe`](/api/language#create_pipe).
|
||||||
|
|
||||||
> #### Example
|
| Name | Type | Description |
|
||||||
>
|
| ------- | ------- | ------------------------------------------------------------------------------- |
|
||||||
> ```python
|
| `vocab` | `Vocab` | The shared vocabulary. |
|
||||||
> # Construction via create_pipe
|
| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
||||||
> entity_linker = nlp.create_pipe("entity_linker")
|
| `**cfg` | - | Configuration parameters. |
|
||||||
>
|
|
||||||
> # Construction from class
|
|
||||||
> from spacy.pipeline import EntityLinker
|
|
||||||
> entity_linker = EntityLinker(nlp.vocab)
|
|
||||||
> entity_linker.from_disk("/path/to/model")
|
|
||||||
> ```
|
|
||||||
|
|
||||||
| Name | Type | Description |
|
| **RETURNS** | `EntityLinker` | The newly constructed object. |
|
||||||
| -------------- | ----------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- |
|
|
||||||
| `vocab` | `Vocab` | The shared vocabulary. |
|
|
||||||
| `model` | `thinc.neural.Model` / `True` | The model powering the pipeline component. If no model is supplied, the model is created when you call `begin_training`, `from_disk` or `from_bytes`. |
|
|
||||||
| `hidden_width` | int | Width of the hidden layer of the entity linking model, defaults to `128`. |
|
|
||||||
| `incl_prior` | bool | Whether or not to include prior probabilities in the model. Defaults to `True`. |
|
|
||||||
| `incl_context` | bool | Whether or not to include the local context in the model (if not: only prior probabilities are used). Defaults to `True`. |
|
|
||||||
| **RETURNS** | `EntityLinker` | The newly constructed object. |
|
|
||||||
|
|
||||||
## EntityLinker.\_\_call\_\_ {#call tag="method"}
|
## EntityLinker.\_\_call\_\_ {#call tag="method"}
|
||||||
|
|
||||||
|
@ -91,11 +94,11 @@ applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and
|
||||||
> pass
|
> pass
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ------------ | -------- | ------------------------------------------------------ |
|
| ------------ | --------------- | ------------------------------------------------------ |
|
||||||
| `stream` | iterable | A stream of documents. |
|
| `stream` | `Iterable[Doc]` | A stream of documents. |
|
||||||
| `batch_size` | int | The number of texts to buffer. Defaults to `128`. |
|
| `batch_size` | int | The number of texts to buffer. Defaults to `128`. |
|
||||||
| **YIELDS** | `Doc` | Processed documents in the order of the original text. |
|
| **YIELDS** | `Doc` | Processed documents in the order of the original text. |
|
||||||
|
|
||||||
## EntityLinker.predict {#predict tag="method"}
|
## EntityLinker.predict {#predict tag="method"}
|
||||||
|
|
||||||
|
@ -105,13 +108,13 @@ Apply the pipeline's model to a batch of docs, without modifying them.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> entity_linker = EntityLinker(nlp.vocab)
|
> entity_linker = EntityLinker(nlp.vocab)
|
||||||
> kb_ids, tensors = entity_linker.predict([doc1, doc2])
|
> kb_ids = entity_linker.predict([doc1, doc2])
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------- | --------------- | ------------------------------------------------------------ |
|
||||||
| `docs` | iterable | The documents to predict. |
|
| `docs` | `Iterable[Doc]` | The documents to predict. |
|
||||||
| **RETURNS** | tuple | A `(kb_ids, tensors)` tuple where `kb_ids` are the model's predicted KB identifiers for the entities in the `docs`, and `tensors` are the token representations used to predict these identifiers. |
|
| **RETURNS** | `Iterable[str]` | The predicted KB identifiers for the entities in the `docs`. |
|
||||||
|
|
||||||
## EntityLinker.set_annotations {#set_annotations tag="method"}
|
## EntityLinker.set_annotations {#set_annotations tag="method"}
|
||||||
|
|
||||||
|
@ -122,19 +125,18 @@ entities.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> entity_linker = EntityLinker(nlp.vocab)
|
> entity_linker = EntityLinker(nlp.vocab)
|
||||||
> kb_ids, tensors = entity_linker.predict([doc1, doc2])
|
> kb_ids = entity_linker.predict([doc1, doc2])
|
||||||
> entity_linker.set_annotations([doc1, doc2], kb_ids, tensors)
|
> entity_linker.set_annotations([doc1, doc2], kb_ids)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| --------- | -------- | ------------------------------------------------------------------------------------------------- |
|
| -------- | --------------- | ------------------------------------------------------------------------------------------------- |
|
||||||
| `docs` | iterable | The documents to modify. |
|
| `docs` | `Iterable[Doc]` | The documents to modify. |
|
||||||
| `kb_ids` | iterable | The knowledge base identifiers for the entities in the docs, predicted by `EntityLinker.predict`. |
|
| `kb_ids` | `Iterable[str]` | The knowledge base identifiers for the entities in the docs, predicted by `EntityLinker.predict`. |
|
||||||
| `tensors` | iterable | The token representations used to predict the identifiers. |
|
|
||||||
|
|
||||||
## EntityLinker.update {#update tag="method"}
|
## EntityLinker.update {#update tag="method"}
|
||||||
|
|
||||||
Learn from a batch of documents and gold-standard information, updating both the
|
Learn from a batch of [`Example`](/api/example) objects, updating both the
|
||||||
pipe's entity linking model and context encoder. Delegates to
|
pipe's entity linking model and context encoder. Delegates to
|
||||||
[`predict`](/api/entitylinker#predict) and
|
[`predict`](/api/entitylinker#predict) and
|
||||||
[`get_loss`](/api/entitylinker#get_loss).
|
[`get_loss`](/api/entitylinker#get_loss).
|
||||||
|
@ -142,40 +144,20 @@ pipe's entity linking model and context encoder. Delegates to
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> entity_linker = EntityLinker(nlp.vocab)
|
> entity_linker = EntityLinker(nlp.vocab, nel_model)
|
||||||
> losses = {}
|
|
||||||
> optimizer = nlp.begin_training()
|
> optimizer = nlp.begin_training()
|
||||||
> entity_linker.update([doc1, doc2], [gold1, gold2], losses=losses, sgd=optimizer)
|
> losses = entity_linker.update(examples, sgd=optimizer)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| -------- | -------- | ------------------------------------------------------------------------------------------------------- |
|
| ----------------- | ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `docs` | iterable | A batch of documents to learn from. |
|
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
|
||||||
| `golds` | iterable | The gold-standard data. Must have the same length as `docs`. |
|
| _keyword-only_ | | |
|
||||||
| `drop` | float | The dropout rate, used both for the EL model and the context encoder. |
|
| `drop` | float | The dropout rate. |
|
||||||
| `sgd` | callable | The optimizer for the EL model. Should take two arguments `weights` and `gradient`, and an optional ID. |
|
| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/entitylinker#set_annotations). |
|
||||||
| `losses` | dict | Optional record of the loss during training. The value keyed by the model's name is updated. |
|
| `sgd` | `Optimizer` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. |
|
||||||
|
| `losses` | `Dict[str, float]` | Optional record of the loss during training. The value keyed by the model's name is updated. |
|
||||||
## EntityLinker.get_loss {#get_loss tag="method"}
|
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
||||||
|
|
||||||
Find the loss and gradient of loss for the entities in a batch of documents and
|
|
||||||
their predicted scores.
|
|
||||||
|
|
||||||
> #### Example
|
|
||||||
>
|
|
||||||
> ```python
|
|
||||||
> entity_linker = EntityLinker(nlp.vocab)
|
|
||||||
> kb_ids, tensors = entity_linker.predict(docs)
|
|
||||||
> loss, d_loss = entity_linker.get_loss(docs, [gold1, gold2], kb_ids, tensors)
|
|
||||||
> ```
|
|
||||||
|
|
||||||
| Name | Type | Description |
|
|
||||||
| ----------- | -------- | ------------------------------------------------------------ |
|
|
||||||
| `docs` | iterable | The batch of documents. |
|
|
||||||
| `golds` | iterable | The gold-standard data. Must have the same length as `docs`. |
|
|
||||||
| `kb_ids` | iterable | KB identifiers representing the model's predictions. |
|
|
||||||
| `tensors` | iterable | The token representations used to predict the identifiers |
|
|
||||||
| **RETURNS** | tuple | The loss and the gradient, i.e. `(loss, gradient)`. |
|
|
||||||
|
|
||||||
## EntityLinker.set_kb {#set_kb tag="method"}
|
## EntityLinker.set_kb {#set_kb tag="method"}
|
||||||
|
|
||||||
|
@ -195,9 +177,9 @@ identifiers.
|
||||||
|
|
||||||
## EntityLinker.begin_training {#begin_training tag="method"}
|
## EntityLinker.begin_training {#begin_training tag="method"}
|
||||||
|
|
||||||
Initialize the pipe for training, using data examples if available. If no model
|
Initialize the pipe for training, using data examples if available. Return an
|
||||||
has been initialized yet, the model is added. Before calling this method, a
|
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object. Before calling this
|
||||||
knowledge base should have been defined with
|
method, a knowledge base should have been defined with
|
||||||
[`set_kb`](/api/entitylinker#set_kb).
|
[`set_kb`](/api/entitylinker#set_kb).
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
|
@ -209,12 +191,12 @@ knowledge base should have been defined with
|
||||||
> optimizer = entity_linker.begin_training(pipeline=nlp.pipeline)
|
> optimizer = entity_linker.begin_training(pipeline=nlp.pipeline)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ------------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------- | ----------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `gold_tuples` | iterable | Optional gold-standard annotations from which to construct [`GoldParse`](/api/goldparse) objects. |
|
| `get_examples` | `Iterable[Example]` | Optional gold-standard annotations in the form of [`Example`](/api/example) objects. |
|
||||||
| `pipeline` | list | Optional list of pipeline components that this component is part of. |
|
| `pipeline` | `List[(str, callable)]` | Optional list of pipeline components that this component is part of. |
|
||||||
| `sgd` | callable | An optional optimizer. Should take two arguments `weights` and `gradient`, and an optional ID. Will be created via [`EntityLinker`](/api/entitylinker#create_optimizer) if not set. |
|
| `sgd` | `Optimizer` | An optional [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. Will be created via [`create_optimizer`](/api/entitylinker#create_optimizer) if not set. |
|
||||||
| **RETURNS** | callable | An optimizer. |
|
| **RETURNS** | `Optimizer` | An optimizer. | |
|
||||||
|
|
||||||
## EntityLinker.create_optimizer {#create_optimizer tag="method"}
|
## EntityLinker.create_optimizer {#create_optimizer tag="method"}
|
||||||
|
|
||||||
|
@ -227,9 +209,9 @@ Create an optimizer for the pipeline component.
|
||||||
> optimizer = entity_linker.create_optimizer()
|
> optimizer = entity_linker.create_optimizer()
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | -------- | -------------- |
|
| ----------- | ----------- | --------------------------------------------------------------- |
|
||||||
| **RETURNS** | callable | The optimizer. |
|
| **RETURNS** | `Optimizer` | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. |
|
||||||
|
|
||||||
## EntityLinker.use_params {#use_params tag="method, contextmanager"}
|
## EntityLinker.use_params {#use_params tag="method, contextmanager"}
|
||||||
|
|
||||||
|
|
|
@ -8,41 +8,46 @@ This class is a subclass of `Pipe` and follows the same API. The pipeline
|
||||||
component is available in the [processing pipeline](/usage/processing-pipelines)
|
component is available in the [processing pipeline](/usage/processing-pipelines)
|
||||||
via the ID `"ner"`.
|
via the ID `"ner"`.
|
||||||
|
|
||||||
## EntityRecognizer.Model {#model tag="classmethod"}
|
## Default config {#config}
|
||||||
|
|
||||||
Initialize a model for the pipe. The model should implement the
|
This is the default configuration used to initialize the model powering the
|
||||||
`thinc.neural.Model` API. Wrappers are under development for most major machine
|
pipeline component. See the [model architectures](/api/architectures)
|
||||||
learning libraries.
|
documentation for details on the architectures and their arguments and
|
||||||
|
hyperparameters. To learn more about how to customize the config and train
|
||||||
|
custom models, check out the [training config](/usage/training#config) docs.
|
||||||
|
|
||||||
| Name | Type | Description |
|
```python
|
||||||
| ----------- | ------ | ------------------------------------- |
|
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/defaults/ner_defaults.cfg
|
||||||
| `**kwargs` | - | Parameters for initializing the model |
|
```
|
||||||
| **RETURNS** | object | The initialized model. |
|
|
||||||
|
|
||||||
## EntityRecognizer.\_\_init\_\_ {#init tag="method"}
|
## EntityRecognizer.\_\_init\_\_ {#init tag="method"}
|
||||||
|
|
||||||
Create a new pipeline instance. In your application, you would normally use a
|
|
||||||
shortcut for this and instantiate the component using its string name and
|
|
||||||
[`nlp.create_pipe`](/api/language#create_pipe).
|
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> # Construction via create_pipe
|
> # Construction via create_pipe
|
||||||
> ner = nlp.create_pipe("ner")
|
> ner = nlp.create_pipe("ner")
|
||||||
>
|
>
|
||||||
> # Construction from class
|
> # Construction via create_pipe with custom model
|
||||||
|
> config = {"model": {"@architectures": "my_ner"}}
|
||||||
|
> parser = nlp.create_pipe("ner", config)
|
||||||
|
>
|
||||||
|
> # Construction from class with custom model from file
|
||||||
> from spacy.pipeline import EntityRecognizer
|
> from spacy.pipeline import EntityRecognizer
|
||||||
> ner = EntityRecognizer(nlp.vocab)
|
> model = util.load_config("model.cfg", create_objects=True)["model"]
|
||||||
> ner.from_disk("/path/to/model")
|
> ner = EntityRecognizer(nlp.vocab, model)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
Create a new pipeline instance. In your application, you would normally use a
|
||||||
| ----------- | ----------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- |
|
shortcut for this and instantiate the component using its string name and
|
||||||
| `vocab` | `Vocab` | The shared vocabulary. |
|
[`nlp.create_pipe`](/api/language#create_pipe).
|
||||||
| `model` | `thinc.neural.Model` / `True` | The model powering the pipeline component. If no model is supplied, the model is created when you call `begin_training`, `from_disk` or `from_bytes`. |
|
|
||||||
| `**cfg` | - | Configuration parameters. |
|
| Name | Type | Description |
|
||||||
| **RETURNS** | `EntityRecognizer` | The newly constructed object. |
|
| ----------- | ------------------ | ------------------------------------------------------------------------------- |
|
||||||
|
| `vocab` | `Vocab` | The shared vocabulary. |
|
||||||
|
| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
||||||
|
| `**cfg` | - | Configuration parameters. |
|
||||||
|
| **RETURNS** | `EntityRecognizer` | The newly constructed object. |
|
||||||
|
|
||||||
## EntityRecognizer.\_\_call\_\_ {#call tag="method"}
|
## EntityRecognizer.\_\_call\_\_ {#call tag="method"}
|
||||||
|
|
||||||
|
@ -85,11 +90,11 @@ applied to the `Doc` in order. Both [`__call__`](/api/entityrecognizer#call) and
|
||||||
> pass
|
> pass
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ------------ | -------- | ------------------------------------------------------ |
|
| ------------ | --------------- | ------------------------------------------------------ |
|
||||||
| `stream` | iterable | A stream of documents. |
|
| `stream` | `Iterable[Doc]` | A stream of documents. |
|
||||||
| `batch_size` | int | The number of texts to buffer. Defaults to `128`. |
|
| `batch_size` | int | The number of texts to buffer. Defaults to `128`. |
|
||||||
| **YIELDS** | `Doc` | Processed documents in the order of the original text. |
|
| **YIELDS** | `Doc` | Processed documents in the order of the original text. |
|
||||||
|
|
||||||
## EntityRecognizer.predict {#predict tag="method"}
|
## EntityRecognizer.predict {#predict tag="method"}
|
||||||
|
|
||||||
|
@ -99,13 +104,13 @@ Apply the pipeline's model to a batch of docs, without modifying them.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> ner = EntityRecognizer(nlp.vocab)
|
> ner = EntityRecognizer(nlp.vocab)
|
||||||
> scores, tensors = ner.predict([doc1, doc2])
|
> scores = ner.predict([doc1, doc2])
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------- | ------------------ | ---------------------------------------------------------------------------------------------------------- |
|
||||||
| `docs` | iterable | The documents to predict. |
|
| `docs` | `Iterable[Doc]` | The documents to predict. |
|
||||||
| **RETURNS** | list | List of `syntax.StateClass` objects. `syntax.StateClass` is a helper class for the parse state (internal). |
|
| **RETURNS** | `List[StateClass]` | List of `syntax.StateClass` objects. `syntax.StateClass` is a helper class for the parse state (internal). |
|
||||||
|
|
||||||
## EntityRecognizer.set_annotations {#set_annotations tag="method"}
|
## EntityRecognizer.set_annotations {#set_annotations tag="method"}
|
||||||
|
|
||||||
|
@ -115,38 +120,38 @@ Modify a batch of documents, using pre-computed scores.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> ner = EntityRecognizer(nlp.vocab)
|
> ner = EntityRecognizer(nlp.vocab)
|
||||||
> scores, tensors = ner.predict([doc1, doc2])
|
> scores = ner.predict([doc1, doc2])
|
||||||
> ner.set_annotations([doc1, doc2], scores, tensors)
|
> ner.set_annotations([doc1, doc2], scores)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| --------- | -------- | ---------------------------------------------------------- |
|
| -------- | ------------------ | ---------------------------------------------------------- |
|
||||||
| `docs` | iterable | The documents to modify. |
|
| `docs` | `Iterable[Doc]` | The documents to modify. |
|
||||||
| `scores` | - | The scores to set, produced by `EntityRecognizer.predict`. |
|
| `scores` | `List[StateClass]` | The scores to set, produced by `EntityRecognizer.predict`. |
|
||||||
| `tensors` | iterable | The token representations used to predict the scores. |
|
|
||||||
|
|
||||||
## EntityRecognizer.update {#update tag="method"}
|
## EntityRecognizer.update {#update tag="method"}
|
||||||
|
|
||||||
Learn from a batch of documents and gold-standard information, updating the
|
Learn from a batch of [`Example`](/api/example) objects, updating the pipe's
|
||||||
pipe's model. Delegates to [`predict`](/api/entityrecognizer#predict) and
|
model. Delegates to [`predict`](/api/entityrecognizer#predict) and
|
||||||
[`get_loss`](/api/entityrecognizer#get_loss).
|
[`get_loss`](/api/entityrecognizer#get_loss).
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> ner = EntityRecognizer(nlp.vocab)
|
> ner = EntityRecognizer(nlp.vocab, ner_model)
|
||||||
> losses = {}
|
|
||||||
> optimizer = nlp.begin_training()
|
> optimizer = nlp.begin_training()
|
||||||
> ner.update([doc1, doc2], [gold1, gold2], losses=losses, sgd=optimizer)
|
> losses = ner.update(examples, sgd=optimizer)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| -------- | -------- | -------------------------------------------------------------------------------------------- |
|
| ----------------- | ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `docs` | iterable | A batch of documents to learn from. |
|
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
|
||||||
| `golds` | iterable | The gold-standard data. Must have the same length as `docs`. |
|
| _keyword-only_ | | |
|
||||||
| `drop` | float | The dropout rate. |
|
| `drop` | float | The dropout rate. |
|
||||||
| `sgd` | callable | The optimizer. Should take two arguments `weights` and `gradient`, and an optional ID. |
|
| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/entityrecognizer#set_annotations). |
|
||||||
| `losses` | dict | Optional record of the loss during training. The value keyed by the model's name is updated. |
|
| `sgd` | `Optimizer` | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. |
|
||||||
|
| `losses` | `Dict[str, float]` | Optional record of the loss during training. The value keyed by the model's name is updated. |
|
||||||
|
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
||||||
|
|
||||||
## EntityRecognizer.get_loss {#get_loss tag="method"}
|
## EntityRecognizer.get_loss {#get_loss tag="method"}
|
||||||
|
|
||||||
|
@ -157,21 +162,20 @@ predicted scores.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> ner = EntityRecognizer(nlp.vocab)
|
> ner = EntityRecognizer(nlp.vocab)
|
||||||
> scores = ner.predict([doc1, doc2])
|
> scores = ner.predict([eg.predicted for eg in examples])
|
||||||
> loss, d_loss = ner.get_loss([doc1, doc2], [gold1, gold2], scores)
|
> loss, d_loss = ner.get_loss(examples, scores)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | -------- | ------------------------------------------------------------ |
|
| ----------- | ------------------- | --------------------------------------------------- |
|
||||||
| `docs` | iterable | The batch of documents. |
|
| `examples` | `Iterable[Example]` | The batch of examples. |
|
||||||
| `golds` | iterable | The gold-standard data. Must have the same length as `docs`. |
|
| `scores` | `List[StateClass]` | Scores representing the model's predictions. |
|
||||||
| `scores` | - | Scores representing the model's predictions. |
|
| **RETURNS** | tuple | The loss and the gradient, i.e. `(loss, gradient)`. |
|
||||||
| **RETURNS** | tuple | The loss and the gradient, i.e. `(loss, gradient)`. |
|
|
||||||
|
|
||||||
## EntityRecognizer.begin_training {#begin_training tag="method"}
|
## EntityRecognizer.begin_training {#begin_training tag="method"}
|
||||||
|
|
||||||
Initialize the pipe for training, using data examples if available. If no model
|
Initialize the pipe for training, using data examples if available. Return an
|
||||||
has been initialized yet, the model is added.
|
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
@ -181,12 +185,12 @@ has been initialized yet, the model is added.
|
||||||
> optimizer = ner.begin_training(pipeline=nlp.pipeline)
|
> optimizer = ner.begin_training(pipeline=nlp.pipeline)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------- | ----------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `gold_tuples` | iterable | Optional gold-standard annotations from which to construct [`GoldParse`](/api/goldparse) objects. |
|
| `get_examples` | `Iterable[Example]` | Optional gold-standard annotations in the form of [`Example`](/api/example) objects. |
|
||||||
| `pipeline` | list | Optional list of pipeline components that this component is part of. |
|
| `pipeline` | `List[(str, callable)]` | Optional list of pipeline components that this component is part of. |
|
||||||
| `sgd` | callable | An optional optimizer. Should take two arguments `weights` and `gradient`, and an optional ID. Will be created via [`EntityRecognizer`](/api/entityrecognizer#create_optimizer) if not set. |
|
| `sgd` | `Optimizer` | An optional [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. Will be created via [`create_optimizer`](/api/entityrecognizer#create_optimizer) if not set. |
|
||||||
| **RETURNS** | callable | An optimizer. |
|
| **RETURNS** | `Optimizer` | An optimizer. |
|
||||||
|
|
||||||
## EntityRecognizer.create_optimizer {#create_optimizer tag="method"}
|
## EntityRecognizer.create_optimizer {#create_optimizer tag="method"}
|
||||||
|
|
||||||
|
@ -199,9 +203,9 @@ Create an optimizer for the pipeline component.
|
||||||
> optimizer = ner.create_optimizer()
|
> optimizer = ner.create_optimizer()
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | -------- | -------------- |
|
| ----------- | ----------- | --------------------------------------------------------------- |
|
||||||
| **RETURNS** | callable | The optimizer. |
|
| **RETURNS** | `Optimizer` | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. |
|
||||||
|
|
||||||
## EntityRecognizer.use_params {#use_params tag="method, contextmanager"}
|
## EntityRecognizer.use_params {#use_params tag="method, contextmanager"}
|
||||||
|
|
||||||
|
|
|
@ -1,10 +1,280 @@
|
||||||
---
|
---
|
||||||
title: Example
|
title: Example
|
||||||
teaser: A training example
|
teaser: A training instance
|
||||||
tag: class
|
tag: class
|
||||||
source: spacy/gold/example.pyx
|
source: spacy/gold/example.pyx
|
||||||
|
new: 3.0
|
||||||
---
|
---
|
||||||
|
|
||||||
<!-- TODO: -->
|
An `Example` holds the information for one training instance. It stores two
|
||||||
|
`Doc` objects: one for holding the gold-standard reference data, and one for
|
||||||
|
holding the predictions of the pipeline. An `Alignment` <!-- TODO: link? -->
|
||||||
|
object stores the alignment between these two documents, as they can differ in
|
||||||
|
tokenization.
|
||||||
|
|
||||||
## Example.\_\_init\_\_ {#init tag="method"}
|
## Example.\_\_init\_\_ {#init tag="method"}
|
||||||
|
|
||||||
|
Construct an `Example` object from the `predicted` document and the `reference`
|
||||||
|
document. If `alignment` is `None`, it will be initialized from the words in
|
||||||
|
both documents.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> from spacy.tokens import Doc
|
||||||
|
> from spacy.gold import Example
|
||||||
|
>
|
||||||
|
> words = ["hello", "world", "!"]
|
||||||
|
> spaces = [True, False, False]
|
||||||
|
> predicted = Doc(nlp.vocab, words=words, spaces=spaces)
|
||||||
|
> reference = parse_gold_doc(my_data)
|
||||||
|
> example = Example(predicted, reference)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| -------------- | ----------- | ------------------------------------------------------------------------------------------------ |
|
||||||
|
| `predicted` | `Doc` | The document containing (partial) predictions. Can not be `None`. |
|
||||||
|
| `reference` | `Doc` | The document containing gold-standard annotations. Can not be `None`. |
|
||||||
|
| _keyword-only_ | | |
|
||||||
|
| `alignment` | `Alignment` | An object holding the alignment between the tokens of the `predicted` and `reference` documents. |
|
||||||
|
| **RETURNS** | `Example` | The newly constructed object. |
|
||||||
|
|
||||||
|
## Example.from_dict {#from_dict tag="classmethod"}
|
||||||
|
|
||||||
|
Construct an `Example` object from the `predicted` document and the reference
|
||||||
|
annotations provided as a dictionary.
|
||||||
|
|
||||||
|
<!-- TODO: document formats? legacy & token_annotation stuff -->
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> from spacy.tokens import Doc
|
||||||
|
> from spacy.gold import Example
|
||||||
|
>
|
||||||
|
> predicted = Doc(vocab, words=["Apply", "some", "sunscreen"])
|
||||||
|
> token_ref = ["Apply", "some", "sun", "screen"]
|
||||||
|
> tags_ref = ["VERB", "DET", "NOUN", "NOUN"]
|
||||||
|
> example = Example.from_dict(predicted, {"words": token_ref, "tags": tags_ref})
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| -------------- | ---------------- | ----------------------------------------------------------------- |
|
||||||
|
| `predicted` | `Doc` | The document containing (partial) predictions. Can not be `None`. |
|
||||||
|
| `example_dict` | `Dict[str, obj]` | The gold-standard annotations as a dictionary. Can not be `None`. |
|
||||||
|
| **RETURNS** | `Example` | The newly constructed object. |
|
||||||
|
|
||||||
|
## Example.text {#text tag="property"}
|
||||||
|
|
||||||
|
The text of the `predicted` document in this `Example`.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> raw_text = example.text
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| ----------- | ---- | ------------------------------------- |
|
||||||
|
| **RETURNS** | str | The text of the `predicted` document. |
|
||||||
|
|
||||||
|
## Example.predicted {#predicted tag="property"}
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> docs = [eg.predicted for eg in examples]
|
||||||
|
> predictions, _ = model.begin_update(docs)
|
||||||
|
> set_annotations(docs, predictions)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
The `Doc` holding the predictions. Occassionally also refered to as `example.x`.
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| ----------- | ----- | ---------------------------------------------- |
|
||||||
|
| **RETURNS** | `Doc` | The document containing (partial) predictions. |
|
||||||
|
|
||||||
|
## Example.reference {#reference tag="property"}
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> for i, eg in enumerate(examples):
|
||||||
|
> for j, label in enumerate(all_labels):
|
||||||
|
> gold_labels[i][j] = eg.reference.cats.get(label, 0.0)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
The `Doc` holding the gold-standard annotations. Occassionally also refered to
|
||||||
|
as `example.y`.
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| ----------- | ----- | -------------------------------------------------- |
|
||||||
|
| **RETURNS** | `Doc` | The document containing gold-standard annotations. |
|
||||||
|
|
||||||
|
## Example.alignment {#alignment tag="property"}
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> tokens_x = ["Apply", "some", "sunscreen"]
|
||||||
|
> x = Doc(vocab, words=tokens_x)
|
||||||
|
> tokens_y = ["Apply", "some", "sun", "screen"]
|
||||||
|
> example = Example.from_dict(x, {"words": tokens_y})
|
||||||
|
> alignment = example.alignment
|
||||||
|
> assert list(alignment.y2x.data) == [[0], [1], [2], [2]]
|
||||||
|
> ```
|
||||||
|
|
||||||
|
The `Alignment` object mapping the tokens of the `predicted` document to those
|
||||||
|
of the `reference` document.
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| ----------- | ----------- | -------------------------------------------------- |
|
||||||
|
| **RETURNS** | `Alignment` | The document containing gold-standard annotations. |
|
||||||
|
|
||||||
|
## Example.get_aligned {#get_aligned tag="method"}
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> predicted = Doc(vocab, words=["Apply", "some", "sunscreen"])
|
||||||
|
> token_ref = ["Apply", "some", "sun", "screen"]
|
||||||
|
> tags_ref = ["VERB", "DET", "NOUN", "NOUN"]
|
||||||
|
> example = Example.from_dict(predicted, {"words": token_ref, "tags": tags_ref})
|
||||||
|
> assert example.get_aligned("TAG", as_string=True) == ["VERB", "DET", "NOUN"]
|
||||||
|
> ```
|
||||||
|
|
||||||
|
Get the aligned view of a certain token attribute, denoted by its int ID or
|
||||||
|
string name.
|
||||||
|
|
||||||
|
| Name | Type | Description | Default |
|
||||||
|
| ----------- | -------------------------- | ------------------------------------------------------------------ | ------- |
|
||||||
|
| `field` | int or str | Attribute ID or string name | |
|
||||||
|
| `as_string` | bool | Whether or not to return the list of values as strings. | `False` |
|
||||||
|
| **RETURNS** | `List[int]` or `List[str]` | List of integer values, or string values if `as_string` is `True`. | |
|
||||||
|
|
||||||
|
## Example.get_aligned_parse {#get_aligned_parse tag="method"}
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> doc = nlp("He pretty quickly walks away")
|
||||||
|
> example = Example.from_dict(doc, {"heads": [3, 2, 3, 0, 2]})
|
||||||
|
> proj_heads, proj_labels = example.get_aligned_parse(projectivize=True)
|
||||||
|
> assert proj_heads == [3, 2, 3, 0, 3]
|
||||||
|
> ```
|
||||||
|
|
||||||
|
Get the aligned view of the dependency parse. If `projectivize` is set to
|
||||||
|
`True`, non-projective dependency trees are made projective through the
|
||||||
|
Pseudo-Projective Dependency Parsing algorithm by Nivre and Nilsson (2005).
|
||||||
|
|
||||||
|
| Name | Type | Description | Default |
|
||||||
|
| -------------- | -------------------------- | ------------------------------------------------------------------ | ------- |
|
||||||
|
| `projectivize` | bool | Whether or not to projectivize the dependency trees | `True` |
|
||||||
|
| **RETURNS** | `List[int]` or `List[str]` | List of integer values, or string values if `as_string` is `True`. | |
|
||||||
|
|
||||||
|
## Example.get_aligned_ner {#get_aligned_ner tag="method"}
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> words = ["Mrs", "Smith", "flew", "to", "New York"]
|
||||||
|
> doc = Doc(en_vocab, words=words)
|
||||||
|
> entities = [(0, 9, "PERSON"), (18, 26, "LOC")]
|
||||||
|
> gold_words = ["Mrs Smith", "flew", "to", "New", "York"]
|
||||||
|
> example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
||||||
|
> ner_tags = example.get_aligned_ner()
|
||||||
|
> assert ner_tags == ["B-PERSON", "L-PERSON", "O", "O", "U-LOC"]
|
||||||
|
> ```
|
||||||
|
|
||||||
|
Get the aligned view of the NER
|
||||||
|
[BILUO](/usage/linguistic-features#accessing-ner) tags.
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| ----------- | ----------- | ----------------------------------------------------------------------------------- |
|
||||||
|
| **RETURNS** | `List[str]` | List of BILUO values, denoting whether tokens are part of an NER annotation or not. |
|
||||||
|
|
||||||
|
## Example.get_aligned_spans_y2x {#get_aligned_spans_y2x tag="method"}
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> words = ["Mr and Mrs Smith", "flew", "to", "New York"]
|
||||||
|
> doc = Doc(en_vocab, words=words)
|
||||||
|
> entities = [(0, 16, "PERSON")]
|
||||||
|
> tokens_ref = ["Mr", "and", "Mrs", "Smith", "flew", "to", "New", "York"]
|
||||||
|
> example = Example.from_dict(doc, {"words": tokens_ref, "entities": entities})
|
||||||
|
> ents_ref = example.reference.ents
|
||||||
|
> assert [(ent.start, ent.end) for ent in ents_ref] == [(0, 4)]
|
||||||
|
> ents_y2x = example.get_aligned_spans_y2x(ents_ref)
|
||||||
|
> assert [(ent.start, ent.end) for ent in ents_y2x] == [(0, 1)]
|
||||||
|
> ```
|
||||||
|
|
||||||
|
Get the aligned view of any set of [`Span`](/api/span) objects defined over
|
||||||
|
`example.reference`. The resulting span indices will align to the tokenization
|
||||||
|
in `example.predicted`.
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| ----------- | ---------------- | --------------------------------------------------------------- |
|
||||||
|
| `y_spans` | `Iterable[Span]` | `Span` objects aligned to the tokenization of `self.reference`. |
|
||||||
|
| **RETURNS** | `Iterable[Span]` | `Span` objects aligned to the tokenization of `self.predicted`. |
|
||||||
|
|
||||||
|
## Example.get_aligned_spans_x2y {#get_aligned_spans_x2y tag="method"}
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> nlp.add_pipe(my_ner)
|
||||||
|
> doc = nlp("Mr and Mrs Smith flew to New York")
|
||||||
|
> tokens_ref = ["Mr and Mrs", "Smith", "flew", "to", "New York"]
|
||||||
|
> example = Example.from_dict(doc, {"words": tokens_ref})
|
||||||
|
> ents_pred = example.predicted.ents
|
||||||
|
> # Assume the NER model has found "Mr and Mrs Smith" as a named entity
|
||||||
|
> assert [(ent.start, ent.end) for ent in ents_pred] == [(0, 4)]
|
||||||
|
> ents_x2y = example.get_aligned_spans_x2y(ents_pred)
|
||||||
|
> assert [(ent.start, ent.end) for ent in ents_x2y] == [(0, 2)]
|
||||||
|
> ```
|
||||||
|
|
||||||
|
Get the aligned view of any set of [`Span`](/api/span) objects defined over
|
||||||
|
`example.predicted`. The resulting span indices will align to the tokenization
|
||||||
|
in `example.reference`. This method is particularly useful to assess the
|
||||||
|
accuracy of predicted entities against the original gold-standard annotation.
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| ----------- | ---------------- | --------------------------------------------------------------- |
|
||||||
|
| `x_spans` | `Iterable[Span]` | `Span` objects aligned to the tokenization of `self.predicted`. |
|
||||||
|
| **RETURNS** | `Iterable[Span]` | `Span` objects aligned to the tokenization of `self.reference`. |
|
||||||
|
|
||||||
|
## Example.to_dict {#to_dict tag="method"}
|
||||||
|
|
||||||
|
Return a dictionary representation of the reference annotation contained in this
|
||||||
|
`Example`.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> eg_dict = example.to_dict()
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| ----------- | ---------------- | ------------------------------------------------------ |
|
||||||
|
| **RETURNS** | `Dict[str, obj]` | Dictionary representation of the reference annotation. |
|
||||||
|
|
||||||
|
## Example.split_sents {#split_sents tag="method"}
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> doc = nlp("I went yesterday had lots of fun")
|
||||||
|
> tokens_ref = ["I", "went", "yesterday", "had", "lots", "of", "fun"]
|
||||||
|
> sents_ref = [True, False, False, True, False, False, False]
|
||||||
|
> example = Example.from_dict(doc, {"words": tokens_ref, "sent_starts": sents_ref})
|
||||||
|
> split_examples = example.split_sents()
|
||||||
|
> assert split_examples[0].text == "I went yesterday "
|
||||||
|
> assert split_examples[1].text == "had lots of fun"
|
||||||
|
> ```
|
||||||
|
|
||||||
|
Split one `Example` into multiple `Example` objects, one for each sentence.
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| ----------- | --------------- | ---------------------------------------------------------- |
|
||||||
|
| **RETURNS** | `List[Example]` | List of `Example` objects, one for each original sentence. |
|
||||||
|
|
|
@ -52,7 +52,7 @@ contain arbitrary whitespace. Alignment into the original string is preserved.
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ----- | --------------------------------------------------------------------------------- |
|
| ----------- | ----- | --------------------------------------------------------------------------------- |
|
||||||
| `text` | str | The text to be processed. |
|
| `text` | str | The text to be processed. |
|
||||||
| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). |
|
| `disable` | `List[str]` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). |
|
||||||
| **RETURNS** | `Doc` | A container for accessing the annotations. |
|
| **RETURNS** | `Doc` | A container for accessing the annotations. |
|
||||||
|
|
||||||
## Language.pipe {#pipe tag="method"}
|
## Language.pipe {#pipe tag="method"}
|
||||||
|
@ -68,15 +68,15 @@ more efficient than processing texts one-by-one.
|
||||||
> assert doc.is_parsed
|
> assert doc.is_parsed
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| -------------------------------------------- | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------------------------------------- | ----------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `texts` | iterable | A sequence of strings. |
|
| `texts` | `Iterable[str]` | A sequence of strings. |
|
||||||
| `as_tuples` | bool | If set to `True`, inputs should be a sequence of `(text, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. |
|
| `as_tuples` | bool | If set to `True`, inputs should be a sequence of `(text, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. |
|
||||||
| `batch_size` | int | The number of texts to buffer. |
|
| `batch_size` | int | The number of texts to buffer. |
|
||||||
| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). |
|
| `disable` | `List[str]` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). |
|
||||||
| `component_cfg` <Tag variant="new">2.1</Tag> | dict | Config parameters for specific pipeline components, keyed by component name. |
|
| `component_cfg` <Tag variant="new">2.1</Tag> | `Dict[str, Dict]` | Config parameters for specific pipeline components, keyed by component name. |
|
||||||
| `n_process` <Tag variant="new">2.2.2</Tag> | int | Number of processors to use, only supported in Python 3. Defaults to `1`. |
|
| `n_process` <Tag variant="new">2.2.2</Tag> | int | Number of processors to use, only supported in Python 3. Defaults to `1`. |
|
||||||
| **YIELDS** | `Doc` | Documents in the order of the original text. |
|
| **YIELDS** | `Doc` | Documents in the order of the original text. |
|
||||||
|
|
||||||
## Language.update {#update tag="method"}
|
## Language.update {#update tag="method"}
|
||||||
|
|
||||||
|
@ -87,18 +87,19 @@ Update the models in the pipeline.
|
||||||
> ```python
|
> ```python
|
||||||
> for raw_text, entity_offsets in train_data:
|
> for raw_text, entity_offsets in train_data:
|
||||||
> doc = nlp.make_doc(raw_text)
|
> doc = nlp.make_doc(raw_text)
|
||||||
> gold = GoldParse(doc, entities=entity_offsets)
|
> example = Example.from_dict(doc, {"entities": entity_offsets})
|
||||||
> nlp.update([doc], [gold], drop=0.5, sgd=optimizer)
|
> nlp.update([example], sgd=optimizer)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| -------------------------------------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------------------------------------- | ------------------- | ---------------------------------------------------------------------------- |
|
||||||
| `docs` | iterable | A batch of `Doc` objects or strings. If strings, a `Doc` object will be created from the text. |
|
| `examples` | `Iterable[Example]` | A batch of `Example` objects to learn from. |
|
||||||
| `golds` | iterable | A batch of `GoldParse` objects or dictionaries. Dictionaries will be used to create [`GoldParse`](/api/goldparse) objects. For the available keys and their usage, see [`GoldParse.__init__`](/api/goldparse#init). |
|
| _keyword-only_ | | |
|
||||||
| `drop` | float | The dropout rate. |
|
| `drop` | float | The dropout rate. |
|
||||||
| `sgd` | callable | An optimizer. |
|
| `sgd` | `Optimizer` | An [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. |
|
||||||
| `losses` | dict | Dictionary to update with the loss, keyed by pipeline component. |
|
| `losses` | `Dict[str, float]` | Dictionary to update with the loss, keyed by pipeline component. |
|
||||||
| `component_cfg` <Tag variant="new">2.1</Tag> | dict | Config parameters for specific pipeline components, keyed by component name. |
|
| `component_cfg` <Tag variant="new">2.1</Tag> | `Dict[str, Dict]` | Config parameters for specific pipeline components, keyed by component name. |
|
||||||
|
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
||||||
|
|
||||||
## Language.evaluate {#evaluate tag="method"}
|
## Language.evaluate {#evaluate tag="method"}
|
||||||
|
|
||||||
|
@ -107,35 +108,37 @@ Evaluate a model's pipeline components.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> scorer = nlp.evaluate(docs_golds, verbose=True)
|
> scorer = nlp.evaluate(examples, verbose=True)
|
||||||
> print(scorer.scores)
|
> print(scorer.scores)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| -------------------------------------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| -------------------------------------------- | ------------------- | ------------------------------------------------------------------------------------- |
|
||||||
| `docs_golds` | iterable | Tuples of `Doc` and `GoldParse` objects, such that the `Doc` objects contain the predictions and the `GoldParse` objects the correct annotations. Alternatively, `(text, annotations)` tuples of raw text and a dict (see [simple training style](/usage/training#training-simple-style)). |
|
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
|
||||||
| `verbose` | bool | Print debugging information. |
|
| `verbose` | bool | Print debugging information. |
|
||||||
| `batch_size` | int | The batch size to use. |
|
| `batch_size` | int | The batch size to use. |
|
||||||
| `scorer` | `Scorer` | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. |
|
| `scorer` | `Scorer` | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. |
|
||||||
| `component_cfg` <Tag variant="new">2.1</Tag> | dict | Config parameters for specific pipeline components, keyed by component name. |
|
| `component_cfg` <Tag variant="new">2.1</Tag> | `Dict[str, Dict]` | Config parameters for specific pipeline components, keyed by component name. |
|
||||||
| **RETURNS** | Scorer | The scorer containing the evaluation scores. |
|
| **RETURNS** | Scorer | The scorer containing the evaluation scores. |
|
||||||
|
|
||||||
## Language.begin_training {#begin_training tag="method"}
|
## Language.begin_training {#begin_training tag="method"}
|
||||||
|
|
||||||
Allocate models, pre-process training data and acquire an optimizer.
|
Allocate models, pre-process training data and acquire an
|
||||||
|
[`Optimizer`](https://thinc.ai/docs/api-optimizers).
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> optimizer = nlp.begin_training(gold_tuples)
|
> optimizer = nlp.begin_training(get_examples)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| -------------------------------------------- | -------- | ---------------------------------------------------------------------------- |
|
| -------------------------------------------- | ------------------- | ------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `gold_tuples` | iterable | Gold-standard training data. |
|
| `get_examples` | `Iterable[Example]` | Optional gold-standard annotations in the form of [`Example`](/api/example) objects. |
|
||||||
| `component_cfg` <Tag variant="new">2.1</Tag> | dict | Config parameters for specific pipeline components, keyed by component name. |
|
| `sgd` | `Optimizer` | An optional [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. If not set, a default one will be created. |
|
||||||
| `**cfg` | - | Config parameters (sent to all components). |
|
| `component_cfg` <Tag variant="new">2.1</Tag> | `Dict[str, Dict]` | Config parameters for specific pipeline components, keyed by component name. |
|
||||||
| **RETURNS** | callable | An optimizer. |
|
| `**cfg` | - | Config parameters (sent to all components). |
|
||||||
|
| **RETURNS** | `Optimizer` | An optimizer. |
|
||||||
|
|
||||||
## Language.use_params {#use_params tag="contextmanager, method"}
|
## Language.use_params {#use_params tag="contextmanager, method"}
|
||||||
|
|
||||||
|
@ -155,16 +158,6 @@ their original weights after the block.
|
||||||
| `params` | dict | A dictionary of parameters keyed by model ID. |
|
| `params` | dict | A dictionary of parameters keyed by model ID. |
|
||||||
| `**cfg` | - | Config parameters. |
|
| `**cfg` | - | Config parameters. |
|
||||||
|
|
||||||
## Language.preprocess_gold {#preprocess_gold tag="method"}
|
|
||||||
|
|
||||||
Can be called before training to pre-process gold data. By default, it handles
|
|
||||||
nonprojectivity and adds missing tags to the tag map.
|
|
||||||
|
|
||||||
| Name | Type | Description |
|
|
||||||
| ------------ | -------- | ---------------------------------------- |
|
|
||||||
| `docs_golds` | iterable | Tuples of `Doc` and `GoldParse` objects. |
|
|
||||||
| **YIELDS** | tuple | Tuples of `Doc` and `GoldParse` objects. |
|
|
||||||
|
|
||||||
## Language.create_pipe {#create_pipe tag="method" new="2"}
|
## Language.create_pipe {#create_pipe tag="method" new="2"}
|
||||||
|
|
||||||
Create a pipeline component from a factory.
|
Create a pipeline component from a factory.
|
||||||
|
|
23
website/docs/api/morphologizer.md
Normal file
|
@ -0,0 +1,23 @@
|
||||||
|
---
|
||||||
|
title: Morphologizer
|
||||||
|
tag: class
|
||||||
|
source: spacy/pipeline/morphologizer.pyx
|
||||||
|
new: 3
|
||||||
|
---
|
||||||
|
|
||||||
|
A trainable pipeline component to predict morphological features. This class is
|
||||||
|
a subclass of `Pipe` and follows the same API. The component is also available
|
||||||
|
via the string name `"morphologizer"`. After initialization, it is typically
|
||||||
|
added to the processing pipeline using [`nlp.add_pipe`](/api/language#add_pipe).
|
||||||
|
|
||||||
|
## Default config {#config}
|
||||||
|
|
||||||
|
This is the default configuration used to initialize the model powering the
|
||||||
|
pipeline component. See the [model architectures](/api/architectures)
|
||||||
|
documentation for details on the architectures and their arguments and
|
||||||
|
hyperparameters. To learn more about how to customize the config and train
|
||||||
|
custom models, check out the [training config](/usage/training#config) docs.
|
||||||
|
|
||||||
|
```python
|
||||||
|
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/defaults/morphologizer_defaults.cfg
|
||||||
|
```
|
|
@ -27,22 +27,20 @@ Create a new `Scorer`.
|
||||||
|
|
||||||
## Scorer.score {#score tag="method"}
|
## Scorer.score {#score tag="method"}
|
||||||
|
|
||||||
Update the evaluation scores from a single [`Doc`](/api/doc) /
|
Update the evaluation scores from a single [`Example`](/api/example) object.
|
||||||
[`GoldParse`](/api/goldparse) pair.
|
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> scorer = Scorer()
|
> scorer = Scorer()
|
||||||
> scorer.score(doc, gold)
|
> scorer.score(example)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| -------------- | ----------- | -------------------------------------------------------------------------------------------------------------------- |
|
| -------------- | --------- | -------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `doc` | `Doc` | The predicted annotations. |
|
| `example` | `Example` | The `Example` object holding both the predictions and the correct gold-standard annotations. |
|
||||||
| `gold` | `GoldParse` | The correct annotations. |
|
| `verbose` | bool | Print debugging information. |
|
||||||
| `verbose` | bool | Print debugging information. |
|
| `punct_labels` | tuple | Dependency labels for punctuation. Used to evaluate dependency attachments to punctuation if `eval_punct` is `True`. |
|
||||||
| `punct_labels` | tuple | Dependency labels for punctuation. Used to evaluate dependency attachments to punctuation if `eval_punct` is `True`. |
|
|
||||||
|
|
||||||
## Properties
|
## Properties
|
||||||
|
|
||||||
|
|
|
@ -11,6 +11,18 @@ subclass of `Pipe` and follows the same API. The component is also available via
|
||||||
the string name `"senter"`. After initialization, it is typically added to the
|
the string name `"senter"`. After initialization, it is typically added to the
|
||||||
processing pipeline using [`nlp.add_pipe`](/api/language#add_pipe).
|
processing pipeline using [`nlp.add_pipe`](/api/language#add_pipe).
|
||||||
|
|
||||||
|
## Default config {#config}
|
||||||
|
|
||||||
|
This is the default configuration used to initialize the model powering the
|
||||||
|
pipeline component. See the [model architectures](/api/architectures)
|
||||||
|
documentation for details on the architectures and their arguments and
|
||||||
|
hyperparameters. To learn more about how to customize the config and train
|
||||||
|
custom models, check out the [training config](/usage/training#config) docs.
|
||||||
|
|
||||||
|
```python
|
||||||
|
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/defaults/senter_defaults.cfg
|
||||||
|
```
|
||||||
|
|
||||||
## SentenceRecognizer.\_\_init\_\_ {#init tag="method"}
|
## SentenceRecognizer.\_\_init\_\_ {#init tag="method"}
|
||||||
|
|
||||||
Initialize the sentence recognizer.
|
Initialize the sentence recognizer.
|
||||||
|
|
|
@ -8,41 +8,34 @@ This class is a subclass of `Pipe` and follows the same API. The pipeline
|
||||||
component is available in the [processing pipeline](/usage/processing-pipelines)
|
component is available in the [processing pipeline](/usage/processing-pipelines)
|
||||||
via the ID `"tagger"`.
|
via the ID `"tagger"`.
|
||||||
|
|
||||||
## Tagger.Model {#model tag="classmethod"}
|
|
||||||
|
|
||||||
Initialize a model for the pipe. The model should implement the
|
|
||||||
`thinc.neural.Model` API. Wrappers are under development for most major machine
|
|
||||||
learning libraries.
|
|
||||||
|
|
||||||
| Name | Type | Description |
|
|
||||||
| ----------- | ------ | ------------------------------------- |
|
|
||||||
| `**kwargs` | - | Parameters for initializing the model |
|
|
||||||
| **RETURNS** | object | The initialized model. |
|
|
||||||
|
|
||||||
## Tagger.\_\_init\_\_ {#init tag="method"}
|
## Tagger.\_\_init\_\_ {#init tag="method"}
|
||||||
|
|
||||||
Create a new pipeline instance. In your application, you would normally use a
|
|
||||||
shortcut for this and instantiate the component using its string name and
|
|
||||||
[`nlp.create_pipe`](/api/language#create_pipe).
|
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> # Construction via create_pipe
|
> # Construction via create_pipe
|
||||||
> tagger = nlp.create_pipe("tagger")
|
> tagger = nlp.create_pipe("tagger")
|
||||||
>
|
>
|
||||||
> # Construction from class
|
> # Construction via create_pipe with custom model
|
||||||
|
> config = {"model": {"@architectures": "my_tagger"}}
|
||||||
|
> parser = nlp.create_pipe("tagger", config)
|
||||||
|
>
|
||||||
|
> # Construction from class with custom model from file
|
||||||
> from spacy.pipeline import Tagger
|
> from spacy.pipeline import Tagger
|
||||||
> tagger = Tagger(nlp.vocab)
|
> model = util.load_config("model.cfg", create_objects=True)["model"]
|
||||||
> tagger.from_disk("/path/to/model")
|
> tagger = Tagger(nlp.vocab, model)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
Create a new pipeline instance. In your application, you would normally use a
|
||||||
| ----------- | ----------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- |
|
shortcut for this and instantiate the component using its string name and
|
||||||
| `vocab` | `Vocab` | The shared vocabulary. |
|
[`nlp.create_pipe`](/api/language#create_pipe).
|
||||||
| `model` | `thinc.neural.Model` / `True` | The model powering the pipeline component. If no model is supplied, the model is created when you call `begin_training`, `from_disk` or `from_bytes`. |
|
|
||||||
| `**cfg` | - | Configuration parameters. |
|
| Name | Type | Description |
|
||||||
| **RETURNS** | `Tagger` | The newly constructed object. |
|
| ----------- | -------- | ------------------------------------------------------------------------------- |
|
||||||
|
| `vocab` | `Vocab` | The shared vocabulary. |
|
||||||
|
| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
||||||
|
| `**cfg` | - | Configuration parameters. |
|
||||||
|
| **RETURNS** | `Tagger` | The newly constructed object. |
|
||||||
|
|
||||||
## Tagger.\_\_call\_\_ {#call tag="method"}
|
## Tagger.\_\_call\_\_ {#call tag="method"}
|
||||||
|
|
||||||
|
@ -83,11 +76,11 @@ applied to the `Doc` in order. Both [`__call__`](/api/tagger#call) and
|
||||||
> pass
|
> pass
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ------------ | -------- | ------------------------------------------------------ |
|
| ------------ | --------------- | ------------------------------------------------------ |
|
||||||
| `stream` | iterable | A stream of documents. |
|
| `stream` | `Iterable[Doc]` | A stream of documents. |
|
||||||
| `batch_size` | int | The number of texts to buffer. Defaults to `128`. |
|
| `batch_size` | int | The number of texts to buffer. Defaults to `128`. |
|
||||||
| **YIELDS** | `Doc` | Processed documents in the order of the original text. |
|
| **YIELDS** | `Doc` | Processed documents in the order of the original text. |
|
||||||
|
|
||||||
## Tagger.predict {#predict tag="method"}
|
## Tagger.predict {#predict tag="method"}
|
||||||
|
|
||||||
|
@ -97,13 +90,13 @@ Apply the pipeline's model to a batch of docs, without modifying them.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> tagger = Tagger(nlp.vocab)
|
> tagger = Tagger(nlp.vocab)
|
||||||
> scores, tensors = tagger.predict([doc1, doc2])
|
> scores = tagger.predict([doc1, doc2])
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------- | --------------- | ----------------------------------------- |
|
||||||
| `docs` | iterable | The documents to predict. |
|
| `docs` | `Iterable[Doc]` | The documents to predict. |
|
||||||
| **RETURNS** | tuple | A `(scores, tensors)` tuple where `scores` is the model's prediction for each document and `tensors` is the token representations used to predict the scores. Each tensor is an array with one row for each token in the document. |
|
| **RETURNS** | - | The model's prediction for each document. |
|
||||||
|
|
||||||
## Tagger.set_annotations {#set_annotations tag="method"}
|
## Tagger.set_annotations {#set_annotations tag="method"}
|
||||||
|
|
||||||
|
@ -113,15 +106,14 @@ Modify a batch of documents, using pre-computed scores.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> tagger = Tagger(nlp.vocab)
|
> tagger = Tagger(nlp.vocab)
|
||||||
> scores, tensors = tagger.predict([doc1, doc2])
|
> scores = tagger.predict([doc1, doc2])
|
||||||
> tagger.set_annotations([doc1, doc2], scores, tensors)
|
> tagger.set_annotations([doc1, doc2], scores)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| --------- | -------- | ----------------------------------------------------- |
|
| -------- | --------------- | ------------------------------------------------ |
|
||||||
| `docs` | iterable | The documents to modify. |
|
| `docs` | `Iterable[Doc]` | The documents to modify. |
|
||||||
| `scores` | - | The scores to set, produced by `Tagger.predict`. |
|
| `scores` | - | The scores to set, produced by `Tagger.predict`. |
|
||||||
| `tensors` | iterable | The token representations used to predict the scores. |
|
|
||||||
|
|
||||||
## Tagger.update {#update tag="method"}
|
## Tagger.update {#update tag="method"}
|
||||||
|
|
||||||
|
@ -132,19 +124,20 @@ pipe's model. Delegates to [`predict`](/api/tagger#predict) and
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> tagger = Tagger(nlp.vocab)
|
> tagger = Tagger(nlp.vocab, tagger_model)
|
||||||
> losses = {}
|
|
||||||
> optimizer = nlp.begin_training()
|
> optimizer = nlp.begin_training()
|
||||||
> tagger.update([doc1, doc2], [gold1, gold2], losses=losses, sgd=optimizer)
|
> losses = tagger.update(examples, sgd=optimizer)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| -------- | -------- | -------------------------------------------------------------------------------------------- |
|
| ----------------- | ------------------- | ------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `docs` | iterable | A batch of documents to learn from. |
|
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
|
||||||
| `golds` | iterable | The gold-standard data. Must have the same length as `docs`. |
|
| _keyword-only_ | | |
|
||||||
| `drop` | float | The dropout rate. |
|
| `drop` | float | The dropout rate. |
|
||||||
| `sgd` | callable | The optimizer. Should take two arguments `weights` and `gradient`, and an optional ID. |
|
| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/tagger#set_annotations). |
|
||||||
| `losses` | dict | Optional record of the loss during training. The value keyed by the model's name is updated. |
|
| `sgd` | `Optimizer` | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. |
|
||||||
|
| `losses` | `Dict[str, float]` | Optional record of the loss during training. The value keyed by the model's name is updated. |
|
||||||
|
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
||||||
|
|
||||||
## Tagger.get_loss {#get_loss tag="method"}
|
## Tagger.get_loss {#get_loss tag="method"}
|
||||||
|
|
||||||
|
@ -155,21 +148,20 @@ predicted scores.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> tagger = Tagger(nlp.vocab)
|
> tagger = Tagger(nlp.vocab)
|
||||||
> scores = tagger.predict([doc1, doc2])
|
> scores = tagger.predict([eg.predicted for eg in examples])
|
||||||
> loss, d_loss = tagger.get_loss([doc1, doc2], [gold1, gold2], scores)
|
> loss, d_loss = tagger.get_loss(examples, scores)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | -------- | ------------------------------------------------------------ |
|
| ----------- | ------------------- | --------------------------------------------------- |
|
||||||
| `docs` | iterable | The batch of documents. |
|
| `examples` | `Iterable[Example]` | The batch of examples. |
|
||||||
| `golds` | iterable | The gold-standard data. Must have the same length as `docs`. |
|
| `scores` | - | Scores representing the model's predictions. |
|
||||||
| `scores` | - | Scores representing the model's predictions. |
|
| **RETURNS** | tuple | The loss and the gradient, i.e. `(loss, gradient)`. |
|
||||||
| **RETURNS** | tuple | The loss and the gradient, i.e. `(loss, gradient)`. |
|
|
||||||
|
|
||||||
## Tagger.begin_training {#begin_training tag="method"}
|
## Tagger.begin_training {#begin_training tag="method"}
|
||||||
|
|
||||||
Initialize the pipe for training, using data examples if available. If no model
|
Initialize the pipe for training, using data examples if available. Return an
|
||||||
has been initialized yet, the model is added.
|
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
@ -179,12 +171,12 @@ has been initialized yet, the model is added.
|
||||||
> optimizer = tagger.begin_training(pipeline=nlp.pipeline)
|
> optimizer = tagger.begin_training(pipeline=nlp.pipeline)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ------------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------- | ----------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `gold_tuples` | iterable | Optional gold-standard annotations from which to construct [`GoldParse`](/api/goldparse) objects. |
|
| `get_examples` | `Iterable[Example]` | Optional gold-standard annotations in the form of [`Example`](/api/example) objects. |
|
||||||
| `pipeline` | list | Optional list of pipeline components that this component is part of. |
|
| `pipeline` | `List[(str, callable)]` | Optional list of pipeline components that this component is part of. |
|
||||||
| `sgd` | callable | An optional optimizer. Should take two arguments `weights` and `gradient`, and an optional ID. Will be created via [`Tagger`](/api/tagger#create_optimizer) if not set. |
|
| `sgd` | `Optimizer` | An optional [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. Will be created via [`create_optimizer`](/api/tagger#create_optimizer) if not set. |
|
||||||
| **RETURNS** | callable | An optimizer. |
|
| **RETURNS** | `Optimizer` | An optimizer. |
|
||||||
|
|
||||||
## Tagger.create_optimizer {#create_optimizer tag="method"}
|
## Tagger.create_optimizer {#create_optimizer tag="method"}
|
||||||
|
|
||||||
|
@ -197,9 +189,9 @@ Create an optimizer for the pipeline component.
|
||||||
> optimizer = tagger.create_optimizer()
|
> optimizer = tagger.create_optimizer()
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | -------- | -------------- |
|
| ----------- | ----------- | --------------------------------------------------------------- |
|
||||||
| **RETURNS** | callable | The optimizer. |
|
| **RETURNS** | `Optimizer` | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. |
|
||||||
|
|
||||||
## Tagger.use_params {#use_params tag="method, contextmanager"}
|
## Tagger.use_params {#use_params tag="method, contextmanager"}
|
||||||
|
|
||||||
|
|
|
@ -9,44 +9,50 @@ This class is a subclass of `Pipe` and follows the same API. The pipeline
|
||||||
component is available in the [processing pipeline](/usage/processing-pipelines)
|
component is available in the [processing pipeline](/usage/processing-pipelines)
|
||||||
via the ID `"textcat"`.
|
via the ID `"textcat"`.
|
||||||
|
|
||||||
## TextCategorizer.Model {#model tag="classmethod"}
|
## Default config {#config}
|
||||||
|
|
||||||
Initialize a model for the pipe. The model should implement the
|
This is the default configuration used to initialize the model powering the
|
||||||
`thinc.neural.Model` API. Wrappers are under development for most major machine
|
pipeline component. See the [model architectures](/api/architectures)
|
||||||
learning libraries.
|
documentation for details on the architectures and their arguments and
|
||||||
|
hyperparameters. To learn more about how to customize the config and train
|
||||||
|
custom models, check out the [training config](/usage/training#config) docs.
|
||||||
|
|
||||||
| Name | Type | Description |
|
```python
|
||||||
| ----------- | ------ | ------------------------------------- |
|
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/defaults/textcat_defaults.cfg
|
||||||
| `**kwargs` | - | Parameters for initializing the model |
|
```
|
||||||
| **RETURNS** | object | The initialized model. |
|
|
||||||
|
<!-- TODO: do we also need to document the other defaults here? -->
|
||||||
|
|
||||||
## TextCategorizer.\_\_init\_\_ {#init tag="method"}
|
## TextCategorizer.\_\_init\_\_ {#init tag="method"}
|
||||||
|
|
||||||
Create a new pipeline instance. In your application, you would normally use a
|
|
||||||
shortcut for this and instantiate the component using its string name and
|
|
||||||
[`nlp.create_pipe`](/api/language#create_pipe).
|
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> # Construction via create_pipe
|
> # Construction via create_pipe
|
||||||
> textcat = nlp.create_pipe("textcat")
|
> textcat = nlp.create_pipe("textcat")
|
||||||
> textcat = nlp.create_pipe("textcat", config={"exclusive_classes": True})
|
|
||||||
>
|
>
|
||||||
> # Construction from class
|
> # Construction via create_pipe with custom model
|
||||||
|
> config = {"model": {"@architectures": "my_textcat"}}
|
||||||
|
> parser = nlp.create_pipe("textcat", config)
|
||||||
|
>
|
||||||
|
> # Construction from class with custom model from file
|
||||||
> from spacy.pipeline import TextCategorizer
|
> from spacy.pipeline import TextCategorizer
|
||||||
> textcat = TextCategorizer(nlp.vocab)
|
> model = util.load_config("model.cfg", create_objects=True)["model"]
|
||||||
> textcat.from_disk("/path/to/model")
|
> textcat = TextCategorizer(nlp.vocab, model)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
Create a new pipeline instance. In your application, you would normally use a
|
||||||
| ------------------- | ----------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- |
|
shortcut for this and instantiate the component using its string name and
|
||||||
| `vocab` | `Vocab` | The shared vocabulary. |
|
[`nlp.create_pipe`](/api/language#create_pipe).
|
||||||
| `model` | `thinc.neural.Model` / `True` | The model powering the pipeline component. If no model is supplied, the model is created when you call `begin_training`, `from_disk` or `from_bytes`. |
|
|
||||||
| `exclusive_classes` | bool | Make categories mutually exclusive. Defaults to `False`. |
|
|
||||||
| `architecture` | str | Model architecture to use, see [architectures](#architectures) for details. Defaults to `"ensemble"`. |
|
|
||||||
| **RETURNS** | `TextCategorizer` | The newly constructed object. |
|
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| ----------- | ----------------- | ------------------------------------------------------------------------------- |
|
||||||
|
| `vocab` | `Vocab` | The shared vocabulary. |
|
||||||
|
| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
||||||
|
| `**cfg` | - | Configuration parameters. |
|
||||||
|
| **RETURNS** | `TextCategorizer` | The newly constructed object. |
|
||||||
|
|
||||||
|
<!-- TODO move to config page
|
||||||
### Architectures {#architectures new="2.1"}
|
### Architectures {#architectures new="2.1"}
|
||||||
|
|
||||||
Text classification models can be used to solve a wide variety of problems.
|
Text classification models can be used to solve a wide variety of problems.
|
||||||
|
@ -61,6 +67,7 @@ argument.
|
||||||
| `"ensemble"` | **Default:** Stacked ensemble of a bag-of-words model and a neural network model. The neural network uses a CNN with mean pooling and attention. The "ngram_size" and "attr" arguments can be used to configure the feature extraction for the bag-of-words model. |
|
| `"ensemble"` | **Default:** Stacked ensemble of a bag-of-words model and a neural network model. The neural network uses a CNN with mean pooling and attention. The "ngram_size" and "attr" arguments can be used to configure the feature extraction for the bag-of-words model. |
|
||||||
| `"simple_cnn"` | A neural network model where token vectors are calculated using a CNN. The vectors are mean pooled and used as features in a feed-forward network. This architecture is usually less accurate than the ensemble, but runs faster. |
|
| `"simple_cnn"` | A neural network model where token vectors are calculated using a CNN. The vectors are mean pooled and used as features in a feed-forward network. This architecture is usually less accurate than the ensemble, but runs faster. |
|
||||||
| `"bow"` | An ngram "bag-of-words" model. This architecture should run much faster than the others, but may not be as accurate, especially if texts are short. The features extracted can be controlled using the keyword arguments `ngram_size` and `attr`. For instance, `ngram_size=3` and `attr="lower"` would give lower-cased unigram, trigram and bigram features. 2, 3 or 4 are usually good choices of ngram size. |
|
| `"bow"` | An ngram "bag-of-words" model. This architecture should run much faster than the others, but may not be as accurate, especially if texts are short. The features extracted can be controlled using the keyword arguments `ngram_size` and `attr`. For instance, `ngram_size=3` and `attr="lower"` would give lower-cased unigram, trigram and bigram features. 2, 3 or 4 are usually good choices of ngram size. |
|
||||||
|
-->
|
||||||
|
|
||||||
## TextCategorizer.\_\_call\_\_ {#call tag="method"}
|
## TextCategorizer.\_\_call\_\_ {#call tag="method"}
|
||||||
|
|
||||||
|
@ -102,11 +109,11 @@ applied to the `Doc` in order. Both [`__call__`](/api/textcategorizer#call) and
|
||||||
> pass
|
> pass
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ------------ | -------- | ------------------------------------------------------ |
|
| ------------ | --------------- | ------------------------------------------------------ |
|
||||||
| `stream` | iterable | A stream of documents. |
|
| `stream` | `Iterable[Doc]` | A stream of documents. |
|
||||||
| `batch_size` | int | The number of texts to buffer. Defaults to `128`. |
|
| `batch_size` | int | The number of texts to buffer. Defaults to `128`. |
|
||||||
| **YIELDS** | `Doc` | Processed documents in the order of the original text. |
|
| **YIELDS** | `Doc` | Processed documents in the order of the original text. |
|
||||||
|
|
||||||
## TextCategorizer.predict {#predict tag="method"}
|
## TextCategorizer.predict {#predict tag="method"}
|
||||||
|
|
||||||
|
@ -116,13 +123,13 @@ Apply the pipeline's model to a batch of docs, without modifying them.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> textcat = TextCategorizer(nlp.vocab)
|
> textcat = TextCategorizer(nlp.vocab)
|
||||||
> scores, tensors = textcat.predict([doc1, doc2])
|
> scores = textcat.predict([doc1, doc2])
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------- | --------------- | ----------------------------------------- |
|
||||||
| `docs` | iterable | The documents to predict. |
|
| `docs` | `Iterable[Doc]` | The documents to predict. |
|
||||||
| **RETURNS** | tuple | A `(scores, tensors)` tuple where `scores` is the model's prediction for each document and `tensors` is the token representations used to predict the scores. Each tensor is an array with one row for each token in the document. |
|
| **RETURNS** | - | The model's prediction for each document. |
|
||||||
|
|
||||||
## TextCategorizer.set_annotations {#set_annotations tag="method"}
|
## TextCategorizer.set_annotations {#set_annotations tag="method"}
|
||||||
|
|
||||||
|
@ -132,15 +139,14 @@ Modify a batch of documents, using pre-computed scores.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> textcat = TextCategorizer(nlp.vocab)
|
> textcat = TextCategorizer(nlp.vocab)
|
||||||
> scores, tensors = textcat.predict([doc1, doc2])
|
> scores = textcat.predict(docs)
|
||||||
> textcat.set_annotations([doc1, doc2], scores, tensors)
|
> textcat.set_annotations(docs, scores)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| --------- | -------- | --------------------------------------------------------- |
|
| -------- | --------------- | --------------------------------------------------------- |
|
||||||
| `docs` | iterable | The documents to modify. |
|
| `docs` | `Iterable[Doc]` | The documents to modify. |
|
||||||
| `scores` | - | The scores to set, produced by `TextCategorizer.predict`. |
|
| `scores` | - | The scores to set, produced by `TextCategorizer.predict`. |
|
||||||
| `tensors` | iterable | The token representations used to predict the scores. |
|
|
||||||
|
|
||||||
## TextCategorizer.update {#update tag="method"}
|
## TextCategorizer.update {#update tag="method"}
|
||||||
|
|
||||||
|
@ -151,19 +157,20 @@ pipe's model. Delegates to [`predict`](/api/textcategorizer#predict) and
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> textcat = TextCategorizer(nlp.vocab)
|
> textcat = TextCategorizer(nlp.vocab, textcat_model)
|
||||||
> losses = {}
|
|
||||||
> optimizer = nlp.begin_training()
|
> optimizer = nlp.begin_training()
|
||||||
> textcat.update([doc1, doc2], [gold1, gold2], losses=losses, sgd=optimizer)
|
> losses = textcat.update(examples, sgd=optimizer)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| -------- | -------- | -------------------------------------------------------------------------------------------- |
|
| ----------------- | ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `docs` | iterable | A batch of documents to learn from. |
|
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
|
||||||
| `golds` | iterable | The gold-standard data. Must have the same length as `docs`. |
|
| _keyword-only_ | | |
|
||||||
| `drop` | float | The dropout rate. |
|
| `drop` | float | The dropout rate. |
|
||||||
| `sgd` | callable | The optimizer. Should take two arguments `weights` and `gradient`, and an optional ID. |
|
| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/textcategorizer#set_annotations). |
|
||||||
| `losses` | dict | Optional record of the loss during training. The value keyed by the model's name is updated. |
|
| `sgd` | `Optimizer` | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. |
|
||||||
|
| `losses` | `Dict[str, float]` | Optional record of the loss during training. The value keyed by the model's name is updated. |
|
||||||
|
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
||||||
|
|
||||||
## TextCategorizer.get_loss {#get_loss tag="method"}
|
## TextCategorizer.get_loss {#get_loss tag="method"}
|
||||||
|
|
||||||
|
@ -174,21 +181,20 @@ predicted scores.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> textcat = TextCategorizer(nlp.vocab)
|
> textcat = TextCategorizer(nlp.vocab)
|
||||||
> scores = textcat.predict([doc1, doc2])
|
> scores = textcat.predict([eg.predicted for eg in examples])
|
||||||
> loss, d_loss = textcat.get_loss([doc1, doc2], [gold1, gold2], scores)
|
> loss, d_loss = textcat.get_loss(examples, scores)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | -------- | ------------------------------------------------------------ |
|
| ----------- | ------------------- | --------------------------------------------------- |
|
||||||
| `docs` | iterable | The batch of documents. |
|
| `examples` | `Iterable[Example]` | The batch of examples. |
|
||||||
| `golds` | iterable | The gold-standard data. Must have the same length as `docs`. |
|
| `scores` | - | Scores representing the model's predictions. |
|
||||||
| `scores` | - | Scores representing the model's predictions. |
|
| **RETURNS** | tuple | The loss and the gradient, i.e. `(loss, gradient)`. |
|
||||||
| **RETURNS** | tuple | The loss and the gradient, i.e. `(loss, gradient)`. |
|
|
||||||
|
|
||||||
## TextCategorizer.begin_training {#begin_training tag="method"}
|
## TextCategorizer.begin_training {#begin_training tag="method"}
|
||||||
|
|
||||||
Initialize the pipe for training, using data examples if available. If no model
|
Initialize the pipe for training, using data examples if available. Return an
|
||||||
has been initialized yet, the model is added.
|
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
@ -198,12 +204,12 @@ has been initialized yet, the model is added.
|
||||||
> optimizer = textcat.begin_training(pipeline=nlp.pipeline)
|
> optimizer = textcat.begin_training(pipeline=nlp.pipeline)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ------------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------- | ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `gold_tuples` | iterable | Optional gold-standard annotations from which to construct [`GoldParse`](/api/goldparse) objects. |
|
| `get_examples` | `Iterable[Example]` | Optional gold-standard annotations in the form of [`Example`](/api/example) objects. |
|
||||||
| `pipeline` | list | Optional list of pipeline components that this component is part of. |
|
| `pipeline` | `List[(str, callable)]` | Optional list of pipeline components that this component is part of. |
|
||||||
| `sgd` | callable | An optional optimizer. Should take two arguments `weights` and `gradient`, and an optional ID. Will be created via [`TextCategorizer`](/api/textcategorizer#create_optimizer) if not set. |
|
| `sgd` | `Optimizer` | An optional [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. Will be created via [`create_optimizer`](/api/textcategorizer#create_optimizer) if not set. |
|
||||||
| **RETURNS** | callable | An optimizer. |
|
| **RETURNS** | `Optimizer` | An optimizer. |
|
||||||
|
|
||||||
## TextCategorizer.create_optimizer {#create_optimizer tag="method"}
|
## TextCategorizer.create_optimizer {#create_optimizer tag="method"}
|
||||||
|
|
||||||
|
@ -216,9 +222,9 @@ Create an optimizer for the pipeline component.
|
||||||
> optimizer = textcat.create_optimizer()
|
> optimizer = textcat.create_optimizer()
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | -------- | -------------- |
|
| ----------- | ----------- | --------------------------------------------------------------- |
|
||||||
| **RETURNS** | callable | The optimizer. |
|
| **RETURNS** | `Optimizer` | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. |
|
||||||
|
|
||||||
## TextCategorizer.use_params {#use_params tag="method, contextmanager"}
|
## TextCategorizer.use_params {#use_params tag="method, contextmanager"}
|
||||||
|
|
||||||
|
|
19
website/docs/api/tok2vec.md
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
---
|
||||||
|
title: Tok2Vec
|
||||||
|
source: spacy/pipeline/tok2vec.py
|
||||||
|
new: 3
|
||||||
|
---
|
||||||
|
|
||||||
|
TODO: document
|
||||||
|
|
||||||
|
## Default config {#config}
|
||||||
|
|
||||||
|
This is the default configuration used to initialize the model powering the
|
||||||
|
pipeline component. See the [model architectures](/api/architectures)
|
||||||
|
documentation for details on the architectures and their arguments and
|
||||||
|
hyperparameters. To learn more about how to customize the config and train
|
||||||
|
custom models, check out the [training config](/usage/training#config) docs.
|
||||||
|
|
||||||
|
```python
|
||||||
|
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/defaults/tok2vec_defaults.cfg
|
||||||
|
```
|
|
@ -3,6 +3,7 @@ title: Top-level Functions
|
||||||
menu:
|
menu:
|
||||||
- ['spacy', 'spacy']
|
- ['spacy', 'spacy']
|
||||||
- ['displacy', 'displacy']
|
- ['displacy', 'displacy']
|
||||||
|
- ['registry', 'registry']
|
||||||
- ['Data & Alignment', 'gold']
|
- ['Data & Alignment', 'gold']
|
||||||
- ['Utility Functions', 'util']
|
- ['Utility Functions', 'util']
|
||||||
---
|
---
|
||||||
|
@ -33,7 +34,7 @@ loaded in via [`Language.from_disk`](/api/language#from_disk).
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ------------ | --------------------------------------------------------------------------------- |
|
| ----------- | ------------ | --------------------------------------------------------------------------------- |
|
||||||
| `name` | str / `Path` | Model to load, i.e. package name or path. |
|
| `name` | str / `Path` | Model to load, i.e. package name or path. |
|
||||||
| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). |
|
| `disable` | `List[str]` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). |
|
||||||
| **RETURNS** | `Language` | A `Language` object with the loaded model. |
|
| **RETURNS** | `Language` | A `Language` object with the loaded model. |
|
||||||
|
|
||||||
Essentially, `spacy.load()` is a convenience wrapper that reads the language ID
|
Essentially, `spacy.load()` is a convenience wrapper that reads the language ID
|
||||||
|
@ -60,11 +61,11 @@ Create a blank model of a given language class. This function is the twin of
|
||||||
> nlp_de = spacy.blank("de")
|
> nlp_de = spacy.blank("de")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ---------- | ------------------------------------------------------------------------------------------------ |
|
| ----------- | ----------- | ------------------------------------------------------------------------------------------------ |
|
||||||
| `name` | str | [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) of the language class to load. |
|
| `name` | str | [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) of the language class to load. |
|
||||||
| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). |
|
| `disable` | `List[str]` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). |
|
||||||
| **RETURNS** | `Language` | An empty `Language` object of the appropriate subclass. |
|
| **RETURNS** | `Language` | An empty `Language` object of the appropriate subclass. |
|
||||||
|
|
||||||
#### spacy.info {#spacy.info tag="function"}
|
#### spacy.info {#spacy.info tag="function"}
|
||||||
|
|
||||||
|
@ -259,6 +260,48 @@ package can also expose a
|
||||||
[`spacy_displacy_colors` entry point](/usage/saving-loading#entry-points-displacy)
|
[`spacy_displacy_colors` entry point](/usage/saving-loading#entry-points-displacy)
|
||||||
to add custom labels and their colors automatically.
|
to add custom labels and their colors automatically.
|
||||||
|
|
||||||
|
## registry {#registry source="spacy/util.py" new="3"}
|
||||||
|
|
||||||
|
spaCy's function registry extends
|
||||||
|
[Thinc's `registry`](https://thinc.ai/docs/api-config#registry) and allows you
|
||||||
|
to map strings to functions. You can register functions to create architectures,
|
||||||
|
optimizers, schedules and more, and then refer to them and set their arguments
|
||||||
|
in your [config file](/usage/training#config). Python type hints are used to
|
||||||
|
validate the inputs. See the
|
||||||
|
[Thinc docs](https://thinc.ai/docs/api-config#registry) for details on the
|
||||||
|
`registry` methods and our helper library
|
||||||
|
[`catalogue`](https://github.com/explosion/catalogue) for some background on the
|
||||||
|
concept of function registries. spaCy also uses the function registry for
|
||||||
|
language subclasses, model architecture, lookups and pipeline component
|
||||||
|
factories.
|
||||||
|
|
||||||
|
<!-- TODO: improve example? -->
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> import spacy
|
||||||
|
> from thinc.api import Model
|
||||||
|
>
|
||||||
|
> @spacy.registry.architectures("CustomNER.v1")
|
||||||
|
> def custom_ner(n0: int) -> Model:
|
||||||
|
> return Model("custom", forward, dims={"nO": nO})
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Registry name | Description |
|
||||||
|
| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `architectures` | Registry for functions that create [model architectures](/api/architectures). Can be used to register custom model architectures and reference them in the `config.cfg`. |
|
||||||
|
| `factories` | Registry for functions that create [pipeline components](/usage/processing-pipelines#custom-components). Added automatically when you use the `@spacy.component` decorator and also reads from [entry points](/usage/saving-loading#entry-points) |
|
||||||
|
| `languages` | Registry for language-specific `Language` subclasses. Automatically reads from [entry points](/usage/saving-loading#entry-points). |
|
||||||
|
| `lookups` | Registry for large lookup tables available via `vocab.lookups`. |
|
||||||
|
| `displacy_colors` | Registry for custom color scheme for the [`displacy` NER visualizer](/usage/visualizers). Automatically reads from [entry points](/usage/saving-loading#entry-points). |
|
||||||
|
| `assets` | <!-- TODO: what is this used for again?--> |
|
||||||
|
| `optimizers` | Registry for functions that create [optimizers](https://thinc.ai/docs/api-optimizers). |
|
||||||
|
| `schedules` | Registry for functions that create [schedules](https://thinc.ai/docs/api-schedules). |
|
||||||
|
| `layers` | Registry for functions that create [layers](https://thinc.ai/docs/api-layers). |
|
||||||
|
| `losses` | Registry for functions that create [losses](https://thinc.ai/docs/api-loss). |
|
||||||
|
| `initializers` | Registry for functions that create [initializers](https://thinc.ai/docs/api-initializers). |
|
||||||
|
|
||||||
## Training data and alignment {#gold source="spacy/gold"}
|
## Training data and alignment {#gold source="spacy/gold"}
|
||||||
|
|
||||||
### gold.docs_to_json {#docs_to_json tag="function"}
|
### gold.docs_to_json {#docs_to_json tag="function"}
|
||||||
|
@ -421,6 +464,8 @@ page should be safe to use and we'll try to ensure backwards compatibility.
|
||||||
However, we recommend having additional tests in place if your application
|
However, we recommend having additional tests in place if your application
|
||||||
depends on any of spaCy's utilities.
|
depends on any of spaCy's utilities.
|
||||||
|
|
||||||
|
<!-- TODO: document new config-related util functions? -->
|
||||||
|
|
||||||
### util.get_lang_class {#util.get_lang_class tag="function"}
|
### util.get_lang_class {#util.get_lang_class tag="function"}
|
||||||
|
|
||||||
Import and load a `Language` class. Allows lazy-loading
|
Import and load a `Language` class. Allows lazy-loading
|
||||||
|
@ -674,8 +719,7 @@ vary on each step.
|
||||||
> ```python
|
> ```python
|
||||||
> batches = minibatch(train_data)
|
> batches = minibatch(train_data)
|
||||||
> for batch in batches:
|
> for batch in batches:
|
||||||
> texts, annotations = zip(*batch)
|
> nlp.update(batch)
|
||||||
> nlp.update(texts, annotations)
|
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
|
@ -705,7 +749,7 @@ of one entity) or when merging spans with
|
||||||
| `spans` | iterable | The spans to filter. |
|
| `spans` | iterable | The spans to filter. |
|
||||||
| **RETURNS** | list | The filtered spans. |
|
| **RETURNS** | list | The filtered spans. |
|
||||||
|
|
||||||
## util.get_words_and_spaces {#get_words_and_spaces tag="function" new="3"}
|
### util.get_words_and_spaces {#get_words_and_spaces tag="function" new="3"}
|
||||||
|
|
||||||
<!-- TODO: document -->
|
<!-- TODO: document -->
|
||||||
|
|
||||||
|
|
|
@ -26,7 +26,7 @@
|
||||||
<path fill="none" stroke="#f33" stroke-width="2" stroke-miterlimit="10" d="M521 195l-35.2-49.3"/>
|
<path fill="none" stroke="#f33" stroke-width="2" stroke-miterlimit="10" d="M521 195l-35.2-49.3"/>
|
||||||
<path fill="#f33" stroke="#f33" stroke-width="2" stroke-miterlimit="10" d="M482.3 140.8l8 4.2-4.5.7-2 4z"/>
|
<path fill="#f33" stroke="#f33" stroke-width="2" stroke-miterlimit="10" d="M482.3 140.8l8 4.2-4.5.7-2 4z"/>
|
||||||
<path fill="#fff2cc" stroke="#d6b656" stroke-width="2" d="M491 195h120v67H491z"/>
|
<path fill="#fff2cc" stroke="#d6b656" stroke-width="2" d="M491 195h120v67H491z"/>
|
||||||
<text class="svg__trainloop__text" dy="1em" transform="translate(513.5 218.5)" width="73" height="18">GoldParse</text>
|
<text class="svg__trainloop__text" dy="1em" transform="translate(513.5 218.5)" width="73" height="18">Example</text>
|
||||||
<path fill="none" stroke="#f33" stroke-width="2" stroke-miterlimit="10" d="M466 59V21h-40.8"/>
|
<path fill="none" stroke="#f33" stroke-width="2" stroke-miterlimit="10" d="M466 59V21h-40.8"/>
|
||||||
<path fill="#f33" stroke="#f33" stroke-width="2" stroke-miterlimit="10" d="M419.2 21l8-4-2 4 2 4z"/>
|
<path fill="#f33" stroke="#f33" stroke-width="2" stroke-miterlimit="10" d="M419.2 21l8-4-2 4 2 4z"/>
|
||||||
<path fill="#f99" stroke="#f33" stroke-width="2" stroke-miterlimit="10" d="M436 59h60l30 40-30 40h-60l-30-40z"/>
|
<path fill="#f99" stroke="#f33" stroke-width="2" stroke-miterlimit="10" d="M436 59h60l30 40-30 40h-60l-30-40z"/>
|
||||||
|
|
Before Width: | Height: | Size: 3.9 KiB After Width: | Height: | Size: 3.9 KiB |
|
@ -3,7 +3,6 @@ title: Models
|
||||||
teaser: Downloadable pretrained models for spaCy
|
teaser: Downloadable pretrained models for spaCy
|
||||||
menu:
|
menu:
|
||||||
- ['Quickstart', 'quickstart']
|
- ['Quickstart', 'quickstart']
|
||||||
- ['Model Architecture', 'architecture']
|
|
||||||
- ['Conventions', 'conventions']
|
- ['Conventions', 'conventions']
|
||||||
---
|
---
|
||||||
|
|
||||||
|
@ -27,7 +26,7 @@ import QuickstartModels from 'widgets/quickstart-models.js'
|
||||||
|
|
||||||
<QuickstartModels title="Quickstart" id="quickstart" description="Install a default model, get the code to load it from within spaCy and test it." />
|
<QuickstartModels title="Quickstart" id="quickstart" description="Install a default model, get the code to load it from within spaCy and test it." />
|
||||||
|
|
||||||
<Infobox title="📖 Installation and usage">
|
<Infobox title="Installation and usage" emoji="📖">
|
||||||
|
|
||||||
For more details on how to use models with spaCy, see the
|
For more details on how to use models with spaCy, see the
|
||||||
[usage guide on models](/usage/models).
|
[usage guide on models](/usage/models).
|
||||||
|
|
|
@ -45,10 +45,11 @@ an **annotated document**. It also orchestrates training and serialization.
|
||||||
|
|
||||||
### Other classes {#architecture-other}
|
### Other classes {#architecture-other}
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| --------------------------------- | ------------------------------------------------------------------------------------------------------------- |
|
| --------------------------------- | ----------------------------------------------------------------------------- |
|
||||||
| [`Vocab`](/api/vocab) | A lookup table for the vocabulary that allows you to access `Lexeme` objects. |
|
| [`Vocab`](/api/vocab) | A lookup table for the vocabulary that allows you to access `Lexeme` objects. |
|
||||||
| [`StringStore`](/api/stringstore) | Map strings to and from hash values. |
|
| [`StringStore`](/api/stringstore) | Map strings to and from hash values. |
|
||||||
| [`Vectors`](/api/vectors) | Container class for vector data keyed by string. |
|
| [`Vectors`](/api/vectors) | Container class for vector data keyed by string. |
|
||||||
| [`GoldParse`](/api/goldparse) | Collection for training annotations. |
|
| [`Example`](/api/example) | Collection for training annotations. |
|
||||||
| [`GoldCorpus`](/api/goldcorpus) | An annotated corpus, using the JSON file format. Manages annotations for tagging, dependency parsing and NER. |
|
|
||||||
|
|
|
||||||
|
|
|
@ -28,7 +28,7 @@ import PosDeps101 from 'usage/101/\_pos-deps.md'
|
||||||
|
|
||||||
<PosDeps101 />
|
<PosDeps101 />
|
||||||
|
|
||||||
<Infobox title="📖 Part-of-speech tag scheme">
|
<Infobox title="Part-of-speech tag scheme" emoji="📖">
|
||||||
|
|
||||||
For a list of the fine-grained and coarse-grained part-of-speech tags assigned
|
For a list of the fine-grained and coarse-grained part-of-speech tags assigned
|
||||||
by spaCy's models across different languages, see the label schemes documented
|
by spaCy's models across different languages, see the label schemes documented
|
||||||
|
@ -287,7 +287,7 @@ for token in doc:
|
||||||
| their | `ADJ` | `poss` | requests |
|
| their | `ADJ` | `poss` | requests |
|
||||||
| requests | `NOUN` | `dobj` | submit |
|
| requests | `NOUN` | `dobj` | submit |
|
||||||
|
|
||||||
<Infobox title="📖 Dependency label scheme">
|
<Infobox title="Dependency label scheme" emoji="📖">
|
||||||
|
|
||||||
For a list of the syntactic dependency labels assigned by spaCy's models across
|
For a list of the syntactic dependency labels assigned by spaCy's models across
|
||||||
different languages, see the label schemes documented in the
|
different languages, see the label schemes documented in the
|
||||||
|
@ -615,7 +615,7 @@ tokens containing periods intact (abbreviations like "U.S.").
|
||||||
|
|
||||||
![Language data architecture](../images/language_data.svg)
|
![Language data architecture](../images/language_data.svg)
|
||||||
|
|
||||||
<Infobox title="📖 Language data">
|
<Infobox title="Language data" emoji="📖">
|
||||||
|
|
||||||
For more details on the language-specific data, see the usage guide on
|
For more details on the language-specific data, see the usage guide on
|
||||||
[adding languages](/usage/adding-languages).
|
[adding languages](/usage/adding-languages).
|
||||||
|
|
|
@ -338,7 +338,7 @@ nlp = spacy.load("/path/to/en_core_web_sm") # load package from a directory
|
||||||
doc = nlp("This is a sentence.")
|
doc = nlp("This is a sentence.")
|
||||||
```
|
```
|
||||||
|
|
||||||
<Infobox title="Tip: Preview model info">
|
<Infobox title="Tip: Preview model info" emoji="💡">
|
||||||
|
|
||||||
You can use the [`info`](/api/cli#info) command or
|
You can use the [`info`](/api/cli#info) command or
|
||||||
[`spacy.info()`](/api/top-level#spacy.info) method to print a model's meta data
|
[`spacy.info()`](/api/top-level#spacy.info) method to print a model's meta data
|
||||||
|
|
|
@ -34,7 +34,7 @@ texts = ["This is a text", "These are lots of texts", "..."]
|
||||||
+ docs = list(nlp.pipe(texts))
|
+ docs = list(nlp.pipe(texts))
|
||||||
```
|
```
|
||||||
|
|
||||||
<Infobox title="Tips for efficient processing">
|
<Infobox title="Tips for efficient processing" emoji="💡">
|
||||||
|
|
||||||
- Process the texts **as a stream** using [`nlp.pipe`](/api/language#pipe) and
|
- Process the texts **as a stream** using [`nlp.pipe`](/api/language#pipe) and
|
||||||
buffer them in batches, instead of one-by-one. This is usually much more
|
buffer them in batches, instead of one-by-one. This is usually much more
|
||||||
|
@ -912,7 +912,7 @@ new_heads = [head - i - 1 if head != 0 else 0 for i, head in enumerate(heads)]
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
<Infobox title="📖 Advanced usage, serialization and entry points">
|
<Infobox title="Advanced usage, serialization and entry points" emoji="📖">
|
||||||
|
|
||||||
For more details on how to write and package custom components, make them
|
For more details on how to write and package custom components, make them
|
||||||
available to spaCy via entry points and implement your own serialization
|
available to spaCy via entry points and implement your own serialization
|
||||||
|
|
|
@ -1,5 +1,614 @@
|
||||||
---
|
---
|
||||||
title: Projects
|
title: Projects
|
||||||
|
new: 3
|
||||||
|
menu:
|
||||||
|
- ['Intro & Workflow', 'intro']
|
||||||
|
- ['Directory & Assets', 'directory']
|
||||||
|
- ['Custom Projects', 'custom']
|
||||||
|
- ['Integrations', 'integrations']
|
||||||
---
|
---
|
||||||
|
|
||||||
TODO: write
|
> #### 🪐 Project templates
|
||||||
|
>
|
||||||
|
> Our [`projects`](https://github.com/explosion/projects) repo includes various
|
||||||
|
> project templates for different NLP tasks, models, workflows and integrations
|
||||||
|
> that you can clone and run. The easiest way to get started is to pick a
|
||||||
|
> template, clone it and start modifying it!
|
||||||
|
|
||||||
|
spaCy projects let you manage and share **end-to-end spaCy workflows** for
|
||||||
|
different **use cases and domains**, and orchestrate training, packaging and
|
||||||
|
serving your custom models. You can start off by cloning a pre-defined project
|
||||||
|
template, adjust it to fit your needs, load in your data, train a model, export
|
||||||
|
it as a Python package and share the project templates with your team. spaCy
|
||||||
|
projects can be used via the new [`spacy project`](/api/cli#project) command.
|
||||||
|
For an overview of the available project templates, check out the
|
||||||
|
[`projects`](https://github.com/explosion/projects) repo. spaCy projects also
|
||||||
|
[integrate](#integrations) with many other cool machine learning and data
|
||||||
|
science tools to track and manage your data and experiments, iterate on demos
|
||||||
|
and prototypes and ship your models into production.
|
||||||
|
|
||||||
|
<!-- TODO: mention integrations -->
|
||||||
|
|
||||||
|
## Introduction and workflow {#intro}
|
||||||
|
|
||||||
|
<!-- TODO: decide how to introduce concept -->
|
||||||
|
|
||||||
|
<Project id="some_example_project">
|
||||||
|
|
||||||
|
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus interdum
|
||||||
|
sodales lectus, ut sodales orci ullamcorper id. Sed condimentum neque ut erat
|
||||||
|
mattis pretium.
|
||||||
|
|
||||||
|
</Project>
|
||||||
|
|
||||||
|
spaCy projects make it easy to integrate with many other **awesome tools** in
|
||||||
|
the data science and machine learning ecosystem to track and manage your data
|
||||||
|
and experiments, iterate on demos and prototypes and ship your models into
|
||||||
|
production.
|
||||||
|
|
||||||
|
<Grid narrow cols={3}>
|
||||||
|
<Integration title="DVC" logo="dvc" url="#dvc">Manage and version your data</Integration>
|
||||||
|
<Integration title="Prodigy" logo="prodigy" url="#prodigy">Create labelled training data</Integration>
|
||||||
|
<Integration title="Streamlit" logo="streamlit" url="#streamlit">Visualize and demo your models</Integration>
|
||||||
|
<Integration title="FastAPI" logo="fastapi" url="#fastapi">Serve your models and host APIs</Integration>
|
||||||
|
<Integration title="Ray" logo="ray" url="#ray">Distributed and parallel training</Integration>
|
||||||
|
<Integration title="Weights & Biases" logo="wandb" url="#wandb">Track your experiments and results</Integration>
|
||||||
|
</Grid>
|
||||||
|
|
||||||
|
### 1. Clone a project template {#clone}
|
||||||
|
|
||||||
|
> #### Cloning under the hood
|
||||||
|
>
|
||||||
|
> To clone a project, spaCy calls into `git` and uses the "sparse checkout"
|
||||||
|
> feature to only clone the relevant directory or directories.
|
||||||
|
|
||||||
|
The [`spacy project clone`](/api/cli#project-clone) command clones an existing
|
||||||
|
project template and copies the files to a local directory. You can then run the
|
||||||
|
project, e.g. to train a model and edit the commands and scripts to build fully
|
||||||
|
custom workflows.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ python -m spacy clone some_example_project
|
||||||
|
```
|
||||||
|
|
||||||
|
By default, the project will be cloned into the current working directory. You
|
||||||
|
can specify an optional second argument to define the output directory. The
|
||||||
|
`--repo` option lets you define a custom repo to clone from, if you don't want
|
||||||
|
to use the spaCy [`projects`](https://github.com/explosion/projects) repo. You
|
||||||
|
can also use any private repo you have access to with Git.
|
||||||
|
|
||||||
|
### 2. Fetch the project assets {#assets}
|
||||||
|
|
||||||
|
> #### project.yml
|
||||||
|
>
|
||||||
|
> ```yaml
|
||||||
|
> assets:
|
||||||
|
> - dest: 'assets/training.spacy'
|
||||||
|
> url: 'https://example.com/data.spacy'
|
||||||
|
> checksum: '63373dd656daa1fd3043ce166a59474c'
|
||||||
|
> ```
|
||||||
|
|
||||||
|
Assets are data files your project needs – for example, the training and
|
||||||
|
evaluation data or pretrained vectors and embeddings to initialize your model
|
||||||
|
with. Each project template comes with a `project.yml` that defines the assets
|
||||||
|
to download and where to put them. The
|
||||||
|
[`spacy project assets`](/api/cli#project-assets) will fetch the project assets
|
||||||
|
for you:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd some_example_project
|
||||||
|
python -m spacy project assets
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Run a command {#run}
|
||||||
|
|
||||||
|
> #### project.yml
|
||||||
|
>
|
||||||
|
> ```yaml
|
||||||
|
> commands:
|
||||||
|
> - name: preprocess
|
||||||
|
> help: "Convert the input data to spaCy's format"
|
||||||
|
> script:
|
||||||
|
> - 'python -m spacy convert assets/train.conllu corpus/'
|
||||||
|
> - 'python -m spacy convert assets/eval.conllu corpus/'
|
||||||
|
> deps:
|
||||||
|
> - 'assets/train.conllu'
|
||||||
|
> - 'assets/eval.conllu'
|
||||||
|
> outputs:
|
||||||
|
> - 'corpus/train.spacy'
|
||||||
|
> - 'corpus/eval.spacy'
|
||||||
|
> ```
|
||||||
|
|
||||||
|
Commands consist of one or more steps and can be run with
|
||||||
|
[`spacy project run`](/api/cli#project-run). The following will run the command
|
||||||
|
`preprocess` defined in the `project.yml`:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ python -m spacy project run preprocess
|
||||||
|
```
|
||||||
|
|
||||||
|
Commands can define their expected [dependencies and outputs](#deps-outputs)
|
||||||
|
using the `deps` (files the commands require) and `outputs` (files the commands
|
||||||
|
create) keys. This allows your project to track changes and determine whether a
|
||||||
|
command needs to be re-run. For instance, if your input data changes, you want
|
||||||
|
to re-run the `preprocess` command. But if nothing changed, this step can be
|
||||||
|
skipped. You can also set `--force` to force re-running a command, or `--dry` to
|
||||||
|
perform a "dry run" and see what would happen (without actually running the
|
||||||
|
script).
|
||||||
|
|
||||||
|
### 4. Run a workflow {#run-workfow}
|
||||||
|
|
||||||
|
> #### project.yml
|
||||||
|
>
|
||||||
|
> ```yaml
|
||||||
|
> workflows:
|
||||||
|
> all:
|
||||||
|
> - preprocess
|
||||||
|
> - train
|
||||||
|
> - package
|
||||||
|
> ```
|
||||||
|
|
||||||
|
Workflows are series of commands that are run in order and often depend on each
|
||||||
|
other. For instance, to generate a packaged model, you might start by converting
|
||||||
|
your data, then run [`spacy train`](/api/cli#train) to train your model on the
|
||||||
|
converted data and if that's successful, run [`spacy package`](/api/cli#package)
|
||||||
|
to turn the best model artifact into an installable Python package. The
|
||||||
|
following command run the workflow named `all` defined in the `project.yml`, and
|
||||||
|
execute the commands it specifies, in order:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ python -m spacy project run all
|
||||||
|
```
|
||||||
|
|
||||||
|
Using the expected [dependencies and outputs](#deps-outputs) defined in the
|
||||||
|
commands, spaCy can determine whether to re-run a command (if its inputs or
|
||||||
|
outputs have changed) or whether to skip it. If you're looking to implement more
|
||||||
|
advanced data pipelines and track your changes in Git, check out the
|
||||||
|
[Data Version Control (DVC) integration](#dvc). The
|
||||||
|
[`spacy project dvc`](/api/cli#project-dvc) command generates a DVC config file
|
||||||
|
from a workflow defined in your `project.yml` so you can manage your spaCy
|
||||||
|
project as a DVC repo.
|
||||||
|
|
||||||
|
## Project directory and assets {#directory}
|
||||||
|
|
||||||
|
### project.yml {#project-yml}
|
||||||
|
|
||||||
|
The `project.yml` defines the assets a project depends on, like datasets and
|
||||||
|
pretrained weights, as well as a series of commands that can be run separately
|
||||||
|
or as a workflow – for instance, to preprocess the data, convert it to spaCy's
|
||||||
|
format, train a model, evaluate it and export metrics, package it and spin up a
|
||||||
|
quick web demo. It looks pretty similar to a config file used to define CI
|
||||||
|
pipelines.
|
||||||
|
|
||||||
|
<!-- TODO: update with better (final) example -->
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
https://github.com/explosion/spacy-boilerplates/blob/master/ner_fashion/project.yml
|
||||||
|
```
|
||||||
|
|
||||||
|
| Section | Description |
|
||||||
|
| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
|
| `variables` | A dictionary of variables that can be referenced in paths, URLs and scripts. For example, `{NAME}` will use the value of the variable `NAME`. |
|
||||||
|
| `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. |
|
||||||
|
| `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. |
|
||||||
|
| `workflows` | A dictionary of workflow names, mapped to a list of command names, to execute in order. Workflows can be run with the [`project run`](/api/cli#project-run) command. |
|
||||||
|
| `commands` | A list of named commands. A command can define an optional help message (shown in the CLI when the user adds `--help`) and the `script`, a list of commands to run. The `deps` and `outputs` let you define the created file the command depends on and produces, respectively. This lets spaCy determine whether a command needs to be re-run because its dependencies or outputs changed. Commands can be run as part of a workflow, or separately with the [`project run`](/api/cli#project-run) command. |
|
||||||
|
|
||||||
|
### Dependencies and outputs {#deps-outputs}
|
||||||
|
|
||||||
|
Each command defined in the `project.yml` can optionally define a list of
|
||||||
|
dependencies and outputs. These are the files the commands requires and creates.
|
||||||
|
For example, a command for training a model may depend on a
|
||||||
|
[`config.cfg`](/usage/training#config) and the training and evaluation data, and
|
||||||
|
it will export a directory `model-best`, containing the best model, which you
|
||||||
|
can then re-use in other commands.
|
||||||
|
|
||||||
|
<!-- prettier-ignore -->
|
||||||
|
```yaml
|
||||||
|
### project.yml
|
||||||
|
commands:
|
||||||
|
- name: train
|
||||||
|
help: 'Train a spaCy model using the specified corpus and config'
|
||||||
|
script:
|
||||||
|
- 'python -m spacy train ./corpus/training.spacy ./corpus/evaluation.spacy ./configs/config.cfg -o training/'
|
||||||
|
deps:
|
||||||
|
- 'configs/config.cfg'
|
||||||
|
- 'corpus/training.spacy'
|
||||||
|
- 'corpus/evaluation.spacy'
|
||||||
|
outputs:
|
||||||
|
- 'training/model-best'
|
||||||
|
```
|
||||||
|
|
||||||
|
> #### Re-running vs. skipping
|
||||||
|
>
|
||||||
|
> Under the hood, spaCy uses a `project.lock` lockfile that stores the details
|
||||||
|
> for each command, as well as its dependencies and outputs and their checksums.
|
||||||
|
> It's updated on each run. If any of this information changes, the command will
|
||||||
|
> be re-run. Otherwise, it will be skipped.
|
||||||
|
|
||||||
|
If you're running a command and it depends on files that are missing, spaCy will
|
||||||
|
show you an error. If a command defines dependencies and outputs that haven't
|
||||||
|
changed since the last run, the command will be skipped. This means that you're
|
||||||
|
only re-running commands if they need to be re-run. Commands can also set
|
||||||
|
`no_skip: true` if they should never be skipped – for example commands that run
|
||||||
|
tests. Commands without outputs are also never skipped. To force re-running a
|
||||||
|
command or workflow, even if nothing changed, you can set the `--force` flag.
|
||||||
|
|
||||||
|
Note that [`spacy project`](/api/cli#project) doesn't compile any dependency
|
||||||
|
graphs based on the dependencies and outputs, and won't re-run previous steps
|
||||||
|
automatically. For instance, if you only run the command `train` that depends on
|
||||||
|
data created by `preprocess` and those files are missing, spaCy will show an
|
||||||
|
error – it won't just re-run `preprocess`. If you're looking for more advanced
|
||||||
|
data management, check out the [Data Version Control (DVC) integration](#dvc)
|
||||||
|
integration. If you're planning on integrating your spaCy project with DVC, you
|
||||||
|
can also use `outputs_no_cache` instead of `outputs` to define outputs that
|
||||||
|
won't be cached or tracked.
|
||||||
|
|
||||||
|
### Files and directory structure {#project-files}
|
||||||
|
|
||||||
|
The `project.yml` can define a list of `directories` that should be created
|
||||||
|
within a project – for instance, `assets`, `training`, `corpus` and so on. spaCy
|
||||||
|
will make sure that these directories are always available, so your commands can
|
||||||
|
write to and read from them. Project directories will also include all files and
|
||||||
|
directories copied from the project template with
|
||||||
|
[`spacy project clone`](/api/cli#project-clone). Here's an example of a project
|
||||||
|
directory:
|
||||||
|
|
||||||
|
> #### project.yml
|
||||||
|
>
|
||||||
|
> <!-- prettier-ignore -->
|
||||||
|
> ```yaml
|
||||||
|
> directories: ['assets', 'configs', 'corpus', 'metas', 'metrics', 'notebooks', 'packages', 'scripts', 'training']
|
||||||
|
> ```
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
### Example project directory
|
||||||
|
├── project.yml # the project settings
|
||||||
|
├── project.lock # lockfile that tracks inputs/outputs
|
||||||
|
├── assets/ # downloaded data assets
|
||||||
|
├── configs/ # model config.cfg files used for training
|
||||||
|
├── corpus/ # output directory for training corpus
|
||||||
|
├── metas/ # model meta.json templates used for packaging
|
||||||
|
├── metrics/ # output directory for evaluation metrics
|
||||||
|
├── notebooks/ # directory for Jupyter notebooks
|
||||||
|
├── packages/ # output directory for model Python packages
|
||||||
|
├── scripts/ # directory for scripts, e.g. referenced in commands
|
||||||
|
├── training/ # output directory for trained models
|
||||||
|
└── ... # any other files, like a requirements.txt etc.
|
||||||
|
```
|
||||||
|
|
||||||
|
If you don't want a project to create a directory, you can delete it and remove
|
||||||
|
its entry from the `project.yml` – just make sure it's not required by any of
|
||||||
|
the commands. [Custom templates](#custom) can use any directories they need –
|
||||||
|
the only file that's required for a project is the `project.yml`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Custom scripts and projects {#custom}
|
||||||
|
|
||||||
|
The `project.yml` lets you define any custom commands and run them as part of
|
||||||
|
your training, evaluation or deployment workflows. The `script` section defines
|
||||||
|
a list of commands that are called in a subprocess, in order. This lets you
|
||||||
|
execute other Python scripts or command-line tools. Let's say you've written a
|
||||||
|
few integration tests that load the best model produced by the training command
|
||||||
|
and check that it works correctly. You can now define a `test` command that
|
||||||
|
calls into [`pytest`](https://docs.pytest.org/en/latest/), runs your tests and
|
||||||
|
uses [`pytest-html`](https://github.com/pytest-dev/pytest-html) to export a test
|
||||||
|
report:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
### project.yml
|
||||||
|
commands:
|
||||||
|
- name: test
|
||||||
|
help: 'Test the trained model'
|
||||||
|
script:
|
||||||
|
- 'pip install pytest pytest-html'
|
||||||
|
- 'python -m pytest ./scripts/tests --html=metrics/test-report.html'
|
||||||
|
deps:
|
||||||
|
- 'training/model-best'
|
||||||
|
outputs:
|
||||||
|
- 'metrics/test-report.html'
|
||||||
|
no_skip: true
|
||||||
|
```
|
||||||
|
|
||||||
|
Adding `training/model-best` to the command's `deps` lets you ensure that the
|
||||||
|
file is available. If not, spaCy will show an error and the command won't run.
|
||||||
|
Setting `no_skip: true` means that the command will always run, even if the
|
||||||
|
dependencies (the trained model) hasn't changed. This makes sense here, because
|
||||||
|
you typically don't want to skip your tests.
|
||||||
|
|
||||||
|
### Writing custom scripts {#custom-scripts}
|
||||||
|
|
||||||
|
Your project commands can include any custom scripts – essentially, anything you
|
||||||
|
can run from the command line. Here's an example of a custom script that uses
|
||||||
|
[`typer`](https://typer.tiangolo.com/) for quick and easy command-line arguments
|
||||||
|
that you can define via your `project.yml`:
|
||||||
|
|
||||||
|
> #### About Typer
|
||||||
|
>
|
||||||
|
> [`typer`](https://typer.tiangolo.com/) is a modern library for building Python
|
||||||
|
> CLIs using type hints. It's a dependency of spaCy, so it will already be
|
||||||
|
> pre-installed in your environment. Function arguments automatically become
|
||||||
|
> positional CLI arguments and using Python type hints, you can define the value
|
||||||
|
> types. For instance, `batch_size: int` means that the value provided via the
|
||||||
|
> command line is converted to an integer.
|
||||||
|
|
||||||
|
```python
|
||||||
|
### scripts/custom_evaluation.py
|
||||||
|
import typer
|
||||||
|
|
||||||
|
def custom_evaluation(batch_size: int = 128, model_path: str, data_path: str):
|
||||||
|
# The arguments are now available as positional CLI arguments
|
||||||
|
print(batch_size, model_path, data_path)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
typer.run(custom_evaluation)
|
||||||
|
```
|
||||||
|
|
||||||
|
In your `project.yml`, you can then run the script by calling
|
||||||
|
`python scripts/custom_evaluation.py` with the function arguments. You can also
|
||||||
|
use the `variables` section to define reusable variables that will be
|
||||||
|
substituted in commands, paths and URLs. In this example, the `BATCH_SIZE` is
|
||||||
|
defined as a variable will be added in place of `{BATCH_SIZE}` in the script.
|
||||||
|
|
||||||
|
> #### Calling into Python
|
||||||
|
>
|
||||||
|
> If any of your command scripts call into `python`, spaCy will take care of
|
||||||
|
> replacing that with your `sys.executable`, to make sure you're executing
|
||||||
|
> everything with the same Python (not some other Python installed on your
|
||||||
|
> system). It also normalizes references to `python3`, `pip3` and `pip`.
|
||||||
|
|
||||||
|
<!-- prettier-ignore -->
|
||||||
|
```yaml
|
||||||
|
### project.yml
|
||||||
|
variables:
|
||||||
|
BATCH_SIZE: 128
|
||||||
|
|
||||||
|
commands:
|
||||||
|
- name: evaluate
|
||||||
|
script:
|
||||||
|
- 'python scripts/custom_evaluation.py {BATCH_SIZE} ./training/model-best ./corpus/eval.json'
|
||||||
|
deps:
|
||||||
|
- 'training/model-best'
|
||||||
|
- 'corpus/eval.json'
|
||||||
|
```
|
||||||
|
|
||||||
|
### Cloning from your own repo {#custom-repo}
|
||||||
|
|
||||||
|
The [`spacy project clone`](/api/cli#project-clone) command lets you customize
|
||||||
|
the repo to clone from using the `--repo` option. It calls into `git`, so you'll
|
||||||
|
be able to clone from any repo that you have access to, including private repos.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ python -m spacy project your_project --repo https://github.com/you/repo
|
||||||
|
```
|
||||||
|
|
||||||
|
At a minimum, a valid project template needs to contain a
|
||||||
|
[`project.yml`](#project-yml). It can also include
|
||||||
|
[other files](/usage/projects#project-files), like custom scripts, a
|
||||||
|
`requirements.txt` listing additional dependencies,
|
||||||
|
[training configs](/usage/training#config) and model meta templates, or Jupyter
|
||||||
|
notebooks with usage examples.
|
||||||
|
|
||||||
|
<Infobox title="Important note about assets" variant="warning">
|
||||||
|
|
||||||
|
It's typically not a good idea to check large data assets, trained models or
|
||||||
|
other artifacts into a Git repo and you should exclude them from your project
|
||||||
|
template by adding a `.gitignore`. If you want to version your data and models,
|
||||||
|
check out [Data Version Control](#dvc) (DVC), which integrates with spaCy
|
||||||
|
projects.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
|
### Working with private assets {#private-assets}
|
||||||
|
|
||||||
|
For many projects, the datasets and weights you're working with might be
|
||||||
|
company-internal and not available via a public URL. In that case, you can
|
||||||
|
specify the destination paths and a checksum, and leave out the URL. When your
|
||||||
|
teammates clone and run your project, they can place the files in the respective
|
||||||
|
directory themselves. The [`spacy project assets`](/api/cli#project-assets)
|
||||||
|
command will alert about missing files and mismatched checksums, so you can
|
||||||
|
ensure that others are running your project with the same data.
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
### project.yml
|
||||||
|
assets:
|
||||||
|
- dest: 'assets/private_training_data.json'
|
||||||
|
checksum: '63373dd656daa1fd3043ce166a59474c'
|
||||||
|
- dest: 'assets/private_vectors.bin'
|
||||||
|
checksum: '5113dc04e03f079525edd8df3f4f39e3'
|
||||||
|
```
|
||||||
|
|
||||||
|
## Integrations {#integrations}
|
||||||
|
|
||||||
|
### Data Version Control (DVC) {#dvc} <IntegrationLogo name="dvc" title="DVC" width={70} height="auto" align="right" />
|
||||||
|
|
||||||
|
Data assets like training corpora or pretrained weights are at the core of any
|
||||||
|
NLP project, but they're often difficult to manage: you can't just check them
|
||||||
|
into your Git repo to version and keep track of them. And if you have multiple
|
||||||
|
steps that depend on each other, like a preprocessing step that generates your
|
||||||
|
training data, you need to make sure the data is always up-to-date, and re-run
|
||||||
|
all steps of your process every time, just to be safe.
|
||||||
|
|
||||||
|
[Data Version Control](https://dvc.org) (DVC) is a standalone open-source tool
|
||||||
|
that integrates into your workflow like Git, builds a dependency graph for your
|
||||||
|
data pipelines and tracks and caches your data files. If you're downloading data
|
||||||
|
from an external source, like a storage bucket, DVC can tell whether the
|
||||||
|
resource has changed. It can also determine whether to re-run a step, depending
|
||||||
|
on whether its input have changed or not. All metadata can be checked into a Git
|
||||||
|
repo, so you'll always be able to reproduce your experiments.
|
||||||
|
|
||||||
|
To set up DVC, install the package and initialize your spaCy project as a Git
|
||||||
|
and DVC repo. You can also
|
||||||
|
[customize your DVC installation](https://dvc.org/doc/install/macos#install-with-pip)
|
||||||
|
to include support for remote storage like Google Cloud Storage, S3, Azure, SSH
|
||||||
|
and more.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install dvc # Install DVC
|
||||||
|
git init # Initialize a Git repo
|
||||||
|
dvc init # Initialize a DVC project
|
||||||
|
```
|
||||||
|
|
||||||
|
The [`spacy project dvc`](/api/cli#project-dvc) command creates a `dvc.yaml`
|
||||||
|
config file based on a workflow defined in your `project.yml`. Whenever you
|
||||||
|
update your project, you can re-run the command to update your DVC config. You
|
||||||
|
can then manage your spaCy project like any other DVC project, run
|
||||||
|
[`dvc add`](https://dvc.org/doc/command-reference/add) to add and track assets
|
||||||
|
and [`dvc repro`](https://dvc.org/doc/command-reference/repro) to reproduce the
|
||||||
|
workflow or individual commands.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ python -m spacy project dvc [workflow name]
|
||||||
|
```
|
||||||
|
|
||||||
|
<Infobox title="Important note for multiple workflows" variant="warning">
|
||||||
|
|
||||||
|
DVC currently expects a single workflow per project, so when creating the config
|
||||||
|
with [`spacy project dvc`](/api/cli#project-dvc), you need to specify the name
|
||||||
|
of a workflow defined in your `project.yml`. You can still use multiple
|
||||||
|
workflows, but only one can be tracked by DVC.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
|
<Project id="integrations/dvc">
|
||||||
|
|
||||||
|
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus interdum
|
||||||
|
sodales lectus, ut sodales orci ullamcorper id. Sed condimentum neque ut erat
|
||||||
|
mattis pretium.
|
||||||
|
|
||||||
|
</Project>
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Prodigy {#prodigy} <IntegrationLogo name="prodigy" width={100} height="auto" align="right" />
|
||||||
|
|
||||||
|
[Prodigy](https://prodi.gy) is a modern annotation tool for creating training
|
||||||
|
data for machine learning models, developed by us. It integrates with spaCy
|
||||||
|
out-of-the-box and provides many different
|
||||||
|
[annotation recipes](https://prodi.gy/docs/recipes) for a variety of NLP tasks,
|
||||||
|
with and without a model in the loop. If Prodigy is installed in your project,
|
||||||
|
you can
|
||||||
|
|
||||||
|
The following example command starts the Prodigy app using the
|
||||||
|
[`ner.correct`](https://prodi.gy/docs/recipes#ner-correct) recipe and streams in
|
||||||
|
suggestions for the given entity labels produced by a pretrained model. You can
|
||||||
|
then correct the suggestions manually in the UI. After you save and exit the
|
||||||
|
server, the full dataset is exported in spaCy's format and split into a training
|
||||||
|
and evaluation set.
|
||||||
|
|
||||||
|
<!-- prettier-ignore -->
|
||||||
|
```yaml
|
||||||
|
### project.yml
|
||||||
|
variables:
|
||||||
|
PRODIGY_DATASET: 'ner_articles'
|
||||||
|
PRODIGY_LABELS: 'PERSON,ORG,PRODUCT'
|
||||||
|
PRODIGY_MODEL: 'en_core_web_md'
|
||||||
|
|
||||||
|
commands:
|
||||||
|
- name: annotate
|
||||||
|
- script:
|
||||||
|
- 'python -m prodigy ner.correct {PRODIGY_DATASET} ./assets/raw_data.jsonl {PRODIGY_MODEL} --labels {PRODIGY_LABELS}'
|
||||||
|
- 'python -m prodigy data-to-spacy ./corpus/train.spacy ./corpus/eval.spacy --ner {PRODIGY_DATASET}'
|
||||||
|
- deps:
|
||||||
|
- 'assets/raw_data.jsonl'
|
||||||
|
- outputs:
|
||||||
|
- 'corpus/train.spacy'
|
||||||
|
- 'corpus/eval.spacy'
|
||||||
|
```
|
||||||
|
|
||||||
|
<Project id="integrations/prodigy">
|
||||||
|
|
||||||
|
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus interdum
|
||||||
|
sodales lectus, ut sodales orci ullamcorper id. Sed condimentum neque ut erat
|
||||||
|
mattis pretium.
|
||||||
|
|
||||||
|
</Project>
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Streamlit {#streamlit} <IntegrationLogo name="streamlit" width={150} height="auto" align="right" />
|
||||||
|
|
||||||
|
<Grid cols={2} gutterBottom={false}>
|
||||||
|
|
||||||
|
<div>
|
||||||
|
|
||||||
|
[Streamlit](https://streamlit.io) is a Python framework for building interactive
|
||||||
|
data apps. The [`spacy-streamlit`](https://github.com/explosion/spacy-streamlit)
|
||||||
|
package helps you integrate spaCy visualizations into your Streamlit apps and
|
||||||
|
quickly spin up demos to explore your models interactively. It includes a full
|
||||||
|
embedded visualizer, as well as individual components.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ pip install spacy_streamlit
|
||||||
|
```
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
![](../images/spacy-streamlit.png)
|
||||||
|
|
||||||
|
</Grid>
|
||||||
|
|
||||||
|
Using [`spacy-streamlit`](https://github.com/explosion/spacy-streamlit), your
|
||||||
|
projects can easily define their own scripts that spin up an interactive
|
||||||
|
visualizer, using the latest model you trained, or a selection of models so you
|
||||||
|
can compare their results. The following script starts an
|
||||||
|
[NER visualizer](/usage/visualizers#ent) and takes two positional command-line
|
||||||
|
argument you can pass in from your `config.yml`: a comma-separated list of model
|
||||||
|
paths and an example text to use as the default text.
|
||||||
|
|
||||||
|
```python
|
||||||
|
### scripts/visualize.py
|
||||||
|
import spacy_streamlit
|
||||||
|
import sys
|
||||||
|
|
||||||
|
DEFAULT_TEXT = sys.argv[2] if len(sys.argv) >= 3 else ""
|
||||||
|
MODELS = [name.strip() for name in sys.argv[1].split(",")]
|
||||||
|
spacy_streamlit.visualize(MODELS, DEFAULT_TEXT, visualizers=["ner"])
|
||||||
|
```
|
||||||
|
|
||||||
|
<!-- prettier-ignore -->
|
||||||
|
```yaml
|
||||||
|
### project.yml
|
||||||
|
commands:
|
||||||
|
- name: visualize
|
||||||
|
help: "Visualize the model's output interactively using Streamlit"
|
||||||
|
script:
|
||||||
|
- 'streamlit run ./scripts/visualize.py ./training/model-best "I like Adidas shoes."'
|
||||||
|
deps:
|
||||||
|
- 'training/model-best'
|
||||||
|
```
|
||||||
|
|
||||||
|
<Project id="integrations/streamlit">
|
||||||
|
|
||||||
|
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus interdum
|
||||||
|
sodales lectus, ut sodales orci ullamcorper id. Sed condimentum neque ut erat
|
||||||
|
mattis pretium.
|
||||||
|
|
||||||
|
</Project>
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### FastAPI {#fastapi} <IntegrationLogo name="fastapi" width={100} height="auto" align="right" />
|
||||||
|
|
||||||
|
<!-- TODO: come up with example – there's not much integration needed, but it'd be nice to show an example that addresses some of the main concerns for serving ML (workers etc.) -->
|
||||||
|
|
||||||
|
<Project id="integrations/fastapi">
|
||||||
|
|
||||||
|
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus interdum
|
||||||
|
sodales lectus, ut sodales orci ullamcorper id. Sed condimentum neque ut erat
|
||||||
|
mattis pretium.
|
||||||
|
|
||||||
|
</Project>
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Ray {#ray} <IntegrationLogo name="ray" width={100} height="auto" align="right" />
|
||||||
|
|
||||||
|
<!-- TODO: document -->
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Weights & Biases {#wandb} <IntegrationLogo name="wandb" width={175} height="auto" align="right" />
|
||||||
|
|
||||||
|
<!-- TODO: decide how we want this to work? Just send results plus config from spacy evaluate in a separate command/script? -->
|
||||||
|
|
|
@ -552,7 +552,7 @@ component with different patterns, depending on your application:
|
||||||
html_merger = BadHTMLMerger(nlp, path="/path/to/patterns.json")
|
html_merger = BadHTMLMerger(nlp, path="/path/to/patterns.json")
|
||||||
```
|
```
|
||||||
|
|
||||||
<Infobox title="📖 Processing pipelines">
|
<Infobox title="Processing pipelines" emoji="📖">
|
||||||
|
|
||||||
For more details and examples of how to **create custom pipeline components**
|
For more details and examples of how to **create custom pipeline components**
|
||||||
and **extension attributes**, see the
|
and **extension attributes**, see the
|
||||||
|
|
|
@ -198,7 +198,7 @@ import Tokenization101 from 'usage/101/\_tokenization.md'
|
||||||
|
|
||||||
<Tokenization101 />
|
<Tokenization101 />
|
||||||
|
|
||||||
<Infobox title="📖 Tokenization rules">
|
<Infobox title="Tokenization rules" emoji="📖">
|
||||||
|
|
||||||
To learn more about how spaCy's tokenization rules work in detail, how to
|
To learn more about how spaCy's tokenization rules work in detail, how to
|
||||||
**customize and replace** the default tokenizer and how to **add
|
**customize and replace** the default tokenizer and how to **add
|
||||||
|
@ -214,7 +214,7 @@ import PosDeps101 from 'usage/101/\_pos-deps.md'
|
||||||
|
|
||||||
<PosDeps101 />
|
<PosDeps101 />
|
||||||
|
|
||||||
<Infobox title="📖 Part-of-speech tagging and morphology">
|
<Infobox title="Part-of-speech tagging and morphology" emoji="📖">
|
||||||
|
|
||||||
To learn more about **part-of-speech tagging** and rule-based morphology, and
|
To learn more about **part-of-speech tagging** and rule-based morphology, and
|
||||||
how to **navigate and use the parse tree** effectively, see the usage guides on
|
how to **navigate and use the parse tree** effectively, see the usage guides on
|
||||||
|
@ -229,7 +229,7 @@ import NER101 from 'usage/101/\_named-entities.md'
|
||||||
|
|
||||||
<NER101 />
|
<NER101 />
|
||||||
|
|
||||||
<Infobox title="📖 Named Entity Recognition">
|
<Infobox title="Named Entity Recognition" emoji="📖">
|
||||||
|
|
||||||
To learn more about entity recognition in spaCy, how to **add your own
|
To learn more about entity recognition in spaCy, how to **add your own
|
||||||
entities** to a document and how to **train and update** the entity predictions
|
entities** to a document and how to **train and update** the entity predictions
|
||||||
|
@ -245,7 +245,7 @@ import Vectors101 from 'usage/101/\_vectors-similarity.md'
|
||||||
|
|
||||||
<Vectors101 />
|
<Vectors101 />
|
||||||
|
|
||||||
<Infobox title="📖 Word vectors">
|
<Infobox title="Word vectors" emoji="📖">
|
||||||
|
|
||||||
To learn more about word vectors, how to **customize them** and how to load
|
To learn more about word vectors, how to **customize them** and how to load
|
||||||
**your own vectors** into spaCy, see the usage guide on
|
**your own vectors** into spaCy, see the usage guide on
|
||||||
|
@ -259,7 +259,7 @@ import Pipelines101 from 'usage/101/\_pipelines.md'
|
||||||
|
|
||||||
<Pipelines101 />
|
<Pipelines101 />
|
||||||
|
|
||||||
<Infobox title="📖 Processing pipelines">
|
<Infobox title="Processing pipelines" emoji="📖">
|
||||||
|
|
||||||
To learn more about **how processing pipelines work** in detail, how to enable
|
To learn more about **how processing pipelines work** in detail, how to enable
|
||||||
and disable their components, and how to **create your own**, see the usage
|
and disable their components, and how to **create your own**, see the usage
|
||||||
|
@ -458,7 +458,7 @@ import Serialization101 from 'usage/101/\_serialization.md'
|
||||||
|
|
||||||
<Serialization101 />
|
<Serialization101 />
|
||||||
|
|
||||||
<Infobox title="📖 Saving and loading">
|
<Infobox title="Saving and loading" emoji="📖">
|
||||||
|
|
||||||
To learn more about how to **save and load your own models**, see the usage
|
To learn more about how to **save and load your own models**, see the usage
|
||||||
guide on [saving and loading](/usage/saving-loading#models).
|
guide on [saving and loading](/usage/saving-loading#models).
|
||||||
|
@ -471,7 +471,7 @@ import Training101 from 'usage/101/\_training.md'
|
||||||
|
|
||||||
<Training101 />
|
<Training101 />
|
||||||
|
|
||||||
<Infobox title="📖 Training statistical models">
|
<Infobox title="Training statistical models" emoji="📖">
|
||||||
|
|
||||||
To learn more about **training and updating** models, how to create training
|
To learn more about **training and updating** models, how to create training
|
||||||
data and how to improve spaCy's named entity recognition models, see the usage
|
data and how to improve spaCy's named entity recognition models, see the usage
|
||||||
|
@ -485,14 +485,6 @@ import LanguageData101 from 'usage/101/\_language-data.md'
|
||||||
|
|
||||||
<LanguageData101 />
|
<LanguageData101 />
|
||||||
|
|
||||||
<Infobox title="📖 Language data">
|
|
||||||
|
|
||||||
To learn more about the individual components of the language data and how to
|
|
||||||
**add a new language** to spaCy in preparation for training a language model,
|
|
||||||
see the usage guide on [adding languages](/usage/adding-languages).
|
|
||||||
|
|
||||||
</Infobox>
|
|
||||||
|
|
||||||
## Lightning tour {#lightning-tour}
|
## Lightning tour {#lightning-tour}
|
||||||
|
|
||||||
The following examples and code snippets give you an overview of spaCy's
|
The following examples and code snippets give you an overview of spaCy's
|
||||||
|
@ -641,8 +633,9 @@ for ent in doc.ents:
|
||||||
### Train and update neural network models {#lightning-tour-training"}
|
### Train and update neural network models {#lightning-tour-training"}
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import spacy
|
|
||||||
import random
|
import random
|
||||||
|
import spacy
|
||||||
|
from spacy.gold import Example
|
||||||
|
|
||||||
nlp = spacy.load("en_core_web_sm")
|
nlp = spacy.load("en_core_web_sm")
|
||||||
train_data = [("Uber blew through $1 million", {"entities": [(0, 4, "ORG")]})]
|
train_data = [("Uber blew through $1 million", {"entities": [(0, 4, "ORG")]})]
|
||||||
|
@ -652,7 +645,9 @@ with nlp.select_pipes(enable="ner"):
|
||||||
for i in range(10):
|
for i in range(10):
|
||||||
random.shuffle(train_data)
|
random.shuffle(train_data)
|
||||||
for text, annotations in train_data:
|
for text, annotations in train_data:
|
||||||
nlp.update([text], [annotations], sgd=optimizer)
|
doc = nlp.make_doc(text)
|
||||||
|
example = Example.from_dict(doc, annotations)
|
||||||
|
nlp.update([example], sgd=optimizer)
|
||||||
nlp.to_disk("/model")
|
nlp.to_disk("/model")
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
|
@ -4,8 +4,8 @@ next: /usage/projects
|
||||||
menu:
|
menu:
|
||||||
- ['Introduction', 'basics']
|
- ['Introduction', 'basics']
|
||||||
- ['CLI & Config', 'cli-config']
|
- ['CLI & Config', 'cli-config']
|
||||||
- ['Custom Models', 'custom-models']
|
|
||||||
- ['Transfer Learning', 'transfer-learning']
|
- ['Transfer Learning', 'transfer-learning']
|
||||||
|
- ['Custom Models', 'custom-models']
|
||||||
- ['Parallel Training', 'parallel-training']
|
- ['Parallel Training', 'parallel-training']
|
||||||
- ['Internal API', 'api']
|
- ['Internal API', 'api']
|
||||||
---
|
---
|
||||||
|
@ -103,26 +103,38 @@ still look good.
|
||||||
|
|
||||||
> #### Migration from spaCy v2.x
|
> #### Migration from spaCy v2.x
|
||||||
>
|
>
|
||||||
> TODO: ...
|
> TODO: once we have an answer for how to update the training command
|
||||||
|
> (`spacy migrate`?), add details here
|
||||||
|
|
||||||
Training config files include all **settings and hyperparameters** for training
|
Training config files include all **settings and hyperparameters** for training
|
||||||
your model. Instead of providing lots of arguments on the command line, you only
|
your model. Instead of providing lots of arguments on the command line, you only
|
||||||
need to pass your `config.cfg` file to [`spacy train`](/api/cli#train).
|
need to pass your `config.cfg` file to [`spacy train`](/api/cli#train). Under
|
||||||
|
the hood, the training config uses the
|
||||||
|
[configuration system](https://thinc.ai/docs/usage-config) provided by our
|
||||||
|
machine learning library [Thinc](https://thinc.ai). This also makes it easy to
|
||||||
|
integrate custom models and architectures, written in your framework of choice.
|
||||||
|
Some of the main advantages and features of spaCy's training config are:
|
||||||
|
|
||||||
To read more about how the config system works under the hood, check out the
|
- **Structured sections.** The config is grouped into sections, and nested
|
||||||
[Thinc documentation](https://thinc.ai/docs/usage-config).
|
sections are defined using the `.` notation. For example, `[nlp.pipeline.ner]`
|
||||||
|
defines the settings for the pipeline's named entity recognizer. The config
|
||||||
- **Structured sections.**
|
can be loaded as a Python dict.
|
||||||
- **References to registered functions.** Sections can refer to registered
|
- **References to registered functions.** Sections can refer to registered
|
||||||
functions like [model architectures](/api/architectures),
|
functions like [model architectures](/api/architectures),
|
||||||
[optimizers](https://thinc.ai/docs/api-optimizers) or
|
[optimizers](https://thinc.ai/docs/api-optimizers) or
|
||||||
[schedules](https://thinc.ai/docs/api-schedules) and define arguments that are
|
[schedules](https://thinc.ai/docs/api-schedules) and define arguments that are
|
||||||
passed into them. You can also register your own functions to define
|
passed into them. You can also register your own functions to define
|
||||||
[custom architectures](#custom-models), reference them in your config,
|
[custom architectures](#custom-models), reference them in your config and
|
||||||
|
tweak their parameters.
|
||||||
- **Interpolation.** If you have hyperparameters used by multiple components,
|
- **Interpolation.** If you have hyperparameters used by multiple components,
|
||||||
define them once and reference them as variables.
|
define them once and reference them as variables.
|
||||||
|
- **Reproducibility with no hidden defaults.** The config file is the "single
|
||||||
<!-- TODO: we need to come up with a good way to present the sections and their expected values visually? -->
|
source of truth" and includes all settings. <!-- TODO: explain this better -->
|
||||||
|
- **Automated checks and validation.** When you load a config, spaCy checks if
|
||||||
|
the settings are complete and if all values have the correct types. This lets
|
||||||
|
you catch potential mistakes early. In your custom architectures, you can use
|
||||||
|
Python [type hints](https://docs.python.org/3/library/typing.html) to tell the
|
||||||
|
config which types of data to expect.
|
||||||
|
|
||||||
<!-- TODO: instead of hard-coding a full config here, we probably want to embed it from GitHub, e.g. from one of the project templates. This also makes it easier to keep it up to date, and the embed widgets take up less space-->
|
<!-- TODO: instead of hard-coding a full config here, we probably want to embed it from GitHub, e.g. from one of the project templates. This also makes it easier to keep it up to date, and the embed widgets take up less space-->
|
||||||
|
|
||||||
|
@ -181,26 +193,60 @@ pretrained_vectors = null
|
||||||
dropout = null
|
dropout = null
|
||||||
```
|
```
|
||||||
|
|
||||||
|
<!-- TODO: explain settings and @ notation, refer to function registry docs -->
|
||||||
|
|
||||||
|
<Infobox title="Config format and settings" emoji="📖">
|
||||||
|
|
||||||
|
For a full overview of spaCy's config format and settings, see the
|
||||||
|
[training format documentation](/api/data-formats#config). The settings
|
||||||
|
available for the different architectures are documented with the
|
||||||
|
[model architectures API](/api/architectures). See the Thinc documentation for
|
||||||
|
[optimizers](https://thinc.ai/docs/api-optimizers) and
|
||||||
|
[schedules](https://thinc.ai/docs/api-schedules).
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
|
#### Using registered functions {#config-functions}
|
||||||
|
|
||||||
|
The training configuration defined in the config file doesn't have to only
|
||||||
|
consist of static values. Some settings can also be **functions**. For instance,
|
||||||
|
the `batch_size` can be a number that doesn't change, or a schedule, like a
|
||||||
|
sequence of compounding values, which has shown to be an effective trick (see
|
||||||
|
[Smith et al., 2017](https://arxiv.org/abs/1711.00489)).
|
||||||
|
|
||||||
|
```ini
|
||||||
|
### With static value
|
||||||
|
[training]
|
||||||
|
batch_size = 128
|
||||||
|
```
|
||||||
|
|
||||||
|
To refer to a function instead, you can make `[training.batch_size]` its own
|
||||||
|
section and use the `@` syntax specify the function and its arguments – in this
|
||||||
|
case [`compounding.v1`](https://thinc.ai/docs/api-schedules#compounding) defined
|
||||||
|
in the [function registry](/api/top-level#registry). All other values defined in
|
||||||
|
the block are passed to the function as keyword arguments when it's initialized.
|
||||||
|
You can also use this mechanism to register
|
||||||
|
[custom implementations and architectures](#custom-models) and reference them
|
||||||
|
from your configs.
|
||||||
|
|
||||||
|
> #### TODO
|
||||||
|
>
|
||||||
|
> TODO: something about how the tree is built bottom-up?
|
||||||
|
|
||||||
|
```ini
|
||||||
|
### With registered function
|
||||||
|
[training.batch_size]
|
||||||
|
@schedules = "compounding.v1"
|
||||||
|
start = 100
|
||||||
|
stop = 1000
|
||||||
|
compound = 1.001
|
||||||
|
```
|
||||||
|
|
||||||
### Model architectures {#model-architectures}
|
### Model architectures {#model-architectures}
|
||||||
|
|
||||||
<!-- TODO: refer to architectures API: /api/architectures. This should document the architectures in spacy/ml/models -->
|
<!-- TODO: refer to architectures API: /api/architectures. This should document the architectures in spacy/ml/models -->
|
||||||
|
|
||||||
## Custom model implementations and architectures {#custom-models}
|
<!-- TODO: how do we document the default configs? -->
|
||||||
|
|
||||||
<!-- TODO: document some basic examples for custom models, refer to Thinc, refer to example config/project -->
|
|
||||||
|
|
||||||
<Project id="some_example_project">
|
|
||||||
|
|
||||||
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus interdum
|
|
||||||
sodales lectus, ut sodales orci ullamcorper id. Sed condimentum neque ut erat
|
|
||||||
mattis pretium.
|
|
||||||
|
|
||||||
</Project>
|
|
||||||
|
|
||||||
### Training with custom code
|
|
||||||
|
|
||||||
<!-- TODO: document usage of spacy train with --code -->
|
|
||||||
<!-- TODO: link to type annotations and maybe show example: https://thinc.ai/docs/usage-config#advanced-types -->
|
|
||||||
|
|
||||||
## Transfer learning {#transfer-learning}
|
## Transfer learning {#transfer-learning}
|
||||||
|
|
||||||
|
@ -220,6 +266,101 @@ visualize your model.
|
||||||
|
|
||||||
<!-- TODO: document spacy pretrain -->
|
<!-- TODO: document spacy pretrain -->
|
||||||
|
|
||||||
|
## Custom model implementations and architectures {#custom-models}
|
||||||
|
|
||||||
|
<!-- TODO: intro, should summarise what spaCy v3 can do and that you can now use fully custom implementations, models defined in PyTorch and TF, etc. etc. -->
|
||||||
|
|
||||||
|
### Training with custom code {#custom-code}
|
||||||
|
|
||||||
|
The [`spacy train`](/api/cli#train) recipe lets you specify an optional argument
|
||||||
|
`--code` that points to a Python file. The file is imported before training and
|
||||||
|
allows you to add custom functions and architectures to the function registry
|
||||||
|
that can then be referenced from your `config.cfg`. This lets you train spaCy
|
||||||
|
models with custom components, without having to re-implement the whole training
|
||||||
|
workflow.
|
||||||
|
|
||||||
|
For example, let's say you've implemented your own batch size schedule to use
|
||||||
|
during training. The `@spacy.registry.schedules` decorator lets you register
|
||||||
|
that function in the `schedules` [registry](/api/top-level#registry) and assign
|
||||||
|
it a string name:
|
||||||
|
|
||||||
|
> #### Why the version in the name?
|
||||||
|
>
|
||||||
|
> A big benefit of the config system is that it makes your experiments
|
||||||
|
> reproducible. We recommend versioning the functions you register, especially
|
||||||
|
> if you expect them to change (like a new model architecture). This way, you
|
||||||
|
> know that a config referencing `v1` means a different function than a config
|
||||||
|
> referencing `v2`.
|
||||||
|
|
||||||
|
```python
|
||||||
|
### functions.py
|
||||||
|
import spacy
|
||||||
|
|
||||||
|
@spacy.registry.schedules("my_custom_schedule.v1")
|
||||||
|
def my_custom_schedule(start: int = 1, factor: int = 1.001):
|
||||||
|
while True:
|
||||||
|
yield start
|
||||||
|
start = start * factor
|
||||||
|
```
|
||||||
|
|
||||||
|
In your config, you can now reference the schedule in the
|
||||||
|
`[training.batch_size]` block via `@schedules`. If a block contains a key
|
||||||
|
starting with an `@`, it's interpreted as a reference to a function. All other
|
||||||
|
settings in the block will be passed to the function as keyword arguments. Keep
|
||||||
|
in mind that the config shouldn't have any hidden defaults and all arguments on
|
||||||
|
the functions need to be represented in the config.
|
||||||
|
|
||||||
|
<!-- TODO: this needs to be updated once we've decided on a workflow for "fill config" -->
|
||||||
|
|
||||||
|
```ini
|
||||||
|
### config.cfg (excerpt)
|
||||||
|
[training.batch_size]
|
||||||
|
@schedules = "my_custom_schedule.v1"
|
||||||
|
start = 2
|
||||||
|
factor = 1.005
|
||||||
|
```
|
||||||
|
|
||||||
|
You can now run [`spacy train`](/api/cli#train) with the `config.cfg` and your
|
||||||
|
custom `functions.py` as the argument `--code`. Before loading the config, spaCy
|
||||||
|
will import the `functions.py` module and your custom functions will be
|
||||||
|
registered.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
### Training with custom code {wrap="true"}
|
||||||
|
python -m spacy train train.spacy dev.spacy config.cfg --output ./output --code ./functions.py
|
||||||
|
```
|
||||||
|
|
||||||
|
<Infobox title="Tip: Use Python type hints" emoji="💡">
|
||||||
|
|
||||||
|
spaCy's configs are powered by our machine learning library Thinc's
|
||||||
|
[configuration system](https://thinc.ai/docs/usage-config), which supports
|
||||||
|
[type hints](https://docs.python.org/3/library/typing.html) and even
|
||||||
|
[advanced type annotations](https://thinc.ai/docs/usage-config#advanced-types)
|
||||||
|
using [`pydantic`](https://github.com/samuelcolvin/pydantic). If your registered
|
||||||
|
function provides For example, `start: int` in the example above will ensure
|
||||||
|
that the value received as the argument `start` is an integer. If the value
|
||||||
|
can't be cast to an integer, spaCy will raise an error.
|
||||||
|
`start: pydantic.StrictInt` will force the value to be an integer and raise an
|
||||||
|
error if it's not – for instance, if your config defines a float.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
|
### Defining custom architectures {#custom-architectures}
|
||||||
|
|
||||||
|
<!-- TODO: this could maybe be a more general example of using Thinc to compose some layers? We don't want to go too deep here and probably want to focus on a simple architecture example to show how it works -->
|
||||||
|
|
||||||
|
### Wrapping PyTorch and TensorFlow {#custom-frameworks}
|
||||||
|
|
||||||
|
<!-- TODO: -->
|
||||||
|
|
||||||
|
<Project id="example_pytorch_model">
|
||||||
|
|
||||||
|
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus interdum
|
||||||
|
sodales lectus, ut sodales orci ullamcorper id. Sed condimentum neque ut erat
|
||||||
|
mattis pretium.
|
||||||
|
|
||||||
|
</Project>
|
||||||
|
|
||||||
## Parallel Training with Ray {#parallel-training}
|
## Parallel Training with Ray {#parallel-training}
|
||||||
|
|
||||||
<!-- TODO: document Ray integration -->
|
<!-- TODO: document Ray integration -->
|
||||||
|
@ -234,45 +375,93 @@ mattis pretium.
|
||||||
|
|
||||||
## Internal training API {#api}
|
## Internal training API {#api}
|
||||||
|
|
||||||
<!-- TODO: rewrite for new nlp.update / example logic -->
|
<Infobox variant="warning">
|
||||||
|
|
||||||
The [`GoldParse`](/api/goldparse) object collects the annotated training
|
spaCy gives you full control over the training loop. However, for most use
|
||||||
examples, also called the **gold standard**. It's initialized with the
|
cases, it's recommended to train your models via the
|
||||||
[`Doc`](/api/doc) object it refers to, and keyword arguments specifying the
|
[`spacy train`](/api/cli#train) command with a [`config.cfg`](#config) to keep
|
||||||
annotations, like `tags` or `entities`. Its job is to encode the annotations,
|
track of your settings and hyperparameters, instead of writing your own training
|
||||||
keep them aligned and create the C-level data structures required for efficient
|
scripts from scratch.
|
||||||
access. Here's an example of a simple `GoldParse` for part-of-speech tags:
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
|
<!-- TODO: maybe add something about why the Example class is great and its benefits, and how it's passed around, holds the alignment etc -->
|
||||||
|
|
||||||
|
The [`Example`](/api/example) object contains annotated training data, also
|
||||||
|
called the **gold standard**. It's initialized with a [`Doc`](/api/doc) object
|
||||||
|
that will hold the predictions, and another `Doc` object that holds the
|
||||||
|
gold-standard annotations. Here's an example of a simple `Example` for
|
||||||
|
part-of-speech tags:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
vocab = Vocab(tag_map={"N": {"pos": "NOUN"}, "V": {"pos": "VERB"}})
|
words = ["I", "like", "stuff"]
|
||||||
doc = Doc(vocab, words=["I", "like", "stuff"])
|
predicted = Doc(vocab, words=words)
|
||||||
gold = GoldParse(doc, tags=["N", "V", "N"])
|
# create the reference Doc with gold-standard TAG annotations
|
||||||
|
tags = ["NOUN", "VERB", "NOUN"]
|
||||||
|
tag_ids = [vocab.strings.add(tag) for tag in tags]
|
||||||
|
reference = Doc(vocab, words=words).from_array("TAG", numpy.array(tag_ids, dtype="uint64"))
|
||||||
|
example = Example(predicted, reference)
|
||||||
```
|
```
|
||||||
|
|
||||||
Using the `Doc` and its gold-standard annotations, the model can be updated to
|
Alternatively, the `reference` `Doc` with the gold-standard annotations can be
|
||||||
learn a sentence of three words with their assigned part-of-speech tags. The
|
created from a dictionary with keyword arguments specifying the annotations,
|
||||||
[tag map](/usage/adding-languages#tag-map) is part of the vocabulary and defines
|
like `tags` or `entities`. Using the `Example` object and its gold-standard
|
||||||
the annotation scheme. If you're training a new language model, this will let
|
annotations, the model can be updated to learn a sentence of three words with
|
||||||
you map the tags present in the treebank you train on to spaCy's tag scheme.
|
their assigned part-of-speech tags.
|
||||||
|
|
||||||
|
> #### About the tag map
|
||||||
|
>
|
||||||
|
> The tag map is part of the vocabulary and defines the annotation scheme. If
|
||||||
|
> you're training a new language model, this will let you map the tags present
|
||||||
|
> in the treebank you train on to spaCy's tag scheme:
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> tag_map = {"N": {"pos": "NOUN"}, "V": {"pos": "VERB"}}
|
||||||
|
> vocab = Vocab(tag_map=tag_map)
|
||||||
|
> ```
|
||||||
|
|
||||||
```python
|
```python
|
||||||
doc = Doc(Vocab(), words=["Facebook", "released", "React", "in", "2014"])
|
words = ["I", "like", "stuff"]
|
||||||
gold = GoldParse(doc, entities=["U-ORG", "O", "U-TECHNOLOGY", "O", "U-DATE"])
|
tags = ["NOUN", "VERB", "NOUN"]
|
||||||
|
predicted = Doc(nlp.vocab, words=words)
|
||||||
|
example = Example.from_dict(predicted, {"tags": tags})
|
||||||
```
|
```
|
||||||
|
|
||||||
The same goes for named entities. The letters added before the labels refer to
|
Here's another example that shows how to define gold-standard named entities.
|
||||||
the tags of the [BILUO scheme](/usage/linguistic-features#updating-biluo) – `O`
|
The letters added before the labels refer to the tags of the
|
||||||
is a token outside an entity, `U` an single entity unit, `B` the beginning of an
|
[BILUO scheme](/usage/linguistic-features#updating-biluo) – `O` is a token
|
||||||
entity, `I` a token inside an entity and `L` the last token of an entity.
|
outside an entity, `U` an single entity unit, `B` the beginning of an entity,
|
||||||
|
`I` a token inside an entity and `L` the last token of an entity.
|
||||||
|
|
||||||
|
```python
|
||||||
|
doc = Doc(nlp.vocab, words=["Facebook", "released", "React", "in", "2014"])
|
||||||
|
example = Example.from_dict(doc, {"entities": ["U-ORG", "O", "U-TECHNOLOGY", "O", "U-DATE"]})
|
||||||
|
```
|
||||||
|
|
||||||
|
<Infobox title="Migrating from v2.x" variant="warning">
|
||||||
|
|
||||||
|
As of v3.0, the [`Example`](/api/example) object replaces the `GoldParse` class.
|
||||||
|
It can be constructed in a very similar way, from a `Doc` and a dictionary of
|
||||||
|
annotations:
|
||||||
|
|
||||||
|
```diff
|
||||||
|
- gold = GoldParse(doc, entities=entities)
|
||||||
|
+ example = Example.from_dict(doc, {"entities": entities})
|
||||||
|
```
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
> - **Training data**: The training examples.
|
> - **Training data**: The training examples.
|
||||||
> - **Text and label**: The current example.
|
> - **Text and label**: The current example.
|
||||||
> - **Doc**: A `Doc` object created from the example text.
|
> - **Doc**: A `Doc` object created from the example text.
|
||||||
> - **GoldParse**: A `GoldParse` object of the `Doc` and label.
|
> - **Example**: An `Example` object holding both predictions and gold-standard
|
||||||
|
> annotations.
|
||||||
> - **nlp**: The `nlp` object with the model.
|
> - **nlp**: The `nlp` object with the model.
|
||||||
> - **Optimizer**: A function that holds state between updates.
|
> - **Optimizer**: A function that holds state between updates.
|
||||||
> - **Update**: Update the model's weights.
|
> - **Update**: Update the model's weights.
|
||||||
|
|
||||||
|
<!-- TODO: update graphic & related text -->
|
||||||
|
|
||||||
![The training loop](../images/training-loop.svg)
|
![The training loop](../images/training-loop.svg)
|
||||||
|
|
||||||
Of course, it's not enough to only show a model a single example once.
|
Of course, it's not enough to only show a model a single example once.
|
||||||
|
@ -286,34 +475,47 @@ dropout means that each feature or internal representation has a 1/4 likelihood
|
||||||
of being dropped.
|
of being dropped.
|
||||||
|
|
||||||
> - [`begin_training`](/api/language#begin_training): Start the training and
|
> - [`begin_training`](/api/language#begin_training): Start the training and
|
||||||
> return an optimizer function to update the model's weights. Can take an
|
> return an [`Optimizer`](https://thinc.ai/docs/api-optimizers) object to
|
||||||
> optional function converting the training data to spaCy's training format.
|
> update the model's weights.
|
||||||
> - [`update`](/api/language#update): Update the model with the training example
|
> - [`update`](/api/language#update): Update the model with the training
|
||||||
> and gold data.
|
> examplea.
|
||||||
> - [`to_disk`](/api/language#to_disk): Save the updated model to a directory.
|
> - [`to_disk`](/api/language#to_disk): Save the updated model to a directory.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### Example training loop
|
### Example training loop
|
||||||
optimizer = nlp.begin_training(get_data)
|
optimizer = nlp.begin_training()
|
||||||
for itn in range(100):
|
for itn in range(100):
|
||||||
random.shuffle(train_data)
|
random.shuffle(train_data)
|
||||||
for raw_text, entity_offsets in train_data:
|
for raw_text, entity_offsets in train_data:
|
||||||
doc = nlp.make_doc(raw_text)
|
doc = nlp.make_doc(raw_text)
|
||||||
gold = GoldParse(doc, entities=entity_offsets)
|
example = Example.from_dict(doc, {"entities": entity_offsets})
|
||||||
nlp.update([doc], [gold], drop=0.5, sgd=optimizer)
|
nlp.update([example], sgd=optimizer)
|
||||||
nlp.to_disk("/model")
|
nlp.to_disk("/model")
|
||||||
```
|
```
|
||||||
|
|
||||||
The [`nlp.update`](/api/language#update) method takes the following arguments:
|
The [`nlp.update`](/api/language#update) method takes the following arguments:
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ---------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `docs` | [`Doc`](/api/doc) objects. The `update` method takes a sequence of them, so you can batch up your training examples. Alternatively, you can also pass in a sequence of raw texts. |
|
| `examples` | [`Example`](/api/example) objects. The `update` method takes a sequence of them, so you can batch up your training examples. |
|
||||||
| `golds` | [`GoldParse`](/api/goldparse) objects. The `update` method takes a sequence of them, so you can batch up your training examples. Alternatively, you can also pass in a dictionary containing the annotations. |
|
| `drop` | Dropout rate. Makes it harder for the model to just memorize the data. |
|
||||||
| `drop` | Dropout rate. Makes it harder for the model to just memorize the data. |
|
| `sgd` | An [`Optimizer`](https://thinc.ai/docs/api-optimizers) object, which updated the model's weights. If not set, spaCy will create a new one and save it for further use. |
|
||||||
| `sgd` | An optimizer, i.e. a callable to update the model's weights. If not set, spaCy will create a new one and save it for further use. |
|
|
||||||
|
|
||||||
Instead of writing your own training loop, you can also use the built-in
|
<Infobox title="Migrating from v2.x" variant="warning">
|
||||||
[`train`](/api/cli#train) command, which expects data in spaCy's
|
|
||||||
[JSON format](/api/data-formats#json-input). On each epoch, a model will be
|
As of v3.0, the [`Example`](/api/example) object replaces the `GoldParse` class
|
||||||
saved out to the directory.
|
and the "simple training style" of calling `nlp.update` with a text and a
|
||||||
|
dictionary of annotations. Updating your code to use the `Example` object should
|
||||||
|
be very straightforward: you can call
|
||||||
|
[`Example.from_dict`](/api/example#from_dict) with a [`Doc`](/api/doc) and the
|
||||||
|
dictionary of annotations:
|
||||||
|
|
||||||
|
```diff
|
||||||
|
text = "Facebook released React in 2014"
|
||||||
|
annotations = {"entities": ["U-ORG", "O", "U-TECHNOLOGY", "O", "U-DATE"]}
|
||||||
|
+ example = Example.from_dict(nlp.make_doc(text), {"entities": entities})
|
||||||
|
- nlp.update([text], [annotations])
|
||||||
|
+ nlp.update([example])
|
||||||
|
```
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
|
@ -186,7 +186,7 @@ underlying [`Lexeme`](/api/lexeme), while [`Doc.vector`](/api/doc#vector) and
|
||||||
tokens. You can customize these behaviors by modifying the `doc.user_hooks`,
|
tokens. You can customize these behaviors by modifying the `doc.user_hooks`,
|
||||||
`doc.user_span_hooks` and `doc.user_token_hooks` dictionaries.
|
`doc.user_span_hooks` and `doc.user_token_hooks` dictionaries.
|
||||||
|
|
||||||
<Infobox title="📖 Custom user hooks">
|
<Infobox title="Custom user hooks" emoji="📖">
|
||||||
|
|
||||||
For more details on **adding hooks** and **overwriting** the built-in `Doc`,
|
For more details on **adding hooks** and **overwriting** the built-in `Doc`,
|
||||||
`Span` and `Token` methods, see the usage guide on
|
`Span` and `Token` methods, see the usage guide on
|
||||||
|
|
|
@ -4,7 +4,7 @@ teaser: Visualize dependencies and entities in your browser or in a notebook
|
||||||
new: 2
|
new: 2
|
||||||
menu:
|
menu:
|
||||||
- ['Dependencies', 'dep']
|
- ['Dependencies', 'dep']
|
||||||
- ['Entities', 'ent']
|
- ['Named Entities', 'ent']
|
||||||
- ['Jupyter Notebooks', 'jupyter']
|
- ['Jupyter Notebooks', 'jupyter']
|
||||||
- ['Rendering HTML', 'html']
|
- ['Rendering HTML', 'html']
|
||||||
- ['Web app usage', 'webapp']
|
- ['Web app usage', 'webapp']
|
||||||
|
@ -356,6 +356,6 @@ Alternatively, if you're using [Streamlit](https://streamlit.io), check out the
|
||||||
helps you integrate spaCy visualizations into your apps. It includes a full
|
helps you integrate spaCy visualizations into your apps. It includes a full
|
||||||
embedded visualizer, as well as individual components.
|
embedded visualizer, as well as individual components.
|
||||||
|
|
||||||
![](../images/spacy-streamlit.png)]
|
![](../images/spacy-streamlit.png)
|
||||||
|
|
||||||
</Grid>
|
</Grid>
|
||||||
|
|
|
@ -79,7 +79,9 @@
|
||||||
"items": [
|
"items": [
|
||||||
{ "text": "Language", "url": "/api/language" },
|
{ "text": "Language", "url": "/api/language" },
|
||||||
{ "text": "Tokenizer", "url": "/api/tokenizer" },
|
{ "text": "Tokenizer", "url": "/api/tokenizer" },
|
||||||
|
{ "text": "Tok2Vec", "url": "/api/tok2vec" },
|
||||||
{ "text": "Lemmatizer", "url": "/api/lemmatizer" },
|
{ "text": "Lemmatizer", "url": "/api/lemmatizer" },
|
||||||
|
{ "text": "Morphologizer", "url": "/api/morphologizer" },
|
||||||
{ "text": "Tagger", "url": "/api/tagger" },
|
{ "text": "Tagger", "url": "/api/tagger" },
|
||||||
{ "text": "DependencyParser", "url": "/api/dependencyparser" },
|
{ "text": "DependencyParser", "url": "/api/dependencyparser" },
|
||||||
{ "text": "EntityRecognizer", "url": "/api/entityrecognizer" },
|
{ "text": "EntityRecognizer", "url": "/api/entityrecognizer" },
|
||||||
|
|
|
@ -1,29 +1,32 @@
|
||||||
import React from 'react'
|
import React from 'react'
|
||||||
import PropTypes from 'prop-types'
|
import PropTypes from 'prop-types'
|
||||||
|
import classNames from 'classnames'
|
||||||
|
|
||||||
import Link from './link'
|
import Link from './link'
|
||||||
import { H5 } from './typography'
|
import { H5 } from './typography'
|
||||||
import classes from '../styles/card.module.sass'
|
import classes from '../styles/card.module.sass'
|
||||||
|
|
||||||
const Card = ({ title, to, image, header, onClick, children }) => (
|
const Card = ({ title, to, image, header, small, onClick, children }) => (
|
||||||
<div className={classes.root}>
|
<div className={classNames(classes.root, { [classes.small]: !!small })}>
|
||||||
{header && (
|
{header && (
|
||||||
<Link to={to} onClick={onClick} hidden>
|
<Link to={to} onClick={onClick} hidden>
|
||||||
{header}
|
{header}
|
||||||
</Link>
|
</Link>
|
||||||
)}
|
)}
|
||||||
<H5>
|
{(title || image) && (
|
||||||
{image && (
|
<H5 className={classes.title}>
|
||||||
<div className={classes.image}>
|
{image && (
|
||||||
<img src={image} width={35} alt="" />
|
<div className={classes.image}>
|
||||||
</div>
|
<img src={image} width={35} alt="" />
|
||||||
)}
|
</div>
|
||||||
{title && (
|
)}
|
||||||
<Link to={to} onClick={onClick} hidden>
|
{title && (
|
||||||
{title}
|
<Link to={to} onClick={onClick} hidden>
|
||||||
</Link>
|
{title}
|
||||||
)}
|
</Link>
|
||||||
</H5>
|
)}
|
||||||
|
</H5>
|
||||||
|
)}
|
||||||
<Link to={to} onClick={onClick} hidden>
|
<Link to={to} onClick={onClick} hidden>
|
||||||
{children}
|
{children}
|
||||||
</Link>
|
</Link>
|
||||||
|
@ -31,10 +34,10 @@ const Card = ({ title, to, image, header, onClick, children }) => (
|
||||||
)
|
)
|
||||||
|
|
||||||
Card.propTypes = {
|
Card.propTypes = {
|
||||||
title: PropTypes.string,
|
title: PropTypes.node,
|
||||||
|
header: PropTypes.node,
|
||||||
to: PropTypes.string,
|
to: PropTypes.string,
|
||||||
image: PropTypes.string,
|
image: PropTypes.string,
|
||||||
card: PropTypes.node,
|
|
||||||
onClick: PropTypes.func,
|
onClick: PropTypes.func,
|
||||||
children: PropTypes.node,
|
children: PropTypes.node,
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,7 +5,7 @@ import classNames from 'classnames'
|
||||||
import Icon from './icon'
|
import Icon from './icon'
|
||||||
import classes from '../styles/infobox.module.sass'
|
import classes from '../styles/infobox.module.sass'
|
||||||
|
|
||||||
const Infobox = ({ title, id, variant, className, children }) => {
|
const Infobox = ({ title, emoji, id, variant, className, children }) => {
|
||||||
const infoboxClassNames = classNames(classes.root, className, {
|
const infoboxClassNames = classNames(classes.root, className, {
|
||||||
[classes.warning]: variant === 'warning',
|
[classes.warning]: variant === 'warning',
|
||||||
[classes.danger]: variant === 'danger',
|
[classes.danger]: variant === 'danger',
|
||||||
|
@ -17,7 +17,14 @@ const Infobox = ({ title, id, variant, className, children }) => {
|
||||||
{variant !== 'default' && (
|
{variant !== 'default' && (
|
||||||
<Icon width={18} name={variant} inline className={classes.icon} />
|
<Icon width={18} name={variant} inline className={classes.icon} />
|
||||||
)}
|
)}
|
||||||
<span className={classes.titleText}>{title}</span>
|
<span className={classes.titleText}>
|
||||||
|
{emoji && (
|
||||||
|
<span className={classes.emoji} aria-hidden="true">
|
||||||
|
{emoji}
|
||||||
|
</span>
|
||||||
|
)}
|
||||||
|
{title}
|
||||||
|
</span>
|
||||||
</h4>
|
</h4>
|
||||||
)}
|
)}
|
||||||
{children}
|
{children}
|
||||||
|
@ -30,7 +37,7 @@ Infobox.defaultProps = {
|
||||||
}
|
}
|
||||||
|
|
||||||
Infobox.propTypes = {
|
Infobox.propTypes = {
|
||||||
title: PropTypes.string,
|
title: PropTypes.node,
|
||||||
id: PropTypes.string,
|
id: PropTypes.string,
|
||||||
variant: PropTypes.oneOf(['default', 'warning', 'danger']),
|
variant: PropTypes.oneOf(['default', 'warning', 'danger']),
|
||||||
className: PropTypes.string,
|
className: PropTypes.string,
|
||||||
|
|
|
@ -26,6 +26,16 @@ function getCellContent(children) {
|
||||||
return children
|
return children
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function isDividerRow(children) {
|
||||||
|
if (children.length && children[0].props && children[0].props.name == 'td') {
|
||||||
|
const tdChildren = children[0].props.children
|
||||||
|
if (tdChildren && !Array.isArray(tdChildren) && tdChildren.props) {
|
||||||
|
return tdChildren.props.name === 'em'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
function isFootRow(children) {
|
function isFootRow(children) {
|
||||||
const rowRegex = /^(RETURNS|YIELDS|CREATES|PRINTS)/
|
const rowRegex = /^(RETURNS|YIELDS|CREATES|PRINTS)/
|
||||||
if (children.length && children[0].props.name === 'td') {
|
if (children.length && children[0].props.name === 'td') {
|
||||||
|
@ -53,9 +63,11 @@ export const Th = props => <th className={classes.th} {...props} />
|
||||||
|
|
||||||
export const Tr = ({ evenodd = true, children, ...props }) => {
|
export const Tr = ({ evenodd = true, children, ...props }) => {
|
||||||
const foot = isFootRow(children)
|
const foot = isFootRow(children)
|
||||||
|
const isDivider = isDividerRow(children)
|
||||||
const trClasssNames = classNames({
|
const trClasssNames = classNames({
|
||||||
[classes.tr]: evenodd,
|
[classes.tr]: evenodd,
|
||||||
[classes.footer]: foot,
|
[classes.footer]: foot,
|
||||||
|
[classes.divider]: isDivider,
|
||||||
'table-footer': foot,
|
'table-footer': foot,
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|
5
website/src/images/logos/dvc.svg
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" width="36" height="22" viewBox="0 0 36 22">
|
||||||
|
<path d="M0 16.584V.696A.44.44 0 01.132.372a.413.413 0 01.3-.132h5.856c2.32 0 4.304.82 5.952 2.46 1.648 1.64 2.472 3.612 2.472 5.916 0 2.32-.824 4.304-2.472 5.952-1.648 1.648-3.632 2.472-5.952 2.472H.432a.413.413 0 01-.3-.132.44.44 0 01-.132-.324zm3.744-3.096h2.304c1.344 0 2.46-.468 3.348-1.404.888-.936 1.332-2.092 1.332-3.468 0-1.36-.444-2.508-1.332-3.444-.888-.936-2.004-1.404-3.348-1.404H3.744v9.72z" fill="#13ADC7"></path>
|
||||||
|
<path d="M17.121 21.016L9.633 4.864c-.08-.16-.076-.304.012-.432s.22-.192.396-.192h3.192c.208 0 .344.088.408.264l3.96 8.76h.144l3.96-8.76c.064-.176.2-.264.408-.264h3.192c.176 0 .308.064.396.192.088.128.092.272.012.432l-7.536 16.152c-.096.176-.232.264-.408.264h-.24c-.176 0-.312-.088-.408-.264z" fill="#945DD6"></path>
|
||||||
|
<path d="M23.196 14.784c-1.68-1.68-2.52-3.72-2.52-6.12s.844-4.444 2.532-6.132C24.896.844 26.94 0 29.34 0c2.24 0 4.176.744 5.808 2.232.24.224.248.448.024.672L33.3 4.848c-.208.192-.408.192-.6 0-.912-.816-1.992-1.224-3.24-1.224-1.344 0-2.464.484-3.36 1.452-.896.968-1.344 2.132-1.344 3.492 0 1.344.452 2.492 1.356 3.444.904.952 2.028 1.428 3.372 1.428 1.248 0 2.32-.384 3.216-1.152.224-.192.432-.184.624.024l1.872 1.992c.208.208.2.424-.024.648-1.6 1.552-3.544 2.328-5.832 2.328-2.4 0-4.448-.832-6.144-2.496z" fill="#F46737"></path>
|
||||||
|
</svg>
|
After Width: | Height: | Size: 1.3 KiB |
14
website/src/images/logos/fastapi.svg
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" width="1309.1" height="241.8" viewBox="0 0 346.4 64">
|
||||||
|
<g fill="#009688">
|
||||||
|
<path d="M32 0a32 32 0 100 64 32 32 0 000-64zm-1.7 57.6v-20H19.2l16-31.3v20.1h10.7z"/>
|
||||||
|
<g stroke-width="2">
|
||||||
|
<path d="M89.5 59.4V4.2h33.3v6.6H97.3v16.6H120V34H97.3v25.4z" />
|
||||||
|
<path d="M141.8 54h4.7l3.4-.5V41.2q-.8-.4-2.6-.7-1.8-.3-4.3-.3l-3.6.2q-1.9.3-3.5 1-1.5.8-2.5 2.1t-1 3.5q0 4 2.5 5.6 2.6 1.5 7 1.5zm-.6-37q4.5 0 7.5 1.1 3.1 1.1 5 3.3 1.9 2 2.7 5 .7 2.9.7 6.4v25.9l-2.7.5-3.8.4q-2.1.3-4.7.4-2.5.3-5 .3-3.4 0-6.4-.7-3-.7-5-2.3-2.2-1.6-3.4-4.1-1.2-2.6-1.2-6.1 0-3.5 1.3-6 1.5-2.4 3.8-4 2.4-1.4 5.6-2.2 3.2-.7 6.7-.7l2.3.2q1.2 0 2.3.3l1.9.3 1 .3v-2.1q0-1.8-.3-3.6-.4-1.8-1.4-3.2-1-1.4-3-2.2-1.7-.9-4.5-.9-3.7 0-6.5.6-2.7.4-4 1l-1-6.1q1.5-.7 4.9-1.2 3.3-.7 7.2-.7z" />
|
||||||
|
<path d="M179 54q4.6 0 6.7-1.1 2.3-1.2 2.3-3.9t-2.2-4.3q-2.1-1.6-7-3.5l-4.7-2q-2.2-1-3.8-2.3-1.6-1.4-2.5-3.3-1-2-1-4.7 0-5.5 4.1-8.7 4-3.3 11-3.3 1.8 0 3.6.3 1.7.1 3.3.4l2.6.6 1.8.6L192 25q-1.2-.6-3.8-1.2-2.5-.8-6.1-.8-3.1 0-5.4 1.3-2.4 1.2-2.4 3.8 0 1.4.5 2.4.6 1 1.6 2L179 34l3.9 1.5q2.9 1 5.2 2.2 2.3 1 4 2.5 1.6 1.4 2.5 3.5.8 2 .8 5 0 5.7-4.3 8.6-4.2 3-12 3-5.6 0-8.7-1l-4.2-1.3 1.3-6.4 4.1 1.4q2.8 1 7.4 1z" />
|
||||||
|
<path d="M211.8 18h15.7v6.2h-15.7v19.1q0 3.1.5 5.2.5 2 1.4 3.2 1 1 2.4 1.6 1.5.5 3.4.5 3.3 0 5.3-.8t2.9-1l1.4 6q-1.1.6-3.9 1.4-2.8.9-6.4.9-4.2 0-7-1-2.7-1.2-4.4-3.3-1.6-2.2-2.4-5.3-.6-3.2-.6-7.3v-37l7.4-1.3z" />
|
||||||
|
<path d="M274.3 59.4l-2.5-7-2.5-7.1h-25l-5 14.1h-8l5.9-16.2 5.4-14.2q2.7-6.7 5.4-12.7 2.6-6.2 5.5-12.1h7q3 6 5.5 12 2.7 6.1 5.3 12.8l5.5 14.2 6 16.2zM267.1 39L262 25.5q-2.5-6.5-5.2-12.5-2.8 6-5.3 12.5l-5 13.4z" />
|
||||||
|
<path d="M304.9 3.6q11.6 0 17.8 4.5 6.3 4.4 6.3 13.1 0 4.8-1.7 8.2-1.7 3.4-5 5.5-3.2 2.1-7.8 3-4.6 1-10.4 1h-6.2v20.5h-7.7V5q3.3-.8 7.3-1 4-.4 7.4-.4zm.6 6.8q-5 0-7.6.2v21.7h5.9q4 0 7.1-.5 3.2-.5 5.4-1.7 2.2-1.3 3.4-3.5 1.2-2.1 1.2-5.5 0-3.1-1.3-5.2-1.2-2-3.3-3.3-2-1.3-4.9-1.7-2.8-.5-5.9-.5z" />
|
||||||
|
<path d="M338.6 4.2h7.8v55.2h-7.8z" />
|
||||||
|
</g>
|
||||||
|
</g>
|
||||||
|
</svg>
|
After Width: | Height: | Size: 2.0 KiB |
3
website/src/images/logos/prodigy.svg
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 540 158" width="540" height="158">
|
||||||
|
<path d="M70.6 48.6c7 7.3 10.5 17 10.5 29.2s-3.3 22-10.4 29.2c-7 7.3-16 11-27 11-9.4 0-16.8-2.6-21.7-8v44.7H0V39h20.7v8c4.8-6.3 12.4-9.5 23-9.5 11 0 20 3.7 27 11zM22 76v3.6c0 12 7.2 19.8 18.2 19.8 11.2 0 18.7-8 18.7-21.6S51.3 56.2 40 56.2C29.2 56.2 22 64 22 76zM133.8 59.4c-12.6 0-20.5 7-20.5 17.8v39.3h-22V39h21v8.8c4-6.4 11.3-9.6 21.4-9.6v21.2zM209.5 107c-7.6 7.4-17.5 11.2-29.5 11.2s-22-3.8-29.7-11c-7.6-7.6-11.5-17.3-11.5-29.3 0-12.2 4-22 11.5-29.3 7.8-7.3 17.7-11 29.7-11s22 3.7 29.5 11c7.8 7.3 11.7 17 11.7 29.2 0 11.8-4 21.6-11.7 29zM180 56.3c-5.7 0-10.3 2-13.8 5.8s-5.2 9-5.2 15.7c0 6.7 1.8 12 5.2 15.7 3.4 3.8 8 5.7 13.8 5.7s10.3-1.8 13.8-5.6 5.2-9 5.2-15.7c0-6.8-1.8-12-5.2-15.7-3.5-3.8-8-5.8-13.8-5.8zM313 116.5h-20.5v-8c-4.4 5.6-12.7 9.7-23 9.7-11 0-20-3.8-27-11-7-7.5-10.5-17.2-10.5-29.4s3.5-22 10.3-29.2c7-7.3 16-11 27-11 9.7 0 17 2.6 22 8V0H313v116.5zm-58.8-38.7c0 13.6 7.5 21.4 18.7 21.4 10.8 0 18.2-7.3 18.2-19.8V76c0-12.2-7.3-19.8-18.3-19.8-11.3 0-18.8 8-18.8 21.6zM354 13.6c0 3.6-1.2 6.8-3.8 9.3-5 4.8-13.6 4.8-18.6 0C323.2 15.3 330-.3 341 .3c7.3 0 13 6 13 13.2zm-2 103h-22V39h22v77.5zM425 47v-8h20.6v80.4c0 11.2-3.6 20-10.6 26.8-7 6.7-16.6 10-28.5 10-23.4 0-37-11.4-40-29.8l21.8-.8c1 7.6 7.6 12 17.4 12 11.2 0 18-5.8 18-16.6v-11c-5 5.4-12.4 8-21.8 8-11 0-20-3.7-27-11s-10.4-17-10.4-29.2 3.5-22 10.3-29.2c7-7.3 16-11 27-11 10.6 0 18.3 3 23 9.5zM387 78c0 13.6 7.5 21.6 18.7 21.6 11 0 18.3-7.6 18.3-19.8V76c0-12.2-7.3-19.8-18.3-19.8-11.2 0-18.7 8-18.7 21.6zM488.8 154.8H465l19.8-45L454.5 39h24l18 46.2L514 39h24.3l-49.7 115.8z" fill="#1a1e24"/>
|
||||||
|
</svg>
|
After Width: | Height: | Size: 1.6 KiB |
4
website/src/images/logos/ray.svg
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" width="945.3" height="350" viewBox="90 90 850 300">
|
||||||
|
<path fill="#231f20" d="M549.5 308.7h18.7L534.6 260c20.7-6.7 32.1-22.7 32.1-46.4 0-30.5-20.6-48-56.4-48h-54.4v143h16.3v-45.1h38c3.3 0 6.6-.2 9.7-.4zm-77.3-61v-66.4h38c26 0 40.8 11 40.8 32.7 0 22.3-14.8 33.7-40.7 33.7zm224.2 24.8l16 36.2H730l-64-143.2h-17L585 308.7h17.4l16-36.2zm-7-15.8h-64.2l32-72.6zm106.6 3l54.4-94.2h-16.8l-45.8 74.7-46.4-74.7h-17l55.4 94.7v48.5H796v-49"/>
|
||||||
|
<path fill="#00adef" d="M204.3 227.2a40.5 40.5 0 0178.7 0H313c1-3.9 2.5-7.6 4.5-11l-53-52.8a40.4 40.4 0 01-49.4-6.2A40.4 40.4 0 01243.7 88a40.5 40.5 0 0134.8 61.4l53 53a40.4 40.4 0 0149.4 6.1 40.5 40.5 0 01-28.7 69.3c-7.6 0-14.7-2.1-20.8-5.8l-53 53a40.4 40.4 0 01-6 49.5 40.5 40.5 0 01-69.3-28.7 40.4 40.4 0 0161.4-34.8l52.9-53c-2-3.3-3.5-7-4.5-10.9H283a40.5 40.5 0 01-78.7 0h-29.9a40.5 40.5 0 110-19.8zM229 360.4a20.7 20.7 0 0035.4-14.7A20.7 20.7 0 00229 331a20.7 20.7 0 000 29.4zm138-108.6a20.7 20.7 0 00-14.8-35.4 20.7 20.7 0 00-14.6 35.4 20.7 20.7 0 0029.3 0zm-108.7-138a20.7 20.7 0 00-35.4 14.7 20.7 20.7 0 0035.4 14.7 20.7 20.7 0 000-29.4zm-138 108.6a20.7 20.7 0 0014.8 35.5 20.7 20.7 0 0014.6-35.5 20.7 20.7 0 00-29.3 0zm108.7 0a20.7 20.7 0 0014.7 35.5 20.7 20.7 0 0014.6-35.5 20.7 20.7 0 00-29.3 0"/>
|
||||||
|
</svg>
|
After Width: | Height: | Size: 1.3 KiB |
14
website/src/images/logos/streamlit.svg
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" width="800" height="200" viewBox="130 250 550 100">
|
||||||
|
<path fill="#262730" d="M300.4 316.8h8c1 8.4 4.3 15.4 18.2 15.4 8.8 0 15.6-5.3 15.6-13.2 0-8-3.6-11.3-16.6-13.5-14.4-2.3-23-6.9-23-19.3 0-10.8 9.2-19.1 22.1-19.1 13.8 0 22.3 6.9 23.6 19.4H341c-1.7-8.9-6.7-12.5-16.2-12.5-9.4 0-14.3 4.8-14.3 11.4 0 6.8 2.5 10.4 16.5 12.5 15.3 2.5 23.4 7.2 23.4 20.5 0 11.8-10.1 20.7-23.7 20.7-18.4 0-25-10.3-26.2-22.3z"/>
|
||||||
|
<path fill="#262730" d="M362.4 325.4v-33h-7.2v-6.8h7.2v-12.2h8v12.2H382v6.8h-11.7v32.3c0 5 2.2 7.4 6.2 7.4 2.5 0 4.4-.4 6.1-1v6.7c-1.6.6-3.5 1.1-6.8 1.1-9 0-13.5-5.5-13.5-13.4z"/>
|
||||||
|
<path fill="#262730" d="M390.3 285.6h8v9.2c3.1-5.8 7.5-9.8 16.5-10v7.5c-10 .5-16.5 4.8-16.5 17.4v28.5h-8v-52.6z"/>
|
||||||
|
<path fill="#262730" d="M454.5 307.4c-.4-11.2-6.1-16-14.4-16-8.1 0-13.7 6-15 16h29.4zm-37.9 5.5V311c0-16.8 9.7-26.4 23.5-26.4 11.6 0 22.6 7 22.6 25.8v3.4h-37.8c.3 11.6 5.8 18.5 16 18.5 7.9 0 12.4-3 13.4-8.9h8c-1.7 10.2-10 15.6-21.4 15.6-14.2 0-24.3-10.5-24.3-26.2z"/>
|
||||||
|
<path fill="#262730" d="M500.8 320.4v-6.8H494c-10.9 0-17.5 2.7-17.5 10.2 0 5.2 2.5 8.9 9.5 8.9 8.5 0 15-4.4 15-12.3zm-32.3 3.4c0-11.8 11.3-16 25.1-16h7.2V303c0-8-3-11.5-10.7-11.5-7 0-10.8 3-11.6 9.2h-8c1.1-11.6 10.3-15.9 20-15.9 9.6 0 18.3 3 18.3 17.2v36.3h-8v-6.4a18 18 0 01-15.6 7.3c-9.3 0-16.7-4.9-16.7-15.3z"/>
|
||||||
|
<path fill="#262730" d="M519.4 285.6h8v7.8c2.2-4.8 7.7-8.7 15-8.7 6.3 0 12 2.7 14.6 9.6 3.5-6.7 11-9.6 17.2-9.6 9 0 16.9 5.5 16.9 20.3v33.2h-8v-33.8c0-9-3.7-12.7-10.6-12.7a13 13 0 00-13.3 13.7v32.8h-8v-33.8c0-9-3.7-12.7-10.6-12.7a13 13 0 00-13.3 13.7v32.8h-8v-52.6z"/>
|
||||||
|
<path fill="#262730" d="M602.1 338.2h8v-72.9h-8v73z"/>
|
||||||
|
<path fill="#262730" d="M623 338.2h8v-52.6h-8v52.6zm-1.4-67.5c0-3 2.4-5.4 5.3-5.4a5.4 5.4 0 010 10.8 5.4 5.4 0 01-5.3-5.4z"/>
|
||||||
|
<path fill="#262730" d="M646.6 325.4v-33h-7.2v-6.8h7.2v-12.2h8v12.2h11.7v6.8h-11.7v32.3c0 5 2.2 7.4 6.2 7.4 2.5 0 4.5-.4 6.2-1v6.7c-1.7.6-3.6 1.1-6.8 1.1-9.2 0-13.6-5.5-13.6-13.4z"/>
|
||||||
|
<path fill="#FFC7C7" d="M276.4 276v.1l-22.2 57.7c-1.5 2.7-3.7 4.4-7 4.4h-84.7c-2 0-4.2-1-5.3-2l116.4-62.5c1.6-1 3.3.6 2.8 2.3z"/>
|
||||||
|
<path fill="#FF8C8C" d="M254.1 333.8c-1.5 2.8-3.7 4.4-7 4.4h-84.6c-3 0-5.8-1.6-7.1-4.4l47-71.3c1.1-1.6 3.5-1.6 4.7 0l47 71.3z"/>
|
||||||
|
<path fill="#FF4B4B" d="M252.4 336.2c-1 1.1-2.9 2-5.2 2h-84.7c-3 0-6-1.5-7.1-4.4L133 276c-.6-1.6 1-3.2 2.7-2.5h.1l116.5 62.6z"/>
|
||||||
|
</svg>
|
After Width: | Height: | Size: 2.3 KiB |
28
website/src/images/logos/wandb.svg
Normal file
|
@ -0,0 +1,28 @@
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" width="700" height="109" fill="none" viewBox="0 0 700 109">
|
||||||
|
<path fill="#000" d="M181.2 83.4h3.2l12.8-47.1 6.3-1v-3h-17.1v3l6.9 1-9.6 35-11.3-35 7.6-1v-3h-20.5v3l7.5 1L155.5 71l-9.3-34.8 7-.8v-3.1h-20.6v3l6.3.8 13.6 47.3h3.3L168 44l13 39.3z"/>
|
||||||
|
<path fill="#000" d="M219.2 49.4c4.6 0 8.2 3.9 8.2 8.2 0 2.2-.8 4-4.5 4h-13.7c.8-8.8 5.7-12.2 10-12.2zm.4-3.5c-9.4 0-17.4 7.8-17.4 19.3 0 11.6 6.7 18.6 16.9 18.6 6.9 0 11.7-3.4 14.5-8.7l-1.8-1.5C229 76.8 226 79 221.3 79c-7.3 0-12.1-5-12.3-14.2h24.1c.3-1 .6-2.5.6-4.2 0-9.2-6.2-14.6-14-14.6z"/>
|
||||||
|
<path fill="#000" d="M249 37.6c2.7 0 5-2 5-4.8 0-3-2.3-4.8-5-4.8-3 0-5.1 1.8-5.1 4.8 0 2.8 2.1 4.8 5 4.8zm8.5 42.5l-5.3-1-.1-12.1V57l.2-10.3-1.1-.8-11.7 4.7v2.6l6 .7c.2 3 .3 5.4.3 9.5V67l-.1 12-5.7 1.1v2.7h17.5v-2.7z"/>
|
||||||
|
<path fill="#000" d="M278 49c4.8 0 7.5 3.8 7.5 9.1 0 5.6-2.9 9.6-7.7 9.6-4.7 0-7.3-3.8-7.3-9.3 0-5.5 2.8-9.4 7.6-9.4zm-13.7 9.3c0 5.2 2.2 8.7 5.8 10.7-3.7 3.5-5 6-5 8.4 0 3 1.6 4.8 4.9 5.7-5.6 2.8-7.4 5.4-7.4 8.7 0 5.3 4.7 9.1 15.6 9.1 12.4 0 18.7-6.5 18.7-13 0-6.2-3.8-9.8-12.2-9.8h-9.9c-3.5 0-4.3-1.3-4.3-3.4 0-1.7.6-3.1 1.8-4.8 1.7.5 3.6.8 5.6.8 7.9 0 13.7-4.4 13.7-12.4 0-2.7-.6-5-1.7-6.8h7.4v-4.8l-1.1-.8-8.6 3a15 15 0 00-9.6-3c-7.9 0-13.7 4.5-13.7 12.4zm14.2 39.1c-6 0-9.7-2-9.7-6.9 0-2.4.8-4.5 3-6.8l3 .2h8.9c5.4 0 7.5 2.6 7.5 5.9 0 4.2-4.4 7.6-12.7 7.6z"/>
|
||||||
|
<path fill="#000" d="M306.3 67l-.1 12-5.7 1.1v2.7h17.6v-2.7l-5.3-1-.1-12.1V56.3c3.8-3.5 7-5 10.7-5 4.4 0 6.1 2.3 6.1 9.2V67l-.1 12.1-5.5 1v2.7h17.5v-2.7l-5.3-1L336 67v-7c0-10-3.2-14-9.7-14a20 20 0 00-13.8 7V38.3l.3-10.7-1-.6-11.7 3.4V33l6.2.8V67z"/>
|
||||||
|
<path fill="#000" d="M356.7 74.4V50.9h10.2v-4h-10l.3-10.2h-4.5l-2 10-6.7 1v3.2h6.2V75.2c0 5.9 3.3 8.6 8.6 8.6 3.8 0 7-1.4 8.9-4l-1.8-2c-2 1.4-3.4 2-5 2-2.7 0-4.2-1.6-4.2-5.4z"/>
|
||||||
|
<path fill="#000" d="M383 83.8c9.7 0 14.7-5 14.7-10.8 0-4.8-2.9-8.3-10.4-11.1l-2.4-.8c-5-2-7-3.2-7-6.5 0-3.1 2.5-5.2 7.1-5.2 1.8 0 3.6.5 5.2 1.3l1.2 6h4l.3-7.7c-3.3-2-6.5-3-10.6-3-8.5 0-13.2 5-13.2 10.6 0 5.1 3.7 8.2 9.2 10.2l3.5 1.3c4.7 1.7 7 3.3 7 6.6 0 3.4-2.7 5.7-8.5 5.7-2.3 0-4.3-.4-6-1.2l-1.5-6.6h-4.2l.3 8.3c3.7 2 7 3 11.3 3z"/>
|
||||||
|
<path fill="#000" d="M435 49.4c-2.7-3.5-3.3-5.9-3.3-8.6 0-4 3-7 6.8-7 3.7 0 6.6 2.7 6.6 7 0 4.5-3.3 7.8-8 11-.7-.7-1.4-1.5-2-2.4zM469 83l.2-3.4-7.3-1-6.2-6.5a55.5 55.5 0 007.9-17.7l5.7-1v-3H454v3l6 1c-1.4 5.9-3.5 10.8-6.3 15.5-3-3.2-5.7-6-8.7-9.4a335 335 0 01-6-6.7c7.6-4.2 11.1-7.9 11.1-13 0-5.8-4-9.8-11.5-9.8-7 0-12.4 4.1-12.4 10.8 0 4 1.5 7.8 5.2 12.1l.3.4c-7.5 3.8-11.4 8.9-11.4 15.7 0 7.6 6.5 13.7 16 13.7 7.2 0 12-2.8 15.4-6.4l2.6 3c2.5 2.5 4.9 3.4 8.9 3.4 2 0 3.5-.2 5.7-.7zm-27.2-16.8l7.7 8.7c-2.9 2.5-7 4-11.2 4-7.3 0-11.4-5.5-11.4-11a13 13 0 016.6-11.5c2.5 2.9 4.9 5.6 8.3 9.8z"/>
|
||||||
|
<path fill="#000" d="M500.8 59.4c0 6.6 0 13.2-.2 19.5l-7 .8v3.1h20.8c14.6 0 20.2-7 20.2-13.8 0-6.1-4.2-11-14.3-12.5 8.4-1.7 11.8-6.5 11.8-12 0-7-6-12.2-16.5-12.2h-22v3l7 .9c.2 6.5.2 13 .2 19.5v3.7zm11.3-1c10.4 0 15 3.6 15 10.6 0 6.7-4.7 10.3-14.4 10.3h-4.6c-.2-6.4-.2-13-.2-20.9h4.2zm1.2-22.7c7.7 0 11.6 2.5 11.6 9.4S521 55 512 55h-4c0-6 0-12.7.2-19.2h5.2z"/>
|
||||||
|
<path fill="#000" d="M549.4 37.6c2.8 0 5-2 5-4.8 0-3-2.2-4.8-5-4.8s-5 1.8-5 4.8c0 2.8 2.2 4.8 5 4.8zm8.6 42.5l-5.2-1-.2-12.1V57l.2-10.3-1.1-.8-11.6 4.7v2.6l6 .7c.2 3 .2 5.4.2 9.5V67l-.1 12-5.7 1.1v2.7H558v-2.7z"/>
|
||||||
|
<path fill="#000" d="M591.8 83.6c2.7 0 4.8-1 6.6-4l-1.5-1.6c-.8 1-1.7 1.7-3 1.7-1.6 0-2.7-1.2-2.7-4.5V59.1c0-9.5-3.8-13.2-11.7-13.2-7.7 0-13.3 3.7-14.7 9.4.2 2 1.4 3.1 3.5 3.1 2 0 3.3-1.3 4-3.6l1.4-5c1.6-.3 3-.4 4-.4 5 0 7.2 1.8 7.2 9.2v1.9c-3 .6-6.2 1.6-8.6 2.5-10.2 3.7-12.5 7-12.5 11.4 0 6.2 4.6 9.4 10.1 9.4 4.5 0 7-2 11.1-6a6.6 6.6 0 006.8 5.8zm-21.3-10.4c0-2.2.8-5.1 7.4-7.7 1.6-.6 4.3-1.5 7-2.2v11.6c-4.3 3-5.8 4-8.3 4-3.5 0-6-1.7-6-5.7z"/>
|
||||||
|
<path fill="#000" d="M614 83.8c9.7 0 14.7-5 14.7-10.8 0-4.8-2.9-8.3-10.4-11.1l-2.4-.8c-5.1-2-7-3.2-7-6.5 0-3.1 2.5-5.2 7-5.2 2 0 3.6.5 5.3 1.3l1.2 6h4l.3-7.7c-3.3-2-6.5-3-10.6-3-8.5 0-13.2 5-13.2 10.6 0 5.1 3.6 8.2 9.2 10.2l3.4 1.3c4.8 1.7 7 3.3 7 6.6 0 3.4-2.7 5.7-8.4 5.7-2.3 0-4.3-.4-6.1-1.2l-1.5-6.6h-4.1l.3 8.3c3.7 2 7 3 11.3 3z"/>
|
||||||
|
<path fill="#000" d="M652 49.4c4.6 0 8.2 3.9 8.2 8.2 0 2.2-.8 4-4.5 4H642c.8-8.8 5.7-12.2 10-12.2zm.4-3.5c-9.5 0-17.4 7.8-17.4 19.3 0 11.6 6.7 18.6 16.9 18.6 6.8 0 11.7-3.4 14.5-8.7l-1.8-1.5c-2.8 3.2-5.7 5.3-10.5 5.3-7.3 0-12.1-5-12.3-14.2H666c.3-1 .6-2.5.6-4.2 0-9.2-6.2-14.6-14.1-14.6z"/>
|
||||||
|
<path fill="#000" d="M685.3 83.8c9.7 0 14.7-5 14.7-10.8 0-4.8-2.9-8.3-10.4-11.1l-2.4-.8c-5.1-2-7-3.2-7-6.5 0-3.1 2.5-5.2 7.1-5.2 1.8 0 3.6.5 5.2 1.3l1.2 6h4l.3-7.7c-3.3-2-6.5-3-10.6-3-8.5 0-13.2 5-13.2 10.6 0 5.1 3.7 8.2 9.2 10.2l3.4 1.3c4.8 1.7 7 3.3 7 6.6 0 3.4-2.6 5.7-8.4 5.7-2.3 0-4.3-.4-6.1-1.2l-1.4-6.6h-4.2l.3 8.3c3.7 2 7 3 11.3 3z"/>
|
||||||
|
<path fill="#FC3" d="M0 35.1a11.2 11.2 0 1022.5 0 11.2 11.2 0 00-22.5 0z"/>
|
||||||
|
<path fill="#FC3" d="M0 91.3a11.2 11.2 0 1022.5 0 11.2 11.2 0 00-22.5 0z"/>
|
||||||
|
<path fill="#FC3" d="M4.2 63.2a7 7 0 1014 0 7 7 0 00-14 0z"/>
|
||||||
|
<path fill="#FC3" d="M4.2 7a7 7 0 1014 0 7 7 0 00-14 0z"/>
|
||||||
|
<path fill="#FC3" d="M43.1 73.6a11.2 11.2 0 1022.5 0 11.2 11.2 0 00-22.5 0z"/>
|
||||||
|
<path fill="#FC3" d="M47.3 101.7a7 7 0 1014 0 7 7 0 00-14 0z"/>
|
||||||
|
<path fill="#FC3" d="M47.3 45.5a7 7 0 1014 0 7 7 0 00-14 0z"/>
|
||||||
|
<path fill="#FC3" d="M47.3 17.4a7 7 0 1014 0 7 7 0 00-14 0z"/>
|
||||||
|
<path fill="#FC3" d="M86.2 35.1a11.2 11.2 0 1022.5 0 11.2 11.2 0 00-22.5 0z"/>
|
||||||
|
<path fill="#FC3" d="M90.5 7a7 7 0 1014 0 7 7 0 00-14 0z"/>
|
||||||
|
<path fill="#FC3" d="M90.5 63.2a7 7 0 1014 0 7 7 0 00-14 0z"/>
|
||||||
|
<path fill="#FC3" d="M90.5 91.3a7 7 0 1014 0 7 7 0 00-14 0z"/>
|
||||||
|
</svg>
|
After Width: | Height: | Size: 5.4 KiB |
|
@ -5,6 +5,15 @@
|
||||||
font: var(--font-size-md)/var(--line-height-md) var(--font-primary)
|
font: var(--font-size-md)/var(--line-height-md) var(--font-primary)
|
||||||
margin-bottom: var(--spacing-sm)
|
margin-bottom: var(--spacing-sm)
|
||||||
|
|
||||||
|
.small
|
||||||
|
padding: 1.5rem
|
||||||
|
font-size: var(--font-size-sm)
|
||||||
|
line-height: var(--line-height-sm)
|
||||||
|
color: var(--color-dark)
|
||||||
|
|
||||||
|
.title
|
||||||
|
margin-bottom: var(--spacing-xs)
|
||||||
|
|
||||||
.image
|
.image
|
||||||
$image-size: 35px
|
$image-size: 35px
|
||||||
width: $image-size
|
width: $image-size
|
||||||
|
|
|
@ -31,6 +31,9 @@
|
||||||
position: relative
|
position: relative
|
||||||
bottom: -2px
|
bottom: -2px
|
||||||
|
|
||||||
|
.emoji
|
||||||
|
margin-right: 0.65em
|
||||||
|
|
||||||
.warning
|
.warning
|
||||||
--color-theme: var(--color-yellow-dark)
|
--color-theme: var(--color-yellow-dark)
|
||||||
--color-theme-dark: var(--color-yellow-dark)
|
--color-theme-dark: var(--color-yellow-dark)
|
||||||
|
|
|
@ -25,7 +25,7 @@
|
||||||
--line-height-sm: 1.375
|
--line-height-sm: 1.375
|
||||||
--line-height-md: 1.5
|
--line-height-md: 1.5
|
||||||
--line-height-lg: 1.9
|
--line-height-lg: 1.9
|
||||||
--line-height-code: 1.8
|
--line-height-code: 1.7
|
||||||
|
|
||||||
// Spacing
|
// Spacing
|
||||||
--spacing-xs: 1rem
|
--spacing-xs: 1rem
|
||||||
|
@ -271,7 +271,7 @@ body
|
||||||
color: var(--color-front)
|
color: var(--color-front)
|
||||||
|
|
||||||
p
|
p
|
||||||
margin-bottom: var(--spacing-md)
|
margin-bottom: var(--spacing-sm)
|
||||||
font-family: var(--font-primary)
|
font-family: var(--font-primary)
|
||||||
font-size: var(--font-size-md)
|
font-size: var(--font-size-md)
|
||||||
line-height: var(--line-height-md)
|
line-height: var(--line-height-md)
|
||||||
|
|
|
@ -49,6 +49,36 @@
|
||||||
border-bottom: 2px solid var(--color-theme)
|
border-bottom: 2px solid var(--color-theme)
|
||||||
vertical-align: bottom
|
vertical-align: bottom
|
||||||
|
|
||||||
|
.divider
|
||||||
|
height: 0
|
||||||
|
border-bottom: 1px solid var(--color-subtle)
|
||||||
|
|
||||||
|
td
|
||||||
|
top: -1px
|
||||||
|
height: 0
|
||||||
|
position: relative
|
||||||
|
padding: 0 !important
|
||||||
|
|
||||||
|
& + tr td
|
||||||
|
padding-top: 12px
|
||||||
|
|
||||||
|
td em
|
||||||
|
position: absolute
|
||||||
|
top: -5px
|
||||||
|
left: 10px
|
||||||
|
display: inline-block
|
||||||
|
background: var(--color-theme)
|
||||||
|
color: var(--color-back)
|
||||||
|
padding: 0 5px 1px
|
||||||
|
font-size: 0.85rem
|
||||||
|
text-transform: uppercase
|
||||||
|
font-weight: bold
|
||||||
|
border: 0
|
||||||
|
border-radius: 1em
|
||||||
|
font-style: normal
|
||||||
|
white-space: nowrap
|
||||||
|
z-index: 5
|
||||||
|
|
||||||
// Responsive table
|
// Responsive table
|
||||||
// Shadows adapted from "CSS only Responsive Tables" by David Bushell
|
// Shadows adapted from "CSS only Responsive Tables" by David Bushell
|
||||||
// http://codepen.io/dbushell/pen/wGaamR
|
// http://codepen.io/dbushell/pen/wGaamR
|
||||||
|
|
|
@ -33,6 +33,7 @@ import { YouTube, SoundCloud, Iframe, Image } from '../components/embed'
|
||||||
import Alert from '../components/alert'
|
import Alert from '../components/alert'
|
||||||
import Search from '../components/search'
|
import Search from '../components/search'
|
||||||
import Project from '../widgets/project'
|
import Project from '../widgets/project'
|
||||||
|
import { Integration, IntegrationLogo } from '../widgets/integration'
|
||||||
|
|
||||||
const mdxComponents = {
|
const mdxComponents = {
|
||||||
a: Link,
|
a: Link,
|
||||||
|
@ -75,6 +76,8 @@ const scopeComponents = {
|
||||||
Grid,
|
Grid,
|
||||||
InlineCode,
|
InlineCode,
|
||||||
Project,
|
Project,
|
||||||
|
Integration,
|
||||||
|
IntegrationLogo,
|
||||||
}
|
}
|
||||||
|
|
||||||
const AlertSpace = ({ nightly }) => {
|
const AlertSpace = ({ nightly }) => {
|
||||||
|
|
46
website/src/widgets/integration.js
Normal file
|
@ -0,0 +1,46 @@
|
||||||
|
import React from 'react'
|
||||||
|
|
||||||
|
import Card from '../components/card'
|
||||||
|
|
||||||
|
import { ReactComponent as DVCLogo } from '../images/logos/dvc.svg'
|
||||||
|
import { ReactComponent as ProdigyLogo } from '../images/logos/prodigy.svg'
|
||||||
|
import { ReactComponent as StreamlitLogo } from '../images/logos/streamlit.svg'
|
||||||
|
import { ReactComponent as FastAPILogo } from '../images/logos/fastapi.svg'
|
||||||
|
import { ReactComponent as WandBLogo } from '../images/logos/wandb.svg'
|
||||||
|
import { ReactComponent as RayLogo } from '../images/logos/ray.svg'
|
||||||
|
|
||||||
|
const LOGOS = {
|
||||||
|
dvc: DVCLogo,
|
||||||
|
prodigy: ProdigyLogo,
|
||||||
|
streamlit: StreamlitLogo,
|
||||||
|
fastapi: FastAPILogo,
|
||||||
|
wandb: WandBLogo,
|
||||||
|
ray: RayLogo,
|
||||||
|
}
|
||||||
|
|
||||||
|
export const IntegrationLogo = ({ name, title, width, height, maxWidth, align, ...props }) => {
|
||||||
|
const Logo = LOGOS[name]
|
||||||
|
if (!Logo) throw new Error(`Unknown logo: ${name}`)
|
||||||
|
const style = { maxWidth, float: align || 'none' }
|
||||||
|
return (
|
||||||
|
<Logo
|
||||||
|
aria-label={title}
|
||||||
|
aria-hidden={title ? undefined : 'true'}
|
||||||
|
width={width}
|
||||||
|
height={height}
|
||||||
|
style={style}
|
||||||
|
{...props}
|
||||||
|
/>
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
export const Integration = ({ height = 30, url, logo, title, children }) => {
|
||||||
|
const header = logo && (
|
||||||
|
<IntegrationLogo name={logo} title={title} height={height} width="auto" maxWidth="80%" />
|
||||||
|
)
|
||||||
|
return (
|
||||||
|
<Card title={header} to={url} small>
|
||||||
|
{children}
|
||||||
|
</Card>
|
||||||
|
)
|
||||||
|
}
|
|
@ -15,14 +15,14 @@ const Project = ({ id, repo, children }) => {
|
||||||
const url = `${repo || DEFAULT_REPO}/${id}`
|
const url = `${repo || DEFAULT_REPO}/${id}`
|
||||||
const title = (
|
const title = (
|
||||||
<>
|
<>
|
||||||
🪐 Get started with a project template:{' '}
|
Get started with a project template:{' '}
|
||||||
<Link to={url}>
|
<Link to={url}>
|
||||||
<InlineCode>{id}</InlineCode>
|
<InlineCode>{id}</InlineCode>
|
||||||
</Link>
|
</Link>
|
||||||
</>
|
</>
|
||||||
)
|
)
|
||||||
return (
|
return (
|
||||||
<Infobox title={title}>
|
<Infobox title={title} emoji="🪐">
|
||||||
{children}
|
{children}
|
||||||
<CopyInput text={text} prefix="$" />
|
<CopyInput text={text} prefix="$" />
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|