Merge remote-tracking branch 'upstream/develop' into feature/more-v3-docs

This commit is contained in:
svlandeg 2020-08-18 11:57:52 +02:00
commit f7b76d2d83
22 changed files with 1005 additions and 538 deletions

View File

@ -36,11 +36,11 @@ redirects = [
{from = "/docs/api/features", to = "/models/#architecture", force = true},
{from = "/docs/api/philosophy", to = "/usage/spacy-101", force = true},
{from = "/docs/usage/showcase", to = "/universe", force = true},
{from = "/tutorials/load-new-word-vectors", to = "/usage/vectors-similarity#custom", force = true},
{from = "/tutorials/load-new-word-vectors", to = "/usage/linguistic-features", force = true},
{from = "/tutorials", to = "/usage/examples", force = true},
# Old documentation pages (v2.x)
{from = "/usage/adding-languages", to = "/usage/linguistic-features", force = true},
{from = "/usage/vectors-similarity", to = "/usage/vectors-embeddings", force = true},
{from = "/usage/vectors-similarity", to = "/usage/linguistic-features#vectors-similarity", force = true},
{from = "/api/goldparse", to = "/api/top-level", force = true},
{from = "/api/goldcorpus", to = "/api/corpus", force = true},
{from = "/api/annotation", to = "/api/data-formats", force = true},

View File

@ -3,7 +3,7 @@ from pathlib import Path
from collections import Counter
import sys
import srsly
from wasabi import Printer, MESSAGES, msg, diff_strings
from wasabi import Printer, MESSAGES, msg
import typer
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
@ -32,8 +32,6 @@ def debug_config_cli(
ctx: typer.Context, # This is only used to read additional arguments
config_path: Path = Arg(..., help="Path to config file", exists=True),
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
auto_fill: bool = Opt(False, "--auto-fill", "-F", help="Whether or not to auto-fill the config with built-in defaults if possible"),
diff: bool = Opt(False, "--diff", "-D", help="Show a visual diff if config was auto-filled")
# fmt: on
):
"""Debug a config.cfg file and show validation errors. The command will
@ -49,18 +47,8 @@ def debug_config_cli(
import_code(code_path)
with show_validation_error(config_path):
config = util.load_config(config_path, overrides=overrides)
nlp, _ = util.load_model_from_config(config, auto_fill=auto_fill)
if auto_fill:
orig_config = config.to_str()
filled_config = nlp.config.to_str()
if orig_config == filled_config:
msg.good("Original config is valid, no values were auto-filled")
else:
msg.good("Auto-filled config is valid")
if diff:
print(diff_strings(config.to_str(), nlp.config.to_str()))
else:
msg.good("Original config is valid")
nlp, _ = util.load_model_from_config(config)
msg.good("Original config is valid")
@debug_cli.command(

View File

@ -249,7 +249,16 @@ def load_model_from_package(
disable: Iterable[str] = tuple(),
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
) -> "Language":
"""Load a model from an installed package."""
"""Load a model from an installed package.
name (str): The package name.
vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
a new Vocab object will be created.
disable (Iterable[str]): Names of pipeline components to disable.
config (Dict[str, Any] / Config): Config overrides as nested dict or dict
keyed by section values in dot notation.
RETURNS (Language): The loaded nlp object.
"""
cls = importlib.import_module(name)
return cls.load(vocab=vocab, disable=disable, config=config)
@ -263,7 +272,17 @@ def load_model_from_path(
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
) -> "Language":
"""Load a model from a data directory path. Creates Language class with
pipeline from config.cfg and then calls from_disk() with path."""
pipeline from config.cfg and then calls from_disk() with path.
name (str): Package name or model path.
meta (Dict[str, Any]): Optional model meta.
vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
a new Vocab object will be created.
disable (Iterable[str]): Names of pipeline components to disable.
config (Dict[str, Any] / Config): Config overrides as nested dict or dict
keyed by section values in dot notation.
RETURNS (Language): The loaded nlp object.
"""
if not model_path.exists():
raise IOError(Errors.E052.format(path=model_path))
if not meta:
@ -284,6 +303,15 @@ def load_model_from_config(
) -> Tuple["Language", Config]:
"""Create an nlp object from a config. Expects the full config file including
a section "nlp" containing the settings for the nlp object.
name (str): Package name or model path.
meta (Dict[str, Any]): Optional model meta.
vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
a new Vocab object will be created.
disable (Iterable[str]): Names of pipeline components to disable.
auto_fill (bool): Whether to auto-fill config with missing defaults.
validate (bool): Whether to show config validation errors.
RETURNS (Language): The loaded nlp object.
"""
if "nlp" not in config:
raise ValueError(Errors.E985.format(config=config))
@ -308,6 +336,13 @@ def load_model_from_init_py(
) -> "Language":
"""Helper function to use in the `load()` method of a model package's
__init__.py.
vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
a new Vocab object will be created.
disable (Iterable[str]): Names of pipeline components to disable.
config (Dict[str, Any] / Config): Config overrides as nested dict or dict
keyed by section values in dot notation.
RETURNS (Language): The loaded nlp object.
"""
model_path = Path(init_file).parent
meta = get_model_meta(model_path)
@ -325,7 +360,14 @@ def load_config(
overrides: Dict[str, Any] = SimpleFrozenDict(),
interpolate: bool = False,
) -> Config:
"""Load a config file. Takes care of path validation and section order."""
"""Load a config file. Takes care of path validation and section order.
path (Union[str, Path]): Path to the config file.
overrides: (Dict[str, Any]): Config overrides as nested dict or
dict keyed by section values in dot notation.
interpolate (bool): Whether to interpolate and resolve variables.
RETURNS (Config): The loaded config.
"""
config_path = ensure_path(path)
if not config_path.exists() or not config_path.is_file():
raise IOError(Errors.E053.format(path=config_path, name="config.cfg"))
@ -337,7 +379,12 @@ def load_config(
def load_config_from_str(
text: str, overrides: Dict[str, Any] = SimpleFrozenDict(), interpolate: bool = False
):
"""Load a full config from a string."""
"""Load a full config from a string. Wrapper around Thinc's Config.from_str.
text (str): The string config to load.
interpolate (bool): Whether to interpolate and resolve variables.
RETURNS (Config): The loaded config.
"""
return Config(section_order=CONFIG_SECTION_ORDER).from_str(
text, overrides=overrides, interpolate=interpolate,
)
@ -435,19 +482,18 @@ def get_base_version(version: str) -> str:
return Version(version).base_version
def get_model_meta(path: Union[str, Path]) -> Dict[str, Any]:
"""Get model meta.json from a directory path and validate its contents.
def load_meta(path: Union[str, Path]) -> Dict[str, Any]:
"""Load a model meta.json from a path and validate its contents.
path (str / Path): Path to model directory.
RETURNS (Dict[str, Any]): The model's meta data.
path (Union[str, Path]): Path to meta.json.
RETURNS (Dict[str, Any]): The loaded meta.
"""
model_path = ensure_path(path)
if not model_path.exists():
raise IOError(Errors.E052.format(path=model_path))
meta_path = model_path / "meta.json"
if not meta_path.is_file():
raise IOError(Errors.E053.format(path=meta_path, name="meta.json"))
meta = srsly.read_json(meta_path)
path = ensure_path(path)
if not path.parent.exists():
raise IOError(Errors.E052.format(path=path.parent))
if not path.exists() or not path.is_file():
raise IOError(Errors.E053.format(path=path, name="meta.json"))
meta = srsly.read_json(path)
for setting in ["lang", "name", "version"]:
if setting not in meta or not meta[setting]:
raise ValueError(Errors.E054.format(setting=setting))
@ -471,6 +517,16 @@ def get_model_meta(path: Union[str, Path]) -> Dict[str, Any]:
return meta
def get_model_meta(path: Union[str, Path]) -> Dict[str, Any]:
"""Get model meta.json from a directory path and validate its contents.
path (str / Path): Path to model directory.
RETURNS (Dict[str, Any]): The model's meta data.
"""
model_path = ensure_path(path)
return load_meta(model_path / "meta.json")
def is_package(name: str) -> bool:
"""Check if string maps to a package installed via pip.

View File

@ -243,11 +243,15 @@ Encode context using bidirectional LSTM layers. Requires
| `window_size` | The number of words to concatenate around each token to construct the convolution. Recommended value is `1`. ~~int~~ |
| `depth` | The number of convolutional layers. Recommended value is `4`. ~~int~~ |
### spacy.StaticVectors.v1 {#StaticVectors}
<!-- TODO: -->
## Transformer architectures {#transformers source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/architectures.py"}
The following architectures are provided by the package
[`spacy-transformers`](https://github.com/explosion/spacy-transformers). See the
[usage documentation](/usage/transformers) for how to integrate the
[usage documentation](/usage/embeddings-transformers) for how to integrate the
architectures into your training config.
### spacy-transformers.TransformerModel.v1 {#TransformerModel}

View File

@ -3,17 +3,17 @@ title: Command Line Interface
teaser: Download, train and package models, and debug spaCy
source: spacy/cli
menu:
- ['Download', 'download']
- ['Info', 'info']
- ['Validate', 'validate']
- ['Init', 'init']
- ['Convert', 'convert']
- ['Debug', 'debug']
- ['Train', 'train']
- ['Pretrain', 'pretrain']
- ['Evaluate', 'evaluate']
- ['Package', 'package']
- ['Project', 'project']
- ['download', 'download']
- ['info', 'info']
- ['validate', 'validate']
- ['init', 'init']
- ['convert', 'convert']
- ['debug', 'debug']
- ['train', 'train']
- ['pretrain', 'pretrain']
- ['evaluate', 'evaluate']
- ['package', 'package']
- ['project', 'project']
---
spaCy's CLI provides a range of helpful commands for downloading and training
@ -22,7 +22,7 @@ list of available commands, you can type `python -m spacy --help`. You can also
add the `--help` flag to any command or subcommand to see the description,
available arguments and usage.
## Download {#download}
## download {#download tag="command"}
Download [models](/usage/models) for spaCy. The downloader finds the
best-matching compatible version and uses `pip install` to download the model as
@ -43,15 +43,15 @@ the model name to be specified with its version (e.g. `en_core_web_sm-2.2.0`).
$ python -m spacy download [model] [--direct] [pip args]
```
| Argument | Type | Description |
| ------------------------------------- | ------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `model` | positional | Model name, e.g. [`en_core_web_sm`](/models/en#en_core_web_sm). |
| `--direct`, `-d` | flag | Force direct download of exact model version. |
| `--help`, `-h` | flag | Show help message and available arguments. |
| pip args <Tag variant="new">2.1</Tag> | option / flag | Additional installation options to be passed to `pip install` when installing the model package. For example, `--user` to install to the user home directory or `--no-deps` to not install model dependencies. |
| **CREATES** | directory | The installed model package in your `site-packages` directory. |
| Name | Description |
| ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `model` | Model name, e.g. [`en_core_web_sm`](/models/en#en_core_web_sm). ~~str (positional)~~ |
| `--direct`, `-d` | Force direct download of exact model version. ~~bool (flag)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| pip args <Tag variant="new">2.1</Tag> | Additional installation options to be passed to `pip install` when installing the model package. For example, `--user` to install to the user home directory or `--no-deps` to not install model dependencies. ~~Any (option/flag)~~ |
| **CREATES** | The installed model package in your `site-packages` directory. |
## Info {#info}
## info {#info tag="command"}
Print information about your spaCy installation, models and local setup, and
generate [Markdown](https://en.wikipedia.org/wiki/Markdown)-formatted markup to
@ -65,15 +65,15 @@ $ python -m spacy info [--markdown] [--silent]
$ python -m spacy info [model] [--markdown] [--silent]
```
| Argument | Type | Description |
| ------------------------------------------------ | ---------- | ---------------------------------------------- |
| `model` | positional | A model, i.e. package name or path (optional). |
| `--markdown`, `-md` | flag | Print information as Markdown. |
| `--silent`, `-s` <Tag variant="new">2.0.12</Tag> | flag | Don't print anything, just return the values. |
| `--help`, `-h` | flag | Show help message and available arguments. |
| **PRINTS** | `stdout` | Information about your spaCy installation. |
| Name | Description |
| ------------------------------------------------ | ------------------------------------------------------------------------------ |
| `model` | A model, i.e. package name or path (optional). ~~Optional[str] \(positional)~~ |
| `--markdown`, `-md` | Print information as Markdown. ~~bool (flag)~~ |
| `--silent`, `-s` <Tag variant="new">2.0.12</Tag> | Don't print anything, just return the values. ~~bool (flag)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **PRINTS** | Information about your spaCy installation. |
## Validate {#validate new="2"}
## validate {#validate new="2" tag="command"}
Find all models installed in the current environment and check whether they are
compatible with the currently installed version of spaCy. Should be run after
@ -92,16 +92,16 @@ and command for updating are shown.
$ python -m spacy validate
```
| Argument | Type | Description |
| ---------- | -------- | --------------------------------------------------------- |
| **PRINTS** | `stdout` | Details about the compatibility of your installed models. |
| Name | Description |
| ---------- | --------------------------------------------------------- |
| **PRINTS** | Details about the compatibility of your installed models. |
## Init {#init new="3"}
## init {#init new="3"}
The `spacy init` CLI includes helpful commands for initializing training config
files and model directories.
### init config {#init-config new="3"}
### init config {#init-config new="3" tag="command"}
Initialize and save a [`config.cfg` file](/usage/training#config) using the
**recommended settings** for your use case. It works just like the
@ -121,15 +121,15 @@ $ python -m spacy init config [output_file] [--lang] [--pipeline]
[--optimize] [--cpu]
```
| Argument | Type | Description |
| ------------------ | ---------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `output_file` | positional | Path to output `.cfg` file. If not set, the config is written to stdout so you can pipe it forward to a file. |
| `--lang`, `-l` | option | Optional code of the [language](/usage/models#languages) to use. Defaults to `"en"`. |
| `--pipeline`, `-p` | option | Comma-separated list of trainable [pipeline components](/usage/processing-pipelines#built-in) to include in the model. Defaults to `"tagger,parser,ner"`. |
| `--optimize`, `-o` | option | `"efficiency"` or `"accuracy"`. Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters. Defaults to `"efficiency"`. |
| `--cpu`, `-C` | flag | Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters. |
| `--help`, `-h` | flag | Show help message and available arguments. |
| **CREATES** | file | The config file for training. |
| Name | Description |
| ------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `output_file` | Path to output `.cfg` file. If not set, the config is written to stdout so you can pipe it forward to a file. ~~Path (positional)~~ |
| `--lang`, `-l` | Optional code of the [language](/usage/models#languages) to use. Defaults to `"en"`. ~~str (option)~~ |
| `--pipeline`, `-p` | Comma-separated list of trainable [pipeline components](/usage/processing-pipelines#built-in) to include in the model. Defaults to `"tagger,parser,ner"`. ~~str (option)~~ |
| `--optimize`, `-o` | `"efficiency"` or `"accuracy"`. Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters. Defaults to `"efficiency"`. ~~str (option)~~ |
| `--cpu`, `-C` | Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters. ~~bool (flag)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **CREATES** | The config file for training. |
### init fill-config {#init-fill-config new="3"}
@ -152,24 +152,22 @@ validation error with more details.
$ python -m spacy init fill-config [base_path] [output_file] [--diff]
```
| Argument | Type | Description |
| -------------- | ---------- | ------------------------------------------------------------------------------------------------------------- |
| `base_path` | positional | Path to base config to fill, e.g. generated by the [quickstart widget](/usage/training#quickstart). |
| `output_file` | positional | Path to output `.cfg` file. If not set, the config is written to stdout so you can pipe it forward to a file. |
| `--diff`, `-D` | flag | Print a visual diff highlighting the changes. |
| `--help`, `-h` | flag | Show help message and available arguments. |
| **CREATES** | file | Complete and auto-filled config file for training. |
| Name | Description |
| -------------- | ----------------------------------------------------------------------------------------------------------------------------------- |
| `base_path` | Path to base config to fill, e.g. generated by the [quickstart widget](/usage/training#quickstart). ~~Path (positional)~~ |
| `output_file` | Path to output `.cfg` file. If not set, the config is written to stdout so you can pipe it forward to a file. ~~Path (positional)~~ |
| `--diff`, `-D` | Print a visual diff highlighting the changes. ~~bool (flag)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **CREATES** | Complete and auto-filled config file for training. |
### init model {#init-model new="2"}
<!-- TODO: update for v3 -->
### init model {#init-model new="2" tag="command"}
Create a new model directory from raw data, like word frequencies, Brown
clusters and word vectors. This command is similar to the `spacy model` command
in v1.x. Note that in order to populate the model's vocab, you need to pass in a
JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) as
`--jsonl-loc` with optional `id` values that correspond to the vectors table.
Just loading in vectors will not automatically populate the vocab.
clusters and word vectors. Note that in order to populate the model's vocab, you
need to pass in a JSONL-formatted
[vocabulary file](/api/data-formats#vocab-jsonl) as `--jsonl-loc` with optional
`id` values that correspond to the vectors table. Just loading in vectors will
not automatically populate the vocab.
<Infobox title="New in v3.0" variant="warning">
@ -182,19 +180,19 @@ $ python -m spacy init model [lang] [output_dir] [--jsonl-loc] [--vectors-loc]
[--prune-vectors]
```
| Argument | Type | Description |
| ------------------------------------------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `lang` | positional | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. |
| `output_dir` | positional | Model output directory. Will be created if it doesn't exist. |
| `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) with lexical attributes. |
| `--vectors-loc`, `-v` | option | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. |
| `--truncate-vectors`, `-t` <Tag variant="new">2.3</Tag> | option | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. |
| `--prune-vectors`, `-V` | option | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. |
| `--vectors-name`, `-vn` | option | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. |
| `--help`, `-h` | flag | Show help message and available arguments. |
| **CREATES** | model | A spaCy model containing the vocab and vectors. |
| Name | Description |
| ------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `lang` | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. ~~str (positional)~~ |
| `output_dir` | Model output directory. Will be created if it doesn't exist. ~~Path (positional)~~ |
| `--jsonl-loc`, `-j` | Optional location of JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) with lexical attributes. ~~Optional[Path] \(option)~~ |
| `--vectors-loc`, `-v` | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Optional[Path] \(option)~~ |
| `--truncate-vectors`, `-t` <Tag variant="new">2.3</Tag> | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ |
| `--prune-vectors`, `-V` | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~ |
| `--vectors-name`, `-vn` | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. ~~str (option)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **CREATES** | A spaCy model containing the vocab and vectors. |
## Convert {#convert}
## convert {#convert tag="command"}
Convert files into spaCy's
[binary training data format](/api/data-formats#binary-training), a serialized
@ -208,22 +206,22 @@ $ python -m spacy convert [input_file] [output_dir] [--converter]
[--merge-subtokens] [--ner-map] [--lang]
```
| Argument | Type | Description |
| ------------------------------------------------ | ---------- | ------------------------------------------------------------------------------------------------------------------------ |
| `input_file` | positional | Input file. |
| `output_dir` | positional | Output directory for converted file. Defaults to `"-"`, meaning data will be written to `stdout`. |
| `--converter`, `-c` <Tag variant="new">2</Tag> | option | Name of converter to use (see below). |
| `--file-type`, `-t` <Tag variant="new">2.1</Tag> | option | Type of file to create. Either `spacy` (default) for binary [`DocBin`](/api/docbin) data or `json` for v2.x JSON format. |
| `--n-sents`, `-n` | option | Number of sentences per document. |
| `--seg-sents`, `-s` <Tag variant="new">2.2</Tag> | flag | Segment sentences (for `-c ner`) |
| `--model`, `-b` <Tag variant="new">2.2</Tag> | option | Model for parser-based sentence segmentation (for `-s`) |
| `--morphology`, `-m` | option | Enable appending morphology to tags. |
| `--ner-map`, `-nm` | option | NER tag mapping (as JSON-encoded dict of entity types). |
| `--lang`, `-l` <Tag variant="new">2.1</Tag> | option | Language code (if tokenizer required). |
| `--help`, `-h` | flag | Show help message and available arguments. |
| **CREATES** | binary | Binary [`DocBin`](/api/docbin) training data that can be used with [`spacy train`](/api/cli#train). |
| Name | Description |
| ------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------- |
| `input_file` | Input file. ~~Path (positional)~~ |
| `output_dir` | Output directory for converted file. Defaults to `"-"`, meaning data will be written to `stdout`. ~~Optional[Path] \(positional)~~ |
| `--converter`, `-c` <Tag variant="new">2</Tag> | Name of converter to use (see below). ~~str (option)~~ |
| `--file-type`, `-t` <Tag variant="new">2.1</Tag> | Type of file to create. Either `spacy` (default) for binary [`DocBin`](/api/docbin) data or `json` for v2.x JSON format. ~~str (option)~~ |
| `--n-sents`, `-n` | Number of sentences per document. ~~int (option)~~ |
| `--seg-sents`, `-s` <Tag variant="new">2.2</Tag> | Segment sentences (for `--converter ner`). ~~bool (flag)~~ |
| `--model`, `-b` <Tag variant="new">2.2</Tag> | Model for parser-based sentence segmentation (for `--seg-sents`). ~~Optional[str](option)~~ |
| `--morphology`, `-m` | Enable appending morphology to tags. ~~bool (flag)~~ |
| `--ner-map`, `-nm` | NER tag mapping (as JSON-encoded dict of entity types). ~~Optional[Path](option)~~ |
| `--lang`, `-l` <Tag variant="new">2.1</Tag> | Language code (if tokenizer required). ~~Optional[str] \(option)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **CREATES** | Binary [`DocBin`](/api/docbin) training data that can be used with [`spacy train`](/api/cli#train). |
### Converters
### Converters {#converters}
| ID | Description |
| ------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
@ -233,12 +231,12 @@ $ python -m spacy convert [input_file] [output_dir] [--converter]
| `ner` | NER with IOB/IOB2 tags, one token per line with columns separated by whitespace. The first column is the token and the final column is the IOB tag. Sentences are separated by blank lines and documents are separated by the line `-DOCSTART- -X- O O`. Supports CoNLL 2003 NER format. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). |
| `iob` | NER with IOB/IOB2 tags, one sentence per line with tokens separated by whitespace and annotation separated by `|`, either `word|B-ENT` or `word|POS|B-ENT`. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). |
## Debug {#debug new="3"}
## debug {#debug new="3"}
The `spacy debug` CLI includes helpful commands for debugging and profiling your
configs, data and implementations.
### debug config {#debug-config}
### debug config {#debug-config new="3" tag="command"}
Debug a [`config.cfg` file](/usage/training#config) and show validation errors.
The command will create all objects in the tree and validate them. Note that
@ -246,10 +244,10 @@ some config validation errors are blocking and will prevent the rest of the
config from being resolved. This means that you may not see all validation
errors at once and some issues are only shown once previous errors have been
fixed. To auto-fill a partial config and save the result, you can use the
[`init config`](/api/cli#init-config) command.
[`init fillconfig`](/api/cli#init-fill-config) command.
```bash
$ python -m spacy debug config [config_path] [--code_path] [--output] [--auto_fill] [--diff] [overrides]
$ python -m spacy debug config [config_path] [--code_path] [overrides]
```
> #### Example
@ -277,18 +275,15 @@ python -m spacy init fill-config tmp/starter-config_invalid.cfg --base tmp/start
</Accordion>
| Argument | Type | Default | Description |
| --------------------- | ------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------- |
| `config_path` | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. |
| `--code_path`, `-c` | option | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. |
| `--auto_fill`, `-F` | option | Whether or not to auto-fill the config with built-in defaults if possible. If `False`, the provided config needs to be complete. |
| `--output_path`, `-o` | option | Output path where the filled config can be stored. Use '-' for standard output. |
| `--diff`, `-D` | option | `Show a visual diff if config was auto-filled. |
| `--help`, `-h` | flag | Show help message and available arguments. |
| overrides | option / flag | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. |
| **PRINTS** | stdout | Config validation errors, if available. |
| Name | Description |
| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
| `--code_path`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. ~~Optional[Path] \(option)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
| **PRINTS** | Config validation errors, if available. |
### debug data {#debug-data}
### debug data {#debug-data tag="command"}
Analyze, debug, and validate your training and development data. Get useful
stats, and find problems like invalid entity annotations, cyclic dependencies,
@ -453,18 +448,18 @@ will not be available.
</Accordion>
| Argument | Type | Description |
| -------------------------- | ------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `config_path` | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. |
| `--code`, `-c` | option | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. |
| `--ignore-warnings`, `-IW` | flag | Ignore warnings, only show stats and errors. |
| `--verbose`, `-V` | flag | Print additional information and explanations. |
| `--no-format`, `-NF` | flag | Don't pretty-print the results. Use this if you want to write to a file. |
| `--help`, `-h` | flag | Show help message and available arguments. |
| overrides | option / flag | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. |
| **PRINTS** | stdout | Debugging information. |
| Name | Description |
| -------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. ~~Optional[Path] \(option)~~ |
| `--ignore-warnings`, `-IW` | Ignore warnings, only show stats and errors. ~~bool (flag)~~ |
| `--verbose`, `-V` | Print additional information and explanations. ~~bool (flag)~~ |
| `--no-format`, `-NF` | Don't pretty-print the results. Use this if you want to write to a file. ~~bool (flag)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
| **PRINTS** | Debugging information. |
### debug profile {#debug-profile}
### debug profile {#debug-profile tag="command"}
Profile which functions take the most time in a spaCy pipeline. Input should be
formatted as one JSON object per line with a key `"text"`. It can either be
@ -482,15 +477,15 @@ The `profile` command is now available as a subcommand of `spacy debug`.
$ python -m spacy debug profile [model] [inputs] [--n-texts]
```
| Argument | Type | Description |
| ----------------- | ---------- | ----------------------------------------------------------------- |
| `model` | positional | A loadable spaCy model. |
| `inputs` | positional | Optional path to input file, or `-` for standard input. |
| `--n-texts`, `-n` | option | Maximum number of texts to use if available. Defaults to `10000`. |
| `--help`, `-h` | flag | Show help message and available arguments. |
| **PRINTS** | stdout | Profiling information for the model. |
| Name | Description |
| ----------------- | ---------------------------------------------------------------------------------- |
| `model` | A loadable spaCy model. ~~str (positional)~~ |
| `inputs` | Optional path to input file, or `-` for standard input. ~~Path (positional)~~ |
| `--n-texts`, `-n` | Maximum number of texts to use if available. Defaults to `10000`. ~~int (option)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **PRINTS** | Profiling information for the model. |
### debug model {#debug-model}
### debug model {#debug-model new="3" tag="command"}
Debug a Thinc [`Model`](https://thinc.ai/docs/api-model) by running it on a
sample text and checking how it updates its internal weights and parameters.
@ -596,23 +591,24 @@ $ python -m spacy debug model ./config.cfg tagger -l "5,15" -DIM -PAR -P0 -P1 -P
</Accordion>
| Argument | Type | Description |
| ----------------------- | ---------- | ----------------------------------------------------------------------------------------------------- |
| `config_path` | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. |
| `component` | positional | Name of the pipeline component of which the model should be analyzed. |
| `--layers`, `-l` | option | Comma-separated names of layer IDs to print. |
| `--dimensions`, `-DIM` | option | Show dimensions of each layer. |
| `--parameters`, `-PAR` | option | Show parameters of each layer. |
| `--gradients`, `-GRAD` | option | Show gradients of each layer. |
| `--attributes`, `-ATTR` | option | Show attributes of each layer. |
| `--print-step0`, `-P0` | option | Print model before training. |
| `--print-step1`, `-P1` | option | Print model after initialization. |
| `--print-step2`, `-P2` | option | Print model after training. |
| `--print-step3`, `-P3` | option | Print final predictions. |
| `--help`, `-h` | flag | Show help message and available arguments. |
| **PRINTS** | stdout | Debugging information. |
| Name | Description |
| ----------------------- | --------------------------------------------------------------------------------------------------------------------------- |
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
| `component` | Name of the pipeline component of which the model should be analyzed. ~~str (positional)~~ |
| `--layers`, `-l` | Comma-separated names of layer IDs to print. ~~str (option)~~ |
| `--dimensions`, `-DIM` | Show dimensions of each layer. ~~bool (flag)~~ |
| `--parameters`, `-PAR` | Show parameters of each layer. ~~bool (flag)~~ |
| `--gradients`, `-GRAD` | Show gradients of each layer. ~~bool (flag)~~ |
| `--attributes`, `-ATTR` | Show attributes of each layer. ~~bool (flag)~~ |
| `--print-step0`, `-P0` | Print model before training. ~~bool (flag)~~ |
| `--print-step1`, `-P1` | Print model after initialization. ~~bool (flag)~~ |
| `--print-step2`, `-P2` | Print model after training. ~~bool (flag)~~ |
| `--print-step3`, `-P3` | Print final predictions. ~~bool (flag)~~ |
| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **PRINTS** | Debugging information. |
## Train {#train}
## train {#train tag="command"}
Train a model. Expects data in spaCy's
[binary format](/api/data-formats#training) and a
@ -640,17 +636,17 @@ in the section `[paths]`.
$ python -m spacy train [config_path] [--output] [--code] [--verbose] [overrides]
```
| Argument | Type | Description |
| ----------------- | ------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `config_path` | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. |
| `--output`, `-o` | positional | Directory to store model in. Will be created if it doesn't exist. |
| `--code`, `-c` | option | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. |
| `--verbose`, `-V` | flag | Show more detailed messages during training. |
| `--help`, `-h` | flag | Show help message and available arguments. |
| overrides | option / flag | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. |
| **CREATES** | model | The final model and the best model. |
| Name | Description |
| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
| `--output`, `-o` | Directory to store model in. Will be created if it doesn't exist. ~~Optional[Path] \(positional)~~ |
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. ~~Optional[Path] \(option)~~ |
| `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
| **CREATES** | The final model and the best model. |
## Pretrain {#pretrain new="2.1" tag="experimental"}
## pretrain {#pretrain new="2.1" tag="command,experimental"}
Pretrain the "token to vector" ([`Tok2vec`](/api/tok2vec)) layer of pipeline
components on [raw text](/api/data-formats#pretrain), using an approximate
@ -678,19 +674,19 @@ $ python -m spacy pretrain [texts_loc] [output_dir] [config_path]
[--code] [--resume-path] [--epoch-resume] [overrides]
```
| Argument | Type | Description |
| ----------------------- | ------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `texts_loc` | positional | Path to JSONL file with raw texts to learn from, with text provided as the key `"text"` or tokens as the key `"tokens"`. [See here](/api/data-formats#pretrain) for details. |
| `output_dir` | positional | Directory to write models to on each epoch. |
| `config_path` | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. |
| `--code`, `-c` | option | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. |
| `--resume-path`, `-r` | option | Path to pretrained weights from which to resume pretraining. |
| `--epoch-resume`, `-er` | option | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. |
| `--help`, `-h` | flag | Show help message and available arguments. |
| overrides | option / flag | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.use_gpu 1`. |
| **CREATES** | weights | The pretrained weights that can be used to initialize `spacy train`. |
| Name | Description |
| ----------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `texts_loc` | Path to JSONL file with raw texts to learn from, with text provided as the key `"text"` or tokens as the key `"tokens"`. [See here](/api/data-formats#pretrain) for details. ~~Path (positional)~~ |
| `output_dir` | Directory to write models to on each epoch. ~~Path (positional)~~ |
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. ~~Optional[Path] \(option)~~ |
| `--resume-path`, `-r` | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~ |
| `--epoch-resume`, `-er` | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. ~~Optional[int] \(option)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.dropout 0.2`. ~~Any (option/flag)~~ |
| **CREATES** | The pretrained weights that can be used to initialize `spacy train`. |
## Evaluate {#evaluate new="2"}
## evaluate {#evaluate new="2" tag="command"}
Evaluate a model. Expects a loadable spaCy model and evaluation data in the
[binary `.spacy` format](/api/data-formats#binary-training). The
@ -707,19 +703,19 @@ $ python -m spacy evaluate [model] [data_path] [--output] [--gold-preproc]
[--gpu-id] [--displacy-path] [--displacy-limit]
```
| Argument | Type | Description |
| ------------------------- | -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `model` | positional | Model to evaluate. Can be a package or a path to a model data directory. |
| `data_path` | positional | Location of evaluation data in spaCy's [binary format](/api/data-formats#training). |
| `--output`, `-o` | option | Output JSON file for metrics. If not set, no metrics will be exported. |
| `--gold-preproc`, `-G` | flag | Use gold preprocessing. |
| `--gpu-id`, `-g` | option | GPU to use, if any. Defaults to `-1` for CPU. |
| `--displacy-path`, `-dp` | option | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. |
| `--displacy-limit`, `-dl` | option | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. |
| `--help`, `-h` | flag | Show help message and available arguments. |
| **CREATES** | `stdout`, JSON, HTML | Training results and optional metrics and visualizations. |
| Name | Description |
| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `model` | Model to evaluate. Can be a package or a path to a model data directory. ~~str (positional)~~ |
| `data_path` | Location of evaluation data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~ |
| `--output`, `-o` | Output JSON file for metrics. If not set, no metrics will be exported. ~~Optional[Path] \(option)~~ |
| `--gold-preproc`, `-G` | Use gold preprocessing. ~~bool (flag)~~ |
| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ |
| `--displacy-path`, `-dp` | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. ~~Optional[Path] \(option)~~ |
| `--displacy-limit`, `-dl` | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. ~~int (option)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **CREATES** | Training results and optional metrics and visualizations. |
## Package {#package}
## package {#package tag="command"}
Generate an installable
[model Python package](/usage/training#models-generating) from an existing model
@ -750,25 +746,25 @@ $ python -m spacy package [input_dir] [output_dir] [--meta-path] [--create-meta]
> pip install dist/en_model-0.0.0.tar.gz
> ```
| Argument | Type | Description |
| ------------------------------------------------ | ---------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `input_dir` | positional | Path to directory containing model data. |
| `output_dir` | positional | Directory to create package folder in. |
| `--meta-path`, `-m` <Tag variant="new">2</Tag> | option | Path to `meta.json` file (optional). |
| `--create-meta`, `-C` <Tag variant="new">2</Tag> | flag | Create a `meta.json` file on the command line, even if one already exists in the directory. If an existing file is found, its entries will be shown as the defaults in the command line prompt. |
| `--no-sdist`, `-NS`, | flag | Don't build the `.tar.gz` sdist automatically. Can be set if you want to run this step manually. |
| `--version`, `-v` <Tag variant="new">3</Tag> | option | Package version to override in meta. Useful when training new versions, as it doesn't require editing the meta template. |
| `--force`, `-f` | flag | Force overwriting of existing folder in output directory. |
| `--help`, `-h` | flag | Show help message and available arguments. |
| **CREATES** | directory | A Python package containing the spaCy model. |
| Name | Description |
| ------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `input_dir` | Path to directory containing model data. ~~Path (positional)~~ |
| `output_dir` | Directory to create package folder in. ~~Path (positional)~~ |
| `--meta-path`, `-m` <Tag variant="new">2</Tag> | Path to `meta.json` file (optional). ~~Optional[Path] \(option)~~ |
| `--create-meta`, `-C` <Tag variant="new">2</Tag> | Create a `meta.json` file on the command line, even if one already exists in the directory. If an existing file is found, its entries will be shown as the defaults in the command line prompt. ~~bool (flag)~~ |
| `--no-sdist`, `-NS`, | Don't build the `.tar.gz` sdist automatically. Can be set if you want to run this step manually. ~~bool (flag)~~ |
| `--version`, `-v` <Tag variant="new">3</Tag> | Package version to override in meta. Useful when training new versions, as it doesn't require editing the meta template. ~~Optional[str] \(option)~~ |
| `--force`, `-f` | Force overwriting of existing folder in output directory. ~~bool (flag)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **CREATES** | A Python package containing the spaCy model. |
## Project {#project new="3"}
## project {#project new="3"}
The `spacy project` CLI includes subcommands for working with
[spaCy projects](/usage/projects), end-to-end workflows for building and
deploying custom spaCy models.
### project clone {#project-clone}
### project clone {#project-clone tag="command"}
Clone a project template from a Git repository. Calls into `git` under the hood
and uses the sparse checkout feature, so you're only downloading what you need.
@ -795,15 +791,15 @@ $ python -m spacy project clone [name] [dest] [--repo]
> $ python -m spacy project clone template --repo https://github.com/your_org/your_repo
> ```
| Argument | Type | Description |
| -------------- | ---------- | ---------------------------------------------------------------------------------------------------------------------------- |
| `name` | positional | The name of the template to clone, relative to the repo. Can be a top-level directory or a subdirectory like `dir/template`. |
| `dest` | positional | Where to clone the project. Defaults to current working directory. |
| `--repo`, `-r` | option | The repository to clone from. Can be any public or private Git repo you have access to. |
| `--help`, `-h` | flag | Show help message and available arguments. |
| **CREATES** | directory | The cloned [project directory](/usage/projects#project-files). |
| Name | Description |
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------- |
| `name` | The name of the template to clone, relative to the repo. Can be a top-level directory or a subdirectory like `dir/template`. ~~str (positional)~~ |
| `dest` | Where to clone the project. Defaults to current working directory. ~~Path (positional)~~ |
| `--repo`, `-r` | The repository to clone from. Can be any public or private Git repo you have access to. ~~str (option)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **CREATES** | The cloned [project directory](/usage/projects#project-files). |
### project assets {#project-assets}
### project assets {#project-assets tag="command"}
Fetch project assets like datasets and pretrained weights. Assets are defined in
the `assets` section of the [`project.yml`](/usage/projects#project-yml). If a
@ -824,13 +820,13 @@ $ python -m spacy project assets [project_dir]
> $ python -m spacy project assets
> ```
| Argument | Type | Description |
| -------------- | ---------- | ----------------------------------------------------------------- |
| `project_dir` | positional | Path to project directory. Defaults to current working directory. |
| `--help`, `-h` | flag | Show help message and available arguments. |
| **CREATES** | files | Downloaded or copied assets defined in the `project.yml`. |
| Name | Description |
| -------------- | --------------------------------------------------------------------------------------- |
| `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **CREATES** | Downloaded or copied assets defined in the `project.yml`. |
### project run {#project-run}
### project run {#project-run tag="command"}
Run a named command or workflow defined in the
[`project.yml`](/usage/projects#project-yml). If a workflow name is specified,
@ -849,16 +845,16 @@ $ python -m spacy project run [subcommand] [project_dir] [--force] [--dry]
> $ python -m spacy project run train
> ```
| Argument | Type | Description |
| --------------- | ---------- | ----------------------------------------------------------------- |
| `subcommand` | positional | Name of the command or workflow to run. |
| `project_dir` | positional | Path to project directory. Defaults to current working directory. |
| `--force`, `-F` | flag | Force re-running steps, even if nothing changed. |
| `--dry`, `-D` | flag |  Perform a dry run and don't execute scripts. |
| `--help`, `-h` | flag | Show help message and available arguments. |
| **EXECUTES** | script | The command defined in the `project.yml`. |
| Name | Description |
| --------------- | --------------------------------------------------------------------------------------- |
| `subcommand` | Name of the command or workflow to run. ~~str (positional)~~ |
| `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ |
| `--force`, `-F` | Force re-running steps, even if nothing changed. ~~bool (flag)~~ |
| `--dry`, `-D` |  Perform a dry run and don't execute scripts. ~~bool (flag)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **EXECUTES** | The command defined in the `project.yml`. |
### project dvc {#project-dvc}
### project dvc {#project-dvc tag="command"}
Auto-generate [Data Version Control](https://dvc.org) (DVC) config file. Calls
[`dvc run`](https://dvc.org/doc/command-reference/run) with `--no-exec` under
@ -890,11 +886,11 @@ $ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose]
> python -m spacy project dvc all
> ```
| Argument | Type | Description |
| ----------------- | ---------- | --------------------------------------------------------------------------------------------- |
| `project_dir` | positional | Path to project directory. Defaults to current working directory. |
| `workflow` | positional | Name of workflow defined in `project.yml`. Defaults to first workflow if not set. |
| `--force`, `-F` | flag | Force-updating config file. |
| `--verbose`, `-V` | flag |  Print more output generated by DVC. |
| `--help`, `-h` | flag | Show help message and available arguments. |
| **CREATES** | file | A `dvc.yaml` file in the project directory, based on the steps defined in the given workflow. |
| Name | Description |
| ----------------- | ----------------------------------------------------------------------------------------------------------------- |
| `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ |
| `workflow` | Name of workflow defined in `project.yml`. Defaults to first workflow if not set. ~~Optional[str] \(positional)~~ |
| `--force`, `-F` | Force-updating config file. ~~bool (flag)~~ |
| `--verbose`, `-V` |  Print more output generated by DVC. ~~bool (flag)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **CREATES** | A `dvc.yaml` file in the project directory, based on the steps defined in the given workflow. |

View File

@ -40,7 +40,7 @@ Initialize a `Language` object.
| `meta` | Custom meta data for the `Language` class. Is written to by models to add model meta data. ~~dict~~ |
| `create_tokenizer` | Optional function that receives the `nlp` object and returns a tokenizer. ~~Callable[[Language], Callable[[str], Doc]]~~ |
## Language.from_config {#from_config tag="classmethod"}
## Language.from_config {#from_config tag="classmethod" new="3"}
Create a `Language` object from a loaded config. Will set up the tokenizer and
language data, add pipeline components based on the pipeline and components

View File

@ -70,7 +70,7 @@ Create a blank model of a given language class. This function is the twin of
| `name` | [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) of the language class to load. ~~str~~ |
| **RETURNS** | An empty `Language` object of the appropriate subclass. ~~Language~~ |
#### spacy.info {#spacy.info tag="function"}
### spacy.info {#spacy.info tag="function"}
The same as the [`info` command](/api/cli#info). Pretty-print information about
your installation, models and local setup from within spaCy. To get the model
@ -316,7 +316,7 @@ factories.
The following registries are added by the
[`spacy-transformers`](https://github.com/explosion/spacy-transformers) package.
See the [`Transformer`](/api/transformer) API reference and
[usage docs](/usage/transformers) for details.
[usage docs](/usage/embeddings-transformers) for details.
> #### Example
>
@ -585,20 +585,40 @@ A helper function to use in the `load()` method of a model package's
| `config` <Tag variant="new">3</Tag> | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ |
| **RETURNS** | `Language` class with the loaded model. ~~Language~~ |
### util.get_model_meta {#util.get_model_meta tag="function" new="2"}
### util.load_config {#util.load_config tag="function" new="3"}
Get a model's meta.json from a directory path and validate its contents.
Load a model's [`config.cfg`](/api/data-formats#config) from a file path. The
config typically includes details about the model pipeline and how its
components are created, as well as all training settings and hyperparameters.
> #### Example
>
> ```python
> meta = util.get_model_meta("/path/to/model")
> config = util.load_config("/path/to/model/config.cfg")
> print(config.to_str())
> ```
| Name | Description |
| ----------- | --------------------------------------------- |
| `path` | Path to model directory. ~~Union[str, Path]~~ |
| **RETURNS** | The model's meta data. ~~Dict[str, Any]~~ |
| Name | Description |
| ------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `path` | Path to the model's `config.cfg`. ~~Union[str, Path]~~ |
| `overrides` | Optional config overrides to replace in loaded config. Can be provided as nested dict, or as flat dict with keys in dot notation, e.g. `"nlp.pipeline"`. ~~Dict[str, Any]~~ |
| `interpolate` | Whether to interpolate the config and replace variables like `${paths:train}` with their values. Defaults to `False`. ~~bool~~ |
| **RETURNS** | The model's config. ~~Config~~ |
### util.load_meta {#util.load_meta tag="function" new="3"}
Get a model's `meta.json` from a file path and validate its contents.
> #### Example
>
> ```python
> meta = util.load_meta("/path/to/model/meta.json")
> ```
| Name | Description |
| ----------- | ----------------------------------------------------- |
| `path` | Path to the model's `meta.json`. ~~Union[str, Path]~~ |
| **RETURNS** | The model's meta data. ~~Dict[str, Any]~~ |
### util.is_package {#util.is_package tag="function"}

View File

@ -41,7 +41,8 @@ token, the spaCy token receives the sum of their values. To access the values,
you can use the custom [`Doc._.trf_data`](#custom-attributes) attribute. The
package also adds the function registries [`@span_getters`](#span_getters) and
[`@annotation_setters`](#annotation_setters) with several built-in registered
functions. For more details, see the [usage documentation](/usage/transformers).
functions. For more details, see the
[usage documentation](/usage/embeddings-transformers).
## Config and implementation {#config}

View File

@ -77,12 +77,14 @@ or flagging duplicates. For example, you can suggest a user content that's
similar to what they're currently looking at, or label a support ticket as a
duplicate if it's very similar to an already existing one.
Each `Doc`, `Span` and `Token` comes with a
[`.similarity()`](/api/token#similarity) method that lets you compare it with
another object, and determine the similarity. Of course similarity is always
subjective whether "dog" and "cat" are similar really depends on how you're
looking at it. spaCy's similarity model usually assumes a pretty general-purpose
definition of similarity.
Each [`Doc`](/api/doc), [`Span`](/api/span), [`Token`](/api/token) and
[`Lexeme`](/api/lexeme) comes with a [`.similarity`](/api/token#similarity)
method that lets you compare it with another object, and determine the
similarity. Of course similarity is always subjective whether "dog" and "cat"
are similar really depends on how you're looking at it. spaCy's similarity model
usually assumes a pretty general-purpose definition of similarity.
<!-- TODO: use better example here -->
```python
### {executable="true"}

View File

@ -0,0 +1,459 @@
---
title: Embeddings, Transformers and Transfer Learning
teaser: Using transformer embeddings like BERT in spaCy
menu:
- ['Embedding Layers', 'embedding-layers']
- ['Transformers', 'transformers']
- ['Static Vectors', 'static-vectors']
- ['Pretraining', 'pretraining']
next: /usage/training
---
<!-- TODO: intro, short explanation of embeddings/transformers, point user to processing pipelines docs for intro -->
## Shared embedding layers {#embedding-layers}
<!-- TODO: write: `Tok2Vec` and `Transformer` components -->
<Accordion title="Whats the difference between word vectors and language models?" id="vectors-vs-language-models">
The key difference between [word vectors](#word-vectors) and contextual language
models such as [transformers](#transformers) is that word vectors model
**lexical types**, rather than _tokens_. If you have a list of terms with no
context around them, a transformer model like BERT can't really help you. BERT
is designed to understand language **in context**, which isn't what you have. A
word vectors table will be a much better fit for your task. However, if you do
have words in context — whole sentences or paragraphs of running text — word
vectors will only provide a very rough approximation of what the text is about.
Word vectors are also very computationally efficient, as they map a word to a
vector with a single indexing operation. Word vectors are therefore useful as a
way to **improve the accuracy** of neural network models, especially models that
are small or have received little or no pretraining. In spaCy, word vector
tables are only used as **static features**. spaCy does not backpropagate
gradients to the pretrained word vectors table. The static vectors table is
usually used in combination with a smaller table of learned task-specific
embeddings.
</Accordion>
<Accordion title="When should I add word vectors to my model?">
Word vectors are not compatible with most [transformer models](#transformers),
but if you're training another type of NLP network, it's almost always worth
adding word vectors to your model. As well as improving your final accuracy,
word vectors often make experiments more consistent, as the accuracy you reach
will be less sensitive to how the network is randomly initialized. High variance
due to random chance can slow down your progress significantly, as you need to
run many experiments to filter the signal from the noise.
Word vector features need to be enabled prior to training, and the same word
vectors table will need to be available at runtime as well. You cannot add word
vector features once the model has already been trained, and you usually cannot
replace one word vectors table with another without causing a significant loss
of performance.
</Accordion>
## Using transformer models {#transformers}
Transformers are a family of neural network architectures that compute **dense,
context-sensitive representations** for the tokens in your documents. Downstream
models in your pipeline can then use these representations as input features to
**improve their predictions**. You can connect multiple components to a single
transformer model, with any or all of those components giving feedback to the
transformer to fine-tune it to your tasks. spaCy's transformer support
interoperates with [PyTorch](https://pytorch.org) and the
[HuggingFace `transformers`](https://huggingface.co/transformers/) library,
giving you access to thousands of pretrained models for your pipelines. There
are many [great guides](http://jalammar.github.io/illustrated-transformer/) to
transformer models, but for practical purposes, you can simply think of them as
a drop-in replacement that let you achieve **higher accuracy** in exchange for
**higher training and runtime costs**.
### Setup and installation {#transformers-installation}
> #### System requirements
>
> We recommend an NVIDIA **GPU** with at least **10GB of memory** in order to
> work with transformer models. Make sure your GPU drivers are up to date and
> you have **CUDA v9+** installed.
> The exact requirements will depend on the transformer model. Training a
> transformer-based model without a GPU will be too slow for most practical
> purposes.
>
> Provisioning a new machine will require about **5GB** of data to be
> downloaded: 3GB CUDA runtime, 800MB PyTorch, 400MB CuPy, 500MB weights, 200MB
> spaCy and dependencies.
Once you have CUDA installed, you'll need to install two pip packages,
[`cupy`](https://docs.cupy.dev/en/stable/install.html) and
[`spacy-transformers`](https://github.com/explosion/spacy-transformers). `cupy`
is just like `numpy`, but for GPU. The best way to install it is to choose a
wheel that matches the version of CUDA you're using. You may also need to set
the `CUDA_PATH` environment variable if your CUDA runtime is installed in a
non-standard location. Putting it all together, if you had installed CUDA 10.2
in `/opt/nvidia/cuda`, you would run:
```bash
### Installation with CUDA
export CUDA_PATH="/opt/nvidia/cuda"
pip install cupy-cuda102
pip install spacy-transformers
```
### Runtime usage {#transformers-runtime}
Transformer models can be used as **drop-in replacements** for other types of
neural networks, so your spaCy pipeline can include them in a way that's
completely invisible to the user. Users will download, load and use the model in
the standard way, like any other spaCy pipeline. Instead of using the
transformers as subnetworks directly, you can also use them via the
[`Transformer`](/api/transformer) pipeline component.
![The processing pipeline with the transformer component](../images/pipeline_transformer.svg)
The `Transformer` component sets the
[`Doc._.trf_data`](/api/transformer#custom_attributes) extension attribute,
which lets you access the transformers outputs at runtime.
```bash
$ python -m spacy download en_core_trf_lg
```
```python
### Example
import spacy
from thinc.api import use_pytorch_for_gpu_memory, require_gpu
# Use the GPU, with memory allocations directed via PyTorch.
# This prevents out-of-memory errors that would otherwise occur from competing
# memory pools.
use_pytorch_for_gpu_memory()
require_gpu(0)
nlp = spacy.load("en_core_trf_lg")
for doc in nlp.pipe(["some text", "some other text"]):
tokvecs = doc._.trf_data.tensors[-1]
```
You can also customize how the [`Transformer`](/api/transformer) component sets
annotations onto the [`Doc`](/api/doc), by customizing the `annotation_setter`.
This callback will be called with the raw input and output data for the whole
batch, along with the batch of `Doc` objects, allowing you to implement whatever
you need. The annotation setter is called with a batch of [`Doc`](/api/doc)
objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch)
containing the transformers data for the batch.
```python
def custom_annotation_setter(docs, trf_data):
# TODO:
...
nlp = spacy.load("en_core_trf_lg")
nlp.get_pipe("transformer").annotation_setter = custom_annotation_setter
doc = nlp("This is a text")
print() # TODO:
```
### Training usage {#transformers-training}
The recommended workflow for training is to use spaCy's
[config system](/usage/training#config), usually via the
[`spacy train`](/api/cli#train) command. The training config defines all
component settings and hyperparameters in one place and lets you describe a tree
of objects by referring to creation functions, including functions you register
yourself. For details on how to get started with training your own model, check
out the [training quickstart](/usage/training#quickstart).
<Project id="en_core_bert">
The easiest way to get started is to clone a transformers-based project
template. Swap in your data, edit the settings and hyperparameters and train,
evaluate, package and visualize your model.
</Project>
The `[components]` section in the [`config.cfg`](/api/data-formats#config)
describes the pipeline components and the settings used to construct them,
including their model implementation. Here's a config snippet for the
[`Transformer`](/api/transformer) component, along with matching Python code. In
this case, the `[components.transformer]` block describes the `transformer`
component:
> #### Python equivalent
>
> ```python
> from spacy_transformers import Transformer, TransformerModel
> from spacy_transformers.annotation_setters import null_annotation_setter
> from spacy_transformers.span_getters import get_doc_spans
>
> trf = Transformer(
> nlp.vocab,
> TransformerModel(
> "bert-base-cased",
> get_spans=get_doc_spans,
> tokenizer_config={"use_fast": True},
> ),
> annotation_setter=null_annotation_setter,
> max_batch_items=4096,
> )
> ```
```ini
### config.cfg (excerpt)
[components.transformer]
factory = "transformer"
max_batch_items = 4096
[components.transformer.model]
@architectures = "spacy-transformers.TransformerModel.v1"
name = "bert-base-cased"
tokenizer_config = {"use_fast": true}
[components.transformer.model.get_spans]
@span_getters = "doc_spans.v1"
[components.transformer.annotation_setter]
@annotation_setters = "spacy-transformer.null_annotation_setter.v1"
```
The `[components.transformer.model]` block describes the `model` argument passed
to the transformer component. It's a Thinc
[`Model`](https://thinc.ai/docs/api-model) object that will be passed into the
component. Here, it references the function
[spacy-transformers.TransformerModel.v1](/api/architectures#TransformerModel)
registered in the [`architectures` registry](/api/top-level#registry). If a key
in a block starts with `@`, it's **resolved to a function** and all other
settings are passed to the function as arguments. In this case, `name`,
`tokenizer_config` and `get_spans`.
`get_spans` is a function that takes a batch of `Doc` object and returns lists
of potentially overlapping `Span` objects to process by the transformer. Several
[built-in functions](/api/transformer#span-getters) are available for example,
to process the whole document or individual sentences. When the config is
resolved, the function is created and passed into the model as an argument.
<Infobox variant="warning">
Remember that the `config.cfg` used for training should contain **no missing
values** and requires all settings to be defined. You don't want any hidden
defaults creeping in and changing your results! spaCy will tell you if settings
are missing, and you can run
[`spacy init fill-config`](/api/cli#init-fill-config) to automatically fill in
all defaults.
</Infobox>
### Customizing the settings {#transformers-training-custom-settings}
To change any of the settings, you can edit the `config.cfg` and re-run the
training. To change any of the functions, like the span getter, you can replace
the name of the referenced function e.g. `@span_getters = "sent_spans.v1"` to
process sentences. You can also register your own functions using the
`span_getters` registry:
> #### config.cfg
>
> ```ini
> [components.transformer.model.get_spans]
> @span_getters = "custom_sent_spans"
> ```
```python
### code.py
import spacy_transformers
@spacy_transformers.registry.span_getters("custom_sent_spans")
def configure_custom_sent_spans():
# TODO: write custom example
def get_sent_spans(docs):
return [list(doc.sents) for doc in docs]
return get_sent_spans
```
To resolve the config during training, spaCy needs to know about your custom
function. You can make it available via the `--code` argument that can point to
a Python file. For more details on training with custom code, see the
[training documentation](/usage/training#custom-code).
```bash
$ python -m spacy train ./config.cfg --code ./code.py
```
### Customizing the model implementations {#training-custom-model}
The [`Transformer`](/api/transformer) component expects a Thinc
[`Model`](https://thinc.ai/docs/api-model) object to be passed in as its `model`
argument. You're not limited to the implementation provided by
`spacy-transformers` the only requirement is that your registered function
must return an object of type ~~Model[List[Doc], FullTransformerBatch]~~: that
is, a Thinc model that takes a list of [`Doc`](/api/doc) objects, and returns a
[`FullTransformerBatch`](/api/transformer#fulltransformerbatch) object with the
transformer data.
> #### Model type annotations
>
> In the documentation and code base, you may come across type annotations and
> descriptions of [Thinc](https://thinc.ai) model types, like ~~Model[List[Doc],
> List[Floats2d]]~~. This so-called generic type describes the layer and its
> input and output type in this case, it takes a list of `Doc` objects as the
> input and list of 2-dimensional arrays of floats as the output. You can read
> more about defining Thinc models [here](https://thinc.ai/docs/usage-models).
> Also see the [type checking](https://thinc.ai/docs/usage-type-checking) for
> how to enable linting in your editor to see live feedback if your inputs and
> outputs don't match.
The same idea applies to task models that power the **downstream components**.
Most of spaCy's built-in model creation functions support a `tok2vec` argument,
which should be a Thinc layer of type ~~Model[List[Doc], List[Floats2d]]~~. This
is where we'll plug in our transformer model, using the
[Tok2VecListener](/api/architectures#Tok2VecListener) layer, which sneakily
delegates to the `Transformer` pipeline component.
```ini
### config.cfg (excerpt) {highlight="12"}
[components.ner]
factory = "ner"
[nlp.pipeline.ner.model]
@architectures = "spacy.TransitionBasedParser.v1"
nr_feature_tokens = 3
hidden_width = 128
maxout_pieces = 3
use_upper = false
[nlp.pipeline.ner.model.tok2vec]
@architectures = "spacy-transformers.Tok2VecListener.v1"
grad_factor = 1.0
[nlp.pipeline.ner.model.tok2vec.pooling]
@layers = "reduce_mean.v1"
```
The [Tok2VecListener](/api/architectures#Tok2VecListener) layer expects a
[pooling layer](https://thinc.ai/docs/api-layers#reduction-ops) as the argument
`pooling`, which needs to be of type ~~Model[Ragged, Floats2d]~~. This layer
determines how the vector for each spaCy token will be computed from the zero or
more source rows the token is aligned against. Here we use the
[`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean) layer, which
averages the wordpiece rows. We could instead use
[`reduce_max`](https://thinc.ai/docs/api-layers#reduce_max), or a custom
function you write yourself.
You can have multiple components all listening to the same transformer model,
and all passing gradients back to it. By default, all of the gradients will be
**equally weighted**. You can control this with the `grad_factor` setting, which
lets you reweight the gradients from the different listeners. For instance,
setting `grad_factor = 0` would disable gradients from one of the listeners,
while `grad_factor = 2.0` would multiply them by 2. This is similar to having a
custom learning rate for each component. Instead of a constant, you can also
provide a schedule, allowing you to freeze the shared parameters at the start of
training.
## Static vectors {#static-vectors}
<!-- TODO: write -->
### Using word vectors in your models {#word-vectors-models}
Many neural network models are able to use word vector tables as additional
features, which sometimes results in significant improvements in accuracy.
spaCy's built-in embedding layer,
[MultiHashEmbed](/api/architectures#MultiHashEmbed), can be configured to use
word vector tables using the `also_use_static_vectors` flag. This setting is
also available on the [MultiHashEmbedCNN](/api/architectures#MultiHashEmbedCNN)
layer, which builds the default token-to-vector encoding architecture.
```ini
[tagger.model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v1"
width = 128
rows = 7000
also_embed_subwords = true
also_use_static_vectors = true
```
<Infobox title="How it works" emoji="💡">
The configuration system will look up the string `"spacy.MultiHashEmbed.v1"` in
the `architectures` [registry](/api/top-level#registry), and call the returned
object with the rest of the arguments from the block. This will result in a call
to the
[`MultiHashEmbed`](https://github.com/explosion/spacy/tree/develop/spacy/ml/models/tok2vec.py)
function, which will return a [Thinc](https://thinc.ai) model object with the
type signature ~~Model[List[Doc], List[Floats2d]]~~. Because the embedding layer
takes a list of `Doc` objects as input, it does not need to store a copy of the
vectors table. The vectors will be retrieved from the `Doc` objects that are
passed in, via the `doc.vocab.vectors` attribute. This part of the process is
handled by the [StaticVectors](/api/architectures#StaticVectors) layer.
</Infobox>
#### Creating a custom embedding layer {#custom-embedding-layer}
The [MultiHashEmbed](/api/architectures#StaticVectors) layer is spaCy's
recommended strategy for constructing initial word representations for your
neural network models, but you can also implement your own. You can register any
function to a string name, and then reference that function within your config
(see the [training docs](/usage/training) for more details). To try this out,
you can save the following little example to a new Python file:
```python
from spacy.ml.staticvectors import StaticVectors
from spacy.util import registry
print("I was imported!")
@registry.architectures("my_example.MyEmbedding.v1")
def MyEmbedding(output_width: int) -> Model[List[Doc], List[Floats2d]]:
print("I was called!")
return StaticVectors(nO=output_width)
```
If you pass the path to your file to the [`spacy train`](/api/cli#train) command
using the `--code` argument, your file will be imported, which means the
decorator registering the function will be run. Your function is now on equal
footing with any of spaCy's built-ins, so you can drop it in instead of any
other model with the same input and output signature. For instance, you could
use it in the tagger model as follows:
```ini
[tagger.model.tok2vec.embed]
@architectures = "my_example.MyEmbedding.v1"
output_width = 128
```
Now that you have a custom function wired into the network, you can start
implementing the logic you're interested in. For example, let's say you want to
try a relatively simple embedding strategy that makes use of static word
vectors, but combines them via summation with a smaller table of learned
embeddings.
```python
from thinc.api import add, chain, remap_ids, Embed
from spacy.ml.staticvectors import StaticVectors
@registry.architectures("my_example.MyEmbedding.v1")
def MyCustomVectors(
output_width: int,
vector_width: int,
embed_rows: int,
key2row: Dict[int, int]
) -> Model[List[Doc], List[Floats2d]]:
return add(
StaticVectors(nO=output_width),
chain(
FeatureExtractor(["ORTH"]),
remap_ids(key2row),
Embed(nO=output_width, nV=embed_rows)
)
)
```
## Pretraining {#pretraining}
<!-- TODO: write -->

View File

@ -9,6 +9,7 @@ menu:
- ['Tokenization', 'tokenization']
- ['Merging & Splitting', 'retokenization']
- ['Sentence Segmentation', 'sbd']
- ['Vectors & Similarity', 'vectors-similarity']
- ['Language data', 'language-data']
---
@ -1024,10 +1025,10 @@ produced by the tokenizer.
>
> If you're working with transformer models like BERT, check out the
> [`spacy-transformers`](https://github.com/explosion/spacy-transformers)
> extension package and [documentation](/usage/transformers). It includes a
> pipeline component for using pretrained transformer weights and **training
> transformer models** in spaCy, as well as helpful utilities for aligning word
> pieces to linguistic tokenization.
> extension package and [documentation](/usage/embeddings-transformers). It
> includes a pipeline component for using pretrained transformer weights and
> **training transformer models** in spaCy, as well as helpful utilities for
> aligning word pieces to linguistic tokenization.
```python
### Custom BERT word piece tokenizer
@ -1510,7 +1511,7 @@ adding it to the pipeline using [`nlp.add_pipe`](/api/language#add_pipe).
</Infobox>
Here's an example of a component that implements a pre-processing rule for
splitting on `'...'` tokens. The component is added before the parser, which is
splitting on `"..."` tokens. The component is added before the parser, which is
then used to further segment the text. That's possible, because `is_sent_start`
is only set to `True` for some of the tokens all others still specify `None`
for unset sentence boundaries. This approach can be useful if you want to
@ -1540,6 +1541,152 @@ doc = nlp(text)
print("After:", [sent.text for sent in doc.sents])
```
## Word vectors and semantic similarity {#vectors-similarity}
import Vectors101 from 'usage/101/\_vectors-similarity.md'
<Vectors101 />
<Infobox title="What to expect from similarity results" variant="warning">
Computing similarity scores can be helpful in many situations, but it's also
important to maintain **realistic expectations** about what information it can
provide. Words can be related to each over in many ways, so a single
"similarity" score will always be a **mix of different signals**, and vectors
trained on different data can produce very different results that may not be
useful for your purpose.
Also note that the similarity of `Doc` or `Span` objects defaults to the
**average** of the token vectors. This means it's insensitive to the order of
the words. Two documents expressing the same meaning with dissimilar wording
will return a lower similarity score than two documents that happen to contain
the same words while expressing different meanings.
</Infobox>
### Adding word vectors {#adding-vectors}
Custom word vectors can be trained using a number of open-source libraries, such
as [Gensim](https://radimrehurek.com/gensim), [Fast Text](https://fasttext.cc),
or Tomas Mikolov's original
[Word2vec implementation](https://code.google.com/archive/p/word2vec/). Most
word vector libraries output an easy-to-read text-based format, where each line
consists of the word followed by its vector. For everyday use, we want to
convert the vectors model into a binary format that loads faster and takes up
less space on disk. The easiest way to do this is the
[`init model`](/api/cli#init-model) command-line utility. This will output a
spaCy model in the directory `/tmp/la_vectors_wiki_lg`, giving you access to
some nice Latin vectors. You can then pass the directory path to
[`spacy.load`](/api/top-level#spacy.load).
> #### Usage example
>
> ```python
> nlp_latin = spacy.load("/tmp/la_vectors_wiki_lg")
> doc1 = nlp_latin("Caecilius est in horto")
> doc2 = nlp_latin("servus est in atrio")
> doc1.similarity(doc2)
> ```
```bash
wget https://s3-us-west-1.amazonaws.com/fasttext-vectors/word-vectors-v2/cc.la.300.vec.gz
python -m spacy init model en /tmp/la_vectors_wiki_lg --vectors-loc cc.la.300.vec.gz
```
<Accordion title="How to optimize vector coverage" id="custom-vectors-coverage" spaced>
To help you strike a good balance between coverage and memory usage, spaCy's
[`Vectors`](/api/vectors) class lets you map **multiple keys** to the **same
row** of the table. If you're using the
[`spacy init model`](/api/cli#init-model) command to create a vocabulary,
pruning the vectors will be taken care of automatically if you set the
`--prune-vectors` flag. You can also do it manually in the following steps:
1. Start with a **word vectors model** that covers a huge vocabulary. For
instance, the [`en_vectors_web_lg`](/models/en-starters#en_vectors_web_lg)
model provides 300-dimensional GloVe vectors for over 1 million terms of
English.
2. If your vocabulary has values set for the `Lexeme.prob` attribute, the
lexemes will be sorted by descending probability to determine which vectors
to prune. Otherwise, lexemes will be sorted by their order in the `Vocab`.
3. Call [`Vocab.prune_vectors`](/api/vocab#prune_vectors) with the number of
vectors you want to keep.
```python
nlp = spacy.load('en_vectors_web_lg')
n_vectors = 105000 # number of vectors to keep
removed_words = nlp.vocab.prune_vectors(n_vectors)
assert len(nlp.vocab.vectors) <= n_vectors # unique vectors have been pruned
assert nlp.vocab.vectors.n_keys > n_vectors # but not the total entries
```
[`Vocab.prune_vectors`](/api/vocab#prune_vectors) reduces the current vector
table to a given number of unique entries, and returns a dictionary containing
the removed words, mapped to `(string, score)` tuples, where `string` is the
entry the removed word was mapped to, and `score` the similarity score between
the two words.
```python
### Removed words
{
"Shore": ("coast", 0.732257),
"Precautionary": ("caution", 0.490973),
"hopelessness": ("sadness", 0.742366),
"Continous": ("continuous", 0.732549),
"Disemboweled": ("corpse", 0.499432),
"biostatistician": ("scientist", 0.339724),
"somewheres": ("somewheres", 0.402736),
"observing": ("observe", 0.823096),
"Leaving": ("leaving", 1.0),
}
```
In the example above, the vector for "Shore" was removed and remapped to the
vector of "coast", which is deemed about 73% similar. "Leaving" was remapped to
the vector of "leaving", which is identical. If you're using the
[`init model`](/api/cli#init-model) command, you can set the `--prune-vectors`
option to easily reduce the size of the vectors as you add them to a spaCy
model:
```bash
$ python -m spacy init model /tmp/la_vectors_web_md --vectors-loc la.300d.vec.tgz --prune-vectors 10000
```
This will create a spaCy model with vectors for the first 10,000 words in the
vectors model. All other words in the vectors model are mapped to the closest
vector among those retained.
</Accordion>
### Adding vectors individually {#adding-individual-vectors}
The `vector` attribute is a **read-only** numpy or cupy array (depending on
whether you've configured spaCy to use GPU memory), with dtype `float32`. The
array is read-only so that spaCy can avoid unnecessary copy operations where
possible. You can modify the vectors via the [`Vocab`](/api/vocab) or
[`Vectors`](/api/vectors) table. Using the
[`Vocab.set_vector`](/api/vocab#set_vector) method is often the easiest approach
if you have vectors in an arbitrary format, as you can read in the vectors with
your own logic, and just set them with a simple loop. This method is likely to
be slower than approaches that work with the whole vectors table at once, but
it's a great approach for once-off conversions before you save out your model to
disk.
```python
### Adding vectors
from spacy.vocab import Vocab
vector_data = {
"dog": numpy.random.uniform(-1, 1, (300,)),
"cat": numpy.random.uniform(-1, 1, (300,)),
"orange": numpy.random.uniform(-1, 1, (300,))
}
vocab = Vocab()
for word, vector in vector_data.items():
vocab.set_vector(word, vector)
```
## Language data {#language-data}
import LanguageData101 from 'usage/101/\_language-data.md'

View File

@ -1,6 +1,6 @@
---
title: Language Processing Pipelines
next: /usage/vectors-embeddings
next: /usage/embeddings-transformers
menu:
- ['Processing Text', 'processing']
- ['How Pipelines Work', 'pipelines']
@ -324,9 +324,9 @@ pretrained components and new components trained on your data.
When reusing components across models, keep in mind that the **vocabulary**,
**vectors** and model settings **must match**. If a pretrained model includes
[word vectors](/usage/vectors-embeddings) and the component uses them as
features, the model you copy it to needs to have the _same_ vectors available
otherwise, it won't be able to make the same predictions.
[word vectors](/usage/linguistic-features#vectors-similarity) and the component
uses them as features, the model you copy it to needs to have the _same_ vectors
available otherwise, it won't be able to make the same predictions.
</Infobox>
@ -1202,7 +1202,7 @@ document similarity method.
Hooks let you customize some of the behaviors of the `Doc`, `Span` or `Token`
objects by adding a component to the pipeline. For instance, to customize the
[`Doc.similarity`](/api/doc#similarity) method, you can add a component that
sets a custom function to `doc.user_hooks['similarity']`. The built-in
sets a custom function to `doc.user_hooks["similarity"]`. The built-in
`Doc.similarity` method will check the `user_hooks` dict, and delegate to your
function if you've set one. Similar results can be achieved by setting functions
to `Doc.user_span_hooks` and `Doc.user_token_hooks`.

View File

@ -247,7 +247,7 @@ import Vectors101 from 'usage/101/\_vectors-similarity.md'
To learn more about word vectors, how to **customize them** and how to load
**your own vectors** into spaCy, see the usage guide on
[using word vectors and semantic similarities](/usage/vectors-embeddings).
[using word vectors and semantic similarities](/usage/linguistic-features#vectors-similarity).
</Infobox>

View File

@ -30,7 +30,7 @@ ready-to-use spaCy models.
</Infobox>
## Quickstart {#quickstart}
## Quickstart {#quickstart tag="new"}
The recommended way to train your spaCy models is via the
[`spacy train`](/api/cli#train) command on the command line. It only needs a
@ -131,7 +131,7 @@ Some of the main advantages and features of spaCy's training config are:
multiple components, define them once and reference them as
[variables](#config-interpolation).
- **Reproducibility with no hidden defaults.** The config file is the "single
source of truth" and includes all settings. <!-- TODO: explain this better -->
source of truth" and includes all settings.
- **Automated checks and validation.** When you load a config, spaCy checks if
the settings are complete and if all values have the correct types. This lets
you catch potential mistakes early. In your custom architectures, you can use
@ -667,7 +667,7 @@ visualize your model.
For more details on how to integrate transformer models into your training
config and customize the implementations, see the usage guide on
[training transformers](/usage/transformers#training).
[training transformers](/usage/embeddings-transformers#transformers-training).
### Pretraining with spaCy {#pretraining}

View File

@ -218,7 +218,7 @@ available via `token.orth`.
The new [`Vectors`](/api/vectors) class helps the `Vocab` manage the vectors
assigned to strings, and lets you assign vectors individually, or
[load in GloVe vectors](/usage/vectors-embeddings#custom-loading-glove) from a
[load in GloVe vectors](/usage/linguistic-features#adding-vectors) from a
directory. To help you strike a good balance between coverage and memory usage,
the `Vectors` class lets you map **multiple keys** to the **same row** of the
table. If you're using the [`spacy init-model`](/api/cli#init-model) command to

View File

@ -30,7 +30,7 @@ menu:
<Infobox title="Details & Documentation" emoji="📖" list>
- **Usage:** [Transformers](/usage/transformers),
- **Usage:** [Embeddings & Transformers](/usage/embeddings-transformers),
[Training models](/usage/training)
- **API:** [`Transformer`](/api/transformer),
[`TransformerData`](/api/transformer#transformerdata),
@ -59,13 +59,13 @@ menu:
### New built-in pipeline components {#features-pipeline-components}
| Name | Description |
| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| [`SentenceRecognizer`](/api/sentencerecognizer) | Trainable component for sentence segmentation. |
| [`Morphologizer`](/api/morphologizer) | Trainable component to predict morphological features. |
| [`Lemmatizer`](/api/lemmatizer) | Standalone component for rule-based and lookup lemmatization. |
| [`AttributeRuler`](/api/attributeruler) | Component for setting token attributes using match patterns. |
| [`Transformer`](/api/transformer) | Component for using [transformer models](/usage/transformers) in your pipeline, accessing outputs and aligning tokens. Provided via [`spacy-transformers`](https://github.com/explosion/spacy-transformers). |
| Name | Description |
| ----------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| [`SentenceRecognizer`](/api/sentencerecognizer) | Trainable component for sentence segmentation. |
| [`Morphologizer`](/api/morphologizer) | Trainable component to predict morphological features. |
| [`Lemmatizer`](/api/lemmatizer) | Standalone component for rule-based and lookup lemmatization. |
| [`AttributeRuler`](/api/attributeruler) | Component for setting token attributes using match patterns. |
| [`Transformer`](/api/transformer) | Component for using [transformer models](/usage/embeddings-transformers) in your pipeline, accessing outputs and aligning tokens. Provided via [`spacy-transformers`](https://github.com/explosion/spacy-transformers). |
<Infobox title="Details & Documentation" emoji="📖" list>
@ -140,22 +140,20 @@ in your config and see validation errors if the argument values don't match.
The following methods, attributes and commands are new in spaCy v3.0.
| Name | Description |
| ------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| [`Token.lex`](/api/token#attributes) | Access a token's [`Lexeme`](/api/lexeme). |
| [`Language.select_pipes`](/api/language#select_pipes) | Contextmanager for enabling or disabling specific pipeline components for a block. |
| [`Language.analyze_pipes`](/api/language#analyze_pipes) | [Analyze](/usage/processing-pipelines#analysis) components and their interdependencies. |
| [`Language.resume_training`](/api/language#resume_training) | Experimental: continue training a pretrained model and initialize "rehearsal" for components that implement a `rehearse` method to prevent catastrophic forgetting. |
| [`@Language.factory`](/api/language#factory) [`@Language.component`](/api/language#component) | Decorators for [registering](/usage/processing-pipelines#custom-components) pipeline component factories and simple stateless component functions. |
| [`Language.has_factory`](/api/language#has_factory) | Check whether a component factory is registered on a language class.s |
| [`Language.get_factory_meta`](/api/language#get_factory_meta) [`Language.get_pipe_meta`](/api/language#get_factory_meta) | Get the [`FactoryMeta`](/api/language#factorymeta) with component metadata for a factory or instance name. |
| [`Language.config`](/api/language#config) | The [config](/usage/training#config) used to create the current `nlp` object. An instance of [`Config`](https://thinc.ai/docs/api-config#config) and can be saved to disk and used for training. |
| [`Pipe.score`](/api/pipe#score) | Method on trainable pipeline components that returns a dictionary of evaluation scores. |
| [`registry`](/api/top-level#registry) | Function registry to map functions to string names that can be referenced in [configs](/usage/training#config). |
| [`init config`](/api/cli#init-config) | CLI command for initializing a [training config](/usage/training) file with the recommended settings. |
| [`init fill-config`](/api/cli#init-fill-config) | CLI command for auto-filling a partial config with all defaults and missing values. |
| [`debug config`](/api/cli#debug-config) | CLI command for debugging a [training config](/usage/training) file and showing validation errors. |
| [`project`](/api/cli#project) | Suite of CLI commands for cloning, running and managing [spaCy projects](/usage/projects). |
| Name | Description |
| ----------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| [`Token.lex`](/api/token#attributes) | Access a token's [`Lexeme`](/api/lexeme). |
| [`Language.select_pipes`](/api/language#select_pipes) | Contextmanager for enabling or disabling specific pipeline components for a block. |
| [`Language.analyze_pipes`](/api/language#analyze_pipes) | [Analyze](/usage/processing-pipelines#analysis) components and their interdependencies. |
| [`Language.resume_training`](/api/language#resume_training) | Experimental: continue training a pretrained model and initialize "rehearsal" for components that implement a `rehearse` method to prevent catastrophic forgetting. |
| [`@Language.factory`](/api/language#factory) [`@Language.component`](/api/language#component) | Decorators for [registering](/usage/processing-pipelines#custom-components) pipeline component factories and simple stateless component functions. |
| [`Language.has_factory`](/api/language#has_factory) | Check whether a component factory is registered on a language class.s |
| [`Language.get_factory_meta`](/api/language#get_factory_meta) [`Language.get_pipe_meta`](/api/language#get_factory_meta) | Get the [`FactoryMeta`](/api/language#factorymeta) with component metadata for a factory or instance name. |
| [`Language.config`](/api/language#config) | The [config](/usage/training#config) used to create the current `nlp` object. An instance of [`Config`](https://thinc.ai/docs/api-config#config) and can be saved to disk and used for training. |
| [`Pipe.score`](/api/pipe#score) | Method on trainable pipeline components that returns a dictionary of evaluation scores. |
| [`registry`](/api/top-level#registry) | Function registry to map functions to string names that can be referenced in [configs](/usage/training#config). |
| [`init config`](/api/cli#init-config) [`init fill-config`](/api/cli#init-fill-config) [`debug config`](/api/cli#debug-config) | CLI commands for initializing, auto-filling and debugging [training configs](/usage/training). |
| [`project`](/api/cli#project) | Suite of CLI commands for cloning, running and managing [spaCy projects](/usage/projects). |
## Backwards Incompatibilities {#incompat}
@ -420,15 +418,20 @@ $ python -m spacy convert ./training.json ./output
#### Training config {#migrating-training-config}
The easiest way to get started with a training config is to use the
[`init config`](/api/cli#init-config) command. You can start off with a blank
config for a new model, copy the config from an existing model, or auto-fill a
partial config like a starter config generated by our
[quickstart widget](/usage/training#quickstart).
[`init config`](/api/cli#init-config) command or the
[quickstart widget](/usage/training#quickstart). You can define your
requirements, and it will auto-generate a starter config with the best-matching
default settings.
```bash
python -m spacy init-config ./config.cfg --lang en --pipeline tagger,parser
$ python -m spacy init config ./config.cfg --lang en --pipeline tagger,parser
```
If you've exported a starter config from our
[quickstart widget](/usage/training#quickstart), you can use the
[`init fill-config`](/api/cli#init-fill-config) to fill it with all default
values. You can then use the auto-generated `config.cfg` for training:
```diff
### {wrap="true"}
- python -m spacy train en ./output ./train.json ./dev.json --pipeline tagger,parser --cnn-window 1 --bilstm-depth 0

View File

@ -1,224 +0,0 @@
---
title: Vectors and Embeddings
menu:
- ["What's a Word Vector?", 'whats-a-vector']
- ['Word Vectors', 'vectors']
- ['Other Embeddings', 'embeddings']
next: /usage/transformers
---
An old idea in linguistics is that you can "know a word by the company it
keeps": that is, word meanings can be understood relationally, based on their
patterns of usage. This idea inspired a branch of NLP research known as
"distributional semantics" that has aimed to compute databases of lexical
knowledge automatically. The [Word2vec](https://en.wikipedia.org/wiki/Word2vec)
family of algorithms are a key milestone in this line of research. For
simplicity, we will refer to a distributional word representation as a "word
vector", and algorithms that computes word vectors (such as
[GloVe](https://nlp.stanford.edu/projects/glove/),
[FastText](https://fasttext.cc), etc.) as "Word2vec algorithms".
Word vector tables are included in some of the spaCy [model packages](/models)
we distribute, and you can easily create your own model packages with word
vectors you train or download yourself. In some cases you can also add word
vectors to an existing pipeline, although each pipeline can only have a single
word vectors table, and a model package that already has word vectors is
unlikely to work correctly if you replace the vectors with new ones.
## What's a word vector? {#whats-a-vector}
For spaCy's purposes, a "word vector" is a 1-dimensional slice from a
2-dimensional **vectors table**, with a deterministic mapping from word types to
rows in the table.
```python
def what_is_a_word_vector(
word_id: int,
key2row: Dict[int, int],
vectors_table: Floats2d,
*,
default_row: int=0
) -> Floats1d:
return vectors_table[key2row.get(word_id, default_row)]
```
Word2vec algorithms try to produce vectors tables that let you estimate useful
relationships between words using simple linear algebra operations. For
instance, you can often find close synonyms of a word by finding the vectors
closest to it by cosine distance, and then finding the words that are mapped to
those neighboring vectors. Word vectors can also be useful as features in
statistical models.
### Word vectors vs. contextual language models {#vectors-vs-language-models}
The key difference between word vectors and contextual language models such as
ElMo, BERT and GPT-2 is that word vectors model **lexical types**, rather than
_tokens_. If you have a list of terms with no context around them, a model like
BERT can't really help you. BERT is designed to understand language **in
context**, which isn't what you have. A word vectors table will be a much better
fit for your task. However, if you do have words in context — whole sentences or
paragraphs of running text — word vectors will only provide a very rough
approximation of what the text is about.
Word vectors are also very computationally efficient, as they map a word to a
vector with a single indexing operation. Word vectors are therefore useful as a
way to **improve the accuracy** of neural network models, especially models that
are small or have received little or no pretraining. In spaCy, word vector
tables are only used as **static features**. spaCy does not backpropagate
gradients to the pretrained word vectors table. The static vectors table is
usually used in combination with a smaller table of learned task-specific
embeddings.
## Using word vectors directly {#vectors}
spaCy stores word vector information in the
[`Vocab.vectors`](/api/vocab#attributes) attribute, so you can access the whole
vectors table from most spaCy objects. You can also access the vector for a
[`Doc`](/api/doc), [`Span`](/api/span), [`Token`](/api/token) or
[`Lexeme`](/api/lexeme) instance via the `vector` attribute. If your `Doc` or
`Span` has multiple tokens, the average of the word vectors will be returned,
excluding any "out of vocabulary" entries that have no vector available. If none
of the words have a vector, a zeroed vector will be returned.
The `vector` attribute is a **read-only** numpy or cupy array (depending on
whether you've configured spaCy to use GPU memory), with dtype `float32`. The
array is read-only so that spaCy can avoid unnecessary copy operations where
possible. You can modify the vectors via the `Vocab` or `Vectors` table.
### Converting word vectors for use in spaCy
Custom word vectors can be trained using a number of open-source libraries, such
as [Gensim](https://radimrehurek.com/gensim), [Fast Text](https://fasttext.cc),
or Tomas Mikolov's original
[Word2vec implementation](https://code.google.com/archive/p/word2vec/). Most
word vector libraries output an easy-to-read text-based format, where each line
consists of the word followed by its vector. For everyday use, we want to
convert the vectors model into a binary format that loads faster and takes up
less space on disk. The easiest way to do this is the
[`init-model`](/api/cli#init-model) command-line utility:
```bash
wget https://s3-us-west-1.amazonaws.com/fasttext-vectors/word-vectors-v2/cc.la.300.vec.gz
python -m spacy init-model en /tmp/la_vectors_wiki_lg --vectors-loc cc.la.300.vec.gz
```
This will output a spaCy model in the directory `/tmp/la_vectors_wiki_lg`,
giving you access to some nice Latin vectors 😉 You can then pass the directory
path to [`spacy.load()`](/api/top-level#spacy.load).
```python
nlp_latin = spacy.load("/tmp/la_vectors_wiki_lg")
doc1 = nlp_latin("Caecilius est in horto")
doc2 = nlp_latin("servus est in atrio")
doc1.similarity(doc2)
```
The model directory will have a `/vocab` directory with the strings, lexical
entries and word vectors from the input vectors model. The
[`init-model`](/api/cli#init-model) command supports a number of archive formats
for the word vectors: the vectors can be in plain text (`.txt`), zipped
(`.zip`), or tarred and zipped (`.tgz`).
### Optimizing vector coverage {#custom-vectors-coverage new="2"}
To help you strike a good balance between coverage and memory usage, spaCy's
[`Vectors`](/api/vectors) class lets you map **multiple keys** to the **same
row** of the table. If you're using the
[`spacy init-model`](/api/cli#init-model) command to create a vocabulary,
pruning the vectors will be taken care of automatically if you set the
`--prune-vectors` flag. You can also do it manually in the following steps:
1. Start with a **word vectors model** that covers a huge vocabulary. For
instance, the [`en_vectors_web_lg`](/models/en-starters#en_vectors_web_lg)
model provides 300-dimensional GloVe vectors for over 1 million terms of
English.
2. If your vocabulary has values set for the `Lexeme.prob` attribute, the
lexemes will be sorted by descending probability to determine which vectors
to prune. Otherwise, lexemes will be sorted by their order in the `Vocab`.
3. Call [`Vocab.prune_vectors`](/api/vocab#prune_vectors) with the number of
vectors you want to keep.
```python
nlp = spacy.load('en_vectors_web_lg')
n_vectors = 105000 # number of vectors to keep
removed_words = nlp.vocab.prune_vectors(n_vectors)
assert len(nlp.vocab.vectors) <= n_vectors # unique vectors have been pruned
assert nlp.vocab.vectors.n_keys > n_vectors # but not the total entries
```
[`Vocab.prune_vectors`](/api/vocab#prune_vectors) reduces the current vector
table to a given number of unique entries, and returns a dictionary containing
the removed words, mapped to `(string, score)` tuples, where `string` is the
entry the removed word was mapped to, and `score` the similarity score between
the two words.
```python
### Removed words
{
"Shore": ("coast", 0.732257),
"Precautionary": ("caution", 0.490973),
"hopelessness": ("sadness", 0.742366),
"Continous": ("continuous", 0.732549),
"Disemboweled": ("corpse", 0.499432),
"biostatistician": ("scientist", 0.339724),
"somewheres": ("somewheres", 0.402736),
"observing": ("observe", 0.823096),
"Leaving": ("leaving", 1.0),
}
```
In the example above, the vector for "Shore" was removed and remapped to the
vector of "coast", which is deemed about 73% similar. "Leaving" was remapped to
the vector of "leaving", which is identical. If you're using the
[`init-model`](/api/cli#init-model) command, you can set the `--prune-vectors`
option to easily reduce the size of the vectors as you add them to a spaCy
model:
```bash
$ python -m spacy init-model /tmp/la_vectors_web_md --vectors-loc la.300d.vec.tgz --prune-vectors 10000
```
This will create a spaCy model with vectors for the first 10,000 words in the
vectors model. All other words in the vectors model are mapped to the closest
vector among those retained.
### Adding vectors {#adding-vectors}
```python
### Adding vectors
from spacy.vocab import Vocab
vector_data = {"dog": numpy.random.uniform(-1, 1, (300,)),
"cat": numpy.random.uniform(-1, 1, (300,)),
"orange": numpy.random.uniform(-1, 1, (300,))}
vocab = Vocab()
for word, vector in vector_data.items():
vocab.set_vector(word, vector)
```
### Using custom similarity methods {#custom-similarity}
By default, [`Token.vector`](/api/token#vector) returns the vector for its
underlying [`Lexeme`](/api/lexeme), while [`Doc.vector`](/api/doc#vector) and
[`Span.vector`](/api/span#vector) return an average of the vectors of their
tokens. You can customize these behaviors by modifying the `doc.user_hooks`,
`doc.user_span_hooks` and `doc.user_token_hooks` dictionaries.
<Infobox title="Custom user hooks" emoji="📖">
For more details on **adding hooks** and **overwriting** the built-in `Doc`,
`Span` and `Token` methods, see the usage guide on
[user hooks](/usage/processing-pipelines#custom-components-user-hooks).
</Infobox>
<!-- TODO:
### Storing vectors on a GPU {#gpu}
-->
## Other embeddings {#embeddings}
<!-- TODO: something about other embeddings -->

View File

@ -18,8 +18,11 @@
{ "text": "Linguistic Features", "url": "/usage/linguistic-features" },
{ "text": "Rule-based Matching", "url": "/usage/rule-based-matching" },
{ "text": "Processing Pipelines", "url": "/usage/processing-pipelines" },
{ "text": "Vectors & Embeddings", "url": "/usage/vectors-embeddings" },
{ "text": "Transformers", "url": "/usage/transformers", "tag": "new" },
{
"text": "Embeddings & Transformers",
"url": "/usage/embeddings-transformers",
"tag": "new"
},
{ "text": "Training Models", "url": "/usage/training", "tag": "new" },
{ "text": "spaCy Projects", "url": "/usage/projects", "tag": "new" },
{ "text": "Saving & Loading", "url": "/usage/saving-loading" },

View File

@ -62,11 +62,12 @@ function linkType(el, showLink = true) {
export const TypeAnnotation = ({ lang = 'python', link = true, children }) => {
// Hacky, but we're temporarily replacing a dot to prevent it from being split during highlighting
const TMP_DOT = ''
const TMP_DOT = '۔'
const code = Array.isArray(children) ? children.join('') : children || ''
const rawStr = code.replace('.', TMP_DOT)
const [rawText, meta] = code.split(/(?= \(.+\)$)/)
const rawStr = rawText.replace(/\./g, TMP_DOT)
const rawHtml = lang === 'none' || !code ? code : highlightCode(lang, rawStr)
const html = rawHtml.replace(TMP_DOT, '.').replace(/\n/g, ' ')
const html = rawHtml.replace(new RegExp(TMP_DOT, 'g'), '.').replace(/\n/g, ' ')
const result = htmlToReact(html)
const elements = Array.isArray(result) ? result : [result]
const annotClassNames = classNames(
@ -83,6 +84,7 @@ export const TypeAnnotation = ({ lang = 'python', link = true, children }) => {
{elements.map((el, i) => (
<Fragment key={i}>{linkType(el, !!link)}</Fragment>
))}
{meta && <span className={classes.typeAnnotationMeta}>{meta}</span>}
</code>
)
}

View File

@ -37,7 +37,7 @@ function isDividerRow(children) {
}
function isFootRow(children) {
const rowRegex = /^(RETURNS|YIELDS|CREATES|PRINTS)/
const rowRegex = /^(RETURNS|YIELDS|CREATES|PRINTS|EXECUTES)/
if (children.length && children[0].props.name === 'td') {
const cellChildren = children[0].props.children
if (

View File

@ -9,7 +9,12 @@ import { isString, github, headingTextClassName } from './util'
import classes from '../styles/typography.module.sass'
export const H1 = ({ Component = 'h1', className, ...props }) => (
<Headline Component={Component} className={classNames(classes.h1, className)} {...props} />
<Headline
Component={Component}
className={classNames(classes.h1, className)}
permalink={false}
{...props}
/>
)
export const H2 = ({ className, ...props }) => (
<Headline Component="h2" className={classNames(classes.h2, className)} {...props} />
@ -90,6 +95,7 @@ const Headline = ({
source,
hidden,
action,
permalink = true,
className,
children,
}) => {
@ -102,7 +108,7 @@ const Headline = ({
const tags = tag ? tag.split(',').map(t => t.trim()) : []
return (
<Component id={id} name={name} className={headingClassNames}>
<Permalink id={id}>{children} </Permalink>
<Permalink id={permalink ? id : null}>{children} </Permalink>
{tags.map((tag, i) => (
<Tag spaced key={i}>
{tag}

View File

@ -88,6 +88,10 @@
text-transform: uppercase
margin-right: 5px
.type-annotation-meta
font-size: 90%
color: var(--color-subtle-dark)
.wrap
white-space: pre-wrap
word-wrap: anywhere