mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
Merge remote-tracking branch 'upstream/develop' into feature/more-v3-docs
This commit is contained in:
commit
f7b76d2d83
|
@ -36,11 +36,11 @@ redirects = [
|
|||
{from = "/docs/api/features", to = "/models/#architecture", force = true},
|
||||
{from = "/docs/api/philosophy", to = "/usage/spacy-101", force = true},
|
||||
{from = "/docs/usage/showcase", to = "/universe", force = true},
|
||||
{from = "/tutorials/load-new-word-vectors", to = "/usage/vectors-similarity#custom", force = true},
|
||||
{from = "/tutorials/load-new-word-vectors", to = "/usage/linguistic-features", force = true},
|
||||
{from = "/tutorials", to = "/usage/examples", force = true},
|
||||
# Old documentation pages (v2.x)
|
||||
{from = "/usage/adding-languages", to = "/usage/linguistic-features", force = true},
|
||||
{from = "/usage/vectors-similarity", to = "/usage/vectors-embeddings", force = true},
|
||||
{from = "/usage/vectors-similarity", to = "/usage/linguistic-features#vectors-similarity", force = true},
|
||||
{from = "/api/goldparse", to = "/api/top-level", force = true},
|
||||
{from = "/api/goldcorpus", to = "/api/corpus", force = true},
|
||||
{from = "/api/annotation", to = "/api/data-formats", force = true},
|
||||
|
|
|
@ -3,7 +3,7 @@ from pathlib import Path
|
|||
from collections import Counter
|
||||
import sys
|
||||
import srsly
|
||||
from wasabi import Printer, MESSAGES, msg, diff_strings
|
||||
from wasabi import Printer, MESSAGES, msg
|
||||
import typer
|
||||
|
||||
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
|
||||
|
@ -32,8 +32,6 @@ def debug_config_cli(
|
|||
ctx: typer.Context, # This is only used to read additional arguments
|
||||
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
||||
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
auto_fill: bool = Opt(False, "--auto-fill", "-F", help="Whether or not to auto-fill the config with built-in defaults if possible"),
|
||||
diff: bool = Opt(False, "--diff", "-D", help="Show a visual diff if config was auto-filled")
|
||||
# fmt: on
|
||||
):
|
||||
"""Debug a config.cfg file and show validation errors. The command will
|
||||
|
@ -49,18 +47,8 @@ def debug_config_cli(
|
|||
import_code(code_path)
|
||||
with show_validation_error(config_path):
|
||||
config = util.load_config(config_path, overrides=overrides)
|
||||
nlp, _ = util.load_model_from_config(config, auto_fill=auto_fill)
|
||||
if auto_fill:
|
||||
orig_config = config.to_str()
|
||||
filled_config = nlp.config.to_str()
|
||||
if orig_config == filled_config:
|
||||
msg.good("Original config is valid, no values were auto-filled")
|
||||
else:
|
||||
msg.good("Auto-filled config is valid")
|
||||
if diff:
|
||||
print(diff_strings(config.to_str(), nlp.config.to_str()))
|
||||
else:
|
||||
msg.good("Original config is valid")
|
||||
nlp, _ = util.load_model_from_config(config)
|
||||
msg.good("Original config is valid")
|
||||
|
||||
|
||||
@debug_cli.command(
|
||||
|
|
|
@ -249,7 +249,16 @@ def load_model_from_package(
|
|||
disable: Iterable[str] = tuple(),
|
||||
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
|
||||
) -> "Language":
|
||||
"""Load a model from an installed package."""
|
||||
"""Load a model from an installed package.
|
||||
|
||||
name (str): The package name.
|
||||
vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
|
||||
a new Vocab object will be created.
|
||||
disable (Iterable[str]): Names of pipeline components to disable.
|
||||
config (Dict[str, Any] / Config): Config overrides as nested dict or dict
|
||||
keyed by section values in dot notation.
|
||||
RETURNS (Language): The loaded nlp object.
|
||||
"""
|
||||
cls = importlib.import_module(name)
|
||||
return cls.load(vocab=vocab, disable=disable, config=config)
|
||||
|
||||
|
@ -263,7 +272,17 @@ def load_model_from_path(
|
|||
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
|
||||
) -> "Language":
|
||||
"""Load a model from a data directory path. Creates Language class with
|
||||
pipeline from config.cfg and then calls from_disk() with path."""
|
||||
pipeline from config.cfg and then calls from_disk() with path.
|
||||
|
||||
name (str): Package name or model path.
|
||||
meta (Dict[str, Any]): Optional model meta.
|
||||
vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
|
||||
a new Vocab object will be created.
|
||||
disable (Iterable[str]): Names of pipeline components to disable.
|
||||
config (Dict[str, Any] / Config): Config overrides as nested dict or dict
|
||||
keyed by section values in dot notation.
|
||||
RETURNS (Language): The loaded nlp object.
|
||||
"""
|
||||
if not model_path.exists():
|
||||
raise IOError(Errors.E052.format(path=model_path))
|
||||
if not meta:
|
||||
|
@ -284,6 +303,15 @@ def load_model_from_config(
|
|||
) -> Tuple["Language", Config]:
|
||||
"""Create an nlp object from a config. Expects the full config file including
|
||||
a section "nlp" containing the settings for the nlp object.
|
||||
|
||||
name (str): Package name or model path.
|
||||
meta (Dict[str, Any]): Optional model meta.
|
||||
vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
|
||||
a new Vocab object will be created.
|
||||
disable (Iterable[str]): Names of pipeline components to disable.
|
||||
auto_fill (bool): Whether to auto-fill config with missing defaults.
|
||||
validate (bool): Whether to show config validation errors.
|
||||
RETURNS (Language): The loaded nlp object.
|
||||
"""
|
||||
if "nlp" not in config:
|
||||
raise ValueError(Errors.E985.format(config=config))
|
||||
|
@ -308,6 +336,13 @@ def load_model_from_init_py(
|
|||
) -> "Language":
|
||||
"""Helper function to use in the `load()` method of a model package's
|
||||
__init__.py.
|
||||
|
||||
vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
|
||||
a new Vocab object will be created.
|
||||
disable (Iterable[str]): Names of pipeline components to disable.
|
||||
config (Dict[str, Any] / Config): Config overrides as nested dict or dict
|
||||
keyed by section values in dot notation.
|
||||
RETURNS (Language): The loaded nlp object.
|
||||
"""
|
||||
model_path = Path(init_file).parent
|
||||
meta = get_model_meta(model_path)
|
||||
|
@ -325,7 +360,14 @@ def load_config(
|
|||
overrides: Dict[str, Any] = SimpleFrozenDict(),
|
||||
interpolate: bool = False,
|
||||
) -> Config:
|
||||
"""Load a config file. Takes care of path validation and section order."""
|
||||
"""Load a config file. Takes care of path validation and section order.
|
||||
|
||||
path (Union[str, Path]): Path to the config file.
|
||||
overrides: (Dict[str, Any]): Config overrides as nested dict or
|
||||
dict keyed by section values in dot notation.
|
||||
interpolate (bool): Whether to interpolate and resolve variables.
|
||||
RETURNS (Config): The loaded config.
|
||||
"""
|
||||
config_path = ensure_path(path)
|
||||
if not config_path.exists() or not config_path.is_file():
|
||||
raise IOError(Errors.E053.format(path=config_path, name="config.cfg"))
|
||||
|
@ -337,7 +379,12 @@ def load_config(
|
|||
def load_config_from_str(
|
||||
text: str, overrides: Dict[str, Any] = SimpleFrozenDict(), interpolate: bool = False
|
||||
):
|
||||
"""Load a full config from a string."""
|
||||
"""Load a full config from a string. Wrapper around Thinc's Config.from_str.
|
||||
|
||||
text (str): The string config to load.
|
||||
interpolate (bool): Whether to interpolate and resolve variables.
|
||||
RETURNS (Config): The loaded config.
|
||||
"""
|
||||
return Config(section_order=CONFIG_SECTION_ORDER).from_str(
|
||||
text, overrides=overrides, interpolate=interpolate,
|
||||
)
|
||||
|
@ -435,19 +482,18 @@ def get_base_version(version: str) -> str:
|
|||
return Version(version).base_version
|
||||
|
||||
|
||||
def get_model_meta(path: Union[str, Path]) -> Dict[str, Any]:
|
||||
"""Get model meta.json from a directory path and validate its contents.
|
||||
def load_meta(path: Union[str, Path]) -> Dict[str, Any]:
|
||||
"""Load a model meta.json from a path and validate its contents.
|
||||
|
||||
path (str / Path): Path to model directory.
|
||||
RETURNS (Dict[str, Any]): The model's meta data.
|
||||
path (Union[str, Path]): Path to meta.json.
|
||||
RETURNS (Dict[str, Any]): The loaded meta.
|
||||
"""
|
||||
model_path = ensure_path(path)
|
||||
if not model_path.exists():
|
||||
raise IOError(Errors.E052.format(path=model_path))
|
||||
meta_path = model_path / "meta.json"
|
||||
if not meta_path.is_file():
|
||||
raise IOError(Errors.E053.format(path=meta_path, name="meta.json"))
|
||||
meta = srsly.read_json(meta_path)
|
||||
path = ensure_path(path)
|
||||
if not path.parent.exists():
|
||||
raise IOError(Errors.E052.format(path=path.parent))
|
||||
if not path.exists() or not path.is_file():
|
||||
raise IOError(Errors.E053.format(path=path, name="meta.json"))
|
||||
meta = srsly.read_json(path)
|
||||
for setting in ["lang", "name", "version"]:
|
||||
if setting not in meta or not meta[setting]:
|
||||
raise ValueError(Errors.E054.format(setting=setting))
|
||||
|
@ -471,6 +517,16 @@ def get_model_meta(path: Union[str, Path]) -> Dict[str, Any]:
|
|||
return meta
|
||||
|
||||
|
||||
def get_model_meta(path: Union[str, Path]) -> Dict[str, Any]:
|
||||
"""Get model meta.json from a directory path and validate its contents.
|
||||
|
||||
path (str / Path): Path to model directory.
|
||||
RETURNS (Dict[str, Any]): The model's meta data.
|
||||
"""
|
||||
model_path = ensure_path(path)
|
||||
return load_meta(model_path / "meta.json")
|
||||
|
||||
|
||||
def is_package(name: str) -> bool:
|
||||
"""Check if string maps to a package installed via pip.
|
||||
|
||||
|
|
|
@ -243,11 +243,15 @@ Encode context using bidirectional LSTM layers. Requires
|
|||
| `window_size` | The number of words to concatenate around each token to construct the convolution. Recommended value is `1`. ~~int~~ |
|
||||
| `depth` | The number of convolutional layers. Recommended value is `4`. ~~int~~ |
|
||||
|
||||
### spacy.StaticVectors.v1 {#StaticVectors}
|
||||
|
||||
<!-- TODO: -->
|
||||
|
||||
## Transformer architectures {#transformers source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/architectures.py"}
|
||||
|
||||
The following architectures are provided by the package
|
||||
[`spacy-transformers`](https://github.com/explosion/spacy-transformers). See the
|
||||
[usage documentation](/usage/transformers) for how to integrate the
|
||||
[usage documentation](/usage/embeddings-transformers) for how to integrate the
|
||||
architectures into your training config.
|
||||
|
||||
### spacy-transformers.TransformerModel.v1 {#TransformerModel}
|
||||
|
|
|
@ -3,17 +3,17 @@ title: Command Line Interface
|
|||
teaser: Download, train and package models, and debug spaCy
|
||||
source: spacy/cli
|
||||
menu:
|
||||
- ['Download', 'download']
|
||||
- ['Info', 'info']
|
||||
- ['Validate', 'validate']
|
||||
- ['Init', 'init']
|
||||
- ['Convert', 'convert']
|
||||
- ['Debug', 'debug']
|
||||
- ['Train', 'train']
|
||||
- ['Pretrain', 'pretrain']
|
||||
- ['Evaluate', 'evaluate']
|
||||
- ['Package', 'package']
|
||||
- ['Project', 'project']
|
||||
- ['download', 'download']
|
||||
- ['info', 'info']
|
||||
- ['validate', 'validate']
|
||||
- ['init', 'init']
|
||||
- ['convert', 'convert']
|
||||
- ['debug', 'debug']
|
||||
- ['train', 'train']
|
||||
- ['pretrain', 'pretrain']
|
||||
- ['evaluate', 'evaluate']
|
||||
- ['package', 'package']
|
||||
- ['project', 'project']
|
||||
---
|
||||
|
||||
spaCy's CLI provides a range of helpful commands for downloading and training
|
||||
|
@ -22,7 +22,7 @@ list of available commands, you can type `python -m spacy --help`. You can also
|
|||
add the `--help` flag to any command or subcommand to see the description,
|
||||
available arguments and usage.
|
||||
|
||||
## Download {#download}
|
||||
## download {#download tag="command"}
|
||||
|
||||
Download [models](/usage/models) for spaCy. The downloader finds the
|
||||
best-matching compatible version and uses `pip install` to download the model as
|
||||
|
@ -43,15 +43,15 @@ the model name to be specified with its version (e.g. `en_core_web_sm-2.2.0`).
|
|||
$ python -m spacy download [model] [--direct] [pip args]
|
||||
```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| ------------------------------------- | ------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `model` | positional | Model name, e.g. [`en_core_web_sm`](/models/en#en_core_web_sm). |
|
||||
| `--direct`, `-d` | flag | Force direct download of exact model version. |
|
||||
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||
| pip args <Tag variant="new">2.1</Tag> | option / flag | Additional installation options to be passed to `pip install` when installing the model package. For example, `--user` to install to the user home directory or `--no-deps` to not install model dependencies. |
|
||||
| **CREATES** | directory | The installed model package in your `site-packages` directory. |
|
||||
| Name | Description |
|
||||
| ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `model` | Model name, e.g. [`en_core_web_sm`](/models/en#en_core_web_sm). ~~str (positional)~~ |
|
||||
| `--direct`, `-d` | Force direct download of exact model version. ~~bool (flag)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| pip args <Tag variant="new">2.1</Tag> | Additional installation options to be passed to `pip install` when installing the model package. For example, `--user` to install to the user home directory or `--no-deps` to not install model dependencies. ~~Any (option/flag)~~ |
|
||||
| **CREATES** | The installed model package in your `site-packages` directory. |
|
||||
|
||||
## Info {#info}
|
||||
## info {#info tag="command"}
|
||||
|
||||
Print information about your spaCy installation, models and local setup, and
|
||||
generate [Markdown](https://en.wikipedia.org/wiki/Markdown)-formatted markup to
|
||||
|
@ -65,15 +65,15 @@ $ python -m spacy info [--markdown] [--silent]
|
|||
$ python -m spacy info [model] [--markdown] [--silent]
|
||||
```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| ------------------------------------------------ | ---------- | ---------------------------------------------- |
|
||||
| `model` | positional | A model, i.e. package name or path (optional). |
|
||||
| `--markdown`, `-md` | flag | Print information as Markdown. |
|
||||
| `--silent`, `-s` <Tag variant="new">2.0.12</Tag> | flag | Don't print anything, just return the values. |
|
||||
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||
| **PRINTS** | `stdout` | Information about your spaCy installation. |
|
||||
| Name | Description |
|
||||
| ------------------------------------------------ | ------------------------------------------------------------------------------ |
|
||||
| `model` | A model, i.e. package name or path (optional). ~~Optional[str] \(positional)~~ |
|
||||
| `--markdown`, `-md` | Print information as Markdown. ~~bool (flag)~~ |
|
||||
| `--silent`, `-s` <Tag variant="new">2.0.12</Tag> | Don't print anything, just return the values. ~~bool (flag)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| **PRINTS** | Information about your spaCy installation. |
|
||||
|
||||
## Validate {#validate new="2"}
|
||||
## validate {#validate new="2" tag="command"}
|
||||
|
||||
Find all models installed in the current environment and check whether they are
|
||||
compatible with the currently installed version of spaCy. Should be run after
|
||||
|
@ -92,16 +92,16 @@ and command for updating are shown.
|
|||
$ python -m spacy validate
|
||||
```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| ---------- | -------- | --------------------------------------------------------- |
|
||||
| **PRINTS** | `stdout` | Details about the compatibility of your installed models. |
|
||||
| Name | Description |
|
||||
| ---------- | --------------------------------------------------------- |
|
||||
| **PRINTS** | Details about the compatibility of your installed models. |
|
||||
|
||||
## Init {#init new="3"}
|
||||
## init {#init new="3"}
|
||||
|
||||
The `spacy init` CLI includes helpful commands for initializing training config
|
||||
files and model directories.
|
||||
|
||||
### init config {#init-config new="3"}
|
||||
### init config {#init-config new="3" tag="command"}
|
||||
|
||||
Initialize and save a [`config.cfg` file](/usage/training#config) using the
|
||||
**recommended settings** for your use case. It works just like the
|
||||
|
@ -121,15 +121,15 @@ $ python -m spacy init config [output_file] [--lang] [--pipeline]
|
|||
[--optimize] [--cpu]
|
||||
```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| ------------------ | ---------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `output_file` | positional | Path to output `.cfg` file. If not set, the config is written to stdout so you can pipe it forward to a file. |
|
||||
| `--lang`, `-l` | option | Optional code of the [language](/usage/models#languages) to use. Defaults to `"en"`. |
|
||||
| `--pipeline`, `-p` | option | Comma-separated list of trainable [pipeline components](/usage/processing-pipelines#built-in) to include in the model. Defaults to `"tagger,parser,ner"`. |
|
||||
| `--optimize`, `-o` | option | `"efficiency"` or `"accuracy"`. Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters. Defaults to `"efficiency"`. |
|
||||
| `--cpu`, `-C` | flag | Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters. |
|
||||
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||
| **CREATES** | file | The config file for training. |
|
||||
| Name | Description |
|
||||
| ------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `output_file` | Path to output `.cfg` file. If not set, the config is written to stdout so you can pipe it forward to a file. ~~Path (positional)~~ |
|
||||
| `--lang`, `-l` | Optional code of the [language](/usage/models#languages) to use. Defaults to `"en"`. ~~str (option)~~ |
|
||||
| `--pipeline`, `-p` | Comma-separated list of trainable [pipeline components](/usage/processing-pipelines#built-in) to include in the model. Defaults to `"tagger,parser,ner"`. ~~str (option)~~ |
|
||||
| `--optimize`, `-o` | `"efficiency"` or `"accuracy"`. Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters. Defaults to `"efficiency"`. ~~str (option)~~ |
|
||||
| `--cpu`, `-C` | Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters. ~~bool (flag)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| **CREATES** | The config file for training. |
|
||||
|
||||
### init fill-config {#init-fill-config new="3"}
|
||||
|
||||
|
@ -152,24 +152,22 @@ validation error with more details.
|
|||
$ python -m spacy init fill-config [base_path] [output_file] [--diff]
|
||||
```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| -------------- | ---------- | ------------------------------------------------------------------------------------------------------------- |
|
||||
| `base_path` | positional | Path to base config to fill, e.g. generated by the [quickstart widget](/usage/training#quickstart). |
|
||||
| `output_file` | positional | Path to output `.cfg` file. If not set, the config is written to stdout so you can pipe it forward to a file. |
|
||||
| `--diff`, `-D` | flag | Print a visual diff highlighting the changes. |
|
||||
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||
| **CREATES** | file | Complete and auto-filled config file for training. |
|
||||
| Name | Description |
|
||||
| -------------- | ----------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `base_path` | Path to base config to fill, e.g. generated by the [quickstart widget](/usage/training#quickstart). ~~Path (positional)~~ |
|
||||
| `output_file` | Path to output `.cfg` file. If not set, the config is written to stdout so you can pipe it forward to a file. ~~Path (positional)~~ |
|
||||
| `--diff`, `-D` | Print a visual diff highlighting the changes. ~~bool (flag)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| **CREATES** | Complete and auto-filled config file for training. |
|
||||
|
||||
### init model {#init-model new="2"}
|
||||
|
||||
<!-- TODO: update for v3 -->
|
||||
### init model {#init-model new="2" tag="command"}
|
||||
|
||||
Create a new model directory from raw data, like word frequencies, Brown
|
||||
clusters and word vectors. This command is similar to the `spacy model` command
|
||||
in v1.x. Note that in order to populate the model's vocab, you need to pass in a
|
||||
JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) as
|
||||
`--jsonl-loc` with optional `id` values that correspond to the vectors table.
|
||||
Just loading in vectors will not automatically populate the vocab.
|
||||
clusters and word vectors. Note that in order to populate the model's vocab, you
|
||||
need to pass in a JSONL-formatted
|
||||
[vocabulary file](/api/data-formats#vocab-jsonl) as `--jsonl-loc` with optional
|
||||
`id` values that correspond to the vectors table. Just loading in vectors will
|
||||
not automatically populate the vocab.
|
||||
|
||||
<Infobox title="New in v3.0" variant="warning">
|
||||
|
||||
|
@ -182,19 +180,19 @@ $ python -m spacy init model [lang] [output_dir] [--jsonl-loc] [--vectors-loc]
|
|||
[--prune-vectors]
|
||||
```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| ------------------------------------------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `lang` | positional | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. |
|
||||
| `output_dir` | positional | Model output directory. Will be created if it doesn't exist. |
|
||||
| `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) with lexical attributes. |
|
||||
| `--vectors-loc`, `-v` | option | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. |
|
||||
| `--truncate-vectors`, `-t` <Tag variant="new">2.3</Tag> | option | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. |
|
||||
| `--prune-vectors`, `-V` | option | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. |
|
||||
| `--vectors-name`, `-vn` | option | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. |
|
||||
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||
| **CREATES** | model | A spaCy model containing the vocab and vectors. |
|
||||
| Name | Description |
|
||||
| ------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `lang` | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. ~~str (positional)~~ |
|
||||
| `output_dir` | Model output directory. Will be created if it doesn't exist. ~~Path (positional)~~ |
|
||||
| `--jsonl-loc`, `-j` | Optional location of JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) with lexical attributes. ~~Optional[Path] \(option)~~ |
|
||||
| `--vectors-loc`, `-v` | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Optional[Path] \(option)~~ |
|
||||
| `--truncate-vectors`, `-t` <Tag variant="new">2.3</Tag> | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ |
|
||||
| `--prune-vectors`, `-V` | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~ |
|
||||
| `--vectors-name`, `-vn` | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. ~~str (option)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| **CREATES** | A spaCy model containing the vocab and vectors. |
|
||||
|
||||
## Convert {#convert}
|
||||
## convert {#convert tag="command"}
|
||||
|
||||
Convert files into spaCy's
|
||||
[binary training data format](/api/data-formats#binary-training), a serialized
|
||||
|
@ -208,22 +206,22 @@ $ python -m spacy convert [input_file] [output_dir] [--converter]
|
|||
[--merge-subtokens] [--ner-map] [--lang]
|
||||
```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| ------------------------------------------------ | ---------- | ------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `input_file` | positional | Input file. |
|
||||
| `output_dir` | positional | Output directory for converted file. Defaults to `"-"`, meaning data will be written to `stdout`. |
|
||||
| `--converter`, `-c` <Tag variant="new">2</Tag> | option | Name of converter to use (see below). |
|
||||
| `--file-type`, `-t` <Tag variant="new">2.1</Tag> | option | Type of file to create. Either `spacy` (default) for binary [`DocBin`](/api/docbin) data or `json` for v2.x JSON format. |
|
||||
| `--n-sents`, `-n` | option | Number of sentences per document. |
|
||||
| `--seg-sents`, `-s` <Tag variant="new">2.2</Tag> | flag | Segment sentences (for `-c ner`) |
|
||||
| `--model`, `-b` <Tag variant="new">2.2</Tag> | option | Model for parser-based sentence segmentation (for `-s`) |
|
||||
| `--morphology`, `-m` | option | Enable appending morphology to tags. |
|
||||
| `--ner-map`, `-nm` | option | NER tag mapping (as JSON-encoded dict of entity types). |
|
||||
| `--lang`, `-l` <Tag variant="new">2.1</Tag> | option | Language code (if tokenizer required). |
|
||||
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||
| **CREATES** | binary | Binary [`DocBin`](/api/docbin) training data that can be used with [`spacy train`](/api/cli#train). |
|
||||
| Name | Description |
|
||||
| ------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `input_file` | Input file. ~~Path (positional)~~ |
|
||||
| `output_dir` | Output directory for converted file. Defaults to `"-"`, meaning data will be written to `stdout`. ~~Optional[Path] \(positional)~~ |
|
||||
| `--converter`, `-c` <Tag variant="new">2</Tag> | Name of converter to use (see below). ~~str (option)~~ |
|
||||
| `--file-type`, `-t` <Tag variant="new">2.1</Tag> | Type of file to create. Either `spacy` (default) for binary [`DocBin`](/api/docbin) data or `json` for v2.x JSON format. ~~str (option)~~ |
|
||||
| `--n-sents`, `-n` | Number of sentences per document. ~~int (option)~~ |
|
||||
| `--seg-sents`, `-s` <Tag variant="new">2.2</Tag> | Segment sentences (for `--converter ner`). ~~bool (flag)~~ |
|
||||
| `--model`, `-b` <Tag variant="new">2.2</Tag> | Model for parser-based sentence segmentation (for `--seg-sents`). ~~Optional[str](option)~~ |
|
||||
| `--morphology`, `-m` | Enable appending morphology to tags. ~~bool (flag)~~ |
|
||||
| `--ner-map`, `-nm` | NER tag mapping (as JSON-encoded dict of entity types). ~~Optional[Path](option)~~ |
|
||||
| `--lang`, `-l` <Tag variant="new">2.1</Tag> | Language code (if tokenizer required). ~~Optional[str] \(option)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| **CREATES** | Binary [`DocBin`](/api/docbin) training data that can be used with [`spacy train`](/api/cli#train). |
|
||||
|
||||
### Converters
|
||||
### Converters {#converters}
|
||||
|
||||
| ID | Description |
|
||||
| ------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
|
@ -233,12 +231,12 @@ $ python -m spacy convert [input_file] [output_dir] [--converter]
|
|||
| `ner` | NER with IOB/IOB2 tags, one token per line with columns separated by whitespace. The first column is the token and the final column is the IOB tag. Sentences are separated by blank lines and documents are separated by the line `-DOCSTART- -X- O O`. Supports CoNLL 2003 NER format. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). |
|
||||
| `iob` | NER with IOB/IOB2 tags, one sentence per line with tokens separated by whitespace and annotation separated by `|`, either `word|B-ENT` or `word|POS|B-ENT`. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). |
|
||||
|
||||
## Debug {#debug new="3"}
|
||||
## debug {#debug new="3"}
|
||||
|
||||
The `spacy debug` CLI includes helpful commands for debugging and profiling your
|
||||
configs, data and implementations.
|
||||
|
||||
### debug config {#debug-config}
|
||||
### debug config {#debug-config new="3" tag="command"}
|
||||
|
||||
Debug a [`config.cfg` file](/usage/training#config) and show validation errors.
|
||||
The command will create all objects in the tree and validate them. Note that
|
||||
|
@ -246,10 +244,10 @@ some config validation errors are blocking and will prevent the rest of the
|
|||
config from being resolved. This means that you may not see all validation
|
||||
errors at once and some issues are only shown once previous errors have been
|
||||
fixed. To auto-fill a partial config and save the result, you can use the
|
||||
[`init config`](/api/cli#init-config) command.
|
||||
[`init fillconfig`](/api/cli#init-fill-config) command.
|
||||
|
||||
```bash
|
||||
$ python -m spacy debug config [config_path] [--code_path] [--output] [--auto_fill] [--diff] [overrides]
|
||||
$ python -m spacy debug config [config_path] [--code_path] [overrides]
|
||||
```
|
||||
|
||||
> #### Example
|
||||
|
@ -277,18 +275,15 @@ python -m spacy init fill-config tmp/starter-config_invalid.cfg --base tmp/start
|
|||
|
||||
</Accordion>
|
||||
|
||||
| Argument | Type | Default | Description |
|
||||
| --------------------- | ------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------- |
|
||||
| `config_path` | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. |
|
||||
| `--code_path`, `-c` | option | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. |
|
||||
| `--auto_fill`, `-F` | option | Whether or not to auto-fill the config with built-in defaults if possible. If `False`, the provided config needs to be complete. |
|
||||
| `--output_path`, `-o` | option | Output path where the filled config can be stored. Use '-' for standard output. |
|
||||
| `--diff`, `-D` | option | `Show a visual diff if config was auto-filled. |
|
||||
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||
| overrides | option / flag | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. |
|
||||
| **PRINTS** | stdout | Config validation errors, if available. |
|
||||
| Name | Description |
|
||||
| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
|
||||
| `--code_path`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
|
||||
| **PRINTS** | Config validation errors, if available. |
|
||||
|
||||
### debug data {#debug-data}
|
||||
### debug data {#debug-data tag="command"}
|
||||
|
||||
Analyze, debug, and validate your training and development data. Get useful
|
||||
stats, and find problems like invalid entity annotations, cyclic dependencies,
|
||||
|
@ -453,18 +448,18 @@ will not be available.
|
|||
|
||||
</Accordion>
|
||||
|
||||
| Argument | Type | Description |
|
||||
| -------------------------- | ------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `config_path` | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. |
|
||||
| `--code`, `-c` | option | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. |
|
||||
| `--ignore-warnings`, `-IW` | flag | Ignore warnings, only show stats and errors. |
|
||||
| `--verbose`, `-V` | flag | Print additional information and explanations. |
|
||||
| `--no-format`, `-NF` | flag | Don't pretty-print the results. Use this if you want to write to a file. |
|
||||
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||
| overrides | option / flag | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. |
|
||||
| **PRINTS** | stdout | Debugging information. |
|
||||
| Name | Description |
|
||||
| -------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
|
||||
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||
| `--ignore-warnings`, `-IW` | Ignore warnings, only show stats and errors. ~~bool (flag)~~ |
|
||||
| `--verbose`, `-V` | Print additional information and explanations. ~~bool (flag)~~ |
|
||||
| `--no-format`, `-NF` | Don't pretty-print the results. Use this if you want to write to a file. ~~bool (flag)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
|
||||
| **PRINTS** | Debugging information. |
|
||||
|
||||
### debug profile {#debug-profile}
|
||||
### debug profile {#debug-profile tag="command"}
|
||||
|
||||
Profile which functions take the most time in a spaCy pipeline. Input should be
|
||||
formatted as one JSON object per line with a key `"text"`. It can either be
|
||||
|
@ -482,15 +477,15 @@ The `profile` command is now available as a subcommand of `spacy debug`.
|
|||
$ python -m spacy debug profile [model] [inputs] [--n-texts]
|
||||
```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| ----------------- | ---------- | ----------------------------------------------------------------- |
|
||||
| `model` | positional | A loadable spaCy model. |
|
||||
| `inputs` | positional | Optional path to input file, or `-` for standard input. |
|
||||
| `--n-texts`, `-n` | option | Maximum number of texts to use if available. Defaults to `10000`. |
|
||||
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||
| **PRINTS** | stdout | Profiling information for the model. |
|
||||
| Name | Description |
|
||||
| ----------------- | ---------------------------------------------------------------------------------- |
|
||||
| `model` | A loadable spaCy model. ~~str (positional)~~ |
|
||||
| `inputs` | Optional path to input file, or `-` for standard input. ~~Path (positional)~~ |
|
||||
| `--n-texts`, `-n` | Maximum number of texts to use if available. Defaults to `10000`. ~~int (option)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| **PRINTS** | Profiling information for the model. |
|
||||
|
||||
### debug model {#debug-model}
|
||||
### debug model {#debug-model new="3" tag="command"}
|
||||
|
||||
Debug a Thinc [`Model`](https://thinc.ai/docs/api-model) by running it on a
|
||||
sample text and checking how it updates its internal weights and parameters.
|
||||
|
@ -596,23 +591,24 @@ $ python -m spacy debug model ./config.cfg tagger -l "5,15" -DIM -PAR -P0 -P1 -P
|
|||
|
||||
</Accordion>
|
||||
|
||||
| Argument | Type | Description |
|
||||
| ----------------------- | ---------- | ----------------------------------------------------------------------------------------------------- |
|
||||
| `config_path` | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. |
|
||||
| `component` | positional | Name of the pipeline component of which the model should be analyzed. |
|
||||
| `--layers`, `-l` | option | Comma-separated names of layer IDs to print. |
|
||||
| `--dimensions`, `-DIM` | option | Show dimensions of each layer. |
|
||||
| `--parameters`, `-PAR` | option | Show parameters of each layer. |
|
||||
| `--gradients`, `-GRAD` | option | Show gradients of each layer. |
|
||||
| `--attributes`, `-ATTR` | option | Show attributes of each layer. |
|
||||
| `--print-step0`, `-P0` | option | Print model before training. |
|
||||
| `--print-step1`, `-P1` | option | Print model after initialization. |
|
||||
| `--print-step2`, `-P2` | option | Print model after training. |
|
||||
| `--print-step3`, `-P3` | option | Print final predictions. |
|
||||
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||
| **PRINTS** | stdout | Debugging information. |
|
||||
| Name | Description |
|
||||
| ----------------------- | --------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
|
||||
| `component` | Name of the pipeline component of which the model should be analyzed. ~~str (positional)~~ |
|
||||
| `--layers`, `-l` | Comma-separated names of layer IDs to print. ~~str (option)~~ |
|
||||
| `--dimensions`, `-DIM` | Show dimensions of each layer. ~~bool (flag)~~ |
|
||||
| `--parameters`, `-PAR` | Show parameters of each layer. ~~bool (flag)~~ |
|
||||
| `--gradients`, `-GRAD` | Show gradients of each layer. ~~bool (flag)~~ |
|
||||
| `--attributes`, `-ATTR` | Show attributes of each layer. ~~bool (flag)~~ |
|
||||
| `--print-step0`, `-P0` | Print model before training. ~~bool (flag)~~ |
|
||||
| `--print-step1`, `-P1` | Print model after initialization. ~~bool (flag)~~ |
|
||||
| `--print-step2`, `-P2` | Print model after training. ~~bool (flag)~~ |
|
||||
| `--print-step3`, `-P3` | Print final predictions. ~~bool (flag)~~ |
|
||||
| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| **PRINTS** | Debugging information. |
|
||||
|
||||
## Train {#train}
|
||||
## train {#train tag="command"}
|
||||
|
||||
Train a model. Expects data in spaCy's
|
||||
[binary format](/api/data-formats#training) and a
|
||||
|
@ -640,17 +636,17 @@ in the section `[paths]`.
|
|||
$ python -m spacy train [config_path] [--output] [--code] [--verbose] [overrides]
|
||||
```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| ----------------- | ------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `config_path` | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. |
|
||||
| `--output`, `-o` | positional | Directory to store model in. Will be created if it doesn't exist. |
|
||||
| `--code`, `-c` | option | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. |
|
||||
| `--verbose`, `-V` | flag | Show more detailed messages during training. |
|
||||
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||
| overrides | option / flag | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. |
|
||||
| **CREATES** | model | The final model and the best model. |
|
||||
| Name | Description |
|
||||
| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
|
||||
| `--output`, `-o` | Directory to store model in. Will be created if it doesn't exist. ~~Optional[Path] \(positional)~~ |
|
||||
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||
| `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
|
||||
| **CREATES** | The final model and the best model. |
|
||||
|
||||
## Pretrain {#pretrain new="2.1" tag="experimental"}
|
||||
## pretrain {#pretrain new="2.1" tag="command,experimental"}
|
||||
|
||||
Pretrain the "token to vector" ([`Tok2vec`](/api/tok2vec)) layer of pipeline
|
||||
components on [raw text](/api/data-formats#pretrain), using an approximate
|
||||
|
@ -678,19 +674,19 @@ $ python -m spacy pretrain [texts_loc] [output_dir] [config_path]
|
|||
[--code] [--resume-path] [--epoch-resume] [overrides]
|
||||
```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| ----------------------- | ------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `texts_loc` | positional | Path to JSONL file with raw texts to learn from, with text provided as the key `"text"` or tokens as the key `"tokens"`. [See here](/api/data-formats#pretrain) for details. |
|
||||
| `output_dir` | positional | Directory to write models to on each epoch. |
|
||||
| `config_path` | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. |
|
||||
| `--code`, `-c` | option | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. |
|
||||
| `--resume-path`, `-r` | option | Path to pretrained weights from which to resume pretraining. |
|
||||
| `--epoch-resume`, `-er` | option | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. |
|
||||
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||
| overrides | option / flag | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.use_gpu 1`. |
|
||||
| **CREATES** | weights | The pretrained weights that can be used to initialize `spacy train`. |
|
||||
| Name | Description |
|
||||
| ----------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `texts_loc` | Path to JSONL file with raw texts to learn from, with text provided as the key `"text"` or tokens as the key `"tokens"`. [See here](/api/data-formats#pretrain) for details. ~~Path (positional)~~ |
|
||||
| `output_dir` | Directory to write models to on each epoch. ~~Path (positional)~~ |
|
||||
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
|
||||
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||
| `--resume-path`, `-r` | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~ |
|
||||
| `--epoch-resume`, `-er` | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. ~~Optional[int] \(option)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.dropout 0.2`. ~~Any (option/flag)~~ |
|
||||
| **CREATES** | The pretrained weights that can be used to initialize `spacy train`. |
|
||||
|
||||
## Evaluate {#evaluate new="2"}
|
||||
## evaluate {#evaluate new="2" tag="command"}
|
||||
|
||||
Evaluate a model. Expects a loadable spaCy model and evaluation data in the
|
||||
[binary `.spacy` format](/api/data-formats#binary-training). The
|
||||
|
@ -707,19 +703,19 @@ $ python -m spacy evaluate [model] [data_path] [--output] [--gold-preproc]
|
|||
[--gpu-id] [--displacy-path] [--displacy-limit]
|
||||
```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| ------------------------- | -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `model` | positional | Model to evaluate. Can be a package or a path to a model data directory. |
|
||||
| `data_path` | positional | Location of evaluation data in spaCy's [binary format](/api/data-formats#training). |
|
||||
| `--output`, `-o` | option | Output JSON file for metrics. If not set, no metrics will be exported. |
|
||||
| `--gold-preproc`, `-G` | flag | Use gold preprocessing. |
|
||||
| `--gpu-id`, `-g` | option | GPU to use, if any. Defaults to `-1` for CPU. |
|
||||
| `--displacy-path`, `-dp` | option | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. |
|
||||
| `--displacy-limit`, `-dl` | option | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. |
|
||||
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||
| **CREATES** | `stdout`, JSON, HTML | Training results and optional metrics and visualizations. |
|
||||
| Name | Description |
|
||||
| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `model` | Model to evaluate. Can be a package or a path to a model data directory. ~~str (positional)~~ |
|
||||
| `data_path` | Location of evaluation data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~ |
|
||||
| `--output`, `-o` | Output JSON file for metrics. If not set, no metrics will be exported. ~~Optional[Path] \(option)~~ |
|
||||
| `--gold-preproc`, `-G` | Use gold preprocessing. ~~bool (flag)~~ |
|
||||
| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ |
|
||||
| `--displacy-path`, `-dp` | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. ~~Optional[Path] \(option)~~ |
|
||||
| `--displacy-limit`, `-dl` | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. ~~int (option)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| **CREATES** | Training results and optional metrics and visualizations. |
|
||||
|
||||
## Package {#package}
|
||||
## package {#package tag="command"}
|
||||
|
||||
Generate an installable
|
||||
[model Python package](/usage/training#models-generating) from an existing model
|
||||
|
@ -750,25 +746,25 @@ $ python -m spacy package [input_dir] [output_dir] [--meta-path] [--create-meta]
|
|||
> pip install dist/en_model-0.0.0.tar.gz
|
||||
> ```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| ------------------------------------------------ | ---------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `input_dir` | positional | Path to directory containing model data. |
|
||||
| `output_dir` | positional | Directory to create package folder in. |
|
||||
| `--meta-path`, `-m` <Tag variant="new">2</Tag> | option | Path to `meta.json` file (optional). |
|
||||
| `--create-meta`, `-C` <Tag variant="new">2</Tag> | flag | Create a `meta.json` file on the command line, even if one already exists in the directory. If an existing file is found, its entries will be shown as the defaults in the command line prompt. |
|
||||
| `--no-sdist`, `-NS`, | flag | Don't build the `.tar.gz` sdist automatically. Can be set if you want to run this step manually. |
|
||||
| `--version`, `-v` <Tag variant="new">3</Tag> | option | Package version to override in meta. Useful when training new versions, as it doesn't require editing the meta template. |
|
||||
| `--force`, `-f` | flag | Force overwriting of existing folder in output directory. |
|
||||
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||
| **CREATES** | directory | A Python package containing the spaCy model. |
|
||||
| Name | Description |
|
||||
| ------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `input_dir` | Path to directory containing model data. ~~Path (positional)~~ |
|
||||
| `output_dir` | Directory to create package folder in. ~~Path (positional)~~ |
|
||||
| `--meta-path`, `-m` <Tag variant="new">2</Tag> | Path to `meta.json` file (optional). ~~Optional[Path] \(option)~~ |
|
||||
| `--create-meta`, `-C` <Tag variant="new">2</Tag> | Create a `meta.json` file on the command line, even if one already exists in the directory. If an existing file is found, its entries will be shown as the defaults in the command line prompt. ~~bool (flag)~~ |
|
||||
| `--no-sdist`, `-NS`, | Don't build the `.tar.gz` sdist automatically. Can be set if you want to run this step manually. ~~bool (flag)~~ |
|
||||
| `--version`, `-v` <Tag variant="new">3</Tag> | Package version to override in meta. Useful when training new versions, as it doesn't require editing the meta template. ~~Optional[str] \(option)~~ |
|
||||
| `--force`, `-f` | Force overwriting of existing folder in output directory. ~~bool (flag)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| **CREATES** | A Python package containing the spaCy model. |
|
||||
|
||||
## Project {#project new="3"}
|
||||
## project {#project new="3"}
|
||||
|
||||
The `spacy project` CLI includes subcommands for working with
|
||||
[spaCy projects](/usage/projects), end-to-end workflows for building and
|
||||
deploying custom spaCy models.
|
||||
|
||||
### project clone {#project-clone}
|
||||
### project clone {#project-clone tag="command"}
|
||||
|
||||
Clone a project template from a Git repository. Calls into `git` under the hood
|
||||
and uses the sparse checkout feature, so you're only downloading what you need.
|
||||
|
@ -795,15 +791,15 @@ $ python -m spacy project clone [name] [dest] [--repo]
|
|||
> $ python -m spacy project clone template --repo https://github.com/your_org/your_repo
|
||||
> ```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| -------------- | ---------- | ---------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `name` | positional | The name of the template to clone, relative to the repo. Can be a top-level directory or a subdirectory like `dir/template`. |
|
||||
| `dest` | positional | Where to clone the project. Defaults to current working directory. |
|
||||
| `--repo`, `-r` | option | The repository to clone from. Can be any public or private Git repo you have access to. |
|
||||
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||
| **CREATES** | directory | The cloned [project directory](/usage/projects#project-files). |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `name` | The name of the template to clone, relative to the repo. Can be a top-level directory or a subdirectory like `dir/template`. ~~str (positional)~~ |
|
||||
| `dest` | Where to clone the project. Defaults to current working directory. ~~Path (positional)~~ |
|
||||
| `--repo`, `-r` | The repository to clone from. Can be any public or private Git repo you have access to. ~~str (option)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| **CREATES** | The cloned [project directory](/usage/projects#project-files). |
|
||||
|
||||
### project assets {#project-assets}
|
||||
### project assets {#project-assets tag="command"}
|
||||
|
||||
Fetch project assets like datasets and pretrained weights. Assets are defined in
|
||||
the `assets` section of the [`project.yml`](/usage/projects#project-yml). If a
|
||||
|
@ -824,13 +820,13 @@ $ python -m spacy project assets [project_dir]
|
|||
> $ python -m spacy project assets
|
||||
> ```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| -------------- | ---------- | ----------------------------------------------------------------- |
|
||||
| `project_dir` | positional | Path to project directory. Defaults to current working directory. |
|
||||
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||
| **CREATES** | files | Downloaded or copied assets defined in the `project.yml`. |
|
||||
| Name | Description |
|
||||
| -------------- | --------------------------------------------------------------------------------------- |
|
||||
| `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| **CREATES** | Downloaded or copied assets defined in the `project.yml`. |
|
||||
|
||||
### project run {#project-run}
|
||||
### project run {#project-run tag="command"}
|
||||
|
||||
Run a named command or workflow defined in the
|
||||
[`project.yml`](/usage/projects#project-yml). If a workflow name is specified,
|
||||
|
@ -849,16 +845,16 @@ $ python -m spacy project run [subcommand] [project_dir] [--force] [--dry]
|
|||
> $ python -m spacy project run train
|
||||
> ```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| --------------- | ---------- | ----------------------------------------------------------------- |
|
||||
| `subcommand` | positional | Name of the command or workflow to run. |
|
||||
| `project_dir` | positional | Path to project directory. Defaults to current working directory. |
|
||||
| `--force`, `-F` | flag | Force re-running steps, even if nothing changed. |
|
||||
| `--dry`, `-D` | flag | Perform a dry run and don't execute scripts. |
|
||||
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||
| **EXECUTES** | script | The command defined in the `project.yml`. |
|
||||
| Name | Description |
|
||||
| --------------- | --------------------------------------------------------------------------------------- |
|
||||
| `subcommand` | Name of the command or workflow to run. ~~str (positional)~~ |
|
||||
| `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ |
|
||||
| `--force`, `-F` | Force re-running steps, even if nothing changed. ~~bool (flag)~~ |
|
||||
| `--dry`, `-D` | Perform a dry run and don't execute scripts. ~~bool (flag)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| **EXECUTES** | The command defined in the `project.yml`. |
|
||||
|
||||
### project dvc {#project-dvc}
|
||||
### project dvc {#project-dvc tag="command"}
|
||||
|
||||
Auto-generate [Data Version Control](https://dvc.org) (DVC) config file. Calls
|
||||
[`dvc run`](https://dvc.org/doc/command-reference/run) with `--no-exec` under
|
||||
|
@ -890,11 +886,11 @@ $ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose]
|
|||
> python -m spacy project dvc all
|
||||
> ```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| ----------------- | ---------- | --------------------------------------------------------------------------------------------- |
|
||||
| `project_dir` | positional | Path to project directory. Defaults to current working directory. |
|
||||
| `workflow` | positional | Name of workflow defined in `project.yml`. Defaults to first workflow if not set. |
|
||||
| `--force`, `-F` | flag | Force-updating config file. |
|
||||
| `--verbose`, `-V` | flag | Print more output generated by DVC. |
|
||||
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||
| **CREATES** | file | A `dvc.yaml` file in the project directory, based on the steps defined in the given workflow. |
|
||||
| Name | Description |
|
||||
| ----------------- | ----------------------------------------------------------------------------------------------------------------- |
|
||||
| `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ |
|
||||
| `workflow` | Name of workflow defined in `project.yml`. Defaults to first workflow if not set. ~~Optional[str] \(positional)~~ |
|
||||
| `--force`, `-F` | Force-updating config file. ~~bool (flag)~~ |
|
||||
| `--verbose`, `-V` | Print more output generated by DVC. ~~bool (flag)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| **CREATES** | A `dvc.yaml` file in the project directory, based on the steps defined in the given workflow. |
|
||||
|
|
|
@ -40,7 +40,7 @@ Initialize a `Language` object.
|
|||
| `meta` | Custom meta data for the `Language` class. Is written to by models to add model meta data. ~~dict~~ |
|
||||
| `create_tokenizer` | Optional function that receives the `nlp` object and returns a tokenizer. ~~Callable[[Language], Callable[[str], Doc]]~~ |
|
||||
|
||||
## Language.from_config {#from_config tag="classmethod"}
|
||||
## Language.from_config {#from_config tag="classmethod" new="3"}
|
||||
|
||||
Create a `Language` object from a loaded config. Will set up the tokenizer and
|
||||
language data, add pipeline components based on the pipeline and components
|
||||
|
|
|
@ -70,7 +70,7 @@ Create a blank model of a given language class. This function is the twin of
|
|||
| `name` | [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) of the language class to load. ~~str~~ |
|
||||
| **RETURNS** | An empty `Language` object of the appropriate subclass. ~~Language~~ |
|
||||
|
||||
#### spacy.info {#spacy.info tag="function"}
|
||||
### spacy.info {#spacy.info tag="function"}
|
||||
|
||||
The same as the [`info` command](/api/cli#info). Pretty-print information about
|
||||
your installation, models and local setup from within spaCy. To get the model
|
||||
|
@ -316,7 +316,7 @@ factories.
|
|||
The following registries are added by the
|
||||
[`spacy-transformers`](https://github.com/explosion/spacy-transformers) package.
|
||||
See the [`Transformer`](/api/transformer) API reference and
|
||||
[usage docs](/usage/transformers) for details.
|
||||
[usage docs](/usage/embeddings-transformers) for details.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -585,20 +585,40 @@ A helper function to use in the `load()` method of a model package's
|
|||
| `config` <Tag variant="new">3</Tag> | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ |
|
||||
| **RETURNS** | `Language` class with the loaded model. ~~Language~~ |
|
||||
|
||||
### util.get_model_meta {#util.get_model_meta tag="function" new="2"}
|
||||
### util.load_config {#util.load_config tag="function" new="3"}
|
||||
|
||||
Get a model's meta.json from a directory path and validate its contents.
|
||||
Load a model's [`config.cfg`](/api/data-formats#config) from a file path. The
|
||||
config typically includes details about the model pipeline and how its
|
||||
components are created, as well as all training settings and hyperparameters.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> meta = util.get_model_meta("/path/to/model")
|
||||
> config = util.load_config("/path/to/model/config.cfg")
|
||||
> print(config.to_str())
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | --------------------------------------------- |
|
||||
| `path` | Path to model directory. ~~Union[str, Path]~~ |
|
||||
| **RETURNS** | The model's meta data. ~~Dict[str, Any]~~ |
|
||||
| Name | Description |
|
||||
| ------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | Path to the model's `config.cfg`. ~~Union[str, Path]~~ |
|
||||
| `overrides` | Optional config overrides to replace in loaded config. Can be provided as nested dict, or as flat dict with keys in dot notation, e.g. `"nlp.pipeline"`. ~~Dict[str, Any]~~ |
|
||||
| `interpolate` | Whether to interpolate the config and replace variables like `${paths:train}` with their values. Defaults to `False`. ~~bool~~ |
|
||||
| **RETURNS** | The model's config. ~~Config~~ |
|
||||
|
||||
### util.load_meta {#util.load_meta tag="function" new="3"}
|
||||
|
||||
Get a model's `meta.json` from a file path and validate its contents.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> meta = util.load_meta("/path/to/model/meta.json")
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ----------------------------------------------------- |
|
||||
| `path` | Path to the model's `meta.json`. ~~Union[str, Path]~~ |
|
||||
| **RETURNS** | The model's meta data. ~~Dict[str, Any]~~ |
|
||||
|
||||
### util.is_package {#util.is_package tag="function"}
|
||||
|
||||
|
|
|
@ -41,7 +41,8 @@ token, the spaCy token receives the sum of their values. To access the values,
|
|||
you can use the custom [`Doc._.trf_data`](#custom-attributes) attribute. The
|
||||
package also adds the function registries [`@span_getters`](#span_getters) and
|
||||
[`@annotation_setters`](#annotation_setters) with several built-in registered
|
||||
functions. For more details, see the [usage documentation](/usage/transformers).
|
||||
functions. For more details, see the
|
||||
[usage documentation](/usage/embeddings-transformers).
|
||||
|
||||
## Config and implementation {#config}
|
||||
|
||||
|
|
|
@ -77,12 +77,14 @@ or flagging duplicates. For example, you can suggest a user content that's
|
|||
similar to what they're currently looking at, or label a support ticket as a
|
||||
duplicate if it's very similar to an already existing one.
|
||||
|
||||
Each `Doc`, `Span` and `Token` comes with a
|
||||
[`.similarity()`](/api/token#similarity) method that lets you compare it with
|
||||
another object, and determine the similarity. Of course similarity is always
|
||||
subjective – whether "dog" and "cat" are similar really depends on how you're
|
||||
looking at it. spaCy's similarity model usually assumes a pretty general-purpose
|
||||
definition of similarity.
|
||||
Each [`Doc`](/api/doc), [`Span`](/api/span), [`Token`](/api/token) and
|
||||
[`Lexeme`](/api/lexeme) comes with a [`.similarity`](/api/token#similarity)
|
||||
method that lets you compare it with another object, and determine the
|
||||
similarity. Of course similarity is always subjective – whether "dog" and "cat"
|
||||
are similar really depends on how you're looking at it. spaCy's similarity model
|
||||
usually assumes a pretty general-purpose definition of similarity.
|
||||
|
||||
<!-- TODO: use better example here -->
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
|
|
459
website/docs/usage/embeddings-transformers.md
Normal file
459
website/docs/usage/embeddings-transformers.md
Normal file
|
@ -0,0 +1,459 @@
|
|||
---
|
||||
title: Embeddings, Transformers and Transfer Learning
|
||||
teaser: Using transformer embeddings like BERT in spaCy
|
||||
menu:
|
||||
- ['Embedding Layers', 'embedding-layers']
|
||||
- ['Transformers', 'transformers']
|
||||
- ['Static Vectors', 'static-vectors']
|
||||
- ['Pretraining', 'pretraining']
|
||||
next: /usage/training
|
||||
---
|
||||
|
||||
<!-- TODO: intro, short explanation of embeddings/transformers, point user to processing pipelines docs for intro -->
|
||||
|
||||
## Shared embedding layers {#embedding-layers}
|
||||
|
||||
<!-- TODO: write: `Tok2Vec` and `Transformer` components -->
|
||||
|
||||
<Accordion title="What’s the difference between word vectors and language models?" id="vectors-vs-language-models">
|
||||
|
||||
The key difference between [word vectors](#word-vectors) and contextual language
|
||||
models such as [transformers](#transformers) is that word vectors model
|
||||
**lexical types**, rather than _tokens_. If you have a list of terms with no
|
||||
context around them, a transformer model like BERT can't really help you. BERT
|
||||
is designed to understand language **in context**, which isn't what you have. A
|
||||
word vectors table will be a much better fit for your task. However, if you do
|
||||
have words in context — whole sentences or paragraphs of running text — word
|
||||
vectors will only provide a very rough approximation of what the text is about.
|
||||
|
||||
Word vectors are also very computationally efficient, as they map a word to a
|
||||
vector with a single indexing operation. Word vectors are therefore useful as a
|
||||
way to **improve the accuracy** of neural network models, especially models that
|
||||
are small or have received little or no pretraining. In spaCy, word vector
|
||||
tables are only used as **static features**. spaCy does not backpropagate
|
||||
gradients to the pretrained word vectors table. The static vectors table is
|
||||
usually used in combination with a smaller table of learned task-specific
|
||||
embeddings.
|
||||
|
||||
</Accordion>
|
||||
|
||||
<Accordion title="When should I add word vectors to my model?">
|
||||
|
||||
Word vectors are not compatible with most [transformer models](#transformers),
|
||||
but if you're training another type of NLP network, it's almost always worth
|
||||
adding word vectors to your model. As well as improving your final accuracy,
|
||||
word vectors often make experiments more consistent, as the accuracy you reach
|
||||
will be less sensitive to how the network is randomly initialized. High variance
|
||||
due to random chance can slow down your progress significantly, as you need to
|
||||
run many experiments to filter the signal from the noise.
|
||||
|
||||
Word vector features need to be enabled prior to training, and the same word
|
||||
vectors table will need to be available at runtime as well. You cannot add word
|
||||
vector features once the model has already been trained, and you usually cannot
|
||||
replace one word vectors table with another without causing a significant loss
|
||||
of performance.
|
||||
|
||||
</Accordion>
|
||||
|
||||
## Using transformer models {#transformers}
|
||||
|
||||
Transformers are a family of neural network architectures that compute **dense,
|
||||
context-sensitive representations** for the tokens in your documents. Downstream
|
||||
models in your pipeline can then use these representations as input features to
|
||||
**improve their predictions**. You can connect multiple components to a single
|
||||
transformer model, with any or all of those components giving feedback to the
|
||||
transformer to fine-tune it to your tasks. spaCy's transformer support
|
||||
interoperates with [PyTorch](https://pytorch.org) and the
|
||||
[HuggingFace `transformers`](https://huggingface.co/transformers/) library,
|
||||
giving you access to thousands of pretrained models for your pipelines. There
|
||||
are many [great guides](http://jalammar.github.io/illustrated-transformer/) to
|
||||
transformer models, but for practical purposes, you can simply think of them as
|
||||
a drop-in replacement that let you achieve **higher accuracy** in exchange for
|
||||
**higher training and runtime costs**.
|
||||
|
||||
### Setup and installation {#transformers-installation}
|
||||
|
||||
> #### System requirements
|
||||
>
|
||||
> We recommend an NVIDIA **GPU** with at least **10GB of memory** in order to
|
||||
> work with transformer models. Make sure your GPU drivers are up to date and
|
||||
> you have **CUDA v9+** installed.
|
||||
|
||||
> The exact requirements will depend on the transformer model. Training a
|
||||
> transformer-based model without a GPU will be too slow for most practical
|
||||
> purposes.
|
||||
>
|
||||
> Provisioning a new machine will require about **5GB** of data to be
|
||||
> downloaded: 3GB CUDA runtime, 800MB PyTorch, 400MB CuPy, 500MB weights, 200MB
|
||||
> spaCy and dependencies.
|
||||
|
||||
Once you have CUDA installed, you'll need to install two pip packages,
|
||||
[`cupy`](https://docs.cupy.dev/en/stable/install.html) and
|
||||
[`spacy-transformers`](https://github.com/explosion/spacy-transformers). `cupy`
|
||||
is just like `numpy`, but for GPU. The best way to install it is to choose a
|
||||
wheel that matches the version of CUDA you're using. You may also need to set
|
||||
the `CUDA_PATH` environment variable if your CUDA runtime is installed in a
|
||||
non-standard location. Putting it all together, if you had installed CUDA 10.2
|
||||
in `/opt/nvidia/cuda`, you would run:
|
||||
|
||||
```bash
|
||||
### Installation with CUDA
|
||||
export CUDA_PATH="/opt/nvidia/cuda"
|
||||
pip install cupy-cuda102
|
||||
pip install spacy-transformers
|
||||
```
|
||||
|
||||
### Runtime usage {#transformers-runtime}
|
||||
|
||||
Transformer models can be used as **drop-in replacements** for other types of
|
||||
neural networks, so your spaCy pipeline can include them in a way that's
|
||||
completely invisible to the user. Users will download, load and use the model in
|
||||
the standard way, like any other spaCy pipeline. Instead of using the
|
||||
transformers as subnetworks directly, you can also use them via the
|
||||
[`Transformer`](/api/transformer) pipeline component.
|
||||
|
||||
![The processing pipeline with the transformer component](../images/pipeline_transformer.svg)
|
||||
|
||||
The `Transformer` component sets the
|
||||
[`Doc._.trf_data`](/api/transformer#custom_attributes) extension attribute,
|
||||
which lets you access the transformers outputs at runtime.
|
||||
|
||||
```bash
|
||||
$ python -m spacy download en_core_trf_lg
|
||||
```
|
||||
|
||||
```python
|
||||
### Example
|
||||
import spacy
|
||||
from thinc.api import use_pytorch_for_gpu_memory, require_gpu
|
||||
|
||||
# Use the GPU, with memory allocations directed via PyTorch.
|
||||
# This prevents out-of-memory errors that would otherwise occur from competing
|
||||
# memory pools.
|
||||
use_pytorch_for_gpu_memory()
|
||||
require_gpu(0)
|
||||
|
||||
nlp = spacy.load("en_core_trf_lg")
|
||||
for doc in nlp.pipe(["some text", "some other text"]):
|
||||
tokvecs = doc._.trf_data.tensors[-1]
|
||||
```
|
||||
|
||||
You can also customize how the [`Transformer`](/api/transformer) component sets
|
||||
annotations onto the [`Doc`](/api/doc), by customizing the `annotation_setter`.
|
||||
This callback will be called with the raw input and output data for the whole
|
||||
batch, along with the batch of `Doc` objects, allowing you to implement whatever
|
||||
you need. The annotation setter is called with a batch of [`Doc`](/api/doc)
|
||||
objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch)
|
||||
containing the transformers data for the batch.
|
||||
|
||||
```python
|
||||
def custom_annotation_setter(docs, trf_data):
|
||||
# TODO:
|
||||
...
|
||||
|
||||
nlp = spacy.load("en_core_trf_lg")
|
||||
nlp.get_pipe("transformer").annotation_setter = custom_annotation_setter
|
||||
doc = nlp("This is a text")
|
||||
print() # TODO:
|
||||
```
|
||||
|
||||
### Training usage {#transformers-training}
|
||||
|
||||
The recommended workflow for training is to use spaCy's
|
||||
[config system](/usage/training#config), usually via the
|
||||
[`spacy train`](/api/cli#train) command. The training config defines all
|
||||
component settings and hyperparameters in one place and lets you describe a tree
|
||||
of objects by referring to creation functions, including functions you register
|
||||
yourself. For details on how to get started with training your own model, check
|
||||
out the [training quickstart](/usage/training#quickstart).
|
||||
|
||||
<Project id="en_core_bert">
|
||||
|
||||
The easiest way to get started is to clone a transformers-based project
|
||||
template. Swap in your data, edit the settings and hyperparameters and train,
|
||||
evaluate, package and visualize your model.
|
||||
|
||||
</Project>
|
||||
|
||||
The `[components]` section in the [`config.cfg`](/api/data-formats#config)
|
||||
describes the pipeline components and the settings used to construct them,
|
||||
including their model implementation. Here's a config snippet for the
|
||||
[`Transformer`](/api/transformer) component, along with matching Python code. In
|
||||
this case, the `[components.transformer]` block describes the `transformer`
|
||||
component:
|
||||
|
||||
> #### Python equivalent
|
||||
>
|
||||
> ```python
|
||||
> from spacy_transformers import Transformer, TransformerModel
|
||||
> from spacy_transformers.annotation_setters import null_annotation_setter
|
||||
> from spacy_transformers.span_getters import get_doc_spans
|
||||
>
|
||||
> trf = Transformer(
|
||||
> nlp.vocab,
|
||||
> TransformerModel(
|
||||
> "bert-base-cased",
|
||||
> get_spans=get_doc_spans,
|
||||
> tokenizer_config={"use_fast": True},
|
||||
> ),
|
||||
> annotation_setter=null_annotation_setter,
|
||||
> max_batch_items=4096,
|
||||
> )
|
||||
> ```
|
||||
|
||||
```ini
|
||||
### config.cfg (excerpt)
|
||||
[components.transformer]
|
||||
factory = "transformer"
|
||||
max_batch_items = 4096
|
||||
|
||||
[components.transformer.model]
|
||||
@architectures = "spacy-transformers.TransformerModel.v1"
|
||||
name = "bert-base-cased"
|
||||
tokenizer_config = {"use_fast": true}
|
||||
|
||||
[components.transformer.model.get_spans]
|
||||
@span_getters = "doc_spans.v1"
|
||||
|
||||
[components.transformer.annotation_setter]
|
||||
@annotation_setters = "spacy-transformer.null_annotation_setter.v1"
|
||||
|
||||
```
|
||||
|
||||
The `[components.transformer.model]` block describes the `model` argument passed
|
||||
to the transformer component. It's a Thinc
|
||||
[`Model`](https://thinc.ai/docs/api-model) object that will be passed into the
|
||||
component. Here, it references the function
|
||||
[spacy-transformers.TransformerModel.v1](/api/architectures#TransformerModel)
|
||||
registered in the [`architectures` registry](/api/top-level#registry). If a key
|
||||
in a block starts with `@`, it's **resolved to a function** and all other
|
||||
settings are passed to the function as arguments. In this case, `name`,
|
||||
`tokenizer_config` and `get_spans`.
|
||||
|
||||
`get_spans` is a function that takes a batch of `Doc` object and returns lists
|
||||
of potentially overlapping `Span` objects to process by the transformer. Several
|
||||
[built-in functions](/api/transformer#span-getters) are available – for example,
|
||||
to process the whole document or individual sentences. When the config is
|
||||
resolved, the function is created and passed into the model as an argument.
|
||||
|
||||
<Infobox variant="warning">
|
||||
|
||||
Remember that the `config.cfg` used for training should contain **no missing
|
||||
values** and requires all settings to be defined. You don't want any hidden
|
||||
defaults creeping in and changing your results! spaCy will tell you if settings
|
||||
are missing, and you can run
|
||||
[`spacy init fill-config`](/api/cli#init-fill-config) to automatically fill in
|
||||
all defaults.
|
||||
|
||||
</Infobox>
|
||||
|
||||
### Customizing the settings {#transformers-training-custom-settings}
|
||||
|
||||
To change any of the settings, you can edit the `config.cfg` and re-run the
|
||||
training. To change any of the functions, like the span getter, you can replace
|
||||
the name of the referenced function – e.g. `@span_getters = "sent_spans.v1"` to
|
||||
process sentences. You can also register your own functions using the
|
||||
`span_getters` registry:
|
||||
|
||||
> #### config.cfg
|
||||
>
|
||||
> ```ini
|
||||
> [components.transformer.model.get_spans]
|
||||
> @span_getters = "custom_sent_spans"
|
||||
> ```
|
||||
|
||||
```python
|
||||
### code.py
|
||||
import spacy_transformers
|
||||
|
||||
@spacy_transformers.registry.span_getters("custom_sent_spans")
|
||||
def configure_custom_sent_spans():
|
||||
# TODO: write custom example
|
||||
def get_sent_spans(docs):
|
||||
return [list(doc.sents) for doc in docs]
|
||||
|
||||
return get_sent_spans
|
||||
```
|
||||
|
||||
To resolve the config during training, spaCy needs to know about your custom
|
||||
function. You can make it available via the `--code` argument that can point to
|
||||
a Python file. For more details on training with custom code, see the
|
||||
[training documentation](/usage/training#custom-code).
|
||||
|
||||
```bash
|
||||
$ python -m spacy train ./config.cfg --code ./code.py
|
||||
```
|
||||
|
||||
### Customizing the model implementations {#training-custom-model}
|
||||
|
||||
The [`Transformer`](/api/transformer) component expects a Thinc
|
||||
[`Model`](https://thinc.ai/docs/api-model) object to be passed in as its `model`
|
||||
argument. You're not limited to the implementation provided by
|
||||
`spacy-transformers` – the only requirement is that your registered function
|
||||
must return an object of type ~~Model[List[Doc], FullTransformerBatch]~~: that
|
||||
is, a Thinc model that takes a list of [`Doc`](/api/doc) objects, and returns a
|
||||
[`FullTransformerBatch`](/api/transformer#fulltransformerbatch) object with the
|
||||
transformer data.
|
||||
|
||||
> #### Model type annotations
|
||||
>
|
||||
> In the documentation and code base, you may come across type annotations and
|
||||
> descriptions of [Thinc](https://thinc.ai) model types, like ~~Model[List[Doc],
|
||||
> List[Floats2d]]~~. This so-called generic type describes the layer and its
|
||||
> input and output type – in this case, it takes a list of `Doc` objects as the
|
||||
> input and list of 2-dimensional arrays of floats as the output. You can read
|
||||
> more about defining Thinc models [here](https://thinc.ai/docs/usage-models).
|
||||
> Also see the [type checking](https://thinc.ai/docs/usage-type-checking) for
|
||||
> how to enable linting in your editor to see live feedback if your inputs and
|
||||
> outputs don't match.
|
||||
|
||||
The same idea applies to task models that power the **downstream components**.
|
||||
Most of spaCy's built-in model creation functions support a `tok2vec` argument,
|
||||
which should be a Thinc layer of type ~~Model[List[Doc], List[Floats2d]]~~. This
|
||||
is where we'll plug in our transformer model, using the
|
||||
[Tok2VecListener](/api/architectures#Tok2VecListener) layer, which sneakily
|
||||
delegates to the `Transformer` pipeline component.
|
||||
|
||||
```ini
|
||||
### config.cfg (excerpt) {highlight="12"}
|
||||
[components.ner]
|
||||
factory = "ner"
|
||||
|
||||
[nlp.pipeline.ner.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v1"
|
||||
nr_feature_tokens = 3
|
||||
hidden_width = 128
|
||||
maxout_pieces = 3
|
||||
use_upper = false
|
||||
|
||||
[nlp.pipeline.ner.model.tok2vec]
|
||||
@architectures = "spacy-transformers.Tok2VecListener.v1"
|
||||
grad_factor = 1.0
|
||||
|
||||
[nlp.pipeline.ner.model.tok2vec.pooling]
|
||||
@layers = "reduce_mean.v1"
|
||||
```
|
||||
|
||||
The [Tok2VecListener](/api/architectures#Tok2VecListener) layer expects a
|
||||
[pooling layer](https://thinc.ai/docs/api-layers#reduction-ops) as the argument
|
||||
`pooling`, which needs to be of type ~~Model[Ragged, Floats2d]~~. This layer
|
||||
determines how the vector for each spaCy token will be computed from the zero or
|
||||
more source rows the token is aligned against. Here we use the
|
||||
[`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean) layer, which
|
||||
averages the wordpiece rows. We could instead use
|
||||
[`reduce_max`](https://thinc.ai/docs/api-layers#reduce_max), or a custom
|
||||
function you write yourself.
|
||||
|
||||
You can have multiple components all listening to the same transformer model,
|
||||
and all passing gradients back to it. By default, all of the gradients will be
|
||||
**equally weighted**. You can control this with the `grad_factor` setting, which
|
||||
lets you reweight the gradients from the different listeners. For instance,
|
||||
setting `grad_factor = 0` would disable gradients from one of the listeners,
|
||||
while `grad_factor = 2.0` would multiply them by 2. This is similar to having a
|
||||
custom learning rate for each component. Instead of a constant, you can also
|
||||
provide a schedule, allowing you to freeze the shared parameters at the start of
|
||||
training.
|
||||
|
||||
## Static vectors {#static-vectors}
|
||||
|
||||
<!-- TODO: write -->
|
||||
|
||||
### Using word vectors in your models {#word-vectors-models}
|
||||
|
||||
Many neural network models are able to use word vector tables as additional
|
||||
features, which sometimes results in significant improvements in accuracy.
|
||||
spaCy's built-in embedding layer,
|
||||
[MultiHashEmbed](/api/architectures#MultiHashEmbed), can be configured to use
|
||||
word vector tables using the `also_use_static_vectors` flag. This setting is
|
||||
also available on the [MultiHashEmbedCNN](/api/architectures#MultiHashEmbedCNN)
|
||||
layer, which builds the default token-to-vector encoding architecture.
|
||||
|
||||
```ini
|
||||
[tagger.model.tok2vec.embed]
|
||||
@architectures = "spacy.MultiHashEmbed.v1"
|
||||
width = 128
|
||||
rows = 7000
|
||||
also_embed_subwords = true
|
||||
also_use_static_vectors = true
|
||||
```
|
||||
|
||||
<Infobox title="How it works" emoji="💡">
|
||||
|
||||
The configuration system will look up the string `"spacy.MultiHashEmbed.v1"` in
|
||||
the `architectures` [registry](/api/top-level#registry), and call the returned
|
||||
object with the rest of the arguments from the block. This will result in a call
|
||||
to the
|
||||
[`MultiHashEmbed`](https://github.com/explosion/spacy/tree/develop/spacy/ml/models/tok2vec.py)
|
||||
function, which will return a [Thinc](https://thinc.ai) model object with the
|
||||
type signature ~~Model[List[Doc], List[Floats2d]]~~. Because the embedding layer
|
||||
takes a list of `Doc` objects as input, it does not need to store a copy of the
|
||||
vectors table. The vectors will be retrieved from the `Doc` objects that are
|
||||
passed in, via the `doc.vocab.vectors` attribute. This part of the process is
|
||||
handled by the [StaticVectors](/api/architectures#StaticVectors) layer.
|
||||
|
||||
</Infobox>
|
||||
|
||||
#### Creating a custom embedding layer {#custom-embedding-layer}
|
||||
|
||||
The [MultiHashEmbed](/api/architectures#StaticVectors) layer is spaCy's
|
||||
recommended strategy for constructing initial word representations for your
|
||||
neural network models, but you can also implement your own. You can register any
|
||||
function to a string name, and then reference that function within your config
|
||||
(see the [training docs](/usage/training) for more details). To try this out,
|
||||
you can save the following little example to a new Python file:
|
||||
|
||||
```python
|
||||
from spacy.ml.staticvectors import StaticVectors
|
||||
from spacy.util import registry
|
||||
|
||||
print("I was imported!")
|
||||
|
||||
@registry.architectures("my_example.MyEmbedding.v1")
|
||||
def MyEmbedding(output_width: int) -> Model[List[Doc], List[Floats2d]]:
|
||||
print("I was called!")
|
||||
return StaticVectors(nO=output_width)
|
||||
```
|
||||
|
||||
If you pass the path to your file to the [`spacy train`](/api/cli#train) command
|
||||
using the `--code` argument, your file will be imported, which means the
|
||||
decorator registering the function will be run. Your function is now on equal
|
||||
footing with any of spaCy's built-ins, so you can drop it in instead of any
|
||||
other model with the same input and output signature. For instance, you could
|
||||
use it in the tagger model as follows:
|
||||
|
||||
```ini
|
||||
[tagger.model.tok2vec.embed]
|
||||
@architectures = "my_example.MyEmbedding.v1"
|
||||
output_width = 128
|
||||
```
|
||||
|
||||
Now that you have a custom function wired into the network, you can start
|
||||
implementing the logic you're interested in. For example, let's say you want to
|
||||
try a relatively simple embedding strategy that makes use of static word
|
||||
vectors, but combines them via summation with a smaller table of learned
|
||||
embeddings.
|
||||
|
||||
```python
|
||||
from thinc.api import add, chain, remap_ids, Embed
|
||||
from spacy.ml.staticvectors import StaticVectors
|
||||
|
||||
@registry.architectures("my_example.MyEmbedding.v1")
|
||||
def MyCustomVectors(
|
||||
output_width: int,
|
||||
vector_width: int,
|
||||
embed_rows: int,
|
||||
key2row: Dict[int, int]
|
||||
) -> Model[List[Doc], List[Floats2d]]:
|
||||
return add(
|
||||
StaticVectors(nO=output_width),
|
||||
chain(
|
||||
FeatureExtractor(["ORTH"]),
|
||||
remap_ids(key2row),
|
||||
Embed(nO=output_width, nV=embed_rows)
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
## Pretraining {#pretraining}
|
||||
|
||||
<!-- TODO: write -->
|
|
@ -9,6 +9,7 @@ menu:
|
|||
- ['Tokenization', 'tokenization']
|
||||
- ['Merging & Splitting', 'retokenization']
|
||||
- ['Sentence Segmentation', 'sbd']
|
||||
- ['Vectors & Similarity', 'vectors-similarity']
|
||||
- ['Language data', 'language-data']
|
||||
---
|
||||
|
||||
|
@ -1024,10 +1025,10 @@ produced by the tokenizer.
|
|||
>
|
||||
> If you're working with transformer models like BERT, check out the
|
||||
> [`spacy-transformers`](https://github.com/explosion/spacy-transformers)
|
||||
> extension package and [documentation](/usage/transformers). It includes a
|
||||
> pipeline component for using pretrained transformer weights and **training
|
||||
> transformer models** in spaCy, as well as helpful utilities for aligning word
|
||||
> pieces to linguistic tokenization.
|
||||
> extension package and [documentation](/usage/embeddings-transformers). It
|
||||
> includes a pipeline component for using pretrained transformer weights and
|
||||
> **training transformer models** in spaCy, as well as helpful utilities for
|
||||
> aligning word pieces to linguistic tokenization.
|
||||
|
||||
```python
|
||||
### Custom BERT word piece tokenizer
|
||||
|
@ -1510,7 +1511,7 @@ adding it to the pipeline using [`nlp.add_pipe`](/api/language#add_pipe).
|
|||
</Infobox>
|
||||
|
||||
Here's an example of a component that implements a pre-processing rule for
|
||||
splitting on `'...'` tokens. The component is added before the parser, which is
|
||||
splitting on `"..."` tokens. The component is added before the parser, which is
|
||||
then used to further segment the text. That's possible, because `is_sent_start`
|
||||
is only set to `True` for some of the tokens – all others still specify `None`
|
||||
for unset sentence boundaries. This approach can be useful if you want to
|
||||
|
@ -1540,6 +1541,152 @@ doc = nlp(text)
|
|||
print("After:", [sent.text for sent in doc.sents])
|
||||
```
|
||||
|
||||
## Word vectors and semantic similarity {#vectors-similarity}
|
||||
|
||||
import Vectors101 from 'usage/101/\_vectors-similarity.md'
|
||||
|
||||
<Vectors101 />
|
||||
|
||||
<Infobox title="What to expect from similarity results" variant="warning">
|
||||
|
||||
Computing similarity scores can be helpful in many situations, but it's also
|
||||
important to maintain **realistic expectations** about what information it can
|
||||
provide. Words can be related to each over in many ways, so a single
|
||||
"similarity" score will always be a **mix of different signals**, and vectors
|
||||
trained on different data can produce very different results that may not be
|
||||
useful for your purpose.
|
||||
|
||||
Also note that the similarity of `Doc` or `Span` objects defaults to the
|
||||
**average** of the token vectors. This means it's insensitive to the order of
|
||||
the words. Two documents expressing the same meaning with dissimilar wording
|
||||
will return a lower similarity score than two documents that happen to contain
|
||||
the same words while expressing different meanings.
|
||||
|
||||
</Infobox>
|
||||
|
||||
### Adding word vectors {#adding-vectors}
|
||||
|
||||
Custom word vectors can be trained using a number of open-source libraries, such
|
||||
as [Gensim](https://radimrehurek.com/gensim), [Fast Text](https://fasttext.cc),
|
||||
or Tomas Mikolov's original
|
||||
[Word2vec implementation](https://code.google.com/archive/p/word2vec/). Most
|
||||
word vector libraries output an easy-to-read text-based format, where each line
|
||||
consists of the word followed by its vector. For everyday use, we want to
|
||||
convert the vectors model into a binary format that loads faster and takes up
|
||||
less space on disk. The easiest way to do this is the
|
||||
[`init model`](/api/cli#init-model) command-line utility. This will output a
|
||||
spaCy model in the directory `/tmp/la_vectors_wiki_lg`, giving you access to
|
||||
some nice Latin vectors. You can then pass the directory path to
|
||||
[`spacy.load`](/api/top-level#spacy.load).
|
||||
|
||||
> #### Usage example
|
||||
>
|
||||
> ```python
|
||||
> nlp_latin = spacy.load("/tmp/la_vectors_wiki_lg")
|
||||
> doc1 = nlp_latin("Caecilius est in horto")
|
||||
> doc2 = nlp_latin("servus est in atrio")
|
||||
> doc1.similarity(doc2)
|
||||
> ```
|
||||
|
||||
```bash
|
||||
wget https://s3-us-west-1.amazonaws.com/fasttext-vectors/word-vectors-v2/cc.la.300.vec.gz
|
||||
python -m spacy init model en /tmp/la_vectors_wiki_lg --vectors-loc cc.la.300.vec.gz
|
||||
```
|
||||
|
||||
<Accordion title="How to optimize vector coverage" id="custom-vectors-coverage" spaced>
|
||||
|
||||
To help you strike a good balance between coverage and memory usage, spaCy's
|
||||
[`Vectors`](/api/vectors) class lets you map **multiple keys** to the **same
|
||||
row** of the table. If you're using the
|
||||
[`spacy init model`](/api/cli#init-model) command to create a vocabulary,
|
||||
pruning the vectors will be taken care of automatically if you set the
|
||||
`--prune-vectors` flag. You can also do it manually in the following steps:
|
||||
|
||||
1. Start with a **word vectors model** that covers a huge vocabulary. For
|
||||
instance, the [`en_vectors_web_lg`](/models/en-starters#en_vectors_web_lg)
|
||||
model provides 300-dimensional GloVe vectors for over 1 million terms of
|
||||
English.
|
||||
2. If your vocabulary has values set for the `Lexeme.prob` attribute, the
|
||||
lexemes will be sorted by descending probability to determine which vectors
|
||||
to prune. Otherwise, lexemes will be sorted by their order in the `Vocab`.
|
||||
3. Call [`Vocab.prune_vectors`](/api/vocab#prune_vectors) with the number of
|
||||
vectors you want to keep.
|
||||
|
||||
```python
|
||||
nlp = spacy.load('en_vectors_web_lg')
|
||||
n_vectors = 105000 # number of vectors to keep
|
||||
removed_words = nlp.vocab.prune_vectors(n_vectors)
|
||||
|
||||
assert len(nlp.vocab.vectors) <= n_vectors # unique vectors have been pruned
|
||||
assert nlp.vocab.vectors.n_keys > n_vectors # but not the total entries
|
||||
```
|
||||
|
||||
[`Vocab.prune_vectors`](/api/vocab#prune_vectors) reduces the current vector
|
||||
table to a given number of unique entries, and returns a dictionary containing
|
||||
the removed words, mapped to `(string, score)` tuples, where `string` is the
|
||||
entry the removed word was mapped to, and `score` the similarity score between
|
||||
the two words.
|
||||
|
||||
```python
|
||||
### Removed words
|
||||
{
|
||||
"Shore": ("coast", 0.732257),
|
||||
"Precautionary": ("caution", 0.490973),
|
||||
"hopelessness": ("sadness", 0.742366),
|
||||
"Continous": ("continuous", 0.732549),
|
||||
"Disemboweled": ("corpse", 0.499432),
|
||||
"biostatistician": ("scientist", 0.339724),
|
||||
"somewheres": ("somewheres", 0.402736),
|
||||
"observing": ("observe", 0.823096),
|
||||
"Leaving": ("leaving", 1.0),
|
||||
}
|
||||
```
|
||||
|
||||
In the example above, the vector for "Shore" was removed and remapped to the
|
||||
vector of "coast", which is deemed about 73% similar. "Leaving" was remapped to
|
||||
the vector of "leaving", which is identical. If you're using the
|
||||
[`init model`](/api/cli#init-model) command, you can set the `--prune-vectors`
|
||||
option to easily reduce the size of the vectors as you add them to a spaCy
|
||||
model:
|
||||
|
||||
```bash
|
||||
$ python -m spacy init model /tmp/la_vectors_web_md --vectors-loc la.300d.vec.tgz --prune-vectors 10000
|
||||
```
|
||||
|
||||
This will create a spaCy model with vectors for the first 10,000 words in the
|
||||
vectors model. All other words in the vectors model are mapped to the closest
|
||||
vector among those retained.
|
||||
|
||||
</Accordion>
|
||||
|
||||
### Adding vectors individually {#adding-individual-vectors}
|
||||
|
||||
The `vector` attribute is a **read-only** numpy or cupy array (depending on
|
||||
whether you've configured spaCy to use GPU memory), with dtype `float32`. The
|
||||
array is read-only so that spaCy can avoid unnecessary copy operations where
|
||||
possible. You can modify the vectors via the [`Vocab`](/api/vocab) or
|
||||
[`Vectors`](/api/vectors) table. Using the
|
||||
[`Vocab.set_vector`](/api/vocab#set_vector) method is often the easiest approach
|
||||
if you have vectors in an arbitrary format, as you can read in the vectors with
|
||||
your own logic, and just set them with a simple loop. This method is likely to
|
||||
be slower than approaches that work with the whole vectors table at once, but
|
||||
it's a great approach for once-off conversions before you save out your model to
|
||||
disk.
|
||||
|
||||
```python
|
||||
### Adding vectors
|
||||
from spacy.vocab import Vocab
|
||||
|
||||
vector_data = {
|
||||
"dog": numpy.random.uniform(-1, 1, (300,)),
|
||||
"cat": numpy.random.uniform(-1, 1, (300,)),
|
||||
"orange": numpy.random.uniform(-1, 1, (300,))
|
||||
}
|
||||
vocab = Vocab()
|
||||
for word, vector in vector_data.items():
|
||||
vocab.set_vector(word, vector)
|
||||
```
|
||||
|
||||
## Language data {#language-data}
|
||||
|
||||
import LanguageData101 from 'usage/101/\_language-data.md'
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
---
|
||||
title: Language Processing Pipelines
|
||||
next: /usage/vectors-embeddings
|
||||
next: /usage/embeddings-transformers
|
||||
menu:
|
||||
- ['Processing Text', 'processing']
|
||||
- ['How Pipelines Work', 'pipelines']
|
||||
|
@ -324,9 +324,9 @@ pretrained components and new components trained on your data.
|
|||
|
||||
When reusing components across models, keep in mind that the **vocabulary**,
|
||||
**vectors** and model settings **must match**. If a pretrained model includes
|
||||
[word vectors](/usage/vectors-embeddings) and the component uses them as
|
||||
features, the model you copy it to needs to have the _same_ vectors available –
|
||||
otherwise, it won't be able to make the same predictions.
|
||||
[word vectors](/usage/linguistic-features#vectors-similarity) and the component
|
||||
uses them as features, the model you copy it to needs to have the _same_ vectors
|
||||
available – otherwise, it won't be able to make the same predictions.
|
||||
|
||||
</Infobox>
|
||||
|
||||
|
@ -1202,7 +1202,7 @@ document similarity method.
|
|||
Hooks let you customize some of the behaviors of the `Doc`, `Span` or `Token`
|
||||
objects by adding a component to the pipeline. For instance, to customize the
|
||||
[`Doc.similarity`](/api/doc#similarity) method, you can add a component that
|
||||
sets a custom function to `doc.user_hooks['similarity']`. The built-in
|
||||
sets a custom function to `doc.user_hooks["similarity"]`. The built-in
|
||||
`Doc.similarity` method will check the `user_hooks` dict, and delegate to your
|
||||
function if you've set one. Similar results can be achieved by setting functions
|
||||
to `Doc.user_span_hooks` and `Doc.user_token_hooks`.
|
||||
|
|
|
@ -247,7 +247,7 @@ import Vectors101 from 'usage/101/\_vectors-similarity.md'
|
|||
|
||||
To learn more about word vectors, how to **customize them** and how to load
|
||||
**your own vectors** into spaCy, see the usage guide on
|
||||
[using word vectors and semantic similarities](/usage/vectors-embeddings).
|
||||
[using word vectors and semantic similarities](/usage/linguistic-features#vectors-similarity).
|
||||
|
||||
</Infobox>
|
||||
|
||||
|
|
|
@ -30,7 +30,7 @@ ready-to-use spaCy models.
|
|||
|
||||
</Infobox>
|
||||
|
||||
## Quickstart {#quickstart}
|
||||
## Quickstart {#quickstart tag="new"}
|
||||
|
||||
The recommended way to train your spaCy models is via the
|
||||
[`spacy train`](/api/cli#train) command on the command line. It only needs a
|
||||
|
@ -131,7 +131,7 @@ Some of the main advantages and features of spaCy's training config are:
|
|||
multiple components, define them once and reference them as
|
||||
[variables](#config-interpolation).
|
||||
- **Reproducibility with no hidden defaults.** The config file is the "single
|
||||
source of truth" and includes all settings. <!-- TODO: explain this better -->
|
||||
source of truth" and includes all settings.
|
||||
- **Automated checks and validation.** When you load a config, spaCy checks if
|
||||
the settings are complete and if all values have the correct types. This lets
|
||||
you catch potential mistakes early. In your custom architectures, you can use
|
||||
|
@ -667,7 +667,7 @@ visualize your model.
|
|||
|
||||
For more details on how to integrate transformer models into your training
|
||||
config and customize the implementations, see the usage guide on
|
||||
[training transformers](/usage/transformers#training).
|
||||
[training transformers](/usage/embeddings-transformers#transformers-training).
|
||||
|
||||
### Pretraining with spaCy {#pretraining}
|
||||
|
||||
|
|
|
@ -218,7 +218,7 @@ available via `token.orth`.
|
|||
|
||||
The new [`Vectors`](/api/vectors) class helps the `Vocab` manage the vectors
|
||||
assigned to strings, and lets you assign vectors individually, or
|
||||
[load in GloVe vectors](/usage/vectors-embeddings#custom-loading-glove) from a
|
||||
[load in GloVe vectors](/usage/linguistic-features#adding-vectors) from a
|
||||
directory. To help you strike a good balance between coverage and memory usage,
|
||||
the `Vectors` class lets you map **multiple keys** to the **same row** of the
|
||||
table. If you're using the [`spacy init-model`](/api/cli#init-model) command to
|
||||
|
|
|
@ -30,7 +30,7 @@ menu:
|
|||
|
||||
<Infobox title="Details & Documentation" emoji="📖" list>
|
||||
|
||||
- **Usage:** [Transformers](/usage/transformers),
|
||||
- **Usage:** [Embeddings & Transformers](/usage/embeddings-transformers),
|
||||
[Training models](/usage/training)
|
||||
- **API:** [`Transformer`](/api/transformer),
|
||||
[`TransformerData`](/api/transformer#transformerdata),
|
||||
|
@ -59,13 +59,13 @@ menu:
|
|||
|
||||
### New built-in pipeline components {#features-pipeline-components}
|
||||
|
||||
| Name | Description |
|
||||
| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| [`SentenceRecognizer`](/api/sentencerecognizer) | Trainable component for sentence segmentation. |
|
||||
| [`Morphologizer`](/api/morphologizer) | Trainable component to predict morphological features. |
|
||||
| [`Lemmatizer`](/api/lemmatizer) | Standalone component for rule-based and lookup lemmatization. |
|
||||
| [`AttributeRuler`](/api/attributeruler) | Component for setting token attributes using match patterns. |
|
||||
| [`Transformer`](/api/transformer) | Component for using [transformer models](/usage/transformers) in your pipeline, accessing outputs and aligning tokens. Provided via [`spacy-transformers`](https://github.com/explosion/spacy-transformers). |
|
||||
| Name | Description |
|
||||
| ----------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| [`SentenceRecognizer`](/api/sentencerecognizer) | Trainable component for sentence segmentation. |
|
||||
| [`Morphologizer`](/api/morphologizer) | Trainable component to predict morphological features. |
|
||||
| [`Lemmatizer`](/api/lemmatizer) | Standalone component for rule-based and lookup lemmatization. |
|
||||
| [`AttributeRuler`](/api/attributeruler) | Component for setting token attributes using match patterns. |
|
||||
| [`Transformer`](/api/transformer) | Component for using [transformer models](/usage/embeddings-transformers) in your pipeline, accessing outputs and aligning tokens. Provided via [`spacy-transformers`](https://github.com/explosion/spacy-transformers). |
|
||||
|
||||
<Infobox title="Details & Documentation" emoji="📖" list>
|
||||
|
||||
|
@ -140,22 +140,20 @@ in your config and see validation errors if the argument values don't match.
|
|||
|
||||
The following methods, attributes and commands are new in spaCy v3.0.
|
||||
|
||||
| Name | Description |
|
||||
| ------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| [`Token.lex`](/api/token#attributes) | Access a token's [`Lexeme`](/api/lexeme). |
|
||||
| [`Language.select_pipes`](/api/language#select_pipes) | Contextmanager for enabling or disabling specific pipeline components for a block. |
|
||||
| [`Language.analyze_pipes`](/api/language#analyze_pipes) | [Analyze](/usage/processing-pipelines#analysis) components and their interdependencies. |
|
||||
| [`Language.resume_training`](/api/language#resume_training) | Experimental: continue training a pretrained model and initialize "rehearsal" for components that implement a `rehearse` method to prevent catastrophic forgetting. |
|
||||
| [`@Language.factory`](/api/language#factory) [`@Language.component`](/api/language#component) | Decorators for [registering](/usage/processing-pipelines#custom-components) pipeline component factories and simple stateless component functions. |
|
||||
| [`Language.has_factory`](/api/language#has_factory) | Check whether a component factory is registered on a language class.s |
|
||||
| [`Language.get_factory_meta`](/api/language#get_factory_meta) [`Language.get_pipe_meta`](/api/language#get_factory_meta) | Get the [`FactoryMeta`](/api/language#factorymeta) with component metadata for a factory or instance name. |
|
||||
| [`Language.config`](/api/language#config) | The [config](/usage/training#config) used to create the current `nlp` object. An instance of [`Config`](https://thinc.ai/docs/api-config#config) and can be saved to disk and used for training. |
|
||||
| [`Pipe.score`](/api/pipe#score) | Method on trainable pipeline components that returns a dictionary of evaluation scores. |
|
||||
| [`registry`](/api/top-level#registry) | Function registry to map functions to string names that can be referenced in [configs](/usage/training#config). |
|
||||
| [`init config`](/api/cli#init-config) | CLI command for initializing a [training config](/usage/training) file with the recommended settings. |
|
||||
| [`init fill-config`](/api/cli#init-fill-config) | CLI command for auto-filling a partial config with all defaults and missing values. |
|
||||
| [`debug config`](/api/cli#debug-config) | CLI command for debugging a [training config](/usage/training) file and showing validation errors. |
|
||||
| [`project`](/api/cli#project) | Suite of CLI commands for cloning, running and managing [spaCy projects](/usage/projects). |
|
||||
| Name | Description |
|
||||
| ----------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| [`Token.lex`](/api/token#attributes) | Access a token's [`Lexeme`](/api/lexeme). |
|
||||
| [`Language.select_pipes`](/api/language#select_pipes) | Contextmanager for enabling or disabling specific pipeline components for a block. |
|
||||
| [`Language.analyze_pipes`](/api/language#analyze_pipes) | [Analyze](/usage/processing-pipelines#analysis) components and their interdependencies. |
|
||||
| [`Language.resume_training`](/api/language#resume_training) | Experimental: continue training a pretrained model and initialize "rehearsal" for components that implement a `rehearse` method to prevent catastrophic forgetting. |
|
||||
| [`@Language.factory`](/api/language#factory) [`@Language.component`](/api/language#component) | Decorators for [registering](/usage/processing-pipelines#custom-components) pipeline component factories and simple stateless component functions. |
|
||||
| [`Language.has_factory`](/api/language#has_factory) | Check whether a component factory is registered on a language class.s |
|
||||
| [`Language.get_factory_meta`](/api/language#get_factory_meta) [`Language.get_pipe_meta`](/api/language#get_factory_meta) | Get the [`FactoryMeta`](/api/language#factorymeta) with component metadata for a factory or instance name. |
|
||||
| [`Language.config`](/api/language#config) | The [config](/usage/training#config) used to create the current `nlp` object. An instance of [`Config`](https://thinc.ai/docs/api-config#config) and can be saved to disk and used for training. |
|
||||
| [`Pipe.score`](/api/pipe#score) | Method on trainable pipeline components that returns a dictionary of evaluation scores. |
|
||||
| [`registry`](/api/top-level#registry) | Function registry to map functions to string names that can be referenced in [configs](/usage/training#config). |
|
||||
| [`init config`](/api/cli#init-config) [`init fill-config`](/api/cli#init-fill-config) [`debug config`](/api/cli#debug-config) | CLI commands for initializing, auto-filling and debugging [training configs](/usage/training). |
|
||||
| [`project`](/api/cli#project) | Suite of CLI commands for cloning, running and managing [spaCy projects](/usage/projects). |
|
||||
|
||||
## Backwards Incompatibilities {#incompat}
|
||||
|
||||
|
@ -420,15 +418,20 @@ $ python -m spacy convert ./training.json ./output
|
|||
#### Training config {#migrating-training-config}
|
||||
|
||||
The easiest way to get started with a training config is to use the
|
||||
[`init config`](/api/cli#init-config) command. You can start off with a blank
|
||||
config for a new model, copy the config from an existing model, or auto-fill a
|
||||
partial config like a starter config generated by our
|
||||
[quickstart widget](/usage/training#quickstart).
|
||||
[`init config`](/api/cli#init-config) command or the
|
||||
[quickstart widget](/usage/training#quickstart). You can define your
|
||||
requirements, and it will auto-generate a starter config with the best-matching
|
||||
default settings.
|
||||
|
||||
```bash
|
||||
python -m spacy init-config ./config.cfg --lang en --pipeline tagger,parser
|
||||
$ python -m spacy init config ./config.cfg --lang en --pipeline tagger,parser
|
||||
```
|
||||
|
||||
If you've exported a starter config from our
|
||||
[quickstart widget](/usage/training#quickstart), you can use the
|
||||
[`init fill-config`](/api/cli#init-fill-config) to fill it with all default
|
||||
values. You can then use the auto-generated `config.cfg` for training:
|
||||
|
||||
```diff
|
||||
### {wrap="true"}
|
||||
- python -m spacy train en ./output ./train.json ./dev.json --pipeline tagger,parser --cnn-window 1 --bilstm-depth 0
|
||||
|
|
|
@ -1,224 +0,0 @@
|
|||
---
|
||||
title: Vectors and Embeddings
|
||||
menu:
|
||||
- ["What's a Word Vector?", 'whats-a-vector']
|
||||
- ['Word Vectors', 'vectors']
|
||||
- ['Other Embeddings', 'embeddings']
|
||||
next: /usage/transformers
|
||||
---
|
||||
|
||||
An old idea in linguistics is that you can "know a word by the company it
|
||||
keeps": that is, word meanings can be understood relationally, based on their
|
||||
patterns of usage. This idea inspired a branch of NLP research known as
|
||||
"distributional semantics" that has aimed to compute databases of lexical
|
||||
knowledge automatically. The [Word2vec](https://en.wikipedia.org/wiki/Word2vec)
|
||||
family of algorithms are a key milestone in this line of research. For
|
||||
simplicity, we will refer to a distributional word representation as a "word
|
||||
vector", and algorithms that computes word vectors (such as
|
||||
[GloVe](https://nlp.stanford.edu/projects/glove/),
|
||||
[FastText](https://fasttext.cc), etc.) as "Word2vec algorithms".
|
||||
|
||||
Word vector tables are included in some of the spaCy [model packages](/models)
|
||||
we distribute, and you can easily create your own model packages with word
|
||||
vectors you train or download yourself. In some cases you can also add word
|
||||
vectors to an existing pipeline, although each pipeline can only have a single
|
||||
word vectors table, and a model package that already has word vectors is
|
||||
unlikely to work correctly if you replace the vectors with new ones.
|
||||
|
||||
## What's a word vector? {#whats-a-vector}
|
||||
|
||||
For spaCy's purposes, a "word vector" is a 1-dimensional slice from a
|
||||
2-dimensional **vectors table**, with a deterministic mapping from word types to
|
||||
rows in the table.
|
||||
|
||||
```python
|
||||
def what_is_a_word_vector(
|
||||
word_id: int,
|
||||
key2row: Dict[int, int],
|
||||
vectors_table: Floats2d,
|
||||
*,
|
||||
default_row: int=0
|
||||
) -> Floats1d:
|
||||
return vectors_table[key2row.get(word_id, default_row)]
|
||||
```
|
||||
|
||||
Word2vec algorithms try to produce vectors tables that let you estimate useful
|
||||
relationships between words using simple linear algebra operations. For
|
||||
instance, you can often find close synonyms of a word by finding the vectors
|
||||
closest to it by cosine distance, and then finding the words that are mapped to
|
||||
those neighboring vectors. Word vectors can also be useful as features in
|
||||
statistical models.
|
||||
|
||||
### Word vectors vs. contextual language models {#vectors-vs-language-models}
|
||||
|
||||
The key difference between word vectors and contextual language models such as
|
||||
ElMo, BERT and GPT-2 is that word vectors model **lexical types**, rather than
|
||||
_tokens_. If you have a list of terms with no context around them, a model like
|
||||
BERT can't really help you. BERT is designed to understand language **in
|
||||
context**, which isn't what you have. A word vectors table will be a much better
|
||||
fit for your task. However, if you do have words in context — whole sentences or
|
||||
paragraphs of running text — word vectors will only provide a very rough
|
||||
approximation of what the text is about.
|
||||
|
||||
Word vectors are also very computationally efficient, as they map a word to a
|
||||
vector with a single indexing operation. Word vectors are therefore useful as a
|
||||
way to **improve the accuracy** of neural network models, especially models that
|
||||
are small or have received little or no pretraining. In spaCy, word vector
|
||||
tables are only used as **static features**. spaCy does not backpropagate
|
||||
gradients to the pretrained word vectors table. The static vectors table is
|
||||
usually used in combination with a smaller table of learned task-specific
|
||||
embeddings.
|
||||
|
||||
## Using word vectors directly {#vectors}
|
||||
|
||||
spaCy stores word vector information in the
|
||||
[`Vocab.vectors`](/api/vocab#attributes) attribute, so you can access the whole
|
||||
vectors table from most spaCy objects. You can also access the vector for a
|
||||
[`Doc`](/api/doc), [`Span`](/api/span), [`Token`](/api/token) or
|
||||
[`Lexeme`](/api/lexeme) instance via the `vector` attribute. If your `Doc` or
|
||||
`Span` has multiple tokens, the average of the word vectors will be returned,
|
||||
excluding any "out of vocabulary" entries that have no vector available. If none
|
||||
of the words have a vector, a zeroed vector will be returned.
|
||||
|
||||
The `vector` attribute is a **read-only** numpy or cupy array (depending on
|
||||
whether you've configured spaCy to use GPU memory), with dtype `float32`. The
|
||||
array is read-only so that spaCy can avoid unnecessary copy operations where
|
||||
possible. You can modify the vectors via the `Vocab` or `Vectors` table.
|
||||
|
||||
### Converting word vectors for use in spaCy
|
||||
|
||||
Custom word vectors can be trained using a number of open-source libraries, such
|
||||
as [Gensim](https://radimrehurek.com/gensim), [Fast Text](https://fasttext.cc),
|
||||
or Tomas Mikolov's original
|
||||
[Word2vec implementation](https://code.google.com/archive/p/word2vec/). Most
|
||||
word vector libraries output an easy-to-read text-based format, where each line
|
||||
consists of the word followed by its vector. For everyday use, we want to
|
||||
convert the vectors model into a binary format that loads faster and takes up
|
||||
less space on disk. The easiest way to do this is the
|
||||
[`init-model`](/api/cli#init-model) command-line utility:
|
||||
|
||||
```bash
|
||||
wget https://s3-us-west-1.amazonaws.com/fasttext-vectors/word-vectors-v2/cc.la.300.vec.gz
|
||||
python -m spacy init-model en /tmp/la_vectors_wiki_lg --vectors-loc cc.la.300.vec.gz
|
||||
```
|
||||
|
||||
This will output a spaCy model in the directory `/tmp/la_vectors_wiki_lg`,
|
||||
giving you access to some nice Latin vectors 😉 You can then pass the directory
|
||||
path to [`spacy.load()`](/api/top-level#spacy.load).
|
||||
|
||||
```python
|
||||
nlp_latin = spacy.load("/tmp/la_vectors_wiki_lg")
|
||||
doc1 = nlp_latin("Caecilius est in horto")
|
||||
doc2 = nlp_latin("servus est in atrio")
|
||||
doc1.similarity(doc2)
|
||||
```
|
||||
|
||||
The model directory will have a `/vocab` directory with the strings, lexical
|
||||
entries and word vectors from the input vectors model. The
|
||||
[`init-model`](/api/cli#init-model) command supports a number of archive formats
|
||||
for the word vectors: the vectors can be in plain text (`.txt`), zipped
|
||||
(`.zip`), or tarred and zipped (`.tgz`).
|
||||
|
||||
### Optimizing vector coverage {#custom-vectors-coverage new="2"}
|
||||
|
||||
To help you strike a good balance between coverage and memory usage, spaCy's
|
||||
[`Vectors`](/api/vectors) class lets you map **multiple keys** to the **same
|
||||
row** of the table. If you're using the
|
||||
[`spacy init-model`](/api/cli#init-model) command to create a vocabulary,
|
||||
pruning the vectors will be taken care of automatically if you set the
|
||||
`--prune-vectors` flag. You can also do it manually in the following steps:
|
||||
|
||||
1. Start with a **word vectors model** that covers a huge vocabulary. For
|
||||
instance, the [`en_vectors_web_lg`](/models/en-starters#en_vectors_web_lg)
|
||||
model provides 300-dimensional GloVe vectors for over 1 million terms of
|
||||
English.
|
||||
2. If your vocabulary has values set for the `Lexeme.prob` attribute, the
|
||||
lexemes will be sorted by descending probability to determine which vectors
|
||||
to prune. Otherwise, lexemes will be sorted by their order in the `Vocab`.
|
||||
3. Call [`Vocab.prune_vectors`](/api/vocab#prune_vectors) with the number of
|
||||
vectors you want to keep.
|
||||
|
||||
```python
|
||||
nlp = spacy.load('en_vectors_web_lg')
|
||||
n_vectors = 105000 # number of vectors to keep
|
||||
removed_words = nlp.vocab.prune_vectors(n_vectors)
|
||||
|
||||
assert len(nlp.vocab.vectors) <= n_vectors # unique vectors have been pruned
|
||||
assert nlp.vocab.vectors.n_keys > n_vectors # but not the total entries
|
||||
```
|
||||
|
||||
[`Vocab.prune_vectors`](/api/vocab#prune_vectors) reduces the current vector
|
||||
table to a given number of unique entries, and returns a dictionary containing
|
||||
the removed words, mapped to `(string, score)` tuples, where `string` is the
|
||||
entry the removed word was mapped to, and `score` the similarity score between
|
||||
the two words.
|
||||
|
||||
```python
|
||||
### Removed words
|
||||
{
|
||||
"Shore": ("coast", 0.732257),
|
||||
"Precautionary": ("caution", 0.490973),
|
||||
"hopelessness": ("sadness", 0.742366),
|
||||
"Continous": ("continuous", 0.732549),
|
||||
"Disemboweled": ("corpse", 0.499432),
|
||||
"biostatistician": ("scientist", 0.339724),
|
||||
"somewheres": ("somewheres", 0.402736),
|
||||
"observing": ("observe", 0.823096),
|
||||
"Leaving": ("leaving", 1.0),
|
||||
}
|
||||
```
|
||||
|
||||
In the example above, the vector for "Shore" was removed and remapped to the
|
||||
vector of "coast", which is deemed about 73% similar. "Leaving" was remapped to
|
||||
the vector of "leaving", which is identical. If you're using the
|
||||
[`init-model`](/api/cli#init-model) command, you can set the `--prune-vectors`
|
||||
option to easily reduce the size of the vectors as you add them to a spaCy
|
||||
model:
|
||||
|
||||
```bash
|
||||
$ python -m spacy init-model /tmp/la_vectors_web_md --vectors-loc la.300d.vec.tgz --prune-vectors 10000
|
||||
```
|
||||
|
||||
This will create a spaCy model with vectors for the first 10,000 words in the
|
||||
vectors model. All other words in the vectors model are mapped to the closest
|
||||
vector among those retained.
|
||||
|
||||
### Adding vectors {#adding-vectors}
|
||||
|
||||
```python
|
||||
### Adding vectors
|
||||
from spacy.vocab import Vocab
|
||||
|
||||
vector_data = {"dog": numpy.random.uniform(-1, 1, (300,)),
|
||||
"cat": numpy.random.uniform(-1, 1, (300,)),
|
||||
"orange": numpy.random.uniform(-1, 1, (300,))}
|
||||
vocab = Vocab()
|
||||
for word, vector in vector_data.items():
|
||||
vocab.set_vector(word, vector)
|
||||
```
|
||||
|
||||
### Using custom similarity methods {#custom-similarity}
|
||||
|
||||
By default, [`Token.vector`](/api/token#vector) returns the vector for its
|
||||
underlying [`Lexeme`](/api/lexeme), while [`Doc.vector`](/api/doc#vector) and
|
||||
[`Span.vector`](/api/span#vector) return an average of the vectors of their
|
||||
tokens. You can customize these behaviors by modifying the `doc.user_hooks`,
|
||||
`doc.user_span_hooks` and `doc.user_token_hooks` dictionaries.
|
||||
|
||||
<Infobox title="Custom user hooks" emoji="📖">
|
||||
|
||||
For more details on **adding hooks** and **overwriting** the built-in `Doc`,
|
||||
`Span` and `Token` methods, see the usage guide on
|
||||
[user hooks](/usage/processing-pipelines#custom-components-user-hooks).
|
||||
|
||||
</Infobox>
|
||||
|
||||
<!-- TODO:
|
||||
|
||||
### Storing vectors on a GPU {#gpu}
|
||||
|
||||
-->
|
||||
|
||||
## Other embeddings {#embeddings}
|
||||
|
||||
<!-- TODO: something about other embeddings -->
|
|
@ -18,8 +18,11 @@
|
|||
{ "text": "Linguistic Features", "url": "/usage/linguistic-features" },
|
||||
{ "text": "Rule-based Matching", "url": "/usage/rule-based-matching" },
|
||||
{ "text": "Processing Pipelines", "url": "/usage/processing-pipelines" },
|
||||
{ "text": "Vectors & Embeddings", "url": "/usage/vectors-embeddings" },
|
||||
{ "text": "Transformers", "url": "/usage/transformers", "tag": "new" },
|
||||
{
|
||||
"text": "Embeddings & Transformers",
|
||||
"url": "/usage/embeddings-transformers",
|
||||
"tag": "new"
|
||||
},
|
||||
{ "text": "Training Models", "url": "/usage/training", "tag": "new" },
|
||||
{ "text": "spaCy Projects", "url": "/usage/projects", "tag": "new" },
|
||||
{ "text": "Saving & Loading", "url": "/usage/saving-loading" },
|
||||
|
|
|
@ -62,11 +62,12 @@ function linkType(el, showLink = true) {
|
|||
|
||||
export const TypeAnnotation = ({ lang = 'python', link = true, children }) => {
|
||||
// Hacky, but we're temporarily replacing a dot to prevent it from being split during highlighting
|
||||
const TMP_DOT = '•'
|
||||
const TMP_DOT = '۔'
|
||||
const code = Array.isArray(children) ? children.join('') : children || ''
|
||||
const rawStr = code.replace('.', TMP_DOT)
|
||||
const [rawText, meta] = code.split(/(?= \(.+\)$)/)
|
||||
const rawStr = rawText.replace(/\./g, TMP_DOT)
|
||||
const rawHtml = lang === 'none' || !code ? code : highlightCode(lang, rawStr)
|
||||
const html = rawHtml.replace(TMP_DOT, '.').replace(/\n/g, ' ')
|
||||
const html = rawHtml.replace(new RegExp(TMP_DOT, 'g'), '.').replace(/\n/g, ' ')
|
||||
const result = htmlToReact(html)
|
||||
const elements = Array.isArray(result) ? result : [result]
|
||||
const annotClassNames = classNames(
|
||||
|
@ -83,6 +84,7 @@ export const TypeAnnotation = ({ lang = 'python', link = true, children }) => {
|
|||
{elements.map((el, i) => (
|
||||
<Fragment key={i}>{linkType(el, !!link)}</Fragment>
|
||||
))}
|
||||
{meta && <span className={classes.typeAnnotationMeta}>{meta}</span>}
|
||||
</code>
|
||||
)
|
||||
}
|
||||
|
|
|
@ -37,7 +37,7 @@ function isDividerRow(children) {
|
|||
}
|
||||
|
||||
function isFootRow(children) {
|
||||
const rowRegex = /^(RETURNS|YIELDS|CREATES|PRINTS)/
|
||||
const rowRegex = /^(RETURNS|YIELDS|CREATES|PRINTS|EXECUTES)/
|
||||
if (children.length && children[0].props.name === 'td') {
|
||||
const cellChildren = children[0].props.children
|
||||
if (
|
||||
|
|
|
@ -9,7 +9,12 @@ import { isString, github, headingTextClassName } from './util'
|
|||
import classes from '../styles/typography.module.sass'
|
||||
|
||||
export const H1 = ({ Component = 'h1', className, ...props }) => (
|
||||
<Headline Component={Component} className={classNames(classes.h1, className)} {...props} />
|
||||
<Headline
|
||||
Component={Component}
|
||||
className={classNames(classes.h1, className)}
|
||||
permalink={false}
|
||||
{...props}
|
||||
/>
|
||||
)
|
||||
export const H2 = ({ className, ...props }) => (
|
||||
<Headline Component="h2" className={classNames(classes.h2, className)} {...props} />
|
||||
|
@ -90,6 +95,7 @@ const Headline = ({
|
|||
source,
|
||||
hidden,
|
||||
action,
|
||||
permalink = true,
|
||||
className,
|
||||
children,
|
||||
}) => {
|
||||
|
@ -102,7 +108,7 @@ const Headline = ({
|
|||
const tags = tag ? tag.split(',').map(t => t.trim()) : []
|
||||
return (
|
||||
<Component id={id} name={name} className={headingClassNames}>
|
||||
<Permalink id={id}>{children} </Permalink>
|
||||
<Permalink id={permalink ? id : null}>{children} </Permalink>
|
||||
{tags.map((tag, i) => (
|
||||
<Tag spaced key={i}>
|
||||
{tag}
|
||||
|
|
|
@ -88,6 +88,10 @@
|
|||
text-transform: uppercase
|
||||
margin-right: 5px
|
||||
|
||||
.type-annotation-meta
|
||||
font-size: 90%
|
||||
color: var(--color-subtle-dark)
|
||||
|
||||
.wrap
|
||||
white-space: pre-wrap
|
||||
word-wrap: anywhere
|
||||
|
|
Loading…
Reference in New Issue
Block a user