Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2020-09-30 16:52:42 +02:00
commit c379a4274a
22 changed files with 356 additions and 173 deletions

View File

@ -1,7 +1,7 @@
SHELL := /bin/bash
ifndef SPACY_EXTRAS
override SPACY_EXTRAS = spacy-lookups-data==0.4.0.dev0 jieba pkuseg==0.0.25 pickle5 sudachipy sudachidict_core
override SPACY_EXTRAS = spacy-lookups-data==1.0.0rc0 jieba pkuseg==0.0.25 pickle5 sudachipy sudachidict_core
endif
ifndef PYVER

View File

@ -6,7 +6,7 @@ requires = [
"cymem>=2.0.2,<2.1.0",
"preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0",
"thinc>=8.0.0a41,<8.0.0a50",
"thinc>=8.0.0a42,<8.0.0a50",
"blis>=0.4.0,<0.5.0",
"pytokenizations",
"pathy"

View File

@ -1,7 +1,7 @@
# Our libraries
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
thinc>=8.0.0a41,<8.0.0a50
thinc>=8.0.0a42,<8.0.0a50
blis>=0.4.0,<0.5.0
ml_datasets==0.2.0a0
murmurhash>=0.28.0,<1.1.0

View File

@ -34,13 +34,13 @@ setup_requires =
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
murmurhash>=0.28.0,<1.1.0
thinc>=8.0.0a41,<8.0.0a50
thinc>=8.0.0a42,<8.0.0a50
install_requires =
# Our libraries
murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
thinc>=8.0.0a41,<8.0.0a50
thinc>=8.0.0a42,<8.0.0a50
blis>=0.4.0,<0.5.0
wasabi>=0.8.0,<1.1.0
srsly>=2.1.0,<3.0.0
@ -65,7 +65,7 @@ console_scripts =
[options.extras_require]
lookups =
spacy_lookups_data==0.4.0.dev0
spacy_lookups_data==1.0.0rc0
cuda =
cupy>=5.0.0b4,<9.0.0
cuda80 =

View File

@ -16,6 +16,7 @@ import os
from ..schemas import ProjectConfigSchema, validate
from ..util import import_file, run_command, make_tempdir, registry, logger
from ..util import ENV_VARS
if TYPE_CHECKING:
from pathy import Pathy # noqa: F401
@ -39,7 +40,6 @@ commands to check and validate your config files, training and evaluation data,
and custom model implementations.
"""
INIT_HELP = """Commands for initializing configs and pipeline packages."""
OVERRIDES_ENV_VAR = "SPACY_CONFIG_OVERRIDES"
# Wrappers for Typer's annotations. Initially created to set defaults and to
# keep the names short, but not needed at the moment.
@ -65,7 +65,7 @@ def setup_cli() -> None:
def parse_config_overrides(
args: List[str], env_var: Optional[str] = OVERRIDES_ENV_VAR
args: List[str], env_var: Optional[str] = ENV_VARS.CONFIG_OVERRIDES
) -> Dict[str, Any]:
"""Generate a dictionary of config overrides based on the extra arguments
provided on the CLI, e.g. --training.batch_size to override

View File

@ -27,7 +27,7 @@ def init_vectors_cli(
you can use in the [initialize.vocab] block of your config to initialize
a model with vectors.
"""
util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
msg.info(f"Creating blank nlp object for language '{lang}'")
nlp = util.get_lang_class(lang)()
convert_vectors(nlp, vectors_loc, truncate=truncate, prune=prune, name=name)
@ -55,14 +55,14 @@ def init_pipeline_cli(
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
# fmt: on
):
util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
overrides = parse_config_overrides(ctx.args)
import_code(code_path)
setup_gpu(use_gpu)
with show_validation_error(config_path):
config = util.load_config(config_path, overrides=overrides)
with show_validation_error(hint_fill=False):
nlp = init_nlp(config, use_gpu=use_gpu, silent=False)
nlp = init_nlp(config, use_gpu=use_gpu)
nlp.to_disk(output_path)
msg.good(f"Saved initialized pipeline to {output_path}")
@ -81,9 +81,12 @@ def init_labels_cli(
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
# fmt: on
):
"""Generate a JSON file for labels in the data. This helps speed up the
training process, since spaCy won't have to preprocess the data to
extract the labels."""
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
if not output_path.exists():
output_path.mkdir()
util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
overrides = parse_config_overrides(ctx.args)
import_code(code_path)
setup_gpu(use_gpu)
@ -93,7 +96,8 @@ def init_labels_cli(
nlp = init_nlp(config, use_gpu=use_gpu)
for name, component in nlp.pipeline:
if getattr(component, "label_data", None) is not None:
srsly.write_json(output_path / f"{name}.json", component.label_data)
msg.good(f"Saving {name} labels to {output_path}/{name}.json")
output_file = output_path / f"{name}.json"
srsly.write_json(output_file, component.label_data)
msg.good(f"Saving {name} labels to {output_file}")
else:
msg.info(f"No labels found for {name}")

View File

@ -4,8 +4,8 @@ can help generate the best possible configuration, given a user's requirements.
{%- set use_transformer = (transformer_data and hardware != "cpu") -%}
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
[paths]
train = ""
dev = ""
train = null
dev = null
[system]
{% if use_transformer -%}

View File

@ -40,7 +40,7 @@ def train_cli(
DOCS: https://nightly.spacy.io/api/cli#train
"""
util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
verify_cli_args(config_path, output_path)
overrides = parse_config_overrides(ctx.args)
import_code(code_path)
@ -50,6 +50,7 @@ def train_cli(
msg.divider("Initializing pipeline")
with show_validation_error(config_path, hint_fill=False):
nlp = init_nlp(config, use_gpu=use_gpu)
msg.good("Initialized pipeline")
msg.divider("Training pipeline")
train(nlp, output_path, use_gpu=use_gpu, silent=False)

View File

@ -1,6 +1,6 @@
[paths]
train = ""
dev = ""
train = null
dev = null
vectors = null
vocab_data = null
init_tok2vec = null

View File

@ -477,6 +477,8 @@ class Errors:
E201 = ("Span index out of range.")
# TODO: fix numbering after merging develop into master
E913 = ("Corpus path can't be None. Maybe you forgot to define it in your "
"config.cfg or override it on the CLI?")
E914 = ("Executing {name} callback failed. Expected the function to "
"return the nlp object but got: {value}. Maybe you forgot to return "
"the modified object in your function?")

View File

@ -36,7 +36,7 @@ cdef class Pipe:
@property
def labels(self) -> Optional[Tuple[str]]:
return []
@property
def label_data(self):
"""Optional JSON-serializable data that would be sufficient to recreate
@ -207,7 +207,7 @@ cdef class Pipe:
DOCS: https://nightly.spacy.io/api/pipe#initialize
"""
raise NotImplementedError(Errors.E931.format(method="initialize", name=self.name))
pass
def _ensure_examples(self, get_examples):
if get_examples is None or not hasattr(get_examples, "__call__"):

View File

@ -14,8 +14,8 @@ from ..util import make_tempdir
nlp_config_string = """
[paths]
train = ""
dev = ""
train = null
dev = null
[corpora]
@ -309,7 +309,7 @@ def test_config_interpolation():
config = Config().from_str(nlp_config_string, interpolate=False)
assert config["corpora"]["train"]["path"] == "${paths.train}"
interpolated = config.interpolate()
assert interpolated["corpora"]["train"]["path"] == ""
assert interpolated["corpora"]["train"]["path"] is None
nlp = English.from_config(config)
assert nlp.config["corpora"]["train"]["path"] == "${paths.train}"
# Ensure that variables are preserved in nlp config
@ -317,10 +317,10 @@ def test_config_interpolation():
assert config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
assert nlp.config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
interpolated2 = nlp.config.interpolate()
assert interpolated2["corpora"]["train"]["path"] == ""
assert interpolated2["corpora"]["train"]["path"] is None
assert interpolated2["components"]["tagger"]["model"]["tok2vec"]["width"] == 342
nlp2 = English.from_config(interpolated)
assert nlp2.config["corpora"]["train"]["path"] == ""
assert nlp2.config["corpora"]["train"]["path"] is None
assert nlp2.config["components"]["tagger"]["model"]["tok2vec"]["width"] == 342

View File

@ -3,10 +3,11 @@ from click import NoSuchOption
from spacy.training import docs_to_json, offsets_to_biluo_tags
from spacy.training.converters import iob_to_docs, conll_ner_to_docs, conllu_to_docs
from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
from spacy.util import ENV_VARS
from spacy.cli.init_config import init_config, RECOMMENDATIONS
from spacy.cli._util import validate_project_commands, parse_config_overrides
from spacy.cli._util import load_project_config, substitute_project_variables
from spacy.cli._util import string_to_list, OVERRIDES_ENV_VAR
from spacy.cli._util import string_to_list
from thinc.api import ConfigValidationError
import srsly
import os
@ -342,21 +343,22 @@ def test_parse_config_overrides_invalid_2(args):
def test_parse_cli_overrides():
os.environ[OVERRIDES_ENV_VAR] = "--x.foo bar --x.bar=12 --x.baz false --y.foo=hello"
overrides = "--x.foo bar --x.bar=12 --x.baz false --y.foo=hello"
os.environ[ENV_VARS.CONFIG_OVERRIDES] = overrides
result = parse_config_overrides([])
assert len(result) == 4
assert result["x.foo"] == "bar"
assert result["x.bar"] == 12
assert result["x.baz"] is False
assert result["y.foo"] == "hello"
os.environ[OVERRIDES_ENV_VAR] = "--x"
os.environ[ENV_VARS.CONFIG_OVERRIDES] = "--x"
assert parse_config_overrides([], env_var=None) == {}
with pytest.raises(SystemExit):
parse_config_overrides([])
os.environ[OVERRIDES_ENV_VAR] = "hello world"
os.environ[ENV_VARS.CONFIG_OVERRIDES] = "hello world"
with pytest.raises(SystemExit):
parse_config_overrides([])
del os.environ[OVERRIDES_ENV_VAR]
del os.environ[ENV_VARS.CONFIG_OVERRIDES]
@pytest.mark.parametrize("lang", ["en", "nl"])

View File

@ -7,7 +7,7 @@ import srsly
from .. import util
from .augment import dont_augment
from .example import Example
from ..errors import Warnings
from ..errors import Warnings, Errors
from ..tokens import DocBin, Doc
from ..vocab import Vocab
@ -20,12 +20,14 @@ FILE_TYPE = ".spacy"
@util.registry.readers("spacy.Corpus.v1")
def create_docbin_reader(
path: Path,
path: Optional[Path],
gold_preproc: bool,
max_length: int = 0,
limit: int = 0,
augmenter: Optional[Callable] = None,
) -> Callable[["Language"], Iterable[Example]]:
if path is None:
raise ValueError(Errors.E913)
util.logger.debug(f"Loading corpus from path: {path}")
return Corpus(
path,

View File

@ -67,10 +67,14 @@ CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "co
# fmt: on
logging.basicConfig()
logging.basicConfig(format="%(message)s")
logger = logging.getLogger("spacy")
class ENV_VARS:
CONFIG_OVERRIDES = "SPACY_CONFIG_OVERRIDES"
class registry(thinc.registry):
languages = catalogue.create("spacy", "languages", entry_points=True)
architectures = catalogue.create("spacy", "architectures", entry_points=True)

View File

@ -32,14 +32,16 @@ streaming.
> gold_preproc = false
> max_length = 0
> limit = 0
> augmenter = null
> ```
| Name | Description |
| --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `path` | The directory or filename to read from. Expects data in spaCy's binary [`.spacy` format](/api/data-formats#binary-training). ~~Path~~ |
|  `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. See [`Corpus`](/api/corpus#init) for details. ~~bool~~ |
| `max_length` | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ |
| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ |
| Name | Description |
| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `path` | The directory or filename to read from. Expects data in spaCy's binary [`.spacy` format](/api/data-formats#binary-training). ~~Path~~ |
|  `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. See [`Corpus`](/api/corpus#init) for details. ~~bool~~ |
| `max_length` | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ |
| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ |
| `augmenter` | Apply some simply data augmentation, where we replace tokens with variations. This is especially useful for punctuation and case replacement, to help generalize beyond corpora that don't have smart-quotes, or only have smart quotes, etc. Defaults to `None`. ~~Optional[Callable]~~ |
```python
%%GITHUB_SPACY/spacy/training/corpus.py
@ -74,7 +76,7 @@ train/test skew.
|  `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. Defaults to `False`. ~~bool~~ |
| `max_length` | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ |
| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ |
| `augmenter` | Optional data augmentation callback. ~~Callable[[Language, Example], Iterable[Example]]~~
| `augmenter` | Optional data augmentation callback. ~~Callable[[Language, Example], Iterable[Example]]~~ |
## Corpus.\_\_call\_\_ {#call tag="method"}

View File

@ -191,16 +191,16 @@ browser. Will run a simple web server.
> displacy.serve([doc1, doc2], style="dep")
> ```
| Name | Description |
| --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `docs` | Document(s) or span(s) to visualize. ~~Union[Iterable[Union[Doc, Span]], Doc, Span]~~ |
| `style` | Visualization style, `"dep"` or `"ent"`. Defaults to `"dep"`. ~~str~~ |
| `page` | Render markup as full HTML page. Defaults to `True`. ~~bool~~ |
| `minify` | Minify HTML markup. Defaults to `False`. ~~bool~~ |
| `options` | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~ |
| Name | Description |
| --------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `docs` | Document(s) or span(s) to visualize. ~~Union[Iterable[Union[Doc, Span]], Doc, Span]~~ |
| `style` | Visualization style, `"dep"` or `"ent"`. Defaults to `"dep"`. ~~str~~ |
| `page` | Render markup as full HTML page. Defaults to `True`. ~~bool~~ |
| `minify` | Minify HTML markup. Defaults to `False`. ~~bool~~ |
| `options` | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~ |
| `manual` | Don't parse `Doc` and instead expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~ |
| `port` | Port to serve visualization. Defaults to `5000`. ~~int~~ |
| `host` | Host to serve visualization. Defaults to `"0.0.0.0"`. ~~str~~ |
| `port` | Port to serve visualization. Defaults to `5000`. ~~int~~ |
| `host` | Host to serve visualization. Defaults to `"0.0.0.0"`. ~~str~~ |
### displacy.render {#displacy.render tag="method" new="2"}
@ -223,7 +223,7 @@ Render a dependency parse tree or named entity visualization.
| `page` | Render markup as full HTML page. Defaults to `True`. ~~bool~~ |
| `minify` | Minify HTML markup. Defaults to `False`. ~~bool~~ |
| `options` | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~ |
| `manual` | Don't parse `Doc` and instead expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~ |
| `manual` | Don't parse `Doc` and instead expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~ |
| `jupyter` | Explicitly enable or disable "[Jupyter](http://jupyter.org/) mode" to return markup ready to be rendered in a notebook. Detected automatically if `None` (default). ~~Optional[bool]~~ |
| **RETURNS** | The rendered HTML markup. ~~str~~ |
@ -244,7 +244,7 @@ If a setting is not present in the options, the default value will be used.
| Name | Description |
| ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------- |
| `fine_grained` | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). Defaults to `False`. ~~bool~~ |
| `add_lemma` <Tag variant="new">2.2.4</Tag> | Print the lemmas in a separate row below the token texts. Defaults to `False`. ~~bool~~ |
| `add_lemma` <Tag variant="new">2.2.4</Tag> | Print the lemmas in a separate row below the token texts. Defaults to `False`. ~~bool~~ |
| `collapse_punct` | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. Defaults to `True`. ~~bool~~ |
| `collapse_phrases` | Merge noun phrases into one token. Defaults to `False`. ~~bool~~ |
| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ |
@ -498,12 +498,13 @@ the [`Corpus`](/api/corpus) class.
> limit = 0
> ```
| Name | Description |
| --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `path` | The directory or filename to read from. Expects data in spaCy's binary [`.spacy` format](/api/data-formats#binary-training). ~~Union[str, Path]~~ |
|  `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. See [`Corpus`](/api/corpus#init) for details. ~~bool~~ |
| `max_length` | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ |
| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ |
| Name | Description |
| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `path` | The directory or filename to read from. Expects data in spaCy's binary [`.spacy` format](/api/data-formats#binary-training). ~~Union[str, Path]~~ |
|  `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. See [`Corpus`](/api/corpus#init) for details. ~~bool~~ |
| `max_length` | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ |
| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ |
| `augmenter` | Apply some simply data augmentation, where we replace tokens with variations. This is especially useful for punctuation and case replacement, to help generalize beyond corpora that don't have smart-quotes, or only have smart quotes, etc. Defaults to `None`. ~~Optional[Callable]~~ |
### JsonlReader {#jsonlreader}
@ -935,7 +936,7 @@ Compile a sequence of prefix rules into a regex object.
| Name | Description |
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
| `entries` | The prefix rules, e.g. [`lang.punctuation.TOKENIZER_PREFIXES`](%%GITHUB_SPACY/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ |
| **RETURNS** | The regex object to be used for [`Tokenizer.prefix_search`](/api/tokenizer#attributes). ~~Pattern~~ |
| **RETURNS** | The regex object to be used for [`Tokenizer.prefix_search`](/api/tokenizer#attributes). ~~Pattern~~ |
### util.compile_suffix_regex {#util.compile_suffix_regex tag="function"}
@ -952,7 +953,7 @@ Compile a sequence of suffix rules into a regex object.
| Name | Description |
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
| `entries` | The suffix rules, e.g. [`lang.punctuation.TOKENIZER_SUFFIXES`](%%GITHUB_SPACY/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ |
| **RETURNS** | The regex object to be used for [`Tokenizer.suffix_search`](/api/tokenizer#attributes). ~~Pattern~~ |
| **RETURNS** | The regex object to be used for [`Tokenizer.suffix_search`](/api/tokenizer#attributes). ~~Pattern~~ |
### util.compile_infix_regex {#util.compile_infix_regex tag="function"}
@ -969,7 +970,7 @@ Compile a sequence of infix rules into a regex object.
| Name | Description |
| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------- |
| `entries` | The infix rules, e.g. [`lang.punctuation.TOKENIZER_INFIXES`](%%GITHUB_SPACY/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ |
| **RETURNS** | The regex object to be used for [`Tokenizer.infix_finditer`](/api/tokenizer#attributes). ~~Pattern~~ |
| **RETURNS** | The regex object to be used for [`Tokenizer.infix_finditer`](/api/tokenizer#attributes). ~~Pattern~~ |
### util.minibatch {#util.minibatch tag="function" new="2"}

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 83 KiB

View File

@ -32,7 +32,7 @@ the [config](/usage/training#config):
```ini
[nlp]
pipeline = ["tagger", "parser", "ner"]
pipeline = ["tok2vec", "tagger", "parser", "ner"]
```
import Accordion from 'components/accordion.js'

View File

@ -167,8 +167,8 @@ the binary data:
```python
### spacy.load under the hood
lang = "en"
pipeline = ["tagger", "parser", "ner"]
data_path = "path/to/en_core_web_sm/en_core_web_sm-2.0.0"
pipeline = ["tok2vec", "tagger", "parser", "ner"]
data_path = "path/to/en_core_web_sm/en_core_web_sm-3.0.0"
cls = spacy.util.get_lang_class(lang) # 1. Get Language class, e.g. English
nlp = cls() # 2. Initialize it
@ -197,9 +197,9 @@ list of human-readable component names.
```python
print(nlp.pipeline)
# [('tagger', <spacy.pipeline.Tagger>), ('parser', <spacy.pipeline.DependencyParser>), ('ner', <spacy.pipeline.EntityRecognizer>)]
# [('tok2vec', <spacy.pipeline.Tok2Vec>), ('tagger', <spacy.pipeline.Tagger>), ('parser', <spacy.pipeline.DependencyParser>), ('ner', <spacy.pipeline.EntityRecognizer>)]
print(nlp.pipe_names)
# ['tagger', 'parser', 'ner']
# ['tok2vec', 'tagger', 'parser', 'ner']
```
### Built-in pipeline components {#built-in}
@ -1126,12 +1126,12 @@ For some use cases, it makes sense to also overwrite additional methods to
customize how the model is updated from examples, how it's initialized, how the
loss is calculated and to add evaluation scores to the training output.
| Name | Description |
| ------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| [`update`](/api/pipe#update) | Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model. |
| [`initialize`](/api/pipe#initialize) | Initialize the model. Typically calls into [`Model.initialize`](https://thinc.ai/docs/api-model#initialize) and [`Pipe.create_optimizer`](/api/pipe#create_optimizer) if no optimizer is provided. |
| [`get_loss`](/api/pipe#get_loss) | Return a tuple of the loss and the gradient for a batch of [`Example`](/api/example) objects. |
| [`score`](/api/pipe#score) | Score a batch of [`Example`](/api/example) objects and return a dictionary of scores. The [`@Language.factory`](/api/language#factory) decorator can define the `default_socre_weights` of the component to decide which keys of the scores to display during training and how they count towards the final score. |
| Name | Description |
| ------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| [`update`](/api/pipe#update) | Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model. |
| [`initialize`](/api/pipe#initialize) | Initialize the model. Typically calls into [`Model.initialize`](https://thinc.ai/docs/api-model#initialize) and can be passed custom arguments via the [`[initialize]`](/api/data-formats#config-initialize) config block that are only loaded during training or when you call [`nlp.initialize`](/api/language#initialize), not at runtime. |
| [`get_loss`](/api/pipe#get_loss) | Return a tuple of the loss and the gradient for a batch of [`Example`](/api/example) objects. |
| [`score`](/api/pipe#score) | Score a batch of [`Example`](/api/example) objects and return a dictionary of scores. The [`@Language.factory`](/api/language#factory) decorator can define the `default_socre_weights` of the component to decide which keys of the scores to display during training and how they count towards the final score. |
<Infobox title="Custom trainable components and models" emoji="📖">

View File

@ -6,8 +6,9 @@ menu:
- ['Introduction', 'basics']
- ['Quickstart', 'quickstart']
- ['Config System', 'config']
<!-- - ['Data Utilities', 'data'] -->
- ['Custom Training', 'config-custom']
- ['Custom Functions', 'custom-functions']
- ['Data Utilities', 'data']
- ['Parallel Training', 'parallel-training']
- ['Internal API', 'api']
---
@ -122,7 +123,7 @@ treebank.
</Project>
## Training config {#config}
## Training config system {#config}
Training config files include all **settings and hyperparameters** for training
your pipeline. Instead of providing lots of arguments on the command line, you
@ -177,6 +178,7 @@ sections of a config file are:
| `system` | Settings related to system and hardware. Re-used across the config as variables, e.g. `${system.seed}`, and can be [overwritten](#config-overrides) on the CLI. |
| `training` | Settings and controls for the training and evaluation process. |
| `pretraining` | Optional settings and controls for the [language model pretraining](/usage/embeddings-transformers#pretraining). |
| `initialize` | Data resources and arguments passed to components when [`nlp.initialize`](/api/language#initialize) is called before training (but not at runtime). |
<Infobox title="Config format and settings" emoji="📖">
@ -190,6 +192,20 @@ available for the different architectures are documented with the
</Infobox>
### Config lifecycle at runtime and training {#config-lifecycle}
A pipeline's `config.cfg` is considered the "single source of truth", both at
**training** and **runtime**. Under the hood,
[`Language.from_config`](/api/language#from_config) takes care of constructing
the `nlp` object using the settings defined in the config. An `nlp` object's
config is available as [`nlp.config`](/api/language#config) and it includes all
information about the pipeline, as well as the settings used to train and
initialize it.
![Illustration of pipeline lifecycle](../images/lifecycle.svg)
<!-- TODO: explain lifecycle and initialization -->
### Overwriting config settings on the command line {#config-overrides}
The config system means that you can define all settings **in one place** and in
@ -233,6 +249,61 @@ defined in the config file.
$ SPACY_CONFIG_OVERRIDES="--system.gpu_allocator pytorch --training.batch_size 128" ./your_script.sh
```
### Using variable interpolation {#config-interpolation}
Another very useful feature of the config system is that it supports variable
interpolation for both **values and sections**. This means that you only need to
define a setting once and can reference it across your config using the
`${section.value}` syntax. In this example, the value of `seed` is reused within
the `[training]` block, and the whole block of `[training.optimizer]` is reused
in `[pretraining]` and will become `pretraining.optimizer`.
```ini
### config.cfg (excerpt) {highlight="5,18"}
[system]
seed = 0
[training]
seed = ${system.seed}
[training.optimizer]
@optimizers = "Adam.v1"
beta1 = 0.9
beta2 = 0.999
L2_is_weight_decay = true
L2 = 0.01
grad_clip = 1.0
use_averages = false
eps = 1e-8
[pretraining]
optimizer = ${training.optimizer}
```
You can also use variables inside strings. In that case, it works just like
f-strings in Python. If the value of a variable is not a string, it's converted
to a string.
```ini
[paths]
version = 5
root = "/Users/you/data"
train = "${paths.root}/train_${paths.version}.spacy"
# Result: /Users/you/data/train_5.spacy
```
<Infobox title="Tip: Override variables on the CLI" emoji="💡">
If you need to change certain values between training runs, you can define them
once, reference them as variables and then [override](#config-overrides) them on
the CLI. For example, `--paths.root /other/root` will change the value of `root`
in the block `[paths]` and the change will be reflected across all other values
that reference this variable.
</Infobox>
## Customizing the pipeline and training {#config-custom}
### Defining pipeline components {#config-components}
You typically train a [pipeline](/usage/processing-pipelines) of **one or more
@ -353,59 +424,6 @@ stop = 1000
compound = 1.001
```
### Using variable interpolation {#config-interpolation}
Another very useful feature of the config system is that it supports variable
interpolation for both **values and sections**. This means that you only need to
define a setting once and can reference it across your config using the
`${section.value}` syntax. In this example, the value of `seed` is reused within
the `[training]` block, and the whole block of `[training.optimizer]` is reused
in `[pretraining]` and will become `pretraining.optimizer`.
```ini
### config.cfg (excerpt) {highlight="5,18"}
[system]
seed = 0
[training]
seed = ${system.seed}
[training.optimizer]
@optimizers = "Adam.v1"
beta1 = 0.9
beta2 = 0.999
L2_is_weight_decay = true
L2 = 0.01
grad_clip = 1.0
use_averages = false
eps = 1e-8
[pretraining]
optimizer = ${training.optimizer}
```
You can also use variables inside strings. In that case, it works just like
f-strings in Python. If the value of a variable is not a string, it's converted
to a string.
```ini
[paths]
version = 5
root = "/Users/you/data"
train = "${paths.root}/train_${paths.version}.spacy"
# Result: /Users/you/data/train_5.spacy
```
<Infobox title="Tip: Override variables on the CLI" emoji="💡">
If you need to change certain values between training runs, you can define them
once, reference them as variables and then [override](#config-overrides) them on
the CLI. For example, `--paths.root /other/root` will change the value of `root`
in the block `[paths]` and the change will be reflected across all other values
that reference this variable.
</Infobox>
### Model architectures {#model-architectures}
> #### 💡 Model type annotations
@ -506,17 +524,7 @@ still look good.
</Accordion>
<!--
## Data Utilities {#data-utilities}
* spacy convert
* The [corpora] block
* Custom corpus class
* Minibatching
* Data augmentation
-->
## Custom Functions {#custom-functions}
## Custom functions {#custom-functions}
Registered functions in the training config files can refer to built-in
implementations, but you can also plug in fully **custom implementations**. All
@ -763,7 +771,96 @@ start = 2
factor = 1.005
```
#### Example: Custom data reading and batching {#custom-code-readers-batchers}
### Defining custom architectures {#custom-architectures}
Built-in pipeline components such as the tagger or named entity recognizer are
constructed with default neural network [models](/api/architectures). You can
change the model architecture entirely by implementing your own custom models
and providing those in the config when creating the pipeline component. See the
documentation on [layers and model architectures](/usage/layers-architectures)
for more details.
> ```ini
> ### config.cfg
> [components.tagger]
> factory = "tagger"
>
> [components.tagger.model]
> @architectures = "custom_neural_network.v1"
> output_width = 512
> ```
```python
### functions.py
from typing import List
from thinc.types import Floats2d
from thinc.api import Model
import spacy
from spacy.tokens import Doc
@spacy.registry.architectures("custom_neural_network.v1")
def MyModel(output_width: int) -> Model[List[Doc], List[Floats2d]]:
return create_model(output_width)
```
## Data utilities {#data}
spaCy includes various features and utilities to make it easy to train from your
own data. If you have training data in a standard format like `.conll` or
`.conllu`, the easiest way to convert it for use with spaCy is to run
[`spacy convert`](/api/cli#convert) and pass it a file and an output directory:
```cli
$ python -m spacy convert ./train.gold.conll ./corpus
```
<Infobox title="Tip: Manage multi-step workflows with projects" emoji="💡">
Training workflows often consist of multiple steps, from preprocessing the data
all the way to packaging and deploying the trained model.
[spaCy projects](/usage/projects) let you define all steps in one file, manage
data assets, track changes and share your end-to-end processes with your team.
</Infobox>
### Working with corpora {#data-corpora}
> #### Example
>
> ```ini
> [corpora]
>
> [corpora.train]
> @readers = "spacy.Corpus.v1"
> path = ${paths.train}
> gold_preproc = false
> max_length = 0
> limit = 0
> augmenter = null
>
> [training]
> train_corpus = "corpora.train"
> ```
The [`[corpora]`](/api/data-formats#config-corpora) block in your config lets
you define **data resources** to use for training, evaluation, pretraining or
any other custom workflows. `corpora.train` and `corpora.dev` are used as
conventions within spaCy's default configs, but you can also define any other
custom blocks. Each section in the corpora config should resolve to a
[`Corpus`](/api/corpus) for example, using spaCy's built-in
[corpus reader](/api/top-level#readers) that takes a path to a binary `.spacy`
file. The `train_corpus` and `dev_corpus` fields in the
[`[training]`](/api/data-formats#config-training) block specify where to find
the corpus in your config. This makes it easy to **swap out** different corpora
by only changing a single config setting.
Instead of making `[corpora]` a block with multiple subsections for each portion
of the data, you can also use a single function that returns a dictionary of
corpora, keyed by corpus name, e.g. `"train"` and `"dev"`. This can be
especially useful if you need to split a single file into corpora for training
and evaluation, without loading the same file twice.
### Custom data reading and batching {#custom-code-readers-batchers}
Some use-cases require **streaming in data** or manipulating datasets on the
fly, rather than generating all data beforehand and storing it to file. Instead
@ -859,37 +956,11 @@ def filter_batch(size: int) -> Callable[[Iterable[Example]], Iterator[List[Examp
return create_filtered_batches
```
### Defining custom architectures {#custom-architectures}
Built-in pipeline components such as the tagger or named entity recognizer are
constructed with default neural network [models](/api/architectures). You can
change the model architecture entirely by implementing your own custom models
and providing those in the config when creating the pipeline component. See the
documentation on [layers and model architectures](/usage/layers-architectures)
for more details.
> ```ini
> ### config.cfg
> [components.tagger]
> factory = "tagger"
>
> [components.tagger.model]
> @architectures = "custom_neural_network.v1"
> output_width = 512
> ```
```python
### functions.py
from typing import List
from thinc.types import Floats2d
from thinc.api import Model
import spacy
from spacy.tokens import Doc
@spacy.registry.architectures("custom_neural_network.v1")
def MyModel(output_width: int) -> Model[List[Doc], List[Floats2d]]:
return create_model(output_width)
```
<!-- TODO:
* Custom corpus class
* Minibatching
* Data augmentation
-->
## Parallel & distributed training with Ray {#parallel-training}

View File

@ -123,13 +123,14 @@ training run, with no hidden defaults, making it easy to rerun your experiments
and track changes. You can use the
[quickstart widget](/usage/training#quickstart) or the `init config` command to
get started. Instead of providing lots of arguments on the command line, you
only need to pass your `config.cfg` file to `spacy train`.
only need to pass your `config.cfg` file to [`spacy train`](/api/cli#train).
Training config files include all **settings and hyperparameters** for training
your pipeline. Some settings can also be registered **functions** that you can
swap out and customize, making it easy to implement your own custom models and
architectures.
![Illustration of pipeline lifecycle](../images/lifecycle.svg)
<Infobox title="Details & Documentation" emoji="📖" list>
- **Usage:** [Training pipelines and models](/usage/training)
@ -723,7 +724,7 @@ nlp = spacy.blank("en")
Because pipeline components are now added using their string names, you won't
have to instantiate the [component classes](/api/#architecture-pipeline)
directly anynore. To configure the component, you can now use the `config`
directly anymore. To configure the component, you can now use the `config`
argument on [`nlp.add_pipe`](/api/language#add_pipe).
> #### config.cfg (excerpt)