Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2020-09-30 16:52:42 +02:00
commit c379a4274a
22 changed files with 356 additions and 173 deletions

View File

@ -1,7 +1,7 @@
SHELL := /bin/bash SHELL := /bin/bash
ifndef SPACY_EXTRAS ifndef SPACY_EXTRAS
override SPACY_EXTRAS = spacy-lookups-data==0.4.0.dev0 jieba pkuseg==0.0.25 pickle5 sudachipy sudachidict_core override SPACY_EXTRAS = spacy-lookups-data==1.0.0rc0 jieba pkuseg==0.0.25 pickle5 sudachipy sudachidict_core
endif endif
ifndef PYVER ifndef PYVER

View File

@ -6,7 +6,7 @@ requires = [
"cymem>=2.0.2,<2.1.0", "cymem>=2.0.2,<2.1.0",
"preshed>=3.0.2,<3.1.0", "preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0", "murmurhash>=0.28.0,<1.1.0",
"thinc>=8.0.0a41,<8.0.0a50", "thinc>=8.0.0a42,<8.0.0a50",
"blis>=0.4.0,<0.5.0", "blis>=0.4.0,<0.5.0",
"pytokenizations", "pytokenizations",
"pathy" "pathy"

View File

@ -1,7 +1,7 @@
# Our libraries # Our libraries
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
thinc>=8.0.0a41,<8.0.0a50 thinc>=8.0.0a42,<8.0.0a50
blis>=0.4.0,<0.5.0 blis>=0.4.0,<0.5.0
ml_datasets==0.2.0a0 ml_datasets==0.2.0a0
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0

View File

@ -34,13 +34,13 @@ setup_requires =
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
thinc>=8.0.0a41,<8.0.0a50 thinc>=8.0.0a42,<8.0.0a50
install_requires = install_requires =
# Our libraries # Our libraries
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
thinc>=8.0.0a41,<8.0.0a50 thinc>=8.0.0a42,<8.0.0a50
blis>=0.4.0,<0.5.0 blis>=0.4.0,<0.5.0
wasabi>=0.8.0,<1.1.0 wasabi>=0.8.0,<1.1.0
srsly>=2.1.0,<3.0.0 srsly>=2.1.0,<3.0.0
@ -65,7 +65,7 @@ console_scripts =
[options.extras_require] [options.extras_require]
lookups = lookups =
spacy_lookups_data==0.4.0.dev0 spacy_lookups_data==1.0.0rc0
cuda = cuda =
cupy>=5.0.0b4,<9.0.0 cupy>=5.0.0b4,<9.0.0
cuda80 = cuda80 =

View File

@ -16,6 +16,7 @@ import os
from ..schemas import ProjectConfigSchema, validate from ..schemas import ProjectConfigSchema, validate
from ..util import import_file, run_command, make_tempdir, registry, logger from ..util import import_file, run_command, make_tempdir, registry, logger
from ..util import ENV_VARS
if TYPE_CHECKING: if TYPE_CHECKING:
from pathy import Pathy # noqa: F401 from pathy import Pathy # noqa: F401
@ -39,7 +40,6 @@ commands to check and validate your config files, training and evaluation data,
and custom model implementations. and custom model implementations.
""" """
INIT_HELP = """Commands for initializing configs and pipeline packages.""" INIT_HELP = """Commands for initializing configs and pipeline packages."""
OVERRIDES_ENV_VAR = "SPACY_CONFIG_OVERRIDES"
# Wrappers for Typer's annotations. Initially created to set defaults and to # Wrappers for Typer's annotations. Initially created to set defaults and to
# keep the names short, but not needed at the moment. # keep the names short, but not needed at the moment.
@ -65,7 +65,7 @@ def setup_cli() -> None:
def parse_config_overrides( def parse_config_overrides(
args: List[str], env_var: Optional[str] = OVERRIDES_ENV_VAR args: List[str], env_var: Optional[str] = ENV_VARS.CONFIG_OVERRIDES
) -> Dict[str, Any]: ) -> Dict[str, Any]:
"""Generate a dictionary of config overrides based on the extra arguments """Generate a dictionary of config overrides based on the extra arguments
provided on the CLI, e.g. --training.batch_size to override provided on the CLI, e.g. --training.batch_size to override

View File

@ -27,7 +27,7 @@ def init_vectors_cli(
you can use in the [initialize.vocab] block of your config to initialize you can use in the [initialize.vocab] block of your config to initialize
a model with vectors. a model with vectors.
""" """
util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR) util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
msg.info(f"Creating blank nlp object for language '{lang}'") msg.info(f"Creating blank nlp object for language '{lang}'")
nlp = util.get_lang_class(lang)() nlp = util.get_lang_class(lang)()
convert_vectors(nlp, vectors_loc, truncate=truncate, prune=prune, name=name) convert_vectors(nlp, vectors_loc, truncate=truncate, prune=prune, name=name)
@ -55,14 +55,14 @@ def init_pipeline_cli(
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU") use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
# fmt: on # fmt: on
): ):
util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR) util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
overrides = parse_config_overrides(ctx.args) overrides = parse_config_overrides(ctx.args)
import_code(code_path) import_code(code_path)
setup_gpu(use_gpu) setup_gpu(use_gpu)
with show_validation_error(config_path): with show_validation_error(config_path):
config = util.load_config(config_path, overrides=overrides) config = util.load_config(config_path, overrides=overrides)
with show_validation_error(hint_fill=False): with show_validation_error(hint_fill=False):
nlp = init_nlp(config, use_gpu=use_gpu, silent=False) nlp = init_nlp(config, use_gpu=use_gpu)
nlp.to_disk(output_path) nlp.to_disk(output_path)
msg.good(f"Saved initialized pipeline to {output_path}") msg.good(f"Saved initialized pipeline to {output_path}")
@ -81,9 +81,12 @@ def init_labels_cli(
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU") use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
# fmt: on # fmt: on
): ):
"""Generate a JSON file for labels in the data. This helps speed up the
training process, since spaCy won't have to preprocess the data to
extract the labels."""
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
if not output_path.exists(): if not output_path.exists():
output_path.mkdir() output_path.mkdir()
util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
overrides = parse_config_overrides(ctx.args) overrides = parse_config_overrides(ctx.args)
import_code(code_path) import_code(code_path)
setup_gpu(use_gpu) setup_gpu(use_gpu)
@ -93,7 +96,8 @@ def init_labels_cli(
nlp = init_nlp(config, use_gpu=use_gpu) nlp = init_nlp(config, use_gpu=use_gpu)
for name, component in nlp.pipeline: for name, component in nlp.pipeline:
if getattr(component, "label_data", None) is not None: if getattr(component, "label_data", None) is not None:
srsly.write_json(output_path / f"{name}.json", component.label_data) output_file = output_path / f"{name}.json"
msg.good(f"Saving {name} labels to {output_path}/{name}.json") srsly.write_json(output_file, component.label_data)
msg.good(f"Saving {name} labels to {output_file}")
else: else:
msg.info(f"No labels found for {name}") msg.info(f"No labels found for {name}")

View File

@ -4,8 +4,8 @@ can help generate the best possible configuration, given a user's requirements.
{%- set use_transformer = (transformer_data and hardware != "cpu") -%} {%- set use_transformer = (transformer_data and hardware != "cpu") -%}
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%} {%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
[paths] [paths]
train = "" train = null
dev = "" dev = null
[system] [system]
{% if use_transformer -%} {% if use_transformer -%}

View File

@ -40,7 +40,7 @@ def train_cli(
DOCS: https://nightly.spacy.io/api/cli#train DOCS: https://nightly.spacy.io/api/cli#train
""" """
util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR) util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
verify_cli_args(config_path, output_path) verify_cli_args(config_path, output_path)
overrides = parse_config_overrides(ctx.args) overrides = parse_config_overrides(ctx.args)
import_code(code_path) import_code(code_path)
@ -50,6 +50,7 @@ def train_cli(
msg.divider("Initializing pipeline") msg.divider("Initializing pipeline")
with show_validation_error(config_path, hint_fill=False): with show_validation_error(config_path, hint_fill=False):
nlp = init_nlp(config, use_gpu=use_gpu) nlp = init_nlp(config, use_gpu=use_gpu)
msg.good("Initialized pipeline")
msg.divider("Training pipeline") msg.divider("Training pipeline")
train(nlp, output_path, use_gpu=use_gpu, silent=False) train(nlp, output_path, use_gpu=use_gpu, silent=False)

View File

@ -1,6 +1,6 @@
[paths] [paths]
train = "" train = null
dev = "" dev = null
vectors = null vectors = null
vocab_data = null vocab_data = null
init_tok2vec = null init_tok2vec = null

View File

@ -477,6 +477,8 @@ class Errors:
E201 = ("Span index out of range.") E201 = ("Span index out of range.")
# TODO: fix numbering after merging develop into master # TODO: fix numbering after merging develop into master
E913 = ("Corpus path can't be None. Maybe you forgot to define it in your "
"config.cfg or override it on the CLI?")
E914 = ("Executing {name} callback failed. Expected the function to " E914 = ("Executing {name} callback failed. Expected the function to "
"return the nlp object but got: {value}. Maybe you forgot to return " "return the nlp object but got: {value}. Maybe you forgot to return "
"the modified object in your function?") "the modified object in your function?")

View File

@ -207,7 +207,7 @@ cdef class Pipe:
DOCS: https://nightly.spacy.io/api/pipe#initialize DOCS: https://nightly.spacy.io/api/pipe#initialize
""" """
raise NotImplementedError(Errors.E931.format(method="initialize", name=self.name)) pass
def _ensure_examples(self, get_examples): def _ensure_examples(self, get_examples):
if get_examples is None or not hasattr(get_examples, "__call__"): if get_examples is None or not hasattr(get_examples, "__call__"):

View File

@ -14,8 +14,8 @@ from ..util import make_tempdir
nlp_config_string = """ nlp_config_string = """
[paths] [paths]
train = "" train = null
dev = "" dev = null
[corpora] [corpora]
@ -309,7 +309,7 @@ def test_config_interpolation():
config = Config().from_str(nlp_config_string, interpolate=False) config = Config().from_str(nlp_config_string, interpolate=False)
assert config["corpora"]["train"]["path"] == "${paths.train}" assert config["corpora"]["train"]["path"] == "${paths.train}"
interpolated = config.interpolate() interpolated = config.interpolate()
assert interpolated["corpora"]["train"]["path"] == "" assert interpolated["corpora"]["train"]["path"] is None
nlp = English.from_config(config) nlp = English.from_config(config)
assert nlp.config["corpora"]["train"]["path"] == "${paths.train}" assert nlp.config["corpora"]["train"]["path"] == "${paths.train}"
# Ensure that variables are preserved in nlp config # Ensure that variables are preserved in nlp config
@ -317,10 +317,10 @@ def test_config_interpolation():
assert config["components"]["tagger"]["model"]["tok2vec"]["width"] == width assert config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
assert nlp.config["components"]["tagger"]["model"]["tok2vec"]["width"] == width assert nlp.config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
interpolated2 = nlp.config.interpolate() interpolated2 = nlp.config.interpolate()
assert interpolated2["corpora"]["train"]["path"] == "" assert interpolated2["corpora"]["train"]["path"] is None
assert interpolated2["components"]["tagger"]["model"]["tok2vec"]["width"] == 342 assert interpolated2["components"]["tagger"]["model"]["tok2vec"]["width"] == 342
nlp2 = English.from_config(interpolated) nlp2 = English.from_config(interpolated)
assert nlp2.config["corpora"]["train"]["path"] == "" assert nlp2.config["corpora"]["train"]["path"] is None
assert nlp2.config["components"]["tagger"]["model"]["tok2vec"]["width"] == 342 assert nlp2.config["components"]["tagger"]["model"]["tok2vec"]["width"] == 342

View File

@ -3,10 +3,11 @@ from click import NoSuchOption
from spacy.training import docs_to_json, offsets_to_biluo_tags from spacy.training import docs_to_json, offsets_to_biluo_tags
from spacy.training.converters import iob_to_docs, conll_ner_to_docs, conllu_to_docs from spacy.training.converters import iob_to_docs, conll_ner_to_docs, conllu_to_docs
from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
from spacy.util import ENV_VARS
from spacy.cli.init_config import init_config, RECOMMENDATIONS from spacy.cli.init_config import init_config, RECOMMENDATIONS
from spacy.cli._util import validate_project_commands, parse_config_overrides from spacy.cli._util import validate_project_commands, parse_config_overrides
from spacy.cli._util import load_project_config, substitute_project_variables from spacy.cli._util import load_project_config, substitute_project_variables
from spacy.cli._util import string_to_list, OVERRIDES_ENV_VAR from spacy.cli._util import string_to_list
from thinc.api import ConfigValidationError from thinc.api import ConfigValidationError
import srsly import srsly
import os import os
@ -342,21 +343,22 @@ def test_parse_config_overrides_invalid_2(args):
def test_parse_cli_overrides(): def test_parse_cli_overrides():
os.environ[OVERRIDES_ENV_VAR] = "--x.foo bar --x.bar=12 --x.baz false --y.foo=hello" overrides = "--x.foo bar --x.bar=12 --x.baz false --y.foo=hello"
os.environ[ENV_VARS.CONFIG_OVERRIDES] = overrides
result = parse_config_overrides([]) result = parse_config_overrides([])
assert len(result) == 4 assert len(result) == 4
assert result["x.foo"] == "bar" assert result["x.foo"] == "bar"
assert result["x.bar"] == 12 assert result["x.bar"] == 12
assert result["x.baz"] is False assert result["x.baz"] is False
assert result["y.foo"] == "hello" assert result["y.foo"] == "hello"
os.environ[OVERRIDES_ENV_VAR] = "--x" os.environ[ENV_VARS.CONFIG_OVERRIDES] = "--x"
assert parse_config_overrides([], env_var=None) == {} assert parse_config_overrides([], env_var=None) == {}
with pytest.raises(SystemExit): with pytest.raises(SystemExit):
parse_config_overrides([]) parse_config_overrides([])
os.environ[OVERRIDES_ENV_VAR] = "hello world" os.environ[ENV_VARS.CONFIG_OVERRIDES] = "hello world"
with pytest.raises(SystemExit): with pytest.raises(SystemExit):
parse_config_overrides([]) parse_config_overrides([])
del os.environ[OVERRIDES_ENV_VAR] del os.environ[ENV_VARS.CONFIG_OVERRIDES]
@pytest.mark.parametrize("lang", ["en", "nl"]) @pytest.mark.parametrize("lang", ["en", "nl"])

View File

@ -7,7 +7,7 @@ import srsly
from .. import util from .. import util
from .augment import dont_augment from .augment import dont_augment
from .example import Example from .example import Example
from ..errors import Warnings from ..errors import Warnings, Errors
from ..tokens import DocBin, Doc from ..tokens import DocBin, Doc
from ..vocab import Vocab from ..vocab import Vocab
@ -20,12 +20,14 @@ FILE_TYPE = ".spacy"
@util.registry.readers("spacy.Corpus.v1") @util.registry.readers("spacy.Corpus.v1")
def create_docbin_reader( def create_docbin_reader(
path: Path, path: Optional[Path],
gold_preproc: bool, gold_preproc: bool,
max_length: int = 0, max_length: int = 0,
limit: int = 0, limit: int = 0,
augmenter: Optional[Callable] = None, augmenter: Optional[Callable] = None,
) -> Callable[["Language"], Iterable[Example]]: ) -> Callable[["Language"], Iterable[Example]]:
if path is None:
raise ValueError(Errors.E913)
util.logger.debug(f"Loading corpus from path: {path}") util.logger.debug(f"Loading corpus from path: {path}")
return Corpus( return Corpus(
path, path,

View File

@ -67,10 +67,14 @@ CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "co
# fmt: on # fmt: on
logging.basicConfig() logging.basicConfig(format="%(message)s")
logger = logging.getLogger("spacy") logger = logging.getLogger("spacy")
class ENV_VARS:
CONFIG_OVERRIDES = "SPACY_CONFIG_OVERRIDES"
class registry(thinc.registry): class registry(thinc.registry):
languages = catalogue.create("spacy", "languages", entry_points=True) languages = catalogue.create("spacy", "languages", entry_points=True)
architectures = catalogue.create("spacy", "architectures", entry_points=True) architectures = catalogue.create("spacy", "architectures", entry_points=True)

View File

@ -32,14 +32,16 @@ streaming.
> gold_preproc = false > gold_preproc = false
> max_length = 0 > max_length = 0
> limit = 0 > limit = 0
> augmenter = null
> ``` > ```
| Name | Description | | Name | Description |
| --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | | --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `path` | The directory or filename to read from. Expects data in spaCy's binary [`.spacy` format](/api/data-formats#binary-training). ~~Path~~ | | `path` | The directory or filename to read from. Expects data in spaCy's binary [`.spacy` format](/api/data-formats#binary-training). ~~Path~~ |
|  `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. See [`Corpus`](/api/corpus#init) for details. ~~bool~~ | |  `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. See [`Corpus`](/api/corpus#init) for details. ~~bool~~ |
| `max_length` | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ | | `max_length` | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ |
| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ | | `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ |
| `augmenter` | Apply some simply data augmentation, where we replace tokens with variations. This is especially useful for punctuation and case replacement, to help generalize beyond corpora that don't have smart-quotes, or only have smart quotes, etc. Defaults to `None`. ~~Optional[Callable]~~ |
```python ```python
%%GITHUB_SPACY/spacy/training/corpus.py %%GITHUB_SPACY/spacy/training/corpus.py
@ -74,7 +76,7 @@ train/test skew.
|  `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. Defaults to `False`. ~~bool~~ | |  `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. Defaults to `False`. ~~bool~~ |
| `max_length` | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ | | `max_length` | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ |
| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ | | `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ |
| `augmenter` | Optional data augmentation callback. ~~Callable[[Language, Example], Iterable[Example]]~~ | `augmenter` | Optional data augmentation callback. ~~Callable[[Language, Example], Iterable[Example]]~~ |
## Corpus.\_\_call\_\_ {#call tag="method"} ## Corpus.\_\_call\_\_ {#call tag="method"}

View File

@ -192,7 +192,7 @@ browser. Will run a simple web server.
> ``` > ```
| Name | Description | | Name | Description |
| --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | --------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `docs` | Document(s) or span(s) to visualize. ~~Union[Iterable[Union[Doc, Span]], Doc, Span]~~ | | `docs` | Document(s) or span(s) to visualize. ~~Union[Iterable[Union[Doc, Span]], Doc, Span]~~ |
| `style` | Visualization style, `"dep"` or `"ent"`. Defaults to `"dep"`. ~~str~~ | | `style` | Visualization style, `"dep"` or `"ent"`. Defaults to `"dep"`. ~~str~~ |
| `page` | Render markup as full HTML page. Defaults to `True`. ~~bool~~ | | `page` | Render markup as full HTML page. Defaults to `True`. ~~bool~~ |
@ -499,11 +499,12 @@ the [`Corpus`](/api/corpus) class.
> ``` > ```
| Name | Description | | Name | Description |
| --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | | --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `path` | The directory or filename to read from. Expects data in spaCy's binary [`.spacy` format](/api/data-formats#binary-training). ~~Union[str, Path]~~ | | `path` | The directory or filename to read from. Expects data in spaCy's binary [`.spacy` format](/api/data-formats#binary-training). ~~Union[str, Path]~~ |
|  `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. See [`Corpus`](/api/corpus#init) for details. ~~bool~~ | |  `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. See [`Corpus`](/api/corpus#init) for details. ~~bool~~ |
| `max_length` | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ | | `max_length` | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ |
| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ | | `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ |
| `augmenter` | Apply some simply data augmentation, where we replace tokens with variations. This is especially useful for punctuation and case replacement, to help generalize beyond corpora that don't have smart-quotes, or only have smart quotes, etc. Defaults to `None`. ~~Optional[Callable]~~ |
### JsonlReader {#jsonlreader} ### JsonlReader {#jsonlreader}

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 83 KiB

View File

@ -32,7 +32,7 @@ the [config](/usage/training#config):
```ini ```ini
[nlp] [nlp]
pipeline = ["tagger", "parser", "ner"] pipeline = ["tok2vec", "tagger", "parser", "ner"]
``` ```
import Accordion from 'components/accordion.js' import Accordion from 'components/accordion.js'

View File

@ -167,8 +167,8 @@ the binary data:
```python ```python
### spacy.load under the hood ### spacy.load under the hood
lang = "en" lang = "en"
pipeline = ["tagger", "parser", "ner"] pipeline = ["tok2vec", "tagger", "parser", "ner"]
data_path = "path/to/en_core_web_sm/en_core_web_sm-2.0.0" data_path = "path/to/en_core_web_sm/en_core_web_sm-3.0.0"
cls = spacy.util.get_lang_class(lang) # 1. Get Language class, e.g. English cls = spacy.util.get_lang_class(lang) # 1. Get Language class, e.g. English
nlp = cls() # 2. Initialize it nlp = cls() # 2. Initialize it
@ -197,9 +197,9 @@ list of human-readable component names.
```python ```python
print(nlp.pipeline) print(nlp.pipeline)
# [('tagger', <spacy.pipeline.Tagger>), ('parser', <spacy.pipeline.DependencyParser>), ('ner', <spacy.pipeline.EntityRecognizer>)] # [('tok2vec', <spacy.pipeline.Tok2Vec>), ('tagger', <spacy.pipeline.Tagger>), ('parser', <spacy.pipeline.DependencyParser>), ('ner', <spacy.pipeline.EntityRecognizer>)]
print(nlp.pipe_names) print(nlp.pipe_names)
# ['tagger', 'parser', 'ner'] # ['tok2vec', 'tagger', 'parser', 'ner']
``` ```
### Built-in pipeline components {#built-in} ### Built-in pipeline components {#built-in}
@ -1127,9 +1127,9 @@ customize how the model is updated from examples, how it's initialized, how the
loss is calculated and to add evaluation scores to the training output. loss is calculated and to add evaluation scores to the training output.
| Name | Description | | Name | Description |
| ------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| [`update`](/api/pipe#update) | Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model. | | [`update`](/api/pipe#update) | Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model. |
| [`initialize`](/api/pipe#initialize) | Initialize the model. Typically calls into [`Model.initialize`](https://thinc.ai/docs/api-model#initialize) and [`Pipe.create_optimizer`](/api/pipe#create_optimizer) if no optimizer is provided. | | [`initialize`](/api/pipe#initialize) | Initialize the model. Typically calls into [`Model.initialize`](https://thinc.ai/docs/api-model#initialize) and can be passed custom arguments via the [`[initialize]`](/api/data-formats#config-initialize) config block that are only loaded during training or when you call [`nlp.initialize`](/api/language#initialize), not at runtime. |
| [`get_loss`](/api/pipe#get_loss) | Return a tuple of the loss and the gradient for a batch of [`Example`](/api/example) objects. | | [`get_loss`](/api/pipe#get_loss) | Return a tuple of the loss and the gradient for a batch of [`Example`](/api/example) objects. |
| [`score`](/api/pipe#score) | Score a batch of [`Example`](/api/example) objects and return a dictionary of scores. The [`@Language.factory`](/api/language#factory) decorator can define the `default_socre_weights` of the component to decide which keys of the scores to display during training and how they count towards the final score. | | [`score`](/api/pipe#score) | Score a batch of [`Example`](/api/example) objects and return a dictionary of scores. The [`@Language.factory`](/api/language#factory) decorator can define the `default_socre_weights` of the component to decide which keys of the scores to display during training and how they count towards the final score. |

View File

@ -6,8 +6,9 @@ menu:
- ['Introduction', 'basics'] - ['Introduction', 'basics']
- ['Quickstart', 'quickstart'] - ['Quickstart', 'quickstart']
- ['Config System', 'config'] - ['Config System', 'config']
<!-- - ['Data Utilities', 'data'] --> - ['Custom Training', 'config-custom']
- ['Custom Functions', 'custom-functions'] - ['Custom Functions', 'custom-functions']
- ['Data Utilities', 'data']
- ['Parallel Training', 'parallel-training'] - ['Parallel Training', 'parallel-training']
- ['Internal API', 'api'] - ['Internal API', 'api']
--- ---
@ -122,7 +123,7 @@ treebank.
</Project> </Project>
## Training config {#config} ## Training config system {#config}
Training config files include all **settings and hyperparameters** for training Training config files include all **settings and hyperparameters** for training
your pipeline. Instead of providing lots of arguments on the command line, you your pipeline. Instead of providing lots of arguments on the command line, you
@ -177,6 +178,7 @@ sections of a config file are:
| `system` | Settings related to system and hardware. Re-used across the config as variables, e.g. `${system.seed}`, and can be [overwritten](#config-overrides) on the CLI. | | `system` | Settings related to system and hardware. Re-used across the config as variables, e.g. `${system.seed}`, and can be [overwritten](#config-overrides) on the CLI. |
| `training` | Settings and controls for the training and evaluation process. | | `training` | Settings and controls for the training and evaluation process. |
| `pretraining` | Optional settings and controls for the [language model pretraining](/usage/embeddings-transformers#pretraining). | | `pretraining` | Optional settings and controls for the [language model pretraining](/usage/embeddings-transformers#pretraining). |
| `initialize` | Data resources and arguments passed to components when [`nlp.initialize`](/api/language#initialize) is called before training (but not at runtime). |
<Infobox title="Config format and settings" emoji="📖"> <Infobox title="Config format and settings" emoji="📖">
@ -190,6 +192,20 @@ available for the different architectures are documented with the
</Infobox> </Infobox>
### Config lifecycle at runtime and training {#config-lifecycle}
A pipeline's `config.cfg` is considered the "single source of truth", both at
**training** and **runtime**. Under the hood,
[`Language.from_config`](/api/language#from_config) takes care of constructing
the `nlp` object using the settings defined in the config. An `nlp` object's
config is available as [`nlp.config`](/api/language#config) and it includes all
information about the pipeline, as well as the settings used to train and
initialize it.
![Illustration of pipeline lifecycle](../images/lifecycle.svg)
<!-- TODO: explain lifecycle and initialization -->
### Overwriting config settings on the command line {#config-overrides} ### Overwriting config settings on the command line {#config-overrides}
The config system means that you can define all settings **in one place** and in The config system means that you can define all settings **in one place** and in
@ -233,6 +249,61 @@ defined in the config file.
$ SPACY_CONFIG_OVERRIDES="--system.gpu_allocator pytorch --training.batch_size 128" ./your_script.sh $ SPACY_CONFIG_OVERRIDES="--system.gpu_allocator pytorch --training.batch_size 128" ./your_script.sh
``` ```
### Using variable interpolation {#config-interpolation}
Another very useful feature of the config system is that it supports variable
interpolation for both **values and sections**. This means that you only need to
define a setting once and can reference it across your config using the
`${section.value}` syntax. In this example, the value of `seed` is reused within
the `[training]` block, and the whole block of `[training.optimizer]` is reused
in `[pretraining]` and will become `pretraining.optimizer`.
```ini
### config.cfg (excerpt) {highlight="5,18"}
[system]
seed = 0
[training]
seed = ${system.seed}
[training.optimizer]
@optimizers = "Adam.v1"
beta1 = 0.9
beta2 = 0.999
L2_is_weight_decay = true
L2 = 0.01
grad_clip = 1.0
use_averages = false
eps = 1e-8
[pretraining]
optimizer = ${training.optimizer}
```
You can also use variables inside strings. In that case, it works just like
f-strings in Python. If the value of a variable is not a string, it's converted
to a string.
```ini
[paths]
version = 5
root = "/Users/you/data"
train = "${paths.root}/train_${paths.version}.spacy"
# Result: /Users/you/data/train_5.spacy
```
<Infobox title="Tip: Override variables on the CLI" emoji="💡">
If you need to change certain values between training runs, you can define them
once, reference them as variables and then [override](#config-overrides) them on
the CLI. For example, `--paths.root /other/root` will change the value of `root`
in the block `[paths]` and the change will be reflected across all other values
that reference this variable.
</Infobox>
## Customizing the pipeline and training {#config-custom}
### Defining pipeline components {#config-components} ### Defining pipeline components {#config-components}
You typically train a [pipeline](/usage/processing-pipelines) of **one or more You typically train a [pipeline](/usage/processing-pipelines) of **one or more
@ -353,59 +424,6 @@ stop = 1000
compound = 1.001 compound = 1.001
``` ```
### Using variable interpolation {#config-interpolation}
Another very useful feature of the config system is that it supports variable
interpolation for both **values and sections**. This means that you only need to
define a setting once and can reference it across your config using the
`${section.value}` syntax. In this example, the value of `seed` is reused within
the `[training]` block, and the whole block of `[training.optimizer]` is reused
in `[pretraining]` and will become `pretraining.optimizer`.
```ini
### config.cfg (excerpt) {highlight="5,18"}
[system]
seed = 0
[training]
seed = ${system.seed}
[training.optimizer]
@optimizers = "Adam.v1"
beta1 = 0.9
beta2 = 0.999
L2_is_weight_decay = true
L2 = 0.01
grad_clip = 1.0
use_averages = false
eps = 1e-8
[pretraining]
optimizer = ${training.optimizer}
```
You can also use variables inside strings. In that case, it works just like
f-strings in Python. If the value of a variable is not a string, it's converted
to a string.
```ini
[paths]
version = 5
root = "/Users/you/data"
train = "${paths.root}/train_${paths.version}.spacy"
# Result: /Users/you/data/train_5.spacy
```
<Infobox title="Tip: Override variables on the CLI" emoji="💡">
If you need to change certain values between training runs, you can define them
once, reference them as variables and then [override](#config-overrides) them on
the CLI. For example, `--paths.root /other/root` will change the value of `root`
in the block `[paths]` and the change will be reflected across all other values
that reference this variable.
</Infobox>
### Model architectures {#model-architectures} ### Model architectures {#model-architectures}
> #### 💡 Model type annotations > #### 💡 Model type annotations
@ -506,17 +524,7 @@ still look good.
</Accordion> </Accordion>
<!-- ## Custom functions {#custom-functions}
## Data Utilities {#data-utilities}
* spacy convert
* The [corpora] block
* Custom corpus class
* Minibatching
* Data augmentation
-->
## Custom Functions {#custom-functions}
Registered functions in the training config files can refer to built-in Registered functions in the training config files can refer to built-in
implementations, but you can also plug in fully **custom implementations**. All implementations, but you can also plug in fully **custom implementations**. All
@ -763,7 +771,96 @@ start = 2
factor = 1.005 factor = 1.005
``` ```
#### Example: Custom data reading and batching {#custom-code-readers-batchers} ### Defining custom architectures {#custom-architectures}
Built-in pipeline components such as the tagger or named entity recognizer are
constructed with default neural network [models](/api/architectures). You can
change the model architecture entirely by implementing your own custom models
and providing those in the config when creating the pipeline component. See the
documentation on [layers and model architectures](/usage/layers-architectures)
for more details.
> ```ini
> ### config.cfg
> [components.tagger]
> factory = "tagger"
>
> [components.tagger.model]
> @architectures = "custom_neural_network.v1"
> output_width = 512
> ```
```python
### functions.py
from typing import List
from thinc.types import Floats2d
from thinc.api import Model
import spacy
from spacy.tokens import Doc
@spacy.registry.architectures("custom_neural_network.v1")
def MyModel(output_width: int) -> Model[List[Doc], List[Floats2d]]:
return create_model(output_width)
```
## Data utilities {#data}
spaCy includes various features and utilities to make it easy to train from your
own data. If you have training data in a standard format like `.conll` or
`.conllu`, the easiest way to convert it for use with spaCy is to run
[`spacy convert`](/api/cli#convert) and pass it a file and an output directory:
```cli
$ python -m spacy convert ./train.gold.conll ./corpus
```
<Infobox title="Tip: Manage multi-step workflows with projects" emoji="💡">
Training workflows often consist of multiple steps, from preprocessing the data
all the way to packaging and deploying the trained model.
[spaCy projects](/usage/projects) let you define all steps in one file, manage
data assets, track changes and share your end-to-end processes with your team.
</Infobox>
### Working with corpora {#data-corpora}
> #### Example
>
> ```ini
> [corpora]
>
> [corpora.train]
> @readers = "spacy.Corpus.v1"
> path = ${paths.train}
> gold_preproc = false
> max_length = 0
> limit = 0
> augmenter = null
>
> [training]
> train_corpus = "corpora.train"
> ```
The [`[corpora]`](/api/data-formats#config-corpora) block in your config lets
you define **data resources** to use for training, evaluation, pretraining or
any other custom workflows. `corpora.train` and `corpora.dev` are used as
conventions within spaCy's default configs, but you can also define any other
custom blocks. Each section in the corpora config should resolve to a
[`Corpus`](/api/corpus) for example, using spaCy's built-in
[corpus reader](/api/top-level#readers) that takes a path to a binary `.spacy`
file. The `train_corpus` and `dev_corpus` fields in the
[`[training]`](/api/data-formats#config-training) block specify where to find
the corpus in your config. This makes it easy to **swap out** different corpora
by only changing a single config setting.
Instead of making `[corpora]` a block with multiple subsections for each portion
of the data, you can also use a single function that returns a dictionary of
corpora, keyed by corpus name, e.g. `"train"` and `"dev"`. This can be
especially useful if you need to split a single file into corpora for training
and evaluation, without loading the same file twice.
### Custom data reading and batching {#custom-code-readers-batchers}
Some use-cases require **streaming in data** or manipulating datasets on the Some use-cases require **streaming in data** or manipulating datasets on the
fly, rather than generating all data beforehand and storing it to file. Instead fly, rather than generating all data beforehand and storing it to file. Instead
@ -859,37 +956,11 @@ def filter_batch(size: int) -> Callable[[Iterable[Example]], Iterator[List[Examp
return create_filtered_batches return create_filtered_batches
``` ```
### Defining custom architectures {#custom-architectures} <!-- TODO:
* Custom corpus class
Built-in pipeline components such as the tagger or named entity recognizer are * Minibatching
constructed with default neural network [models](/api/architectures). You can * Data augmentation
change the model architecture entirely by implementing your own custom models -->
and providing those in the config when creating the pipeline component. See the
documentation on [layers and model architectures](/usage/layers-architectures)
for more details.
> ```ini
> ### config.cfg
> [components.tagger]
> factory = "tagger"
>
> [components.tagger.model]
> @architectures = "custom_neural_network.v1"
> output_width = 512
> ```
```python
### functions.py
from typing import List
from thinc.types import Floats2d
from thinc.api import Model
import spacy
from spacy.tokens import Doc
@spacy.registry.architectures("custom_neural_network.v1")
def MyModel(output_width: int) -> Model[List[Doc], List[Floats2d]]:
return create_model(output_width)
```
## Parallel & distributed training with Ray {#parallel-training} ## Parallel & distributed training with Ray {#parallel-training}

View File

@ -123,13 +123,14 @@ training run, with no hidden defaults, making it easy to rerun your experiments
and track changes. You can use the and track changes. You can use the
[quickstart widget](/usage/training#quickstart) or the `init config` command to [quickstart widget](/usage/training#quickstart) or the `init config` command to
get started. Instead of providing lots of arguments on the command line, you get started. Instead of providing lots of arguments on the command line, you
only need to pass your `config.cfg` file to `spacy train`. only need to pass your `config.cfg` file to [`spacy train`](/api/cli#train).
Training config files include all **settings and hyperparameters** for training Training config files include all **settings and hyperparameters** for training
your pipeline. Some settings can also be registered **functions** that you can your pipeline. Some settings can also be registered **functions** that you can
swap out and customize, making it easy to implement your own custom models and swap out and customize, making it easy to implement your own custom models and
architectures. architectures.
![Illustration of pipeline lifecycle](../images/lifecycle.svg)
<Infobox title="Details & Documentation" emoji="📖" list> <Infobox title="Details & Documentation" emoji="📖" list>
- **Usage:** [Training pipelines and models](/usage/training) - **Usage:** [Training pipelines and models](/usage/training)
@ -723,7 +724,7 @@ nlp = spacy.blank("en")
Because pipeline components are now added using their string names, you won't Because pipeline components are now added using their string names, you won't
have to instantiate the [component classes](/api/#architecture-pipeline) have to instantiate the [component classes](/api/#architecture-pipeline)
directly anynore. To configure the component, you can now use the `config` directly anymore. To configure the component, you can now use the `config`
argument on [`nlp.add_pipe`](/api/language#add_pipe). argument on [`nlp.add_pipe`](/api/language#add_pipe).
> #### config.cfg (excerpt) > #### config.cfg (excerpt)