mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-11 04:08:09 +03:00
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
c379a4274a
2
Makefile
2
Makefile
|
@ -1,7 +1,7 @@
|
||||||
SHELL := /bin/bash
|
SHELL := /bin/bash
|
||||||
|
|
||||||
ifndef SPACY_EXTRAS
|
ifndef SPACY_EXTRAS
|
||||||
override SPACY_EXTRAS = spacy-lookups-data==0.4.0.dev0 jieba pkuseg==0.0.25 pickle5 sudachipy sudachidict_core
|
override SPACY_EXTRAS = spacy-lookups-data==1.0.0rc0 jieba pkuseg==0.0.25 pickle5 sudachipy sudachidict_core
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifndef PYVER
|
ifndef PYVER
|
||||||
|
|
|
@ -6,7 +6,7 @@ requires = [
|
||||||
"cymem>=2.0.2,<2.1.0",
|
"cymem>=2.0.2,<2.1.0",
|
||||||
"preshed>=3.0.2,<3.1.0",
|
"preshed>=3.0.2,<3.1.0",
|
||||||
"murmurhash>=0.28.0,<1.1.0",
|
"murmurhash>=0.28.0,<1.1.0",
|
||||||
"thinc>=8.0.0a41,<8.0.0a50",
|
"thinc>=8.0.0a42,<8.0.0a50",
|
||||||
"blis>=0.4.0,<0.5.0",
|
"blis>=0.4.0,<0.5.0",
|
||||||
"pytokenizations",
|
"pytokenizations",
|
||||||
"pathy"
|
"pathy"
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# Our libraries
|
# Our libraries
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.0.0a41,<8.0.0a50
|
thinc>=8.0.0a42,<8.0.0a50
|
||||||
blis>=0.4.0,<0.5.0
|
blis>=0.4.0,<0.5.0
|
||||||
ml_datasets==0.2.0a0
|
ml_datasets==0.2.0a0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
|
|
|
@ -34,13 +34,13 @@ setup_requires =
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
thinc>=8.0.0a41,<8.0.0a50
|
thinc>=8.0.0a42,<8.0.0a50
|
||||||
install_requires =
|
install_requires =
|
||||||
# Our libraries
|
# Our libraries
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.0.0a41,<8.0.0a50
|
thinc>=8.0.0a42,<8.0.0a50
|
||||||
blis>=0.4.0,<0.5.0
|
blis>=0.4.0,<0.5.0
|
||||||
wasabi>=0.8.0,<1.1.0
|
wasabi>=0.8.0,<1.1.0
|
||||||
srsly>=2.1.0,<3.0.0
|
srsly>=2.1.0,<3.0.0
|
||||||
|
@ -65,7 +65,7 @@ console_scripts =
|
||||||
|
|
||||||
[options.extras_require]
|
[options.extras_require]
|
||||||
lookups =
|
lookups =
|
||||||
spacy_lookups_data==0.4.0.dev0
|
spacy_lookups_data==1.0.0rc0
|
||||||
cuda =
|
cuda =
|
||||||
cupy>=5.0.0b4,<9.0.0
|
cupy>=5.0.0b4,<9.0.0
|
||||||
cuda80 =
|
cuda80 =
|
||||||
|
|
|
@ -16,6 +16,7 @@ import os
|
||||||
|
|
||||||
from ..schemas import ProjectConfigSchema, validate
|
from ..schemas import ProjectConfigSchema, validate
|
||||||
from ..util import import_file, run_command, make_tempdir, registry, logger
|
from ..util import import_file, run_command, make_tempdir, registry, logger
|
||||||
|
from ..util import ENV_VARS
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from pathy import Pathy # noqa: F401
|
from pathy import Pathy # noqa: F401
|
||||||
|
@ -39,7 +40,6 @@ commands to check and validate your config files, training and evaluation data,
|
||||||
and custom model implementations.
|
and custom model implementations.
|
||||||
"""
|
"""
|
||||||
INIT_HELP = """Commands for initializing configs and pipeline packages."""
|
INIT_HELP = """Commands for initializing configs and pipeline packages."""
|
||||||
OVERRIDES_ENV_VAR = "SPACY_CONFIG_OVERRIDES"
|
|
||||||
|
|
||||||
# Wrappers for Typer's annotations. Initially created to set defaults and to
|
# Wrappers for Typer's annotations. Initially created to set defaults and to
|
||||||
# keep the names short, but not needed at the moment.
|
# keep the names short, but not needed at the moment.
|
||||||
|
@ -65,7 +65,7 @@ def setup_cli() -> None:
|
||||||
|
|
||||||
|
|
||||||
def parse_config_overrides(
|
def parse_config_overrides(
|
||||||
args: List[str], env_var: Optional[str] = OVERRIDES_ENV_VAR
|
args: List[str], env_var: Optional[str] = ENV_VARS.CONFIG_OVERRIDES
|
||||||
) -> Dict[str, Any]:
|
) -> Dict[str, Any]:
|
||||||
"""Generate a dictionary of config overrides based on the extra arguments
|
"""Generate a dictionary of config overrides based on the extra arguments
|
||||||
provided on the CLI, e.g. --training.batch_size to override
|
provided on the CLI, e.g. --training.batch_size to override
|
||||||
|
|
|
@ -27,7 +27,7 @@ def init_vectors_cli(
|
||||||
you can use in the [initialize.vocab] block of your config to initialize
|
you can use in the [initialize.vocab] block of your config to initialize
|
||||||
a model with vectors.
|
a model with vectors.
|
||||||
"""
|
"""
|
||||||
util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
|
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||||
msg.info(f"Creating blank nlp object for language '{lang}'")
|
msg.info(f"Creating blank nlp object for language '{lang}'")
|
||||||
nlp = util.get_lang_class(lang)()
|
nlp = util.get_lang_class(lang)()
|
||||||
convert_vectors(nlp, vectors_loc, truncate=truncate, prune=prune, name=name)
|
convert_vectors(nlp, vectors_loc, truncate=truncate, prune=prune, name=name)
|
||||||
|
@ -55,14 +55,14 @@ def init_pipeline_cli(
|
||||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
|
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
|
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||||
overrides = parse_config_overrides(ctx.args)
|
overrides = parse_config_overrides(ctx.args)
|
||||||
import_code(code_path)
|
import_code(code_path)
|
||||||
setup_gpu(use_gpu)
|
setup_gpu(use_gpu)
|
||||||
with show_validation_error(config_path):
|
with show_validation_error(config_path):
|
||||||
config = util.load_config(config_path, overrides=overrides)
|
config = util.load_config(config_path, overrides=overrides)
|
||||||
with show_validation_error(hint_fill=False):
|
with show_validation_error(hint_fill=False):
|
||||||
nlp = init_nlp(config, use_gpu=use_gpu, silent=False)
|
nlp = init_nlp(config, use_gpu=use_gpu)
|
||||||
nlp.to_disk(output_path)
|
nlp.to_disk(output_path)
|
||||||
msg.good(f"Saved initialized pipeline to {output_path}")
|
msg.good(f"Saved initialized pipeline to {output_path}")
|
||||||
|
|
||||||
|
@ -81,9 +81,12 @@ def init_labels_cli(
|
||||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
|
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
|
"""Generate a JSON file for labels in the data. This helps speed up the
|
||||||
|
training process, since spaCy won't have to preprocess the data to
|
||||||
|
extract the labels."""
|
||||||
|
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||||
if not output_path.exists():
|
if not output_path.exists():
|
||||||
output_path.mkdir()
|
output_path.mkdir()
|
||||||
util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
|
|
||||||
overrides = parse_config_overrides(ctx.args)
|
overrides = parse_config_overrides(ctx.args)
|
||||||
import_code(code_path)
|
import_code(code_path)
|
||||||
setup_gpu(use_gpu)
|
setup_gpu(use_gpu)
|
||||||
|
@ -93,7 +96,8 @@ def init_labels_cli(
|
||||||
nlp = init_nlp(config, use_gpu=use_gpu)
|
nlp = init_nlp(config, use_gpu=use_gpu)
|
||||||
for name, component in nlp.pipeline:
|
for name, component in nlp.pipeline:
|
||||||
if getattr(component, "label_data", None) is not None:
|
if getattr(component, "label_data", None) is not None:
|
||||||
srsly.write_json(output_path / f"{name}.json", component.label_data)
|
output_file = output_path / f"{name}.json"
|
||||||
msg.good(f"Saving {name} labels to {output_path}/{name}.json")
|
srsly.write_json(output_file, component.label_data)
|
||||||
|
msg.good(f"Saving {name} labels to {output_file}")
|
||||||
else:
|
else:
|
||||||
msg.info(f"No labels found for {name}")
|
msg.info(f"No labels found for {name}")
|
||||||
|
|
|
@ -4,8 +4,8 @@ can help generate the best possible configuration, given a user's requirements.
|
||||||
{%- set use_transformer = (transformer_data and hardware != "cpu") -%}
|
{%- set use_transformer = (transformer_data and hardware != "cpu") -%}
|
||||||
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
|
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
|
||||||
[paths]
|
[paths]
|
||||||
train = ""
|
train = null
|
||||||
dev = ""
|
dev = null
|
||||||
|
|
||||||
[system]
|
[system]
|
||||||
{% if use_transformer -%}
|
{% if use_transformer -%}
|
||||||
|
|
|
@ -40,7 +40,7 @@ def train_cli(
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/cli#train
|
DOCS: https://nightly.spacy.io/api/cli#train
|
||||||
"""
|
"""
|
||||||
util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
|
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||||
verify_cli_args(config_path, output_path)
|
verify_cli_args(config_path, output_path)
|
||||||
overrides = parse_config_overrides(ctx.args)
|
overrides = parse_config_overrides(ctx.args)
|
||||||
import_code(code_path)
|
import_code(code_path)
|
||||||
|
@ -50,6 +50,7 @@ def train_cli(
|
||||||
msg.divider("Initializing pipeline")
|
msg.divider("Initializing pipeline")
|
||||||
with show_validation_error(config_path, hint_fill=False):
|
with show_validation_error(config_path, hint_fill=False):
|
||||||
nlp = init_nlp(config, use_gpu=use_gpu)
|
nlp = init_nlp(config, use_gpu=use_gpu)
|
||||||
|
msg.good("Initialized pipeline")
|
||||||
msg.divider("Training pipeline")
|
msg.divider("Training pipeline")
|
||||||
train(nlp, output_path, use_gpu=use_gpu, silent=False)
|
train(nlp, output_path, use_gpu=use_gpu, silent=False)
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
[paths]
|
[paths]
|
||||||
train = ""
|
train = null
|
||||||
dev = ""
|
dev = null
|
||||||
vectors = null
|
vectors = null
|
||||||
vocab_data = null
|
vocab_data = null
|
||||||
init_tok2vec = null
|
init_tok2vec = null
|
||||||
|
|
|
@ -477,6 +477,8 @@ class Errors:
|
||||||
E201 = ("Span index out of range.")
|
E201 = ("Span index out of range.")
|
||||||
|
|
||||||
# TODO: fix numbering after merging develop into master
|
# TODO: fix numbering after merging develop into master
|
||||||
|
E913 = ("Corpus path can't be None. Maybe you forgot to define it in your "
|
||||||
|
"config.cfg or override it on the CLI?")
|
||||||
E914 = ("Executing {name} callback failed. Expected the function to "
|
E914 = ("Executing {name} callback failed. Expected the function to "
|
||||||
"return the nlp object but got: {value}. Maybe you forgot to return "
|
"return the nlp object but got: {value}. Maybe you forgot to return "
|
||||||
"the modified object in your function?")
|
"the modified object in your function?")
|
||||||
|
|
|
@ -207,7 +207,7 @@ cdef class Pipe:
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/pipe#initialize
|
DOCS: https://nightly.spacy.io/api/pipe#initialize
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError(Errors.E931.format(method="initialize", name=self.name))
|
pass
|
||||||
|
|
||||||
def _ensure_examples(self, get_examples):
|
def _ensure_examples(self, get_examples):
|
||||||
if get_examples is None or not hasattr(get_examples, "__call__"):
|
if get_examples is None or not hasattr(get_examples, "__call__"):
|
||||||
|
|
|
@ -14,8 +14,8 @@ from ..util import make_tempdir
|
||||||
|
|
||||||
nlp_config_string = """
|
nlp_config_string = """
|
||||||
[paths]
|
[paths]
|
||||||
train = ""
|
train = null
|
||||||
dev = ""
|
dev = null
|
||||||
|
|
||||||
[corpora]
|
[corpora]
|
||||||
|
|
||||||
|
@ -309,7 +309,7 @@ def test_config_interpolation():
|
||||||
config = Config().from_str(nlp_config_string, interpolate=False)
|
config = Config().from_str(nlp_config_string, interpolate=False)
|
||||||
assert config["corpora"]["train"]["path"] == "${paths.train}"
|
assert config["corpora"]["train"]["path"] == "${paths.train}"
|
||||||
interpolated = config.interpolate()
|
interpolated = config.interpolate()
|
||||||
assert interpolated["corpora"]["train"]["path"] == ""
|
assert interpolated["corpora"]["train"]["path"] is None
|
||||||
nlp = English.from_config(config)
|
nlp = English.from_config(config)
|
||||||
assert nlp.config["corpora"]["train"]["path"] == "${paths.train}"
|
assert nlp.config["corpora"]["train"]["path"] == "${paths.train}"
|
||||||
# Ensure that variables are preserved in nlp config
|
# Ensure that variables are preserved in nlp config
|
||||||
|
@ -317,10 +317,10 @@ def test_config_interpolation():
|
||||||
assert config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
|
assert config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
|
||||||
assert nlp.config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
|
assert nlp.config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
|
||||||
interpolated2 = nlp.config.interpolate()
|
interpolated2 = nlp.config.interpolate()
|
||||||
assert interpolated2["corpora"]["train"]["path"] == ""
|
assert interpolated2["corpora"]["train"]["path"] is None
|
||||||
assert interpolated2["components"]["tagger"]["model"]["tok2vec"]["width"] == 342
|
assert interpolated2["components"]["tagger"]["model"]["tok2vec"]["width"] == 342
|
||||||
nlp2 = English.from_config(interpolated)
|
nlp2 = English.from_config(interpolated)
|
||||||
assert nlp2.config["corpora"]["train"]["path"] == ""
|
assert nlp2.config["corpora"]["train"]["path"] is None
|
||||||
assert nlp2.config["components"]["tagger"]["model"]["tok2vec"]["width"] == 342
|
assert nlp2.config["components"]["tagger"]["model"]["tok2vec"]["width"] == 342
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -3,10 +3,11 @@ from click import NoSuchOption
|
||||||
from spacy.training import docs_to_json, offsets_to_biluo_tags
|
from spacy.training import docs_to_json, offsets_to_biluo_tags
|
||||||
from spacy.training.converters import iob_to_docs, conll_ner_to_docs, conllu_to_docs
|
from spacy.training.converters import iob_to_docs, conll_ner_to_docs, conllu_to_docs
|
||||||
from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
|
from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
|
||||||
|
from spacy.util import ENV_VARS
|
||||||
from spacy.cli.init_config import init_config, RECOMMENDATIONS
|
from spacy.cli.init_config import init_config, RECOMMENDATIONS
|
||||||
from spacy.cli._util import validate_project_commands, parse_config_overrides
|
from spacy.cli._util import validate_project_commands, parse_config_overrides
|
||||||
from spacy.cli._util import load_project_config, substitute_project_variables
|
from spacy.cli._util import load_project_config, substitute_project_variables
|
||||||
from spacy.cli._util import string_to_list, OVERRIDES_ENV_VAR
|
from spacy.cli._util import string_to_list
|
||||||
from thinc.api import ConfigValidationError
|
from thinc.api import ConfigValidationError
|
||||||
import srsly
|
import srsly
|
||||||
import os
|
import os
|
||||||
|
@ -342,21 +343,22 @@ def test_parse_config_overrides_invalid_2(args):
|
||||||
|
|
||||||
|
|
||||||
def test_parse_cli_overrides():
|
def test_parse_cli_overrides():
|
||||||
os.environ[OVERRIDES_ENV_VAR] = "--x.foo bar --x.bar=12 --x.baz false --y.foo=hello"
|
overrides = "--x.foo bar --x.bar=12 --x.baz false --y.foo=hello"
|
||||||
|
os.environ[ENV_VARS.CONFIG_OVERRIDES] = overrides
|
||||||
result = parse_config_overrides([])
|
result = parse_config_overrides([])
|
||||||
assert len(result) == 4
|
assert len(result) == 4
|
||||||
assert result["x.foo"] == "bar"
|
assert result["x.foo"] == "bar"
|
||||||
assert result["x.bar"] == 12
|
assert result["x.bar"] == 12
|
||||||
assert result["x.baz"] is False
|
assert result["x.baz"] is False
|
||||||
assert result["y.foo"] == "hello"
|
assert result["y.foo"] == "hello"
|
||||||
os.environ[OVERRIDES_ENV_VAR] = "--x"
|
os.environ[ENV_VARS.CONFIG_OVERRIDES] = "--x"
|
||||||
assert parse_config_overrides([], env_var=None) == {}
|
assert parse_config_overrides([], env_var=None) == {}
|
||||||
with pytest.raises(SystemExit):
|
with pytest.raises(SystemExit):
|
||||||
parse_config_overrides([])
|
parse_config_overrides([])
|
||||||
os.environ[OVERRIDES_ENV_VAR] = "hello world"
|
os.environ[ENV_VARS.CONFIG_OVERRIDES] = "hello world"
|
||||||
with pytest.raises(SystemExit):
|
with pytest.raises(SystemExit):
|
||||||
parse_config_overrides([])
|
parse_config_overrides([])
|
||||||
del os.environ[OVERRIDES_ENV_VAR]
|
del os.environ[ENV_VARS.CONFIG_OVERRIDES]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("lang", ["en", "nl"])
|
@pytest.mark.parametrize("lang", ["en", "nl"])
|
||||||
|
|
|
@ -7,7 +7,7 @@ import srsly
|
||||||
from .. import util
|
from .. import util
|
||||||
from .augment import dont_augment
|
from .augment import dont_augment
|
||||||
from .example import Example
|
from .example import Example
|
||||||
from ..errors import Warnings
|
from ..errors import Warnings, Errors
|
||||||
from ..tokens import DocBin, Doc
|
from ..tokens import DocBin, Doc
|
||||||
from ..vocab import Vocab
|
from ..vocab import Vocab
|
||||||
|
|
||||||
|
@ -20,12 +20,14 @@ FILE_TYPE = ".spacy"
|
||||||
|
|
||||||
@util.registry.readers("spacy.Corpus.v1")
|
@util.registry.readers("spacy.Corpus.v1")
|
||||||
def create_docbin_reader(
|
def create_docbin_reader(
|
||||||
path: Path,
|
path: Optional[Path],
|
||||||
gold_preproc: bool,
|
gold_preproc: bool,
|
||||||
max_length: int = 0,
|
max_length: int = 0,
|
||||||
limit: int = 0,
|
limit: int = 0,
|
||||||
augmenter: Optional[Callable] = None,
|
augmenter: Optional[Callable] = None,
|
||||||
) -> Callable[["Language"], Iterable[Example]]:
|
) -> Callable[["Language"], Iterable[Example]]:
|
||||||
|
if path is None:
|
||||||
|
raise ValueError(Errors.E913)
|
||||||
util.logger.debug(f"Loading corpus from path: {path}")
|
util.logger.debug(f"Loading corpus from path: {path}")
|
||||||
return Corpus(
|
return Corpus(
|
||||||
path,
|
path,
|
||||||
|
|
|
@ -67,10 +67,14 @@ CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "co
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
|
|
||||||
logging.basicConfig()
|
logging.basicConfig(format="%(message)s")
|
||||||
logger = logging.getLogger("spacy")
|
logger = logging.getLogger("spacy")
|
||||||
|
|
||||||
|
|
||||||
|
class ENV_VARS:
|
||||||
|
CONFIG_OVERRIDES = "SPACY_CONFIG_OVERRIDES"
|
||||||
|
|
||||||
|
|
||||||
class registry(thinc.registry):
|
class registry(thinc.registry):
|
||||||
languages = catalogue.create("spacy", "languages", entry_points=True)
|
languages = catalogue.create("spacy", "languages", entry_points=True)
|
||||||
architectures = catalogue.create("spacy", "architectures", entry_points=True)
|
architectures = catalogue.create("spacy", "architectures", entry_points=True)
|
||||||
|
|
|
@ -32,14 +32,16 @@ streaming.
|
||||||
> gold_preproc = false
|
> gold_preproc = false
|
||||||
> max_length = 0
|
> max_length = 0
|
||||||
> limit = 0
|
> limit = 0
|
||||||
|
> augmenter = null
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `path` | The directory or filename to read from. Expects data in spaCy's binary [`.spacy` format](/api/data-formats#binary-training). ~~Path~~ |
|
| `path` | The directory or filename to read from. Expects data in spaCy's binary [`.spacy` format](/api/data-formats#binary-training). ~~Path~~ |
|
||||||
| `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. See [`Corpus`](/api/corpus#init) for details. ~~bool~~ |
|
| `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. See [`Corpus`](/api/corpus#init) for details. ~~bool~~ |
|
||||||
| `max_length` | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ |
|
| `max_length` | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ |
|
||||||
| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ |
|
| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ |
|
||||||
|
| `augmenter` | Apply some simply data augmentation, where we replace tokens with variations. This is especially useful for punctuation and case replacement, to help generalize beyond corpora that don't have smart-quotes, or only have smart quotes, etc. Defaults to `None`. ~~Optional[Callable]~~ |
|
||||||
|
|
||||||
```python
|
```python
|
||||||
%%GITHUB_SPACY/spacy/training/corpus.py
|
%%GITHUB_SPACY/spacy/training/corpus.py
|
||||||
|
@ -74,7 +76,7 @@ train/test skew.
|
||||||
| `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. Defaults to `False`. ~~bool~~ |
|
| `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. Defaults to `False`. ~~bool~~ |
|
||||||
| `max_length` | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ |
|
| `max_length` | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ |
|
||||||
| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ |
|
| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ |
|
||||||
| `augmenter` | Optional data augmentation callback. ~~Callable[[Language, Example], Iterable[Example]]~~
|
| `augmenter` | Optional data augmentation callback. ~~Callable[[Language, Example], Iterable[Example]]~~ |
|
||||||
|
|
||||||
## Corpus.\_\_call\_\_ {#call tag="method"}
|
## Corpus.\_\_call\_\_ {#call tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -192,7 +192,7 @@ browser. Will run a simple web server.
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| --------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `docs` | Document(s) or span(s) to visualize. ~~Union[Iterable[Union[Doc, Span]], Doc, Span]~~ |
|
| `docs` | Document(s) or span(s) to visualize. ~~Union[Iterable[Union[Doc, Span]], Doc, Span]~~ |
|
||||||
| `style` | Visualization style, `"dep"` or `"ent"`. Defaults to `"dep"`. ~~str~~ |
|
| `style` | Visualization style, `"dep"` or `"ent"`. Defaults to `"dep"`. ~~str~~ |
|
||||||
| `page` | Render markup as full HTML page. Defaults to `True`. ~~bool~~ |
|
| `page` | Render markup as full HTML page. Defaults to `True`. ~~bool~~ |
|
||||||
|
@ -499,11 +499,12 @@ the [`Corpus`](/api/corpus) class.
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `path` | The directory or filename to read from. Expects data in spaCy's binary [`.spacy` format](/api/data-formats#binary-training). ~~Union[str, Path]~~ |
|
| `path` | The directory or filename to read from. Expects data in spaCy's binary [`.spacy` format](/api/data-formats#binary-training). ~~Union[str, Path]~~ |
|
||||||
| `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. See [`Corpus`](/api/corpus#init) for details. ~~bool~~ |
|
| `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. See [`Corpus`](/api/corpus#init) for details. ~~bool~~ |
|
||||||
| `max_length` | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ |
|
| `max_length` | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ |
|
||||||
| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ |
|
| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ |
|
||||||
|
| `augmenter` | Apply some simply data augmentation, where we replace tokens with variations. This is especially useful for punctuation and case replacement, to help generalize beyond corpora that don't have smart-quotes, or only have smart quotes, etc. Defaults to `None`. ~~Optional[Callable]~~ |
|
||||||
|
|
||||||
### JsonlReader {#jsonlreader}
|
### JsonlReader {#jsonlreader}
|
||||||
|
|
||||||
|
|
93
website/docs/images/lifecycle.svg
Normal file
93
website/docs/images/lifecycle.svg
Normal file
File diff suppressed because one or more lines are too long
After Width: | Height: | Size: 83 KiB |
|
@ -32,7 +32,7 @@ the [config](/usage/training#config):
|
||||||
|
|
||||||
```ini
|
```ini
|
||||||
[nlp]
|
[nlp]
|
||||||
pipeline = ["tagger", "parser", "ner"]
|
pipeline = ["tok2vec", "tagger", "parser", "ner"]
|
||||||
```
|
```
|
||||||
|
|
||||||
import Accordion from 'components/accordion.js'
|
import Accordion from 'components/accordion.js'
|
||||||
|
|
|
@ -167,8 +167,8 @@ the binary data:
|
||||||
```python
|
```python
|
||||||
### spacy.load under the hood
|
### spacy.load under the hood
|
||||||
lang = "en"
|
lang = "en"
|
||||||
pipeline = ["tagger", "parser", "ner"]
|
pipeline = ["tok2vec", "tagger", "parser", "ner"]
|
||||||
data_path = "path/to/en_core_web_sm/en_core_web_sm-2.0.0"
|
data_path = "path/to/en_core_web_sm/en_core_web_sm-3.0.0"
|
||||||
|
|
||||||
cls = spacy.util.get_lang_class(lang) # 1. Get Language class, e.g. English
|
cls = spacy.util.get_lang_class(lang) # 1. Get Language class, e.g. English
|
||||||
nlp = cls() # 2. Initialize it
|
nlp = cls() # 2. Initialize it
|
||||||
|
@ -197,9 +197,9 @@ list of human-readable component names.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
print(nlp.pipeline)
|
print(nlp.pipeline)
|
||||||
# [('tagger', <spacy.pipeline.Tagger>), ('parser', <spacy.pipeline.DependencyParser>), ('ner', <spacy.pipeline.EntityRecognizer>)]
|
# [('tok2vec', <spacy.pipeline.Tok2Vec>), ('tagger', <spacy.pipeline.Tagger>), ('parser', <spacy.pipeline.DependencyParser>), ('ner', <spacy.pipeline.EntityRecognizer>)]
|
||||||
print(nlp.pipe_names)
|
print(nlp.pipe_names)
|
||||||
# ['tagger', 'parser', 'ner']
|
# ['tok2vec', 'tagger', 'parser', 'ner']
|
||||||
```
|
```
|
||||||
|
|
||||||
### Built-in pipeline components {#built-in}
|
### Built-in pipeline components {#built-in}
|
||||||
|
@ -1127,9 +1127,9 @@ customize how the model is updated from examples, how it's initialized, how the
|
||||||
loss is calculated and to add evaluation scores to the training output.
|
loss is calculated and to add evaluation scores to the training output.
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| ------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| [`update`](/api/pipe#update) | Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model. |
|
| [`update`](/api/pipe#update) | Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model. |
|
||||||
| [`initialize`](/api/pipe#initialize) | Initialize the model. Typically calls into [`Model.initialize`](https://thinc.ai/docs/api-model#initialize) and [`Pipe.create_optimizer`](/api/pipe#create_optimizer) if no optimizer is provided. |
|
| [`initialize`](/api/pipe#initialize) | Initialize the model. Typically calls into [`Model.initialize`](https://thinc.ai/docs/api-model#initialize) and can be passed custom arguments via the [`[initialize]`](/api/data-formats#config-initialize) config block that are only loaded during training or when you call [`nlp.initialize`](/api/language#initialize), not at runtime. |
|
||||||
| [`get_loss`](/api/pipe#get_loss) | Return a tuple of the loss and the gradient for a batch of [`Example`](/api/example) objects. |
|
| [`get_loss`](/api/pipe#get_loss) | Return a tuple of the loss and the gradient for a batch of [`Example`](/api/example) objects. |
|
||||||
| [`score`](/api/pipe#score) | Score a batch of [`Example`](/api/example) objects and return a dictionary of scores. The [`@Language.factory`](/api/language#factory) decorator can define the `default_socre_weights` of the component to decide which keys of the scores to display during training and how they count towards the final score. |
|
| [`score`](/api/pipe#score) | Score a batch of [`Example`](/api/example) objects and return a dictionary of scores. The [`@Language.factory`](/api/language#factory) decorator can define the `default_socre_weights` of the component to decide which keys of the scores to display during training and how they count towards the final score. |
|
||||||
|
|
||||||
|
|
|
@ -6,8 +6,9 @@ menu:
|
||||||
- ['Introduction', 'basics']
|
- ['Introduction', 'basics']
|
||||||
- ['Quickstart', 'quickstart']
|
- ['Quickstart', 'quickstart']
|
||||||
- ['Config System', 'config']
|
- ['Config System', 'config']
|
||||||
<!-- - ['Data Utilities', 'data'] -->
|
- ['Custom Training', 'config-custom']
|
||||||
- ['Custom Functions', 'custom-functions']
|
- ['Custom Functions', 'custom-functions']
|
||||||
|
- ['Data Utilities', 'data']
|
||||||
- ['Parallel Training', 'parallel-training']
|
- ['Parallel Training', 'parallel-training']
|
||||||
- ['Internal API', 'api']
|
- ['Internal API', 'api']
|
||||||
---
|
---
|
||||||
|
@ -122,7 +123,7 @@ treebank.
|
||||||
|
|
||||||
</Project>
|
</Project>
|
||||||
|
|
||||||
## Training config {#config}
|
## Training config system {#config}
|
||||||
|
|
||||||
Training config files include all **settings and hyperparameters** for training
|
Training config files include all **settings and hyperparameters** for training
|
||||||
your pipeline. Instead of providing lots of arguments on the command line, you
|
your pipeline. Instead of providing lots of arguments on the command line, you
|
||||||
|
@ -177,6 +178,7 @@ sections of a config file are:
|
||||||
| `system` | Settings related to system and hardware. Re-used across the config as variables, e.g. `${system.seed}`, and can be [overwritten](#config-overrides) on the CLI. |
|
| `system` | Settings related to system and hardware. Re-used across the config as variables, e.g. `${system.seed}`, and can be [overwritten](#config-overrides) on the CLI. |
|
||||||
| `training` | Settings and controls for the training and evaluation process. |
|
| `training` | Settings and controls for the training and evaluation process. |
|
||||||
| `pretraining` | Optional settings and controls for the [language model pretraining](/usage/embeddings-transformers#pretraining). |
|
| `pretraining` | Optional settings and controls for the [language model pretraining](/usage/embeddings-transformers#pretraining). |
|
||||||
|
| `initialize` | Data resources and arguments passed to components when [`nlp.initialize`](/api/language#initialize) is called before training (but not at runtime). |
|
||||||
|
|
||||||
<Infobox title="Config format and settings" emoji="📖">
|
<Infobox title="Config format and settings" emoji="📖">
|
||||||
|
|
||||||
|
@ -190,6 +192,20 @@ available for the different architectures are documented with the
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
|
### Config lifecycle at runtime and training {#config-lifecycle}
|
||||||
|
|
||||||
|
A pipeline's `config.cfg` is considered the "single source of truth", both at
|
||||||
|
**training** and **runtime**. Under the hood,
|
||||||
|
[`Language.from_config`](/api/language#from_config) takes care of constructing
|
||||||
|
the `nlp` object using the settings defined in the config. An `nlp` object's
|
||||||
|
config is available as [`nlp.config`](/api/language#config) and it includes all
|
||||||
|
information about the pipeline, as well as the settings used to train and
|
||||||
|
initialize it.
|
||||||
|
|
||||||
|
![Illustration of pipeline lifecycle](../images/lifecycle.svg)
|
||||||
|
|
||||||
|
<!-- TODO: explain lifecycle and initialization -->
|
||||||
|
|
||||||
### Overwriting config settings on the command line {#config-overrides}
|
### Overwriting config settings on the command line {#config-overrides}
|
||||||
|
|
||||||
The config system means that you can define all settings **in one place** and in
|
The config system means that you can define all settings **in one place** and in
|
||||||
|
@ -233,6 +249,61 @@ defined in the config file.
|
||||||
$ SPACY_CONFIG_OVERRIDES="--system.gpu_allocator pytorch --training.batch_size 128" ./your_script.sh
|
$ SPACY_CONFIG_OVERRIDES="--system.gpu_allocator pytorch --training.batch_size 128" ./your_script.sh
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Using variable interpolation {#config-interpolation}
|
||||||
|
|
||||||
|
Another very useful feature of the config system is that it supports variable
|
||||||
|
interpolation for both **values and sections**. This means that you only need to
|
||||||
|
define a setting once and can reference it across your config using the
|
||||||
|
`${section.value}` syntax. In this example, the value of `seed` is reused within
|
||||||
|
the `[training]` block, and the whole block of `[training.optimizer]` is reused
|
||||||
|
in `[pretraining]` and will become `pretraining.optimizer`.
|
||||||
|
|
||||||
|
```ini
|
||||||
|
### config.cfg (excerpt) {highlight="5,18"}
|
||||||
|
[system]
|
||||||
|
seed = 0
|
||||||
|
|
||||||
|
[training]
|
||||||
|
seed = ${system.seed}
|
||||||
|
|
||||||
|
[training.optimizer]
|
||||||
|
@optimizers = "Adam.v1"
|
||||||
|
beta1 = 0.9
|
||||||
|
beta2 = 0.999
|
||||||
|
L2_is_weight_decay = true
|
||||||
|
L2 = 0.01
|
||||||
|
grad_clip = 1.0
|
||||||
|
use_averages = false
|
||||||
|
eps = 1e-8
|
||||||
|
|
||||||
|
[pretraining]
|
||||||
|
optimizer = ${training.optimizer}
|
||||||
|
```
|
||||||
|
|
||||||
|
You can also use variables inside strings. In that case, it works just like
|
||||||
|
f-strings in Python. If the value of a variable is not a string, it's converted
|
||||||
|
to a string.
|
||||||
|
|
||||||
|
```ini
|
||||||
|
[paths]
|
||||||
|
version = 5
|
||||||
|
root = "/Users/you/data"
|
||||||
|
train = "${paths.root}/train_${paths.version}.spacy"
|
||||||
|
# Result: /Users/you/data/train_5.spacy
|
||||||
|
```
|
||||||
|
|
||||||
|
<Infobox title="Tip: Override variables on the CLI" emoji="💡">
|
||||||
|
|
||||||
|
If you need to change certain values between training runs, you can define them
|
||||||
|
once, reference them as variables and then [override](#config-overrides) them on
|
||||||
|
the CLI. For example, `--paths.root /other/root` will change the value of `root`
|
||||||
|
in the block `[paths]` and the change will be reflected across all other values
|
||||||
|
that reference this variable.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
|
## Customizing the pipeline and training {#config-custom}
|
||||||
|
|
||||||
### Defining pipeline components {#config-components}
|
### Defining pipeline components {#config-components}
|
||||||
|
|
||||||
You typically train a [pipeline](/usage/processing-pipelines) of **one or more
|
You typically train a [pipeline](/usage/processing-pipelines) of **one or more
|
||||||
|
@ -353,59 +424,6 @@ stop = 1000
|
||||||
compound = 1.001
|
compound = 1.001
|
||||||
```
|
```
|
||||||
|
|
||||||
### Using variable interpolation {#config-interpolation}
|
|
||||||
|
|
||||||
Another very useful feature of the config system is that it supports variable
|
|
||||||
interpolation for both **values and sections**. This means that you only need to
|
|
||||||
define a setting once and can reference it across your config using the
|
|
||||||
`${section.value}` syntax. In this example, the value of `seed` is reused within
|
|
||||||
the `[training]` block, and the whole block of `[training.optimizer]` is reused
|
|
||||||
in `[pretraining]` and will become `pretraining.optimizer`.
|
|
||||||
|
|
||||||
```ini
|
|
||||||
### config.cfg (excerpt) {highlight="5,18"}
|
|
||||||
[system]
|
|
||||||
seed = 0
|
|
||||||
|
|
||||||
[training]
|
|
||||||
seed = ${system.seed}
|
|
||||||
|
|
||||||
[training.optimizer]
|
|
||||||
@optimizers = "Adam.v1"
|
|
||||||
beta1 = 0.9
|
|
||||||
beta2 = 0.999
|
|
||||||
L2_is_weight_decay = true
|
|
||||||
L2 = 0.01
|
|
||||||
grad_clip = 1.0
|
|
||||||
use_averages = false
|
|
||||||
eps = 1e-8
|
|
||||||
|
|
||||||
[pretraining]
|
|
||||||
optimizer = ${training.optimizer}
|
|
||||||
```
|
|
||||||
|
|
||||||
You can also use variables inside strings. In that case, it works just like
|
|
||||||
f-strings in Python. If the value of a variable is not a string, it's converted
|
|
||||||
to a string.
|
|
||||||
|
|
||||||
```ini
|
|
||||||
[paths]
|
|
||||||
version = 5
|
|
||||||
root = "/Users/you/data"
|
|
||||||
train = "${paths.root}/train_${paths.version}.spacy"
|
|
||||||
# Result: /Users/you/data/train_5.spacy
|
|
||||||
```
|
|
||||||
|
|
||||||
<Infobox title="Tip: Override variables on the CLI" emoji="💡">
|
|
||||||
|
|
||||||
If you need to change certain values between training runs, you can define them
|
|
||||||
once, reference them as variables and then [override](#config-overrides) them on
|
|
||||||
the CLI. For example, `--paths.root /other/root` will change the value of `root`
|
|
||||||
in the block `[paths]` and the change will be reflected across all other values
|
|
||||||
that reference this variable.
|
|
||||||
|
|
||||||
</Infobox>
|
|
||||||
|
|
||||||
### Model architectures {#model-architectures}
|
### Model architectures {#model-architectures}
|
||||||
|
|
||||||
> #### 💡 Model type annotations
|
> #### 💡 Model type annotations
|
||||||
|
@ -506,17 +524,7 @@ still look good.
|
||||||
|
|
||||||
</Accordion>
|
</Accordion>
|
||||||
|
|
||||||
<!--
|
## Custom functions {#custom-functions}
|
||||||
## Data Utilities {#data-utilities}
|
|
||||||
|
|
||||||
* spacy convert
|
|
||||||
* The [corpora] block
|
|
||||||
* Custom corpus class
|
|
||||||
* Minibatching
|
|
||||||
* Data augmentation
|
|
||||||
-->
|
|
||||||
|
|
||||||
## Custom Functions {#custom-functions}
|
|
||||||
|
|
||||||
Registered functions in the training config files can refer to built-in
|
Registered functions in the training config files can refer to built-in
|
||||||
implementations, but you can also plug in fully **custom implementations**. All
|
implementations, but you can also plug in fully **custom implementations**. All
|
||||||
|
@ -763,7 +771,96 @@ start = 2
|
||||||
factor = 1.005
|
factor = 1.005
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Example: Custom data reading and batching {#custom-code-readers-batchers}
|
### Defining custom architectures {#custom-architectures}
|
||||||
|
|
||||||
|
Built-in pipeline components such as the tagger or named entity recognizer are
|
||||||
|
constructed with default neural network [models](/api/architectures). You can
|
||||||
|
change the model architecture entirely by implementing your own custom models
|
||||||
|
and providing those in the config when creating the pipeline component. See the
|
||||||
|
documentation on [layers and model architectures](/usage/layers-architectures)
|
||||||
|
for more details.
|
||||||
|
|
||||||
|
> ```ini
|
||||||
|
> ### config.cfg
|
||||||
|
> [components.tagger]
|
||||||
|
> factory = "tagger"
|
||||||
|
>
|
||||||
|
> [components.tagger.model]
|
||||||
|
> @architectures = "custom_neural_network.v1"
|
||||||
|
> output_width = 512
|
||||||
|
> ```
|
||||||
|
|
||||||
|
```python
|
||||||
|
### functions.py
|
||||||
|
from typing import List
|
||||||
|
from thinc.types import Floats2d
|
||||||
|
from thinc.api import Model
|
||||||
|
import spacy
|
||||||
|
from spacy.tokens import Doc
|
||||||
|
|
||||||
|
@spacy.registry.architectures("custom_neural_network.v1")
|
||||||
|
def MyModel(output_width: int) -> Model[List[Doc], List[Floats2d]]:
|
||||||
|
return create_model(output_width)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Data utilities {#data}
|
||||||
|
|
||||||
|
spaCy includes various features and utilities to make it easy to train from your
|
||||||
|
own data. If you have training data in a standard format like `.conll` or
|
||||||
|
`.conllu`, the easiest way to convert it for use with spaCy is to run
|
||||||
|
[`spacy convert`](/api/cli#convert) and pass it a file and an output directory:
|
||||||
|
|
||||||
|
```cli
|
||||||
|
$ python -m spacy convert ./train.gold.conll ./corpus
|
||||||
|
```
|
||||||
|
|
||||||
|
<Infobox title="Tip: Manage multi-step workflows with projects" emoji="💡">
|
||||||
|
|
||||||
|
Training workflows often consist of multiple steps, from preprocessing the data
|
||||||
|
all the way to packaging and deploying the trained model.
|
||||||
|
[spaCy projects](/usage/projects) let you define all steps in one file, manage
|
||||||
|
data assets, track changes and share your end-to-end processes with your team.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
|
### Working with corpora {#data-corpora}
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> [corpora]
|
||||||
|
>
|
||||||
|
> [corpora.train]
|
||||||
|
> @readers = "spacy.Corpus.v1"
|
||||||
|
> path = ${paths.train}
|
||||||
|
> gold_preproc = false
|
||||||
|
> max_length = 0
|
||||||
|
> limit = 0
|
||||||
|
> augmenter = null
|
||||||
|
>
|
||||||
|
> [training]
|
||||||
|
> train_corpus = "corpora.train"
|
||||||
|
> ```
|
||||||
|
|
||||||
|
The [`[corpora]`](/api/data-formats#config-corpora) block in your config lets
|
||||||
|
you define **data resources** to use for training, evaluation, pretraining or
|
||||||
|
any other custom workflows. `corpora.train` and `corpora.dev` are used as
|
||||||
|
conventions within spaCy's default configs, but you can also define any other
|
||||||
|
custom blocks. Each section in the corpora config should resolve to a
|
||||||
|
[`Corpus`](/api/corpus) – for example, using spaCy's built-in
|
||||||
|
[corpus reader](/api/top-level#readers) that takes a path to a binary `.spacy`
|
||||||
|
file. The `train_corpus` and `dev_corpus` fields in the
|
||||||
|
[`[training]`](/api/data-formats#config-training) block specify where to find
|
||||||
|
the corpus in your config. This makes it easy to **swap out** different corpora
|
||||||
|
by only changing a single config setting.
|
||||||
|
|
||||||
|
Instead of making `[corpora]` a block with multiple subsections for each portion
|
||||||
|
of the data, you can also use a single function that returns a dictionary of
|
||||||
|
corpora, keyed by corpus name, e.g. `"train"` and `"dev"`. This can be
|
||||||
|
especially useful if you need to split a single file into corpora for training
|
||||||
|
and evaluation, without loading the same file twice.
|
||||||
|
|
||||||
|
### Custom data reading and batching {#custom-code-readers-batchers}
|
||||||
|
|
||||||
Some use-cases require **streaming in data** or manipulating datasets on the
|
Some use-cases require **streaming in data** or manipulating datasets on the
|
||||||
fly, rather than generating all data beforehand and storing it to file. Instead
|
fly, rather than generating all data beforehand and storing it to file. Instead
|
||||||
|
@ -859,37 +956,11 @@ def filter_batch(size: int) -> Callable[[Iterable[Example]], Iterator[List[Examp
|
||||||
return create_filtered_batches
|
return create_filtered_batches
|
||||||
```
|
```
|
||||||
|
|
||||||
### Defining custom architectures {#custom-architectures}
|
<!-- TODO:
|
||||||
|
* Custom corpus class
|
||||||
Built-in pipeline components such as the tagger or named entity recognizer are
|
* Minibatching
|
||||||
constructed with default neural network [models](/api/architectures). You can
|
* Data augmentation
|
||||||
change the model architecture entirely by implementing your own custom models
|
-->
|
||||||
and providing those in the config when creating the pipeline component. See the
|
|
||||||
documentation on [layers and model architectures](/usage/layers-architectures)
|
|
||||||
for more details.
|
|
||||||
|
|
||||||
> ```ini
|
|
||||||
> ### config.cfg
|
|
||||||
> [components.tagger]
|
|
||||||
> factory = "tagger"
|
|
||||||
>
|
|
||||||
> [components.tagger.model]
|
|
||||||
> @architectures = "custom_neural_network.v1"
|
|
||||||
> output_width = 512
|
|
||||||
> ```
|
|
||||||
|
|
||||||
```python
|
|
||||||
### functions.py
|
|
||||||
from typing import List
|
|
||||||
from thinc.types import Floats2d
|
|
||||||
from thinc.api import Model
|
|
||||||
import spacy
|
|
||||||
from spacy.tokens import Doc
|
|
||||||
|
|
||||||
@spacy.registry.architectures("custom_neural_network.v1")
|
|
||||||
def MyModel(output_width: int) -> Model[List[Doc], List[Floats2d]]:
|
|
||||||
return create_model(output_width)
|
|
||||||
```
|
|
||||||
|
|
||||||
## Parallel & distributed training with Ray {#parallel-training}
|
## Parallel & distributed training with Ray {#parallel-training}
|
||||||
|
|
||||||
|
|
|
@ -123,13 +123,14 @@ training run, with no hidden defaults, making it easy to rerun your experiments
|
||||||
and track changes. You can use the
|
and track changes. You can use the
|
||||||
[quickstart widget](/usage/training#quickstart) or the `init config` command to
|
[quickstart widget](/usage/training#quickstart) or the `init config` command to
|
||||||
get started. Instead of providing lots of arguments on the command line, you
|
get started. Instead of providing lots of arguments on the command line, you
|
||||||
only need to pass your `config.cfg` file to `spacy train`.
|
only need to pass your `config.cfg` file to [`spacy train`](/api/cli#train).
|
||||||
|
|
||||||
Training config files include all **settings and hyperparameters** for training
|
Training config files include all **settings and hyperparameters** for training
|
||||||
your pipeline. Some settings can also be registered **functions** that you can
|
your pipeline. Some settings can also be registered **functions** that you can
|
||||||
swap out and customize, making it easy to implement your own custom models and
|
swap out and customize, making it easy to implement your own custom models and
|
||||||
architectures.
|
architectures.
|
||||||
|
|
||||||
|
![Illustration of pipeline lifecycle](../images/lifecycle.svg)
|
||||||
|
|
||||||
<Infobox title="Details & Documentation" emoji="📖" list>
|
<Infobox title="Details & Documentation" emoji="📖" list>
|
||||||
|
|
||||||
- **Usage:** [Training pipelines and models](/usage/training)
|
- **Usage:** [Training pipelines and models](/usage/training)
|
||||||
|
@ -723,7 +724,7 @@ nlp = spacy.blank("en")
|
||||||
|
|
||||||
Because pipeline components are now added using their string names, you won't
|
Because pipeline components are now added using their string names, you won't
|
||||||
have to instantiate the [component classes](/api/#architecture-pipeline)
|
have to instantiate the [component classes](/api/#architecture-pipeline)
|
||||||
directly anynore. To configure the component, you can now use the `config`
|
directly anymore. To configure the component, you can now use the `config`
|
||||||
argument on [`nlp.add_pipe`](/api/language#add_pipe).
|
argument on [`nlp.add_pipe`](/api/language#add_pipe).
|
||||||
|
|
||||||
> #### config.cfg (excerpt)
|
> #### config.cfg (excerpt)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user