mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
Merge pull request #6023 from explosion/ux/model-terminology-consistency [ci skip]
This commit is contained in:
commit
896caf45e3
|
@ -24,7 +24,7 @@ redirects = [
|
||||||
{from = "/docs/usage/customizing-tokenizer", to = "/usage/linguistic-features#tokenization", force = true},
|
{from = "/docs/usage/customizing-tokenizer", to = "/usage/linguistic-features#tokenization", force = true},
|
||||||
{from = "/docs/usage/language-processing-pipeline", to = "/usage/processing-pipelines", force = true},
|
{from = "/docs/usage/language-processing-pipeline", to = "/usage/processing-pipelines", force = true},
|
||||||
{from = "/docs/usage/customizing-pipeline", to = "/usage/processing-pipelines", force = true},
|
{from = "/docs/usage/customizing-pipeline", to = "/usage/processing-pipelines", force = true},
|
||||||
{from = "/docs/usage/training-ner", to = "/usage/training#ner", force = true},
|
{from = "/docs/usage/training-ner", to = "/usage/training", force = true},
|
||||||
{from = "/docs/usage/tutorials", to = "/usage/examples", force = true},
|
{from = "/docs/usage/tutorials", to = "/usage/examples", force = true},
|
||||||
{from = "/docs/usage/data-model", to = "/api", force = true},
|
{from = "/docs/usage/data-model", to = "/api", force = true},
|
||||||
{from = "/docs/usage/cli", to = "/api/cli", force = true},
|
{from = "/docs/usage/cli", to = "/api/cli", force = true},
|
||||||
|
|
|
@ -29,9 +29,9 @@ from .project.document import project_document # noqa: F401
|
||||||
|
|
||||||
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
|
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
|
||||||
def link(*args, **kwargs):
|
def link(*args, **kwargs):
|
||||||
"""As of spaCy v3.0, model symlinks are deprecated. You can load models
|
"""As of spaCy v3.0, symlinks like "en" are deprecated. You can load trained
|
||||||
using their full names or from a directory path."""
|
pipeline packages using their full names or from a directory path."""
|
||||||
msg.warn(
|
msg.warn(
|
||||||
"As of spaCy v3.0, model symlinks are deprecated. You can load models "
|
"As of spaCy v3.0, model symlinks are deprecated. You can load trained "
|
||||||
"using their full names or from a directory path."
|
"pipeline packages using their full names or from a directory path."
|
||||||
)
|
)
|
||||||
|
|
|
@ -36,7 +36,7 @@ DEBUG_HELP = """Suite of helpful commands for debugging and profiling. Includes
|
||||||
commands to check and validate your config files, training and evaluation data,
|
commands to check and validate your config files, training and evaluation data,
|
||||||
and custom model implementations.
|
and custom model implementations.
|
||||||
"""
|
"""
|
||||||
INIT_HELP = """Commands for initializing configs and models."""
|
INIT_HELP = """Commands for initializing configs and pipeline packages."""
|
||||||
|
|
||||||
# Wrappers for Typer's annotations. Initially created to set defaults and to
|
# Wrappers for Typer's annotations. Initially created to set defaults and to
|
||||||
# keep the names short, but not needed at the moment.
|
# keep the names short, but not needed at the moment.
|
||||||
|
|
|
@ -44,7 +44,7 @@ def convert_cli(
|
||||||
file_type: FileTypes = Opt("spacy", "--file-type", "-t", help="Type of data to produce"),
|
file_type: FileTypes = Opt("spacy", "--file-type", "-t", help="Type of data to produce"),
|
||||||
n_sents: int = Opt(1, "--n-sents", "-n", help="Number of sentences per doc (0 to disable)"),
|
n_sents: int = Opt(1, "--n-sents", "-n", help="Number of sentences per doc (0 to disable)"),
|
||||||
seg_sents: bool = Opt(False, "--seg-sents", "-s", help="Segment sentences (for -c ner)"),
|
seg_sents: bool = Opt(False, "--seg-sents", "-s", help="Segment sentences (for -c ner)"),
|
||||||
model: Optional[str] = Opt(None, "--model", "-b", help="Model for sentence segmentation (for -s)"),
|
model: Optional[str] = Opt(None, "--model", "--base", "-b", help="Trained spaCy pipeline for sentence segmentation to use as base (for --seg-sents)"),
|
||||||
morphology: bool = Opt(False, "--morphology", "-m", help="Enable appending morphology to tags"),
|
morphology: bool = Opt(False, "--morphology", "-m", help="Enable appending morphology to tags"),
|
||||||
merge_subtokens: bool = Opt(False, "--merge-subtokens", "-T", help="Merge CoNLL-U subtokens"),
|
merge_subtokens: bool = Opt(False, "--merge-subtokens", "-T", help="Merge CoNLL-U subtokens"),
|
||||||
converter: str = Opt("auto", "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"),
|
converter: str = Opt("auto", "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"),
|
||||||
|
|
|
@ -18,7 +18,7 @@ from .. import util
|
||||||
NEW_LABEL_THRESHOLD = 50
|
NEW_LABEL_THRESHOLD = 50
|
||||||
# Minimum number of expected occurrences of dependency labels
|
# Minimum number of expected occurrences of dependency labels
|
||||||
DEP_LABEL_THRESHOLD = 20
|
DEP_LABEL_THRESHOLD = 20
|
||||||
# Minimum number of expected examples to train a blank model
|
# Minimum number of expected examples to train a new pipeline
|
||||||
BLANK_MODEL_MIN_THRESHOLD = 100
|
BLANK_MODEL_MIN_THRESHOLD = 100
|
||||||
BLANK_MODEL_THRESHOLD = 2000
|
BLANK_MODEL_THRESHOLD = 2000
|
||||||
|
|
||||||
|
@ -148,7 +148,7 @@ def debug_data(
|
||||||
msg.text(f"Language: {config['nlp']['lang']}")
|
msg.text(f"Language: {config['nlp']['lang']}")
|
||||||
msg.text(f"Training pipeline: {', '.join(pipeline)}")
|
msg.text(f"Training pipeline: {', '.join(pipeline)}")
|
||||||
if resume_components:
|
if resume_components:
|
||||||
msg.text(f"Components from other models: {', '.join(resume_components)}")
|
msg.text(f"Components from other pipelines: {', '.join(resume_components)}")
|
||||||
if frozen_components:
|
if frozen_components:
|
||||||
msg.text(f"Frozen components: {', '.join(frozen_components)}")
|
msg.text(f"Frozen components: {', '.join(frozen_components)}")
|
||||||
msg.text(f"{len(train_dataset)} training docs")
|
msg.text(f"{len(train_dataset)} training docs")
|
||||||
|
@ -164,9 +164,7 @@ def debug_data(
|
||||||
# TODO: make this feedback more fine-grained and report on updated
|
# TODO: make this feedback more fine-grained and report on updated
|
||||||
# components vs. blank components
|
# components vs. blank components
|
||||||
if not resume_components and len(train_dataset) < BLANK_MODEL_THRESHOLD:
|
if not resume_components and len(train_dataset) < BLANK_MODEL_THRESHOLD:
|
||||||
text = (
|
text = f"Low number of examples to train a new pipeline ({len(train_dataset)})"
|
||||||
f"Low number of examples to train from a blank model ({len(train_dataset)})"
|
|
||||||
)
|
|
||||||
if len(train_dataset) < BLANK_MODEL_MIN_THRESHOLD:
|
if len(train_dataset) < BLANK_MODEL_MIN_THRESHOLD:
|
||||||
msg.fail(text)
|
msg.fail(text)
|
||||||
else:
|
else:
|
||||||
|
@ -214,7 +212,7 @@ def debug_data(
|
||||||
show=verbose,
|
show=verbose,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
msg.info("No word vectors present in the model")
|
msg.info("No word vectors present in the package")
|
||||||
|
|
||||||
if "ner" in factory_names:
|
if "ner" in factory_names:
|
||||||
# Get all unique NER labels present in the data
|
# Get all unique NER labels present in the data
|
||||||
|
|
|
@ -17,16 +17,19 @@ from ..errors import OLD_MODEL_SHORTCUTS
|
||||||
def download_cli(
|
def download_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
ctx: typer.Context,
|
ctx: typer.Context,
|
||||||
model: str = Arg(..., help="Name of model to download"),
|
model: str = Arg(..., help="Name of pipeline package to download"),
|
||||||
direct: bool = Opt(False, "--direct", "-d", "-D", help="Force direct download of name + version"),
|
direct: bool = Opt(False, "--direct", "-d", "-D", help="Force direct download of name + version"),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Download compatible model from default download path using pip. If --direct
|
Download compatible trained pipeline from the default download path using
|
||||||
flag is set, the command expects the full model name with version.
|
pip. If --direct flag is set, the command expects the full package name with
|
||||||
For direct downloads, the compatibility check will be skipped. All
|
version. For direct downloads, the compatibility check will be skipped. All
|
||||||
additional arguments provided to this command will be passed to `pip install`
|
additional arguments provided to this command will be passed to `pip install`
|
||||||
on model installation.
|
on package installation.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/cli#download
|
||||||
|
AVAILABLE PACKAGES: https://spacy.io/models
|
||||||
"""
|
"""
|
||||||
download(model, direct, *ctx.args)
|
download(model, direct, *ctx.args)
|
||||||
|
|
||||||
|
@ -34,11 +37,11 @@ def download_cli(
|
||||||
def download(model: str, direct: bool = False, *pip_args) -> None:
|
def download(model: str, direct: bool = False, *pip_args) -> None:
|
||||||
if not is_package("spacy") and "--no-deps" not in pip_args:
|
if not is_package("spacy") and "--no-deps" not in pip_args:
|
||||||
msg.warn(
|
msg.warn(
|
||||||
"Skipping model package dependencies and setting `--no-deps`. "
|
"Skipping pipeline package dependencies and setting `--no-deps`. "
|
||||||
"You don't seem to have the spaCy package itself installed "
|
"You don't seem to have the spaCy package itself installed "
|
||||||
"(maybe because you've built from source?), so installing the "
|
"(maybe because you've built from source?), so installing the "
|
||||||
"model dependencies would cause spaCy to be downloaded, which "
|
"package dependencies would cause spaCy to be downloaded, which "
|
||||||
"probably isn't what you want. If the model package has other "
|
"probably isn't what you want. If the pipeline package has other "
|
||||||
"dependencies, you'll have to install them manually."
|
"dependencies, you'll have to install them manually."
|
||||||
)
|
)
|
||||||
pip_args = pip_args + ("--no-deps",)
|
pip_args = pip_args + ("--no-deps",)
|
||||||
|
@ -53,7 +56,7 @@ def download(model: str, direct: bool = False, *pip_args) -> None:
|
||||||
if model in OLD_MODEL_SHORTCUTS:
|
if model in OLD_MODEL_SHORTCUTS:
|
||||||
msg.warn(
|
msg.warn(
|
||||||
f"As of spaCy v3.0, shortcuts like '{model}' are deprecated. Please"
|
f"As of spaCy v3.0, shortcuts like '{model}' are deprecated. Please"
|
||||||
f"use the full model name '{OLD_MODEL_SHORTCUTS[model]}' instead."
|
f"use the full pipeline package name '{OLD_MODEL_SHORTCUTS[model]}' instead."
|
||||||
)
|
)
|
||||||
model_name = OLD_MODEL_SHORTCUTS[model]
|
model_name = OLD_MODEL_SHORTCUTS[model]
|
||||||
compatibility = get_compatibility()
|
compatibility = get_compatibility()
|
||||||
|
@ -61,7 +64,7 @@ def download(model: str, direct: bool = False, *pip_args) -> None:
|
||||||
download_model(dl_tpl.format(m=model_name, v=version), pip_args)
|
download_model(dl_tpl.format(m=model_name, v=version), pip_args)
|
||||||
msg.good(
|
msg.good(
|
||||||
"Download and installation successful",
|
"Download and installation successful",
|
||||||
f"You can now load the model via spacy.load('{model_name}')",
|
f"You can now load the package via spacy.load('{model_name}')",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -71,7 +74,7 @@ def get_compatibility() -> dict:
|
||||||
if r.status_code != 200:
|
if r.status_code != 200:
|
||||||
msg.fail(
|
msg.fail(
|
||||||
f"Server error ({r.status_code})",
|
f"Server error ({r.status_code})",
|
||||||
f"Couldn't fetch compatibility table. Please find a model for your spaCy "
|
f"Couldn't fetch compatibility table. Please find a package for your spaCy "
|
||||||
f"installation (v{about.__version__}), and download it manually. "
|
f"installation (v{about.__version__}), and download it manually. "
|
||||||
f"For more details, see the documentation: "
|
f"For more details, see the documentation: "
|
||||||
f"https://spacy.io/usage/models",
|
f"https://spacy.io/usage/models",
|
||||||
|
@ -80,7 +83,7 @@ def get_compatibility() -> dict:
|
||||||
comp_table = r.json()
|
comp_table = r.json()
|
||||||
comp = comp_table["spacy"]
|
comp = comp_table["spacy"]
|
||||||
if version not in comp:
|
if version not in comp:
|
||||||
msg.fail(f"No compatible models found for v{version} of spaCy", exits=1)
|
msg.fail(f"No compatible packages found for v{version} of spaCy", exits=1)
|
||||||
return comp[version]
|
return comp[version]
|
||||||
|
|
||||||
|
|
||||||
|
@ -88,7 +91,7 @@ def get_version(model: str, comp: dict) -> str:
|
||||||
model = get_base_version(model)
|
model = get_base_version(model)
|
||||||
if model not in comp:
|
if model not in comp:
|
||||||
msg.fail(
|
msg.fail(
|
||||||
f"No compatible model found for '{model}' (spaCy v{about.__version__})",
|
f"No compatible package found for '{model}' (spaCy v{about.__version__})",
|
||||||
exits=1,
|
exits=1,
|
||||||
)
|
)
|
||||||
return comp[model][0]
|
return comp[model][0]
|
||||||
|
|
|
@ -26,8 +26,8 @@ def evaluate_cli(
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Evaluate a model. Expects a loadable spaCy model and evaluation data in the
|
Evaluate a trained pipeline. Expects a loadable spaCy pipeline and evaluation
|
||||||
binary .spacy format. The --gold-preproc option sets up the evaluation
|
data in the binary .spacy format. The --gold-preproc option sets up the evaluation
|
||||||
examples with gold-standard sentences and tokens for the predictions. Gold
|
examples with gold-standard sentences and tokens for the predictions. Gold
|
||||||
preprocessing helps the annotations align to the tokenization, and may
|
preprocessing helps the annotations align to the tokenization, and may
|
||||||
result in sequences of more consistent length. However, it may reduce
|
result in sequences of more consistent length. However, it may reduce
|
||||||
|
|
|
@ -12,14 +12,14 @@ from .. import about
|
||||||
@app.command("info")
|
@app.command("info")
|
||||||
def info_cli(
|
def info_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
model: Optional[str] = Arg(None, help="Optional model name"),
|
model: Optional[str] = Arg(None, help="Optional loadable spaCy pipeline"),
|
||||||
markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues"),
|
markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues"),
|
||||||
silent: bool = Opt(False, "--silent", "-s", "-S", help="Don't print anything (just return)"),
|
silent: bool = Opt(False, "--silent", "-s", "-S", help="Don't print anything (just return)"),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Print info about spaCy installation. If a model is speficied as an argument,
|
Print info about spaCy installation. If a pipeline is speficied as an argument,
|
||||||
print model information. Flag --markdown prints details in Markdown for easy
|
print its meta information. Flag --markdown prints details in Markdown for easy
|
||||||
copy-pasting to GitHub issues.
|
copy-pasting to GitHub issues.
|
||||||
"""
|
"""
|
||||||
info(model, markdown=markdown, silent=silent)
|
info(model, markdown=markdown, silent=silent)
|
||||||
|
@ -30,14 +30,16 @@ def info(
|
||||||
) -> Union[str, dict]:
|
) -> Union[str, dict]:
|
||||||
msg = Printer(no_print=silent, pretty=not silent)
|
msg = Printer(no_print=silent, pretty=not silent)
|
||||||
if model:
|
if model:
|
||||||
title = f"Info about model '{model}'"
|
title = f"Info about pipeline '{model}'"
|
||||||
data = info_model(model, silent=silent)
|
data = info_model(model, silent=silent)
|
||||||
else:
|
else:
|
||||||
title = "Info about spaCy"
|
title = "Info about spaCy"
|
||||||
data = info_spacy()
|
data = info_spacy()
|
||||||
raw_data = {k.lower().replace(" ", "_"): v for k, v in data.items()}
|
raw_data = {k.lower().replace(" ", "_"): v for k, v in data.items()}
|
||||||
if "Models" in data and isinstance(data["Models"], dict):
|
if "Pipelines" in data and isinstance(data["Pipelines"], dict):
|
||||||
data["Models"] = ", ".join(f"{n} ({v})" for n, v in data["Models"].items())
|
data["Pipelines"] = ", ".join(
|
||||||
|
f"{n} ({v})" for n, v in data["Pipelines"].items()
|
||||||
|
)
|
||||||
markdown_data = get_markdown(data, title=title)
|
markdown_data = get_markdown(data, title=title)
|
||||||
if markdown:
|
if markdown:
|
||||||
if not silent:
|
if not silent:
|
||||||
|
@ -63,7 +65,7 @@ def info_spacy() -> Dict[str, any]:
|
||||||
"Location": str(Path(__file__).parent.parent),
|
"Location": str(Path(__file__).parent.parent),
|
||||||
"Platform": platform.platform(),
|
"Platform": platform.platform(),
|
||||||
"Python version": platform.python_version(),
|
"Python version": platform.python_version(),
|
||||||
"Models": all_models,
|
"Pipelines": all_models,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -81,7 +83,7 @@ def info_model(model: str, *, silent: bool = True) -> Dict[str, Any]:
|
||||||
model_path = model
|
model_path = model
|
||||||
meta_path = model_path / "meta.json"
|
meta_path = model_path / "meta.json"
|
||||||
if not meta_path.is_file():
|
if not meta_path.is_file():
|
||||||
msg.fail("Can't find model meta.json", meta_path, exits=1)
|
msg.fail("Can't find pipeline meta.json", meta_path, exits=1)
|
||||||
meta = srsly.read_json(meta_path)
|
meta = srsly.read_json(meta_path)
|
||||||
if model_path.resolve() != model_path:
|
if model_path.resolve() != model_path:
|
||||||
meta["source"] = str(model_path.resolve())
|
meta["source"] = str(model_path.resolve())
|
||||||
|
|
|
@ -27,7 +27,7 @@ def init_config_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
output_file: Path = Arg(..., help="File to save config.cfg to or - for stdout (will only output config and no additional logging info)", allow_dash=True),
|
output_file: Path = Arg(..., help="File to save config.cfg to or - for stdout (will only output config and no additional logging info)", allow_dash=True),
|
||||||
lang: Optional[str] = Opt("en", "--lang", "-l", help="Two-letter code of the language to use"),
|
lang: Optional[str] = Opt("en", "--lang", "-l", help="Two-letter code of the language to use"),
|
||||||
pipeline: Optional[str] = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include in the model (without 'tok2vec' or 'transformer')"),
|
pipeline: Optional[str] = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"),
|
||||||
optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
||||||
cpu: bool = Opt(False, "--cpu", "-C", help="Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
cpu: bool = Opt(False, "--cpu", "-C", help="Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
@ -168,7 +168,7 @@ def save_config(
|
||||||
output_file.parent.mkdir(parents=True)
|
output_file.parent.mkdir(parents=True)
|
||||||
config.to_disk(output_file, interpolate=False)
|
config.to_disk(output_file, interpolate=False)
|
||||||
msg.good("Saved config", output_file)
|
msg.good("Saved config", output_file)
|
||||||
msg.text("You can now add your data and train your model:")
|
msg.text("You can now add your data and train your pipeline:")
|
||||||
variables = ["--paths.train ./train.spacy", "--paths.dev ./dev.spacy"]
|
variables = ["--paths.train ./train.spacy", "--paths.dev ./dev.spacy"]
|
||||||
if not no_print:
|
if not no_print:
|
||||||
print(f"{COMMAND} train {output_file.parts[-1]} {' '.join(variables)}")
|
print(f"{COMMAND} train {output_file.parts[-1]} {' '.join(variables)}")
|
||||||
|
|
|
@ -28,7 +28,7 @@ except ImportError:
|
||||||
DEFAULT_OOV_PROB = -20
|
DEFAULT_OOV_PROB = -20
|
||||||
|
|
||||||
|
|
||||||
@init_cli.command("model")
|
@init_cli.command("vectors")
|
||||||
@app.command(
|
@app.command(
|
||||||
"init-model",
|
"init-model",
|
||||||
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||||
|
@ -37,8 +37,8 @@ DEFAULT_OOV_PROB = -20
|
||||||
def init_model_cli(
|
def init_model_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
ctx: typer.Context, # This is only used to read additional arguments
|
ctx: typer.Context, # This is only used to read additional arguments
|
||||||
lang: str = Arg(..., help="Model language"),
|
lang: str = Arg(..., help="Pipeline language"),
|
||||||
output_dir: Path = Arg(..., help="Model output directory"),
|
output_dir: Path = Arg(..., help="Pipeline output directory"),
|
||||||
freqs_loc: Optional[Path] = Arg(None, help="Location of words frequencies file", exists=True),
|
freqs_loc: Optional[Path] = Arg(None, help="Location of words frequencies file", exists=True),
|
||||||
clusters_loc: Optional[Path] = Opt(None, "--clusters-loc", "-c", help="Optional location of brown clusters data", exists=True),
|
clusters_loc: Optional[Path] = Opt(None, "--clusters-loc", "-c", help="Optional location of brown clusters data", exists=True),
|
||||||
jsonl_loc: Optional[Path] = Opt(None, "--jsonl-loc", "-j", help="Location of JSONL-formatted attributes file", exists=True),
|
jsonl_loc: Optional[Path] = Opt(None, "--jsonl-loc", "-j", help="Location of JSONL-formatted attributes file", exists=True),
|
||||||
|
@ -46,19 +46,20 @@ def init_model_cli(
|
||||||
prune_vectors: int = Opt(-1, "--prune-vectors", "-V", help="Optional number of vectors to prune to"),
|
prune_vectors: int = Opt(-1, "--prune-vectors", "-V", help="Optional number of vectors to prune to"),
|
||||||
truncate_vectors: int = Opt(0, "--truncate-vectors", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
|
truncate_vectors: int = Opt(0, "--truncate-vectors", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
|
||||||
vectors_name: Optional[str] = Opt(None, "--vectors-name", "-vn", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
|
vectors_name: Optional[str] = Opt(None, "--vectors-name", "-vn", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
|
||||||
model_name: Optional[str] = Opt(None, "--model-name", "-mn", help="Optional name for the model meta"),
|
model_name: Optional[str] = Opt(None, "--meta-name", "-mn", help="Optional name of the package for the pipeline meta"),
|
||||||
base_model: Optional[str] = Opt(None, "--base-model", "-b", help="Base model (for languages with custom tokenizers)")
|
base_model: Optional[str] = Opt(None, "--base", "-b", help="Name of or path to base pipeline to start with (mostly relevant for pipelines with custom tokenizers)")
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Create a new model from raw data. If vectors are provided in Word2Vec format,
|
Create a new blank pipeline directory with vocab and vectors from raw data.
|
||||||
they can be either a .txt or zipped as a .zip or .tar.gz.
|
If vectors are provided in Word2Vec format, they can be either a .txt or
|
||||||
|
zipped as a .zip or .tar.gz.
|
||||||
"""
|
"""
|
||||||
if ctx.command.name == "init-model":
|
if ctx.command.name == "init-model":
|
||||||
msg.warn(
|
msg.warn(
|
||||||
"The init-model command is now available via the 'init model' "
|
"The init-model command is now called 'init vocab'. You can run "
|
||||||
"subcommand (without the hyphen). You can run python -m spacy init "
|
"'python -m spacy init --help' for an overview of the other "
|
||||||
"--help for an overview of the other available initialization commands."
|
"available initialization commands."
|
||||||
)
|
)
|
||||||
init_model(
|
init_model(
|
||||||
lang,
|
lang,
|
||||||
|
@ -115,10 +116,10 @@ def init_model(
|
||||||
msg.fail("Can't find words frequencies file", freqs_loc, exits=1)
|
msg.fail("Can't find words frequencies file", freqs_loc, exits=1)
|
||||||
lex_attrs = read_attrs_from_deprecated(msg, freqs_loc, clusters_loc)
|
lex_attrs = read_attrs_from_deprecated(msg, freqs_loc, clusters_loc)
|
||||||
|
|
||||||
with msg.loading("Creating model..."):
|
with msg.loading("Creating blank pipeline..."):
|
||||||
nlp = create_model(lang, lex_attrs, name=model_name, base_model=base_model)
|
nlp = create_model(lang, lex_attrs, name=model_name, base_model=base_model)
|
||||||
|
|
||||||
msg.good("Successfully created model")
|
msg.good("Successfully created blank pipeline")
|
||||||
if vectors_loc is not None:
|
if vectors_loc is not None:
|
||||||
add_vectors(
|
add_vectors(
|
||||||
msg, nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name
|
msg, nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name
|
||||||
|
@ -242,7 +243,8 @@ def add_vectors(
|
||||||
if vectors_data is not None:
|
if vectors_data is not None:
|
||||||
nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
|
nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
|
||||||
if name is None:
|
if name is None:
|
||||||
nlp.vocab.vectors.name = f"{nlp.meta['lang']}_model.vectors"
|
# TODO: Is this correct? Does this matter?
|
||||||
|
nlp.vocab.vectors.name = f"{nlp.meta['lang']}_{nlp.meta['name']}.vectors"
|
||||||
else:
|
else:
|
||||||
nlp.vocab.vectors.name = name
|
nlp.vocab.vectors.name = name
|
||||||
nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
|
nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
|
||||||
|
|
|
@ -14,19 +14,19 @@ from .. import about
|
||||||
@app.command("package")
|
@app.command("package")
|
||||||
def package_cli(
|
def package_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
input_dir: Path = Arg(..., help="Directory with model data", exists=True, file_okay=False),
|
input_dir: Path = Arg(..., help="Directory with pipeline data", exists=True, file_okay=False),
|
||||||
output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False),
|
output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False),
|
||||||
meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False),
|
meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False),
|
||||||
create_meta: bool = Opt(False, "--create-meta", "-c", "-C", help="Create meta.json, even if one exists"),
|
create_meta: bool = Opt(False, "--create-meta", "-c", "-C", help="Create meta.json, even if one exists"),
|
||||||
version: Optional[str] = Opt(None, "--version", "-v", help="Package version to override meta"),
|
version: Optional[str] = Opt(None, "--version", "-v", help="Package version to override meta"),
|
||||||
no_sdist: bool = Opt(False, "--no-sdist", "-NS", help="Don't build .tar.gz sdist, can be set if you want to run this step manually"),
|
no_sdist: bool = Opt(False, "--no-sdist", "-NS", help="Don't build .tar.gz sdist, can be set if you want to run this step manually"),
|
||||||
force: bool = Opt(False, "--force", "-f", "-F", help="Force overwriting existing model in output directory"),
|
force: bool = Opt(False, "--force", "-f", "-F", help="Force overwriting existing data in output directory"),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Generate an installable Python package for a model. Includes model data,
|
Generate an installable Python package for a pipeline. Includes binary data,
|
||||||
meta and required installation files. A new directory will be created in the
|
meta and required installation files. A new directory will be created in the
|
||||||
specified output directory, and model data will be copied over. If
|
specified output directory, and the data will be copied over. If
|
||||||
--create-meta is set and a meta.json already exists in the output directory,
|
--create-meta is set and a meta.json already exists in the output directory,
|
||||||
the existing values will be used as the defaults in the command-line prompt.
|
the existing values will be used as the defaults in the command-line prompt.
|
||||||
After packaging, "python setup.py sdist" is run in the package directory,
|
After packaging, "python setup.py sdist" is run in the package directory,
|
||||||
|
@ -59,14 +59,14 @@ def package(
|
||||||
output_path = util.ensure_path(output_dir)
|
output_path = util.ensure_path(output_dir)
|
||||||
meta_path = util.ensure_path(meta_path)
|
meta_path = util.ensure_path(meta_path)
|
||||||
if not input_path or not input_path.exists():
|
if not input_path or not input_path.exists():
|
||||||
msg.fail("Can't locate model data", input_path, exits=1)
|
msg.fail("Can't locate pipeline data", input_path, exits=1)
|
||||||
if not output_path or not output_path.exists():
|
if not output_path or not output_path.exists():
|
||||||
msg.fail("Output directory not found", output_path, exits=1)
|
msg.fail("Output directory not found", output_path, exits=1)
|
||||||
if meta_path and not meta_path.exists():
|
if meta_path and not meta_path.exists():
|
||||||
msg.fail("Can't find model meta.json", meta_path, exits=1)
|
msg.fail("Can't find pipeline meta.json", meta_path, exits=1)
|
||||||
meta_path = meta_path or input_dir / "meta.json"
|
meta_path = meta_path or input_dir / "meta.json"
|
||||||
if not meta_path.exists() or not meta_path.is_file():
|
if not meta_path.exists() or not meta_path.is_file():
|
||||||
msg.fail("Can't load model meta.json", meta_path, exits=1)
|
msg.fail("Can't load pipeline meta.json", meta_path, exits=1)
|
||||||
meta = srsly.read_json(meta_path)
|
meta = srsly.read_json(meta_path)
|
||||||
meta = get_meta(input_dir, meta)
|
meta = get_meta(input_dir, meta)
|
||||||
if version is not None:
|
if version is not None:
|
||||||
|
@ -77,7 +77,7 @@ def package(
|
||||||
meta = generate_meta(meta, msg)
|
meta = generate_meta(meta, msg)
|
||||||
errors = validate(ModelMetaSchema, meta)
|
errors = validate(ModelMetaSchema, meta)
|
||||||
if errors:
|
if errors:
|
||||||
msg.fail("Invalid model meta.json")
|
msg.fail("Invalid pipeline meta.json")
|
||||||
print("\n".join(errors))
|
print("\n".join(errors))
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
model_name = meta["lang"] + "_" + meta["name"]
|
model_name = meta["lang"] + "_" + meta["name"]
|
||||||
|
@ -118,7 +118,7 @@ def get_meta(
|
||||||
) -> Dict[str, Any]:
|
) -> Dict[str, Any]:
|
||||||
meta = {
|
meta = {
|
||||||
"lang": "en",
|
"lang": "en",
|
||||||
"name": "model",
|
"name": "pipeline",
|
||||||
"version": "0.0.0",
|
"version": "0.0.0",
|
||||||
"description": "",
|
"description": "",
|
||||||
"author": "",
|
"author": "",
|
||||||
|
@ -143,10 +143,10 @@ def get_meta(
|
||||||
def generate_meta(existing_meta: Dict[str, Any], msg: Printer) -> Dict[str, Any]:
|
def generate_meta(existing_meta: Dict[str, Any], msg: Printer) -> Dict[str, Any]:
|
||||||
meta = existing_meta or {}
|
meta = existing_meta or {}
|
||||||
settings = [
|
settings = [
|
||||||
("lang", "Model language", meta.get("lang", "en")),
|
("lang", "Pipeline language", meta.get("lang", "en")),
|
||||||
("name", "Model name", meta.get("name", "model")),
|
("name", "Pipeline name", meta.get("name", "pipeline")),
|
||||||
("version", "Model version", meta.get("version", "0.0.0")),
|
("version", "Package version", meta.get("version", "0.0.0")),
|
||||||
("description", "Model description", meta.get("description", None)),
|
("description", "Package description", meta.get("description", None)),
|
||||||
("author", "Author", meta.get("author", None)),
|
("author", "Author", meta.get("author", None)),
|
||||||
("email", "Author email", meta.get("email", None)),
|
("email", "Author email", meta.get("email", None)),
|
||||||
("url", "Author website", meta.get("url", None)),
|
("url", "Author website", meta.get("url", None)),
|
||||||
|
@ -154,8 +154,8 @@ def generate_meta(existing_meta: Dict[str, Any], msg: Printer) -> Dict[str, Any]
|
||||||
]
|
]
|
||||||
msg.divider("Generating meta.json")
|
msg.divider("Generating meta.json")
|
||||||
msg.text(
|
msg.text(
|
||||||
"Enter the package settings for your model. The following information "
|
"Enter the package settings for your pipeline. The following information "
|
||||||
"will be read from your model data: pipeline, vectors."
|
"will be read from your pipeline data: pipeline, vectors."
|
||||||
)
|
)
|
||||||
for setting, desc, default in settings:
|
for setting, desc, default in settings:
|
||||||
response = get_raw_input(desc, default)
|
response = get_raw_input(desc, default)
|
||||||
|
|
|
@ -31,7 +31,7 @@ def pretrain_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
ctx: typer.Context, # This is only used to read additional arguments
|
ctx: typer.Context, # This is only used to read additional arguments
|
||||||
texts_loc: Path = Arg(..., help="Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'", exists=True),
|
texts_loc: Path = Arg(..., help="Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'", exists=True),
|
||||||
output_dir: Path = Arg(..., help="Directory to write models to on each epoch"),
|
output_dir: Path = Arg(..., help="Directory to write weights to on each epoch"),
|
||||||
config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False),
|
config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False),
|
||||||
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||||
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
|
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
|
||||||
|
@ -376,10 +376,9 @@ def verify_cli_args(texts_loc, output_dir, config_path, resume_path, epoch_resum
|
||||||
if output_dir.exists() and [p for p in output_dir.iterdir()]:
|
if output_dir.exists() and [p for p in output_dir.iterdir()]:
|
||||||
if resume_path:
|
if resume_path:
|
||||||
msg.warn(
|
msg.warn(
|
||||||
"Output directory is not empty. ",
|
"Output directory is not empty.",
|
||||||
"If you're resuming a run from a previous model in this directory, "
|
"If you're resuming a run in this directory, the old weights "
|
||||||
"the old models for the consecutive epochs will be overwritten "
|
"for the consecutive epochs will be overwritten with the new ones.",
|
||||||
"with the new ones.",
|
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
msg.warn(
|
msg.warn(
|
||||||
|
|
|
@ -19,7 +19,7 @@ from ..util import load_model
|
||||||
def profile_cli(
|
def profile_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
ctx: typer.Context, # This is only used to read current calling context
|
ctx: typer.Context, # This is only used to read current calling context
|
||||||
model: str = Arg(..., help="Model to load"),
|
model: str = Arg(..., help="Trained pipeline to load"),
|
||||||
inputs: Optional[Path] = Arg(None, help="Location of input file. '-' for stdin.", exists=True, allow_dash=True),
|
inputs: Optional[Path] = Arg(None, help="Location of input file. '-' for stdin.", exists=True, allow_dash=True),
|
||||||
n_texts: int = Opt(10000, "--n-texts", "-n", help="Maximum number of texts to use if available"),
|
n_texts: int = Opt(10000, "--n-texts", "-n", help="Maximum number of texts to use if available"),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
@ -60,9 +60,9 @@ def profile(model: str, inputs: Optional[Path] = None, n_texts: int = 10000) ->
|
||||||
inputs, _ = zip(*imdb_train)
|
inputs, _ = zip(*imdb_train)
|
||||||
msg.info(f"Loaded IMDB dataset and using {n_inputs} examples")
|
msg.info(f"Loaded IMDB dataset and using {n_inputs} examples")
|
||||||
inputs = inputs[:n_inputs]
|
inputs = inputs[:n_inputs]
|
||||||
with msg.loading(f"Loading model '{model}'..."):
|
with msg.loading(f"Loading pipeline '{model}'..."):
|
||||||
nlp = load_model(model)
|
nlp = load_model(model)
|
||||||
msg.good(f"Loaded model '{model}'")
|
msg.good(f"Loaded pipeline '{model}'")
|
||||||
texts = list(itertools.islice(inputs, n_texts))
|
texts = list(itertools.islice(inputs, n_texts))
|
||||||
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
|
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
|
||||||
s = pstats.Stats("Profile.prof")
|
s = pstats.Stats("Profile.prof")
|
||||||
|
|
|
@ -26,7 +26,7 @@ def train_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
ctx: typer.Context, # This is only used to read additional arguments
|
ctx: typer.Context, # This is only used to read additional arguments
|
||||||
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
||||||
output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store model in"),
|
output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"),
|
||||||
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||||
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||||
|
@ -34,7 +34,7 @@ def train_cli(
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Train or update a spaCy model. Requires data in spaCy's binary format. To
|
Train or update a spaCy pipeline. Requires data in spaCy's binary format. To
|
||||||
convert data from other formats, use the `spacy convert` command. The
|
convert data from other formats, use the `spacy convert` command. The
|
||||||
config file includes all settings and hyperparameters used during traing.
|
config file includes all settings and hyperparameters used during traing.
|
||||||
To override settings in the config, e.g. settings that point to local
|
To override settings in the config, e.g. settings that point to local
|
||||||
|
@ -113,12 +113,12 @@ def train(
|
||||||
# Load morph rules
|
# Load morph rules
|
||||||
nlp.vocab.morphology.load_morph_exceptions(morph_rules)
|
nlp.vocab.morphology.load_morph_exceptions(morph_rules)
|
||||||
|
|
||||||
# Load a pretrained tok2vec model - cf. CLI command 'pretrain'
|
# Load pretrained tok2vec weights - cf. CLI command 'pretrain'
|
||||||
if weights_data is not None:
|
if weights_data is not None:
|
||||||
tok2vec_path = config["pretraining"].get("tok2vec_model", None)
|
tok2vec_path = config["pretraining"].get("tok2vec_model", None)
|
||||||
if tok2vec_path is None:
|
if tok2vec_path is None:
|
||||||
msg.fail(
|
msg.fail(
|
||||||
f"To use a pretrained tok2vec model, the config needs to specify which "
|
f"To pretrained tok2vec weights, the config needs to specify which "
|
||||||
f"tok2vec layer to load in the setting [pretraining.tok2vec_model].",
|
f"tok2vec layer to load in the setting [pretraining.tok2vec_model].",
|
||||||
exits=1,
|
exits=1,
|
||||||
)
|
)
|
||||||
|
@ -183,7 +183,7 @@ def train(
|
||||||
nlp.to_disk(final_model_path)
|
nlp.to_disk(final_model_path)
|
||||||
else:
|
else:
|
||||||
nlp.to_disk(final_model_path)
|
nlp.to_disk(final_model_path)
|
||||||
msg.good(f"Saved model to output directory {final_model_path}")
|
msg.good(f"Saved pipeline to output directory {final_model_path}")
|
||||||
|
|
||||||
|
|
||||||
def create_train_batches(iterator, batcher, max_epochs: int):
|
def create_train_batches(iterator, batcher, max_epochs: int):
|
||||||
|
|
|
@ -13,9 +13,9 @@ from ..util import get_package_path, get_model_meta, is_compatible_version
|
||||||
@app.command("validate")
|
@app.command("validate")
|
||||||
def validate_cli():
|
def validate_cli():
|
||||||
"""
|
"""
|
||||||
Validate the currently installed models and spaCy version. Checks if the
|
Validate the currently installed pipeline packages and spaCy version. Checks
|
||||||
installed models are compatible and shows upgrade instructions if available.
|
if the installed packages are compatible and shows upgrade instructions if
|
||||||
Should be run after `pip install -U spacy`.
|
available. Should be run after `pip install -U spacy`.
|
||||||
"""
|
"""
|
||||||
validate()
|
validate()
|
||||||
|
|
||||||
|
@ -25,13 +25,13 @@ def validate() -> None:
|
||||||
spacy_version = get_base_version(about.__version__)
|
spacy_version = get_base_version(about.__version__)
|
||||||
current_compat = compat.get(spacy_version, {})
|
current_compat = compat.get(spacy_version, {})
|
||||||
if not current_compat:
|
if not current_compat:
|
||||||
msg.warn(f"No compatible models found for v{spacy_version} of spaCy")
|
msg.warn(f"No compatible packages found for v{spacy_version} of spaCy")
|
||||||
incompat_models = {d["name"] for _, d in model_pkgs.items() if not d["compat"]}
|
incompat_models = {d["name"] for _, d in model_pkgs.items() if not d["compat"]}
|
||||||
na_models = [m for m in incompat_models if m not in current_compat]
|
na_models = [m for m in incompat_models if m not in current_compat]
|
||||||
update_models = [m for m in incompat_models if m in current_compat]
|
update_models = [m for m in incompat_models if m in current_compat]
|
||||||
spacy_dir = Path(__file__).parent.parent
|
spacy_dir = Path(__file__).parent.parent
|
||||||
|
|
||||||
msg.divider(f"Installed models (spaCy v{about.__version__})")
|
msg.divider(f"Installed pipeline packages (spaCy v{about.__version__})")
|
||||||
msg.info(f"spaCy installation: {spacy_dir}")
|
msg.info(f"spaCy installation: {spacy_dir}")
|
||||||
|
|
||||||
if model_pkgs:
|
if model_pkgs:
|
||||||
|
@ -47,15 +47,15 @@ def validate() -> None:
|
||||||
rows.append((data["name"], data["spacy"], version, comp))
|
rows.append((data["name"], data["spacy"], version, comp))
|
||||||
msg.table(rows, header=header)
|
msg.table(rows, header=header)
|
||||||
else:
|
else:
|
||||||
msg.text("No models found in your current environment.", exits=0)
|
msg.text("No pipeline packages found in your current environment.", exits=0)
|
||||||
if update_models:
|
if update_models:
|
||||||
msg.divider("Install updates")
|
msg.divider("Install updates")
|
||||||
msg.text("Use the following commands to update the model packages:")
|
msg.text("Use the following commands to update the packages:")
|
||||||
cmd = "python -m spacy download {}"
|
cmd = "python -m spacy download {}"
|
||||||
print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n")
|
print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n")
|
||||||
if na_models:
|
if na_models:
|
||||||
msg.info(
|
msg.info(
|
||||||
f"The following models are custom spaCy models or not "
|
f"The following packages are custom spaCy pipelines or not "
|
||||||
f"available for spaCy v{about.__version__}:",
|
f"available for spaCy v{about.__version__}:",
|
||||||
", ".join(na_models),
|
", ".join(na_models),
|
||||||
)
|
)
|
||||||
|
|
|
@ -192,7 +192,7 @@ class Language:
|
||||||
self._meta.setdefault("lang", self.vocab.lang)
|
self._meta.setdefault("lang", self.vocab.lang)
|
||||||
else:
|
else:
|
||||||
self._meta.setdefault("lang", self.lang)
|
self._meta.setdefault("lang", self.lang)
|
||||||
self._meta.setdefault("name", "model")
|
self._meta.setdefault("name", "pipeline")
|
||||||
self._meta.setdefault("version", "0.0.0")
|
self._meta.setdefault("version", "0.0.0")
|
||||||
self._meta.setdefault("spacy_version", spacy_version)
|
self._meta.setdefault("spacy_version", spacy_version)
|
||||||
self._meta.setdefault("description", "")
|
self._meta.setdefault("description", "")
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
---
|
---
|
||||||
title: Command Line Interface
|
title: Command Line Interface
|
||||||
teaser: Download, train and package models, and debug spaCy
|
teaser: Download, train and package pipelines, and debug spaCy
|
||||||
source: spacy/cli
|
source: spacy/cli
|
||||||
menu:
|
menu:
|
||||||
- ['download', 'download']
|
- ['download', 'download']
|
||||||
|
@ -17,45 +17,47 @@ menu:
|
||||||
---
|
---
|
||||||
|
|
||||||
spaCy's CLI provides a range of helpful commands for downloading and training
|
spaCy's CLI provides a range of helpful commands for downloading and training
|
||||||
models, converting data and debugging your config, data and installation. For a
|
pipelines, converting data and debugging your config, data and installation. For
|
||||||
list of available commands, you can type `python -m spacy --help`. You can also
|
a list of available commands, you can type `python -m spacy --help`. You can
|
||||||
add the `--help` flag to any command or subcommand to see the description,
|
also add the `--help` flag to any command or subcommand to see the description,
|
||||||
available arguments and usage.
|
available arguments and usage.
|
||||||
|
|
||||||
## download {#download tag="command"}
|
## download {#download tag="command"}
|
||||||
|
|
||||||
Download [models](/usage/models) for spaCy. The downloader finds the
|
Download [trained pipelines](/usage/models) for spaCy. The downloader finds the
|
||||||
best-matching compatible version and uses `pip install` to download the model as
|
best-matching compatible version and uses `pip install` to download the Python
|
||||||
a package. Direct downloads don't perform any compatibility checks and require
|
package. Direct downloads don't perform any compatibility checks and require the
|
||||||
the model name to be specified with its version (e.g. `en_core_web_sm-2.2.0`).
|
pipeline name to be specified with its version (e.g. `en_core_web_sm-2.2.0`).
|
||||||
|
|
||||||
> #### Downloading best practices
|
> #### Downloading best practices
|
||||||
>
|
>
|
||||||
> The `download` command is mostly intended as a convenient, interactive wrapper
|
> The `download` command is mostly intended as a convenient, interactive wrapper
|
||||||
> – it performs compatibility checks and prints detailed messages in case things
|
> – it performs compatibility checks and prints detailed messages in case things
|
||||||
> go wrong. It's **not recommended** to use this command as part of an automated
|
> go wrong. It's **not recommended** to use this command as part of an automated
|
||||||
> process. If you know which model your project needs, you should consider a
|
> process. If you know which package your project needs, you should consider a
|
||||||
> [direct download via pip](/usage/models#download-pip), or uploading the model
|
> [direct download via pip](/usage/models#download-pip), or uploading the
|
||||||
> to a local PyPi installation and fetching it straight from there. This will
|
> package to a local PyPi installation and fetching it straight from there. This
|
||||||
> also allow you to add it as a versioned package dependency to your project.
|
> will also allow you to add it as a versioned package dependency to your
|
||||||
|
> project.
|
||||||
|
|
||||||
```cli
|
```cli
|
||||||
$ python -m spacy download [model] [--direct] [pip_args]
|
$ python -m spacy download [model] [--direct] [pip_args]
|
||||||
```
|
```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| ------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `model` | Model name, e.g. [`en_core_web_sm`](/models/en#en_core_web_sm). ~~str (positional)~~ |
|
| `model` | Pipeline package name, e.g. [`en_core_web_sm`](/models/en#en_core_web_sm). ~~str (positional)~~ |
|
||||||
| `--direct`, `-d` | Force direct download of exact model version. ~~bool (flag)~~ |
|
| `--direct`, `-d` | Force direct download of exact package version. ~~bool (flag)~~ |
|
||||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||||
| pip args <Tag variant="new">2.1</Tag> | Additional installation options to be passed to `pip install` when installing the model package. For example, `--user` to install to the user home directory or `--no-deps` to not install model dependencies. ~~Any (option/flag)~~ |
|
| pip args <Tag variant="new">2.1</Tag> | Additional installation options to be passed to `pip install` when installing the pipeline package. For example, `--user` to install to the user home directory or `--no-deps` to not install package dependencies. ~~Any (option/flag)~~ |
|
||||||
| **CREATES** | The installed model package in your `site-packages` directory. |
|
| **CREATES** | The installed pipeline package in your `site-packages` directory. |
|
||||||
|
|
||||||
## info {#info tag="command"}
|
## info {#info tag="command"}
|
||||||
|
|
||||||
Print information about your spaCy installation, models and local setup, and
|
Print information about your spaCy installation, trained pipelines and local
|
||||||
generate [Markdown](https://en.wikipedia.org/wiki/Markdown)-formatted markup to
|
setup, and generate [Markdown](https://en.wikipedia.org/wiki/Markdown)-formatted
|
||||||
copy-paste into [GitHub issues](https://github.com/explosion/spaCy/issues).
|
markup to copy-paste into
|
||||||
|
[GitHub issues](https://github.com/explosion/spaCy/issues).
|
||||||
|
|
||||||
```cli
|
```cli
|
||||||
$ python -m spacy info [--markdown] [--silent]
|
$ python -m spacy info [--markdown] [--silent]
|
||||||
|
@ -65,41 +67,41 @@ $ python -m spacy info [--markdown] [--silent]
|
||||||
$ python -m spacy info [model] [--markdown] [--silent]
|
$ python -m spacy info [model] [--markdown] [--silent]
|
||||||
```
|
```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------------------------------------ | ------------------------------------------------------------------------------ |
|
| ------------------------------------------------ | ----------------------------------------------------------------------------------------- |
|
||||||
| `model` | A model, i.e. package name or path (optional). ~~Optional[str] \(positional)~~ |
|
| `model` | A trained pipeline, i.e. package name or path (optional). ~~Optional[str] \(positional)~~ |
|
||||||
| `--markdown`, `-md` | Print information as Markdown. ~~bool (flag)~~ |
|
| `--markdown`, `-md` | Print information as Markdown. ~~bool (flag)~~ |
|
||||||
| `--silent`, `-s` <Tag variant="new">2.0.12</Tag> | Don't print anything, just return the values. ~~bool (flag)~~ |
|
| `--silent`, `-s` <Tag variant="new">2.0.12</Tag> | Don't print anything, just return the values. ~~bool (flag)~~ |
|
||||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||||
| **PRINTS** | Information about your spaCy installation. |
|
| **PRINTS** | Information about your spaCy installation. |
|
||||||
|
|
||||||
## validate {#validate new="2" tag="command"}
|
## validate {#validate new="2" tag="command"}
|
||||||
|
|
||||||
Find all models installed in the current environment and check whether they are
|
Find all trained pipeline packages installed in the current environment and
|
||||||
compatible with the currently installed version of spaCy. Should be run after
|
check whether they are compatible with the currently installed version of spaCy.
|
||||||
upgrading spaCy via `pip install -U spacy` to ensure that all installed models
|
Should be run after upgrading spaCy via `pip install -U spacy` to ensure that
|
||||||
are can be used with the new version. It will show a list of models and their
|
all installed packages are can be used with the new version. It will show a list
|
||||||
installed versions. If any model is out of date, the latest compatible versions
|
of packages and their installed versions. If any package is out of date, the
|
||||||
and command for updating are shown.
|
latest compatible versions and command for updating are shown.
|
||||||
|
|
||||||
> #### Automated validation
|
> #### Automated validation
|
||||||
>
|
>
|
||||||
> You can also use the `validate` command as part of your build process or test
|
> You can also use the `validate` command as part of your build process or test
|
||||||
> suite, to ensure all models are up to date before proceeding. If incompatible
|
> suite, to ensure all packages are up to date before proceeding. If
|
||||||
> models are found, it will return `1`.
|
> incompatible packages are found, it will return `1`.
|
||||||
|
|
||||||
```cli
|
```cli
|
||||||
$ python -m spacy validate
|
$ python -m spacy validate
|
||||||
```
|
```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ---------- | --------------------------------------------------------- |
|
| ---------- | -------------------------------------------------------------------- |
|
||||||
| **PRINTS** | Details about the compatibility of your installed models. |
|
| **PRINTS** | Details about the compatibility of your installed pipeline packages. |
|
||||||
|
|
||||||
## init {#init new="3"}
|
## init {#init new="3"}
|
||||||
|
|
||||||
The `spacy init` CLI includes helpful commands for initializing training config
|
The `spacy init` CLI includes helpful commands for initializing training config
|
||||||
files and model directories.
|
files and pipeline directories.
|
||||||
|
|
||||||
### init config {#init-config new="3" tag="command"}
|
### init config {#init-config new="3" tag="command"}
|
||||||
|
|
||||||
|
@ -125,7 +127,7 @@ $ python -m spacy init config [output_file] [--lang] [--pipeline] [--optimize] [
|
||||||
| ------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `output_file` | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~ |
|
| `output_file` | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~ |
|
||||||
| `--lang`, `-l` | Optional code of the [language](/usage/models#languages) to use. Defaults to `"en"`. ~~str (option)~~ |
|
| `--lang`, `-l` | Optional code of the [language](/usage/models#languages) to use. Defaults to `"en"`. ~~str (option)~~ |
|
||||||
| `--pipeline`, `-p` | Comma-separated list of trainable [pipeline components](/usage/processing-pipelines#built-in) to include in the model. Defaults to `"tagger,parser,ner"`. ~~str (option)~~ |
|
| `--pipeline`, `-p` | Comma-separated list of trainable [pipeline components](/usage/processing-pipelines#built-in) to include. Defaults to `"tagger,parser,ner"`. ~~str (option)~~ |
|
||||||
| `--optimize`, `-o` | `"efficiency"` or `"accuracy"`. Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters. Defaults to `"efficiency"`. ~~str (option)~~ |
|
| `--optimize`, `-o` | `"efficiency"` or `"accuracy"`. Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters. Defaults to `"efficiency"`. ~~str (option)~~ |
|
||||||
| `--cpu`, `-C` | Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters. ~~bool (flag)~~ |
|
| `--cpu`, `-C` | Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters. ~~bool (flag)~~ |
|
||||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||||
|
@ -165,36 +167,38 @@ $ python -m spacy init fill-config [base_path] [output_file] [--diff]
|
||||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||||
| **CREATES** | Complete and auto-filled config file for training. |
|
| **CREATES** | Complete and auto-filled config file for training. |
|
||||||
|
|
||||||
### init model {#init-model new="2" tag="command"}
|
### init vocab {#init-vocab new="3" tag="command"}
|
||||||
|
|
||||||
Create a new model directory from raw data, like word frequencies, Brown
|
Create a blank pipeline directory from raw data, like word frequencies, Brown
|
||||||
clusters and word vectors. Note that in order to populate the model's vocab, you
|
clusters and word vectors. Note that in order to populate the vocabulary, you
|
||||||
need to pass in a JSONL-formatted
|
need to pass in a JSONL-formatted
|
||||||
[vocabulary file](/api/data-formats#vocab-jsonl) as `--jsonl-loc` with optional
|
[vocabulary file](/api/data-formats#vocab-jsonl) as `--jsonl-loc` with optional
|
||||||
`id` values that correspond to the vectors table. Just loading in vectors will
|
`id` values that correspond to the vectors table. Just loading in vectors will
|
||||||
not automatically populate the vocab.
|
not automatically populate the vocab.
|
||||||
|
|
||||||
<Infobox title="New in v3.0" variant="warning">
|
<Infobox title="New in v3.0" variant="warning" id="init-model">
|
||||||
|
|
||||||
The `init-model` command is now available as a subcommand of `spacy init`.
|
This command was previously called `init-model`.
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
```cli
|
```cli
|
||||||
$ python -m spacy init model [lang] [output_dir] [--jsonl-loc] [--vectors-loc] [--prune-vectors]
|
$ python -m spacy init vocab [lang] [output_dir] [--jsonl-loc] [--vectors-loc] [--prune-vectors] [--vectors-name] [--meta-name] [--base]
|
||||||
```
|
```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `lang` | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. ~~str (positional)~~ |
|
| `lang` | Pipeline language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. ~~str (positional)~~ |
|
||||||
| `output_dir` | Model output directory. Will be created if it doesn't exist. ~~Path (positional)~~ |
|
| `output_dir` | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~ |
|
||||||
| `--jsonl-loc`, `-j` | Optional location of JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) with lexical attributes. ~~Optional[Path] \(option)~~ |
|
| `--jsonl-loc`, `-j` | Optional location of JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) with lexical attributes. ~~Optional[Path] \(option)~~ |
|
||||||
| `--vectors-loc`, `-v` | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Optional[Path] \(option)~~ |
|
| `--vectors-loc`, `-v` | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Optional[Path] \(option)~~ |
|
||||||
| `--truncate-vectors`, `-t` <Tag variant="new">2.3</Tag> | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ |
|
| `--truncate-vectors`, `-t` <Tag variant="new">2.3</Tag> | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ |
|
||||||
| `--prune-vectors`, `-V` | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~ |
|
| `--prune-vectors`, `-V` | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~ |
|
||||||
| `--vectors-name`, `-vn` | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. ~~str (option)~~ |
|
| `--vectors-name`, `-vn` | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. ~~Optional[str] \(option)~~ |
|
||||||
|
| `--meta-name`, `-mn` | Optional name of the package for the pipeline meta. ~~Optional[str] \(option)~~ |
|
||||||
|
| `--base`, `-b` | Optional name of or path to base pipeline to start with (mostly relevant for pipelines with custom tokenizers). ~~Optional[str] \(option)~~ |
|
||||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||||
| **CREATES** | A spaCy model containing the vocab and vectors. |
|
| **CREATES** | A spaCy pipeline directory containing the vocab and vectors. |
|
||||||
|
|
||||||
## convert {#convert tag="command"}
|
## convert {#convert tag="command"}
|
||||||
|
|
||||||
|
@ -205,7 +209,7 @@ management functions. The converter can be specified on the command line, or
|
||||||
chosen based on the file extension of the input file.
|
chosen based on the file extension of the input file.
|
||||||
|
|
||||||
```cli
|
```cli
|
||||||
$ python -m spacy convert [input_file] [output_dir] [--converter] [--file-type] [--n-sents] [--seg-sents] [--model] [--morphology] [--merge-subtokens] [--ner-map] [--lang]
|
$ python -m spacy convert [input_file] [output_dir] [--converter] [--file-type] [--n-sents] [--seg-sents] [--base] [--morphology] [--merge-subtokens] [--ner-map] [--lang]
|
||||||
```
|
```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
|
@ -216,7 +220,7 @@ $ python -m spacy convert [input_file] [output_dir] [--converter] [--file-type]
|
||||||
| `--file-type`, `-t` <Tag variant="new">2.1</Tag> | Type of file to create. Either `spacy` (default) for binary [`DocBin`](/api/docbin) data or `json` for v2.x JSON format. ~~str (option)~~ |
|
| `--file-type`, `-t` <Tag variant="new">2.1</Tag> | Type of file to create. Either `spacy` (default) for binary [`DocBin`](/api/docbin) data or `json` for v2.x JSON format. ~~str (option)~~ |
|
||||||
| `--n-sents`, `-n` | Number of sentences per document. ~~int (option)~~ |
|
| `--n-sents`, `-n` | Number of sentences per document. ~~int (option)~~ |
|
||||||
| `--seg-sents`, `-s` <Tag variant="new">2.2</Tag> | Segment sentences (for `--converter ner`). ~~bool (flag)~~ |
|
| `--seg-sents`, `-s` <Tag variant="new">2.2</Tag> | Segment sentences (for `--converter ner`). ~~bool (flag)~~ |
|
||||||
| `--model`, `-b` <Tag variant="new">2.2</Tag> | Model for parser-based sentence segmentation (for `--seg-sents`). ~~Optional[str](option)~~ |
|
| `--base`, `-b` | Trained spaCy pipeline for sentence segmentation to use as base (for `--seg-sents`). ~~Optional[str](option)~~ |
|
||||||
| `--morphology`, `-m` | Enable appending morphology to tags. ~~bool (flag)~~ |
|
| `--morphology`, `-m` | Enable appending morphology to tags. ~~bool (flag)~~ |
|
||||||
| `--ner-map`, `-nm` | NER tag mapping (as JSON-encoded dict of entity types). ~~Optional[Path](option)~~ |
|
| `--ner-map`, `-nm` | NER tag mapping (as JSON-encoded dict of entity types). ~~Optional[Path](option)~~ |
|
||||||
| `--lang`, `-l` <Tag variant="new">2.1</Tag> | Language code (if tokenizer required). ~~Optional[str] \(option)~~ |
|
| `--lang`, `-l` <Tag variant="new">2.1</Tag> | Language code (if tokenizer required). ~~Optional[str] \(option)~~ |
|
||||||
|
@ -594,11 +598,11 @@ $ python -m spacy debug profile [model] [inputs] [--n-texts]
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------------- | ---------------------------------------------------------------------------------- |
|
| ----------------- | ---------------------------------------------------------------------------------- |
|
||||||
| `model` | A loadable spaCy model. ~~str (positional)~~ |
|
| `model` | A loadable spaCy pipeline (package name or path). ~~str (positional)~~ |
|
||||||
| `inputs` | Optional path to input file, or `-` for standard input. ~~Path (positional)~~ |
|
| `inputs` | Optional path to input file, or `-` for standard input. ~~Path (positional)~~ |
|
||||||
| `--n-texts`, `-n` | Maximum number of texts to use if available. Defaults to `10000`. ~~int (option)~~ |
|
| `--n-texts`, `-n` | Maximum number of texts to use if available. Defaults to `10000`. ~~int (option)~~ |
|
||||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||||
| **PRINTS** | Profiling information for the model. |
|
| **PRINTS** | Profiling information for the pipeline. |
|
||||||
|
|
||||||
### debug model {#debug-model new="3" tag="command"}
|
### debug model {#debug-model new="3" tag="command"}
|
||||||
|
|
||||||
|
@ -724,10 +728,10 @@ $ python -m spacy debug model ./config.cfg tagger -l "5,15" -DIM -PAR -P0 -P1 -P
|
||||||
|
|
||||||
## train {#train tag="command"}
|
## train {#train tag="command"}
|
||||||
|
|
||||||
Train a model. Expects data in spaCy's
|
Train a pipeline. Expects data in spaCy's
|
||||||
[binary format](/api/data-formats#training) and a
|
[binary format](/api/data-formats#training) and a
|
||||||
[config file](/api/data-formats#config) with all settings and hyperparameters.
|
[config file](/api/data-formats#config) with all settings and hyperparameters.
|
||||||
Will save out the best model from all epochs, as well as the final model. The
|
Will save out the best model from all epochs, as well as the final pipeline. The
|
||||||
`--code` argument can be used to provide a Python file that's imported before
|
`--code` argument can be used to provide a Python file that's imported before
|
||||||
the training process starts. This lets you register
|
the training process starts. This lets you register
|
||||||
[custom functions](/usage/training#custom-functions) and architectures and refer
|
[custom functions](/usage/training#custom-functions) and architectures and refer
|
||||||
|
@ -753,12 +757,12 @@ $ python -m spacy train [config_path] [--output] [--code] [--verbose] [overrides
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
|
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
|
||||||
| `--output`, `-o` | Directory to store model in. Will be created if it doesn't exist. ~~Optional[Path] \(positional)~~ |
|
| `--output`, `-o` | Directory to store trained pipeline in. Will be created if it doesn't exist. ~~Optional[Path] \(positional)~~ |
|
||||||
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||||
| `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~ |
|
| `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~ |
|
||||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||||
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
|
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
|
||||||
| **CREATES** | The final model and the best model. |
|
| **CREATES** | The final trained pipeline and the best trained pipeline. |
|
||||||
|
|
||||||
## pretrain {#pretrain new="2.1" tag="command,experimental"}
|
## pretrain {#pretrain new="2.1" tag="command,experimental"}
|
||||||
|
|
||||||
|
@ -769,7 +773,7 @@ a component like a CNN, BiLSTM, etc to predict vectors which match the
|
||||||
pretrained ones. The weights are saved to a directory after each epoch. You can
|
pretrained ones. The weights are saved to a directory after each epoch. You can
|
||||||
then include a **path to one of these pretrained weights files** in your
|
then include a **path to one of these pretrained weights files** in your
|
||||||
[training config](/usage/training#config) as the `init_tok2vec` setting when you
|
[training config](/usage/training#config) as the `init_tok2vec` setting when you
|
||||||
train your model. This technique may be especially helpful if you have little
|
train your pipeline. This technique may be especially helpful if you have little
|
||||||
labelled data. See the usage docs on [pretraining](/usage/training#pretraining)
|
labelled data. See the usage docs on [pretraining](/usage/training#pretraining)
|
||||||
for more info.
|
for more info.
|
||||||
|
|
||||||
|
@ -792,7 +796,7 @@ $ python -m spacy pretrain [texts_loc] [output_dir] [config_path] [--code] [--re
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `texts_loc` | Path to JSONL file with raw texts to learn from, with text provided as the key `"text"` or tokens as the key `"tokens"`. [See here](/api/data-formats#pretrain) for details. ~~Path (positional)~~ |
|
| `texts_loc` | Path to JSONL file with raw texts to learn from, with text provided as the key `"text"` or tokens as the key `"tokens"`. [See here](/api/data-formats#pretrain) for details. ~~Path (positional)~~ |
|
||||||
| `output_dir` | Directory to write models to on each epoch. ~~Path (positional)~~ |
|
| `output_dir` | Directory to save binary weights to on each epoch. ~~Path (positional)~~ |
|
||||||
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
|
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
|
||||||
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||||
| `--resume-path`, `-r` | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~ |
|
| `--resume-path`, `-r` | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~ |
|
||||||
|
@ -803,7 +807,8 @@ $ python -m spacy pretrain [texts_loc] [output_dir] [config_path] [--code] [--re
|
||||||
|
|
||||||
## evaluate {#evaluate new="2" tag="command"}
|
## evaluate {#evaluate new="2" tag="command"}
|
||||||
|
|
||||||
Evaluate a model. Expects a loadable spaCy model and evaluation data in the
|
Evaluate a trained pipeline. Expects a loadable spaCy pipeline (package name or
|
||||||
|
path) and evaluation data in the
|
||||||
[binary `.spacy` format](/api/data-formats#binary-training). The
|
[binary `.spacy` format](/api/data-formats#binary-training). The
|
||||||
`--gold-preproc` option sets up the evaluation examples with gold-standard
|
`--gold-preproc` option sets up the evaluation examples with gold-standard
|
||||||
sentences and tokens for the predictions. Gold preprocessing helps the
|
sentences and tokens for the predictions. Gold preprocessing helps the
|
||||||
|
@ -819,7 +824,7 @@ $ python -m spacy evaluate [model] [data_path] [--output] [--gold-preproc] [--gp
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `model` | Model to evaluate. Can be a package or a path to a model data directory. ~~str (positional)~~ |
|
| `model` | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~ |
|
||||||
| `data_path` | Location of evaluation data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~ |
|
| `data_path` | Location of evaluation data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~ |
|
||||||
| `--output`, `-o` | Output JSON file for metrics. If not set, no metrics will be exported. ~~Optional[Path] \(option)~~ |
|
| `--output`, `-o` | Output JSON file for metrics. If not set, no metrics will be exported. ~~Optional[Path] \(option)~~ |
|
||||||
| `--gold-preproc`, `-G` | Use gold preprocessing. ~~bool (flag)~~ |
|
| `--gold-preproc`, `-G` | Use gold preprocessing. ~~bool (flag)~~ |
|
||||||
|
@ -831,13 +836,12 @@ $ python -m spacy evaluate [model] [data_path] [--output] [--gold-preproc] [--gp
|
||||||
|
|
||||||
## package {#package tag="command"}
|
## package {#package tag="command"}
|
||||||
|
|
||||||
Generate an installable
|
Generate an installable [Python package](/usage/training#models-generating) from
|
||||||
[model Python package](/usage/training#models-generating) from an existing model
|
an existing pipeline data directory. All data files are copied over. If the path
|
||||||
data directory. All data files are copied over. If the path to a
|
to a [`meta.json`](/api/data-formats#meta) is supplied, or a `meta.json` is
|
||||||
[`meta.json`](/api/data-formats#meta) is supplied, or a `meta.json` is found in
|
found in the input directory, this file is used. Otherwise, the data can be
|
||||||
the input directory, this file is used. Otherwise, the data can be entered
|
entered directly from the command line. spaCy will then create a `.tar.gz`
|
||||||
directly from the command line. spaCy will then create a `.tar.gz` archive file
|
archive file that you can distribute and install with `pip install`.
|
||||||
that you can distribute and install with `pip install`.
|
|
||||||
|
|
||||||
<Infobox title="New in v3.0" variant="warning">
|
<Infobox title="New in v3.0" variant="warning">
|
||||||
|
|
||||||
|
@ -855,13 +859,13 @@ $ python -m spacy package [input_dir] [output_dir] [--meta-path] [--create-meta]
|
||||||
>
|
>
|
||||||
> ```cli
|
> ```cli
|
||||||
> $ python -m spacy package /input /output
|
> $ python -m spacy package /input /output
|
||||||
> $ cd /output/en_model-0.0.0
|
> $ cd /output/en_pipeline-0.0.0
|
||||||
> $ pip install dist/en_model-0.0.0.tar.gz
|
> $ pip install dist/en_pipeline-0.0.0.tar.gz
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `input_dir` | Path to directory containing model data. ~~Path (positional)~~ |
|
| `input_dir` | Path to directory containing pipeline data. ~~Path (positional)~~ |
|
||||||
| `output_dir` | Directory to create package folder in. ~~Path (positional)~~ |
|
| `output_dir` | Directory to create package folder in. ~~Path (positional)~~ |
|
||||||
| `--meta-path`, `-m` <Tag variant="new">2</Tag> | Path to [`meta.json`](/api/data-formats#meta) file (optional). ~~Optional[Path] \(option)~~ |
|
| `--meta-path`, `-m` <Tag variant="new">2</Tag> | Path to [`meta.json`](/api/data-formats#meta) file (optional). ~~Optional[Path] \(option)~~ |
|
||||||
| `--create-meta`, `-C` <Tag variant="new">2</Tag> | Create a `meta.json` file on the command line, even if one already exists in the directory. If an existing file is found, its entries will be shown as the defaults in the command line prompt. ~~bool (flag)~~ |
|
| `--create-meta`, `-C` <Tag variant="new">2</Tag> | Create a `meta.json` file on the command line, even if one already exists in the directory. If an existing file is found, its entries will be shown as the defaults in the command line prompt. ~~bool (flag)~~ |
|
||||||
|
@ -869,13 +873,13 @@ $ python -m spacy package [input_dir] [output_dir] [--meta-path] [--create-meta]
|
||||||
| `--version`, `-v` <Tag variant="new">3</Tag> | Package version to override in meta. Useful when training new versions, as it doesn't require editing the meta template. ~~Optional[str] \(option)~~ |
|
| `--version`, `-v` <Tag variant="new">3</Tag> | Package version to override in meta. Useful when training new versions, as it doesn't require editing the meta template. ~~Optional[str] \(option)~~ |
|
||||||
| `--force`, `-f` | Force overwriting of existing folder in output directory. ~~bool (flag)~~ |
|
| `--force`, `-f` | Force overwriting of existing folder in output directory. ~~bool (flag)~~ |
|
||||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||||
| **CREATES** | A Python package containing the spaCy model. |
|
| **CREATES** | A Python package containing the spaCy pipeline. |
|
||||||
|
|
||||||
## project {#project new="3"}
|
## project {#project new="3"}
|
||||||
|
|
||||||
The `spacy project` CLI includes subcommands for working with
|
The `spacy project` CLI includes subcommands for working with
|
||||||
[spaCy projects](/usage/projects), end-to-end workflows for building and
|
[spaCy projects](/usage/projects), end-to-end workflows for building and
|
||||||
deploying custom spaCy models.
|
deploying custom spaCy pipelines.
|
||||||
|
|
||||||
### project clone {#project-clone tag="command"}
|
### project clone {#project-clone tag="command"}
|
||||||
|
|
||||||
|
@ -1015,9 +1019,9 @@ Download all files or directories listed as `outputs` for commands, unless they
|
||||||
are not already present locally. When searching for files in the remote, `pull`
|
are not already present locally. When searching for files in the remote, `pull`
|
||||||
won't just look at the output path, but will also consider the **command
|
won't just look at the output path, but will also consider the **command
|
||||||
string** and the **hashes of the dependencies**. For instance, let's say you've
|
string** and the **hashes of the dependencies**. For instance, let's say you've
|
||||||
previously pushed a model checkpoint to the remote, but now you've changed some
|
previously pushed a checkpoint to the remote, but now you've changed some
|
||||||
hyper-parameters. Because you've changed the inputs to the command, if you run
|
hyper-parameters. Because you've changed the inputs to the command, if you run
|
||||||
`pull`, you won't retrieve the stale result. If you train your model and push
|
`pull`, you won't retrieve the stale result. If you train your pipeline and push
|
||||||
the outputs to the remote, the outputs will be saved alongside the prior
|
the outputs to the remote, the outputs will be saved alongside the prior
|
||||||
outputs, so if you change the config back, you'll be able to fetch back the
|
outputs, so if you change the config back, you'll be able to fetch back the
|
||||||
result.
|
result.
|
||||||
|
|
|
@ -6,18 +6,18 @@ menu:
|
||||||
- ['Training Data', 'training']
|
- ['Training Data', 'training']
|
||||||
- ['Pretraining Data', 'pretraining']
|
- ['Pretraining Data', 'pretraining']
|
||||||
- ['Vocabulary', 'vocab-jsonl']
|
- ['Vocabulary', 'vocab-jsonl']
|
||||||
- ['Model Meta', 'meta']
|
- ['Pipeline Meta', 'meta']
|
||||||
---
|
---
|
||||||
|
|
||||||
This section documents input and output formats of data used by spaCy, including
|
This section documents input and output formats of data used by spaCy, including
|
||||||
the [training config](/usage/training#config), training data and lexical
|
the [training config](/usage/training#config), training data and lexical
|
||||||
vocabulary data. For an overview of label schemes used by the models, see the
|
vocabulary data. For an overview of label schemes used by the models, see the
|
||||||
[models directory](/models). Each model documents the label schemes used in its
|
[models directory](/models). Each trained pipeline documents the label schemes
|
||||||
components, depending on the data it was trained on.
|
used in its components, depending on the data it was trained on.
|
||||||
|
|
||||||
## Training config {#config new="3"}
|
## Training config {#config new="3"}
|
||||||
|
|
||||||
Config files define the training process and model pipeline and can be passed to
|
Config files define the training process and pipeline and can be passed to
|
||||||
[`spacy train`](/api/cli#train). They use
|
[`spacy train`](/api/cli#train). They use
|
||||||
[Thinc's configuration system](https://thinc.ai/docs/usage-config) under the
|
[Thinc's configuration system](https://thinc.ai/docs/usage-config) under the
|
||||||
hood. For details on how to use training configs, see the
|
hood. For details on how to use training configs, see the
|
||||||
|
@ -74,16 +74,16 @@ your config and check that it's valid, you can run the
|
||||||
Defines the `nlp` object, its tokenizer and
|
Defines the `nlp` object, its tokenizer and
|
||||||
[processing pipeline](/usage/processing-pipelines) component names.
|
[processing pipeline](/usage/processing-pipelines) component names.
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `lang` | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). Defaults to `null`. ~~str~~ |
|
| `lang` | Pipeline language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). Defaults to `null`. ~~str~~ |
|
||||||
| `pipeline` | Names of pipeline components in order. Should correspond to sections in the `[components]` block, e.g. `[components.ner]`. See docs on [defining components](/usage/training#config-components). Defaults to `[]`. ~~List[str]~~ |
|
| `pipeline` | Names of pipeline components in order. Should correspond to sections in the `[components]` block, e.g. `[components.ner]`. See docs on [defining components](/usage/training#config-components). Defaults to `[]`. ~~List[str]~~ |
|
||||||
| `disabled` | Names of pipeline components that are loaded but disabled by default and not run as part of the pipeline. Should correspond to components listed in `pipeline`. After a model is loaded, disabled components can be enabled using [`Language.enable_pipe`](/api/language#enable_pipe). ~~List[str]~~ |
|
| `disabled` | Names of pipeline components that are loaded but disabled by default and not run as part of the pipeline. Should correspond to components listed in `pipeline`. After a pipeline is loaded, disabled components can be enabled using [`Language.enable_pipe`](/api/language#enable_pipe). ~~List[str]~~ |
|
||||||
| `load_vocab_data` | Whether to load additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) if available. Defaults to `true`. ~~bool~~ |
|
| `load_vocab_data` | Whether to load additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) if available. Defaults to `true`. ~~bool~~ |
|
||||||
| `before_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `Language` subclass before it's initialized. Defaults to `null`. ~~Optional[Callable[[Type[Language]], Type[Language]]]~~ |
|
| `before_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `Language` subclass before it's initialized. Defaults to `null`. ~~Optional[Callable[[Type[Language]], Type[Language]]]~~ |
|
||||||
| `after_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object right after it's initialized. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ |
|
| `after_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object right after it's initialized. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ |
|
||||||
| `after_pipeline_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object after the pipeline components have been added. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ |
|
| `after_pipeline_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object after the pipeline components have been added. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ |
|
||||||
| `tokenizer` | The tokenizer to use. Defaults to [`Tokenizer`](/api/tokenizer). ~~Callable[[str], Doc]~~ |
|
| `tokenizer` | The tokenizer to use. Defaults to [`Tokenizer`](/api/tokenizer). ~~Callable[[str], Doc]~~ |
|
||||||
|
|
||||||
### components {#config-components tag="section"}
|
### components {#config-components tag="section"}
|
||||||
|
|
||||||
|
@ -105,8 +105,8 @@ This section includes definitions of the
|
||||||
[pipeline components](/usage/processing-pipelines) and their models, if
|
[pipeline components](/usage/processing-pipelines) and their models, if
|
||||||
available. Components in this section can be referenced in the `pipeline` of the
|
available. Components in this section can be referenced in the `pipeline` of the
|
||||||
`[nlp]` block. Component blocks need to specify either a `factory` (named
|
`[nlp]` block. Component blocks need to specify either a `factory` (named
|
||||||
function to use to create component) or a `source` (name of path of pretrained
|
function to use to create component) or a `source` (name of path of trained
|
||||||
model to copy components from). See the docs on
|
pipeline to copy components from). See the docs on
|
||||||
[defining pipeline components](/usage/training#config-components) for details.
|
[defining pipeline components](/usage/training#config-components) for details.
|
||||||
|
|
||||||
### paths, system {#config-variables tag="variables"}
|
### paths, system {#config-variables tag="variables"}
|
||||||
|
@ -145,7 +145,7 @@ process that are used when you run [`spacy train`](/api/cli#train).
|
||||||
| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ |
|
| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ |
|
||||||
| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ |
|
| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ |
|
||||||
| `train_corpus` | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/corpus). ~~Callable[[Language], Iterator[Example]]~~ |
|
| `train_corpus` | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/corpus). ~~Callable[[Language], Iterator[Example]]~~ |
|
||||||
| `vectors` | Model name or path to model containing pretrained word vectors to use, e.g. created with [`init model`](/api/cli#init-model). Defaults to `null`. ~~Optional[str]~~ |
|
| `vectors` | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~ |
|
||||||
|
|
||||||
### pretraining {#config-pretraining tag="section,optional"}
|
### pretraining {#config-pretraining tag="section,optional"}
|
||||||
|
|
||||||
|
@ -184,7 +184,7 @@ run [`spacy pretrain`](/api/cli#pretrain).
|
||||||
|
|
||||||
The main data format used in spaCy v3.0 is a **binary format** created by
|
The main data format used in spaCy v3.0 is a **binary format** created by
|
||||||
serializing a [`DocBin`](/api/docbin), which represents a collection of `Doc`
|
serializing a [`DocBin`](/api/docbin), which represents a collection of `Doc`
|
||||||
objects. This means that you can train spaCy models using the same format it
|
objects. This means that you can train spaCy pipelines using the same format it
|
||||||
outputs: annotated `Doc` objects. The binary format is extremely **efficient in
|
outputs: annotated `Doc` objects. The binary format is extremely **efficient in
|
||||||
storage**, especially when packing multiple documents together.
|
storage**, especially when packing multiple documents together.
|
||||||
|
|
||||||
|
@ -286,8 +286,8 @@ a dictionary of gold-standard annotations.
|
||||||
[internal training API](/usage/training#api) and they're expected when you call
|
[internal training API](/usage/training#api) and they're expected when you call
|
||||||
[`nlp.update`](/api/language#update). However, for most use cases, you
|
[`nlp.update`](/api/language#update). However, for most use cases, you
|
||||||
**shouldn't** have to write your own training scripts. It's recommended to train
|
**shouldn't** have to write your own training scripts. It's recommended to train
|
||||||
your models via the [`spacy train`](/api/cli#train) command with a config file
|
your pipelines via the [`spacy train`](/api/cli#train) command with a config
|
||||||
to keep track of your settings and hyperparameters and your own
|
file to keep track of your settings and hyperparameters and your own
|
||||||
[registered functions](/usage/training/#custom-code) to customize the setup.
|
[registered functions](/usage/training/#custom-code) to customize the setup.
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
@ -406,15 +406,15 @@ in line-by-line, while still making it easy to represent newlines in the data.
|
||||||
|
|
||||||
## Lexical data for vocabulary {#vocab-jsonl new="2"}
|
## Lexical data for vocabulary {#vocab-jsonl new="2"}
|
||||||
|
|
||||||
To populate a model's vocabulary, you can use the
|
To populate a pipeline's vocabulary, you can use the
|
||||||
[`spacy init model`](/api/cli#init-model) command and load in a
|
[`spacy init vocab`](/api/cli#init-vocab) command and load in a
|
||||||
[newline-delimited JSON](http://jsonlines.org/) (JSONL) file containing one
|
[newline-delimited JSON](http://jsonlines.org/) (JSONL) file containing one
|
||||||
lexical entry per line via the `--jsonl-loc` option. The first line defines the
|
lexical entry per line via the `--jsonl-loc` option. The first line defines the
|
||||||
language and vocabulary settings. All other lines are expected to be JSON
|
language and vocabulary settings. All other lines are expected to be JSON
|
||||||
objects describing an individual lexeme. The lexical attributes will be then set
|
objects describing an individual lexeme. The lexical attributes will be then set
|
||||||
as attributes on spaCy's [`Lexeme`](/api/lexeme#attributes) object. The `vocab`
|
as attributes on spaCy's [`Lexeme`](/api/lexeme#attributes) object. The `vocab`
|
||||||
command outputs a ready-to-use spaCy model with a `Vocab` containing the lexical
|
command outputs a ready-to-use spaCy pipeline with a `Vocab` containing the
|
||||||
data.
|
lexical data.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### First line
|
### First line
|
||||||
|
@ -459,11 +459,11 @@ Here's an example of the 20 most frequent lexemes in the English training data:
|
||||||
https://github.com/explosion/spaCy/tree/master/examples/training/vocab-data.jsonl
|
https://github.com/explosion/spaCy/tree/master/examples/training/vocab-data.jsonl
|
||||||
```
|
```
|
||||||
|
|
||||||
## Model meta {#meta}
|
## Pipeline meta {#meta}
|
||||||
|
|
||||||
The model meta is available as the file `meta.json` and exported automatically
|
The pipeline meta is available as the file `meta.json` and exported
|
||||||
when you save an `nlp` object to disk. Its contents are available as
|
automatically when you save an `nlp` object to disk. Its contents are available
|
||||||
[`nlp.meta`](/api/language#meta).
|
as [`nlp.meta`](/api/language#meta).
|
||||||
|
|
||||||
<Infobox variant="warning" title="Changed in v3.0">
|
<Infobox variant="warning" title="Changed in v3.0">
|
||||||
|
|
||||||
|
@ -473,8 +473,8 @@ creating a Python package with [`spacy package`](/api/cli#package). How to set
|
||||||
up the `nlp` object is now defined in the
|
up the `nlp` object is now defined in the
|
||||||
[`config.cfg`](/api/data-formats#config), which includes detailed information
|
[`config.cfg`](/api/data-formats#config), which includes detailed information
|
||||||
about the pipeline components and their model architectures, and all other
|
about the pipeline components and their model architectures, and all other
|
||||||
settings and hyperparameters used to train the model. It's the **single source
|
settings and hyperparameters used to train the pipeline. It's the **single
|
||||||
of truth** used for loading a model.
|
source of truth** used for loading a pipeline.
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
|
@ -482,12 +482,12 @@ of truth** used for loading a model.
|
||||||
>
|
>
|
||||||
> ```json
|
> ```json
|
||||||
> {
|
> {
|
||||||
> "name": "example_model",
|
> "name": "example_pipeline",
|
||||||
> "lang": "en",
|
> "lang": "en",
|
||||||
> "version": "1.0.0",
|
> "version": "1.0.0",
|
||||||
> "spacy_version": ">=3.0.0,<3.1.0",
|
> "spacy_version": ">=3.0.0,<3.1.0",
|
||||||
> "parent_package": "spacy",
|
> "parent_package": "spacy",
|
||||||
> "description": "Example model for spaCy",
|
> "description": "Example pipeline for spaCy",
|
||||||
> "author": "You",
|
> "author": "You",
|
||||||
> "email": "you@example.com",
|
> "email": "you@example.com",
|
||||||
> "url": "https://example.com",
|
> "url": "https://example.com",
|
||||||
|
@ -510,23 +510,23 @@ of truth** used for loading a model.
|
||||||
> }
|
> }
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ---------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| ---------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `lang` | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). Defaults to `"en"`. ~~str~~ |
|
| `lang` | Pipeline language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). Defaults to `"en"`. ~~str~~ |
|
||||||
| `name` | Model name, e.g. `"core_web_sm"`. The final model package name will be `{lang}_{name}`. Defaults to `"model"`. ~~str~~ |
|
| `name` | Pipeline name, e.g. `"core_web_sm"`. The final package name will be `{lang}_{name}`. Defaults to `"pipeline"`. ~~str~~ |
|
||||||
| `version` | Model version. Will be used to version a Python package created with [`spacy package`](/api/cli#package). Defaults to `"0.0.0"`. ~~str~~ |
|
| `version` | Pipeline version. Will be used to version a Python package created with [`spacy package`](/api/cli#package). Defaults to `"0.0.0"`. ~~str~~ |
|
||||||
| `spacy_version` | spaCy version range the model is compatible with. Defaults to the spaCy version used to create the model, up to next minor version, which is the default compatibility for the available [pretrained models](/models). For instance, a model trained with v3.0.0 will have the version range `">=3.0.0,<3.1.0"`. ~~str~~ |
|
| `spacy_version` | spaCy version range the package is compatible with. Defaults to the spaCy version used to create the pipeline, up to next minor version, which is the default compatibility for the available [trained pipelines](/models). For instance, a pipeline trained with v3.0.0 will have the version range `">=3.0.0,<3.1.0"`. ~~str~~ |
|
||||||
| `parent_package` | Name of the spaCy package. Typically `"spacy"` or `"spacy_nightly"`. Defaults to `"spacy"`. ~~str~~ |
|
| `parent_package` | Name of the spaCy package. Typically `"spacy"` or `"spacy_nightly"`. Defaults to `"spacy"`. ~~str~~ |
|
||||||
| `description` | Model description. Also used for Python package. Defaults to `""`. ~~str~~ |
|
| `description` | Pipeline description. Also used for Python package. Defaults to `""`. ~~str~~ |
|
||||||
| `author` | Model author name. Also used for Python package. Defaults to `""`. ~~str~~ |
|
| `author` | Pipeline author name. Also used for Python package. Defaults to `""`. ~~str~~ |
|
||||||
| `email` | Model author email. Also used for Python package. Defaults to `""`. ~~str~~ |
|
| `email` | Pipeline author email. Also used for Python package. Defaults to `""`. ~~str~~ |
|
||||||
| `url` | Model author URL. Also used for Python package. Defaults to `""`. ~~str~~ |
|
| `url` | Pipeline author URL. Also used for Python package. Defaults to `""`. ~~str~~ |
|
||||||
| `license` | Model license. Also used for Python package. Defaults to `""`. ~~str~~ |
|
| `license` | Pipeline license. Also used for Python package. Defaults to `""`. ~~str~~ |
|
||||||
| `sources` | Data sources used to train the model. Typically a list of dicts with the keys `"name"`, `"url"`, `"author"` and `"license"`. [See here](https://github.com/explosion/spacy-models/tree/master/meta) for examples. Defaults to `None`. ~~Optional[List[Dict[str, str]]]~~ |
|
| `sources` | Data sources used to train the pipeline. Typically a list of dicts with the keys `"name"`, `"url"`, `"author"` and `"license"`. [See here](https://github.com/explosion/spacy-models/tree/master/meta) for examples. Defaults to `None`. ~~Optional[List[Dict[str, str]]]~~ |
|
||||||
| `vectors` | Information about the word vectors included with the model. Typically a dict with the keys `"width"`, `"vectors"` (number of vectors), `"keys"` and `"name"`. ~~Dict[str, Any]~~ |
|
| `vectors` | Information about the word vectors included with the pipeline. Typically a dict with the keys `"width"`, `"vectors"` (number of vectors), `"keys"` and `"name"`. ~~Dict[str, Any]~~ |
|
||||||
| `pipeline` | Names of pipeline component names in the model, in order. Corresponds to [`nlp.pipe_names`](/api/language#pipe_names). Only exists for reference and is not used to create the components. This information is defined in the [`config.cfg`](/api/data-formats#config). Defaults to `[]`. ~~List[str]~~ |
|
| `pipeline` | Names of pipeline component names, in order. Corresponds to [`nlp.pipe_names`](/api/language#pipe_names). Only exists for reference and is not used to create the components. This information is defined in the [`config.cfg`](/api/data-formats#config). Defaults to `[]`. ~~List[str]~~ |
|
||||||
| `labels` | Label schemes of the trained pipeline components, keyed by component name. Corresponds to [`nlp.pipe_labels`](/api/language#pipe_labels). [See here](https://github.com/explosion/spacy-models/tree/master/meta) for examples. Defaults to `{}`. ~~Dict[str, Dict[str, List[str]]]~~ |
|
| `labels` | Label schemes of the trained pipeline components, keyed by component name. Corresponds to [`nlp.pipe_labels`](/api/language#pipe_labels). [See here](https://github.com/explosion/spacy-models/tree/master/meta) for examples. Defaults to `{}`. ~~Dict[str, Dict[str, List[str]]]~~ |
|
||||||
| `accuracy` | Training accuracy, added automatically by [`spacy train`](/api/cli#train). Dictionary of [score names](/usage/training#metrics) mapped to scores. Defaults to `{}`. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
|
| `accuracy` | Training accuracy, added automatically by [`spacy train`](/api/cli#train). Dictionary of [score names](/usage/training#metrics) mapped to scores. Defaults to `{}`. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
|
||||||
| `speed` | Model speed, added automatically by [`spacy train`](/api/cli#train). Typically a dictionary with the keys `"cpu"`, `"gpu"` and `"nwords"` (words per second). Defaults to `{}`. ~~Dict[str, Optional[Union[float, str]]]~~ |
|
| `speed` | Inference speed, added automatically by [`spacy train`](/api/cli#train). Typically a dictionary with the keys `"cpu"`, `"gpu"` and `"nwords"` (words per second). Defaults to `{}`. ~~Dict[str, Optional[Union[float, str]]]~~ |
|
||||||
| `spacy_git_version` <Tag variant="new">3</Tag> | Git commit of [`spacy`](https://github.com/explosion/spaCy) used to create model. ~~str~~ |
|
| `spacy_git_version` <Tag variant="new">3</Tag> | Git commit of [`spacy`](https://github.com/explosion/spaCy) used to create pipeline. ~~str~~ |
|
||||||
| other | Any other custom meta information you want to add. The data is preserved in [`nlp.meta`](/api/language#meta). ~~Any~~ |
|
| other | Any other custom meta information you want to add. The data is preserved in [`nlp.meta`](/api/language#meta). ~~Any~~ |
|
||||||
|
|
|
@ -9,8 +9,8 @@ The `DependencyMatcher` follows the same API as the [`Matcher`](/api/matcher)
|
||||||
and [`PhraseMatcher`](/api/phrasematcher) and lets you match on dependency trees
|
and [`PhraseMatcher`](/api/phrasematcher) and lets you match on dependency trees
|
||||||
using the
|
using the
|
||||||
[Semgrex syntax](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html).
|
[Semgrex syntax](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html).
|
||||||
It requires a pretrained [`DependencyParser`](/api/parser) or other component
|
It requires a trained [`DependencyParser`](/api/parser) or other component that
|
||||||
that sets the `Token.dep` attribute.
|
sets the `Token.dep` attribute.
|
||||||
|
|
||||||
## Pattern format {#patterns}
|
## Pattern format {#patterns}
|
||||||
|
|
||||||
|
|
|
@ -13,8 +13,8 @@ An `EntityLinker` component disambiguates textual mentions (tagged as named
|
||||||
entities) to unique identifiers, grounding the named entities into the "real
|
entities) to unique identifiers, grounding the named entities into the "real
|
||||||
world". It requires a `KnowledgeBase`, as well as a function to generate
|
world". It requires a `KnowledgeBase`, as well as a function to generate
|
||||||
plausible candidates from that `KnowledgeBase` given a certain textual mention,
|
plausible candidates from that `KnowledgeBase` given a certain textual mention,
|
||||||
and a ML model to pick the right candidate, given the local context of the
|
and a machine learning model to pick the right candidate, given the local
|
||||||
mention.
|
context of the mention.
|
||||||
|
|
||||||
## Config and implementation {#config}
|
## Config and implementation {#config}
|
||||||
|
|
||||||
|
|
|
@ -7,9 +7,9 @@ source: spacy/language.py
|
||||||
|
|
||||||
Usually you'll load this once per process as `nlp` and pass the instance around
|
Usually you'll load this once per process as `nlp` and pass the instance around
|
||||||
your application. The `Language` class is created when you call
|
your application. The `Language` class is created when you call
|
||||||
[`spacy.load()`](/api/top-level#spacy.load) and contains the shared vocabulary
|
[`spacy.load`](/api/top-level#spacy.load) and contains the shared vocabulary and
|
||||||
and [language data](/usage/adding-languages), optional model data loaded from a
|
[language data](/usage/adding-languages), optional binary weights, e.g. provided
|
||||||
[model package](/models) or a path, and a
|
by a [trained pipeline](/models), and the
|
||||||
[processing pipeline](/usage/processing-pipelines) containing components like
|
[processing pipeline](/usage/processing-pipelines) containing components like
|
||||||
the tagger or parser that are called on a document in order. You can also add
|
the tagger or parser that are called on a document in order. You can also add
|
||||||
your own processing pipeline components that take a `Doc` object, modify it and
|
your own processing pipeline components that take a `Doc` object, modify it and
|
||||||
|
@ -37,7 +37,7 @@ Initialize a `Language` object.
|
||||||
| `vocab` | A `Vocab` object. If `True`, a vocab is created using the default language data settings. ~~Vocab~~ |
|
| `vocab` | A `Vocab` object. If `True`, a vocab is created using the default language data settings. ~~Vocab~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `max_length` | Maximum number of characters allowed in a single text. Defaults to `10 ** 6`. ~~int~~ |
|
| `max_length` | Maximum number of characters allowed in a single text. Defaults to `10 ** 6`. ~~int~~ |
|
||||||
| `meta` | Custom meta data for the `Language` class. Is written to by models to add model meta data. ~~dict~~ |
|
| `meta` | Custom meta data for the `Language` class. Is written to by pipelines to add meta data. ~~dict~~ |
|
||||||
| `create_tokenizer` | Optional function that receives the `nlp` object and returns a tokenizer. ~~Callable[[Language], Callable[[str], Doc]]~~ |
|
| `create_tokenizer` | Optional function that receives the `nlp` object and returns a tokenizer. ~~Callable[[Language], Callable[[str], Doc]]~~ |
|
||||||
|
|
||||||
## Language.from_config {#from_config tag="classmethod" new="3"}
|
## Language.from_config {#from_config tag="classmethod" new="3"}
|
||||||
|
@ -232,7 +232,7 @@ tuples of `Doc` and `GoldParse` objects.
|
||||||
|
|
||||||
## Language.resume_training {#resume_training tag="method,experimental" new="3"}
|
## Language.resume_training {#resume_training tag="method,experimental" new="3"}
|
||||||
|
|
||||||
Continue training a pretrained model. Create and return an optimizer, and
|
Continue training a trained pipeline. Create and return an optimizer, and
|
||||||
initialize "rehearsal" for any pipeline component that has a `rehearse` method.
|
initialize "rehearsal" for any pipeline component that has a `rehearse` method.
|
||||||
Rehearsal is used to prevent models from "forgetting" their initialized
|
Rehearsal is used to prevent models from "forgetting" their initialized
|
||||||
"knowledge". To perform rehearsal, collect samples of text you want the models
|
"knowledge". To perform rehearsal, collect samples of text you want the models
|
||||||
|
@ -314,7 +314,7 @@ the "catastrophic forgetting" problem. This feature is experimental.
|
||||||
|
|
||||||
## Language.evaluate {#evaluate tag="method"}
|
## Language.evaluate {#evaluate tag="method"}
|
||||||
|
|
||||||
Evaluate a model's pipeline components.
|
Evaluate a pipeline's components.
|
||||||
|
|
||||||
<Infobox variant="warning" title="Changed in v3.0">
|
<Infobox variant="warning" title="Changed in v3.0">
|
||||||
|
|
||||||
|
@ -386,24 +386,24 @@ component, adds it to the pipeline and returns it.
|
||||||
> nlp.add_pipe("component", before="ner")
|
> nlp.add_pipe("component", before="ner")
|
||||||
> component = nlp.add_pipe("component", name="custom_name", last=True)
|
> component = nlp.add_pipe("component", name="custom_name", last=True)
|
||||||
>
|
>
|
||||||
> # Add component from source model
|
> # Add component from source pipeline
|
||||||
> source_nlp = spacy.load("en_core_web_sm")
|
> source_nlp = spacy.load("en_core_web_sm")
|
||||||
> nlp.add_pipe("ner", source=source_nlp)
|
> nlp.add_pipe("ner", source=source_nlp)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `factory_name` | Name of the registered component factory. ~~str~~ |
|
| `factory_name` | Name of the registered component factory. ~~str~~ |
|
||||||
| `name` | Optional unique name of pipeline component instance. If not set, the factory name is used. An error is raised if the name already exists in the pipeline. ~~Optional[str]~~ |
|
| `name` | Optional unique name of pipeline component instance. If not set, the factory name is used. An error is raised if the name already exists in the pipeline. ~~Optional[str]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `before` | Component name or index to insert component directly before. ~~Optional[Union[str, int]]~~ |
|
| `before` | Component name or index to insert component directly before. ~~Optional[Union[str, int]]~~ |
|
||||||
| `after` | Component name or index to insert component directly after. ~~Optional[Union[str, int]]~~ |
|
| `after` | Component name or index to insert component directly after. ~~Optional[Union[str, int]]~~ |
|
||||||
| `first` | Insert component first / not first in the pipeline. ~~Optional[bool]~~ |
|
| `first` | Insert component first / not first in the pipeline. ~~Optional[bool]~~ |
|
||||||
| `last` | Insert component last / not last in the pipeline. ~~Optional[bool]~~ |
|
| `last` | Insert component last / not last in the pipeline. ~~Optional[bool]~~ |
|
||||||
| `config` <Tag variant="new">3</Tag> | Optional config parameters to use for this component. Will be merged with the `default_config` specified by the component factory. ~~Optional[Dict[str, Any]]~~ |
|
| `config` <Tag variant="new">3</Tag> | Optional config parameters to use for this component. Will be merged with the `default_config` specified by the component factory. ~~Optional[Dict[str, Any]]~~ |
|
||||||
| `source` <Tag variant="new">3</Tag> | Optional source model to copy component from. If a source is provided, the `factory_name` is interpreted as the name of the component in the source pipeline. Make sure that the vocab, vectors and settings of the source model match the target model. ~~Optional[Language]~~ |
|
| `source` <Tag variant="new">3</Tag> | Optional source pipeline to copy component from. If a source is provided, the `factory_name` is interpreted as the name of the component in the source pipeline. Make sure that the vocab, vectors and settings of the source pipeline match the target pipeline. ~~Optional[Language]~~ |
|
||||||
| `validate` <Tag variant="new">3</Tag> | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ |
|
| `validate` <Tag variant="new">3</Tag> | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ |
|
||||||
| **RETURNS** | The pipeline component. ~~Callable[[Doc], Doc]~~ |
|
| **RETURNS** | The pipeline component. ~~Callable[[Doc], Doc]~~ |
|
||||||
|
|
||||||
## Language.create_pipe {#create_pipe tag="method" new="2"}
|
## Language.create_pipe {#create_pipe tag="method" new="2"}
|
||||||
|
|
||||||
|
@ -790,9 +790,10 @@ token.ent_iob, token.ent_type
|
||||||
|
|
||||||
## Language.meta {#meta tag="property"}
|
## Language.meta {#meta tag="property"}
|
||||||
|
|
||||||
Custom meta data for the Language class. If a model is loaded, contains meta
|
Custom meta data for the Language class. If a trained pipeline is loaded, this
|
||||||
data of the model. The `Language.meta` is also what's serialized as the
|
contains meta data of the pipeline. The `Language.meta` is also what's
|
||||||
[`meta.json`](/api/data-formats#meta) when you save an `nlp` object to disk.
|
serialized as the [`meta.json`](/api/data-formats#meta) when you save an `nlp`
|
||||||
|
object to disk.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
@ -827,13 +828,15 @@ subclass of the built-in `dict`. It supports the additional methods `to_disk`
|
||||||
|
|
||||||
## Language.to_disk {#to_disk tag="method" new="2"}
|
## Language.to_disk {#to_disk tag="method" new="2"}
|
||||||
|
|
||||||
Save the current state to a directory. If a model is loaded, this will **include
|
Save the current state to a directory. Under the hood, this method delegates to
|
||||||
the model**.
|
the `to_disk` methods of the individual pipeline components, if available. This
|
||||||
|
means that if a trained pipeline is loaded, all components and their weights
|
||||||
|
will be saved to disk.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> nlp.to_disk("/path/to/models")
|
> nlp.to_disk("/path/to/pipeline")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
|
@ -844,22 +847,28 @@ the model**.
|
||||||
|
|
||||||
## Language.from_disk {#from_disk tag="method" new="2"}
|
## Language.from_disk {#from_disk tag="method" new="2"}
|
||||||
|
|
||||||
Loads state from a directory. Modifies the object in place and returns it. If
|
Loads state from a directory, including all data that was saved with the
|
||||||
the saved `Language` object contains a model, the model will be loaded. Note
|
`Language` object. Modifies the object in place and returns it.
|
||||||
that this method is commonly used via the subclasses like `English` or `German`
|
|
||||||
to make language-specific functionality like the
|
<Infobox variant="warning" title="Important note">
|
||||||
[lexical attribute getters](/usage/adding-languages#lex-attrs) available to the
|
|
||||||
loaded object.
|
Keep in mind that this method **only loads serialized state** and doesn't set up
|
||||||
|
the `nlp` object. This means that it requires the correct language class to be
|
||||||
|
initialized and all pipeline components to be added to the pipeline. If you want
|
||||||
|
to load a serialized pipeline from a directory, you should use
|
||||||
|
[`spacy.load`](/api/top-level#spacy.load), which will set everything up for you.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> from spacy.language import Language
|
> from spacy.language import Language
|
||||||
> nlp = Language().from_disk("/path/to/model")
|
> nlp = Language().from_disk("/path/to/pipeline")
|
||||||
>
|
>
|
||||||
> # using language-specific subclass
|
> # Using language-specific subclass
|
||||||
> from spacy.lang.en import English
|
> from spacy.lang.en import English
|
||||||
> nlp = English().from_disk("/path/to/en_model")
|
> nlp = English().from_disk("/path/to/pipeline")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
|
@ -924,7 +933,7 @@ available to the loaded object.
|
||||||
| `components` <Tag variant="new">3</Tag> | List of all available `(name, component)` tuples, including components that are currently disabled. ~~List[Tuple[str, Callable[[Doc], Doc]]]~~ |
|
| `components` <Tag variant="new">3</Tag> | List of all available `(name, component)` tuples, including components that are currently disabled. ~~List[Tuple[str, Callable[[Doc], Doc]]]~~ |
|
||||||
| `component_names` <Tag variant="new">3</Tag> | List of all available component names, including components that are currently disabled. ~~List[str]~~ |
|
| `component_names` <Tag variant="new">3</Tag> | List of all available component names, including components that are currently disabled. ~~List[str]~~ |
|
||||||
| `disabled` <Tag variant="new">3</Tag> | Names of components that are currently disabled and don't run as part of the pipeline. ~~List[str]~~ |
|
| `disabled` <Tag variant="new">3</Tag> | Names of components that are currently disabled and don't run as part of the pipeline. ~~List[str]~~ |
|
||||||
| `path` <Tag variant="new">2</Tag> | Path to the model data directory, if a model is loaded. Otherwise `None`. ~~Optional[Path]~~ |
|
| `path` <Tag variant="new">2</Tag> | Path to the pipeline data directory, if a pipeline is loaded from a path or package. Otherwise `None`. ~~Optional[Path]~~ |
|
||||||
|
|
||||||
## Class attributes {#class-attributes}
|
## Class attributes {#class-attributes}
|
||||||
|
|
||||||
|
@ -1004,7 +1013,7 @@ serialization by passing in the string names via the `exclude` argument.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> data = nlp.to_bytes(exclude=["tokenizer", "vocab"])
|
> data = nlp.to_bytes(exclude=["tokenizer", "vocab"])
|
||||||
> nlp.from_disk("./model-data", exclude=["ner"])
|
> nlp.from_disk("/pipeline", exclude=["ner"])
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
|
|
|
@ -286,7 +286,7 @@ context, the original parameters are restored.
|
||||||
|
|
||||||
## Pipe.add_label {#add_label tag="method"}
|
## Pipe.add_label {#add_label tag="method"}
|
||||||
|
|
||||||
Add a new label to the pipe. It's possible to extend pretrained models with new
|
Add a new label to the pipe. It's possible to extend trained models with new
|
||||||
labels, but care should be taken to avoid the "catastrophic forgetting" problem.
|
labels, but care should be taken to avoid the "catastrophic forgetting" problem.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
|
|
|
@ -12,14 +12,14 @@ menu:
|
||||||
|
|
||||||
## spaCy {#spacy hidden="true"}
|
## spaCy {#spacy hidden="true"}
|
||||||
|
|
||||||
### spacy.load {#spacy.load tag="function" model="any"}
|
### spacy.load {#spacy.load tag="function"}
|
||||||
|
|
||||||
Load a model using the name of an installed
|
Load a pipeline using the name of an installed
|
||||||
[model package](/usage/training#models-generating), a string path or a
|
[package](/usage/saving-loading#models), a string path or a `Path`-like object.
|
||||||
`Path`-like object. spaCy will try resolving the load argument in this order. If
|
spaCy will try resolving the load argument in this order. If a pipeline is
|
||||||
a model is loaded from a model name, spaCy will assume it's a Python package and
|
loaded from a string name, spaCy will assume it's a Python package and import it
|
||||||
import it and call the model's own `load()` method. If a model is loaded from a
|
and call the package's own `load()` method. If a pipeline is loaded from a path,
|
||||||
path, spaCy will assume it's a data directory, load its
|
spaCy will assume it's a data directory, load its
|
||||||
[`config.cfg`](/api/data-formats#config) and use the language and pipeline
|
[`config.cfg`](/api/data-formats#config) and use the language and pipeline
|
||||||
information to construct the `Language` class. The data will be loaded in via
|
information to construct the `Language` class. The data will be loaded in via
|
||||||
[`Language.from_disk`](/api/language#from_disk).
|
[`Language.from_disk`](/api/language#from_disk).
|
||||||
|
@ -36,38 +36,38 @@ specified separately using the new `exclude` keyword argument.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> nlp = spacy.load("en_core_web_sm") # package
|
> nlp = spacy.load("en_core_web_sm") # package
|
||||||
> nlp = spacy.load("/path/to/en") # string path
|
> nlp = spacy.load("/path/to/pipeline") # string path
|
||||||
> nlp = spacy.load(Path("/path/to/en")) # pathlib Path
|
> nlp = spacy.load(Path("/path/to/pipeline")) # pathlib Path
|
||||||
>
|
>
|
||||||
> nlp = spacy.load("en_core_web_sm", exclude=["parser", "tagger"])
|
> nlp = spacy.load("en_core_web_sm", exclude=["parser", "tagger"])
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `name` | Model to load, i.e. package name or path. ~~Union[str, Path]~~ |
|
| `name` | Pipeline to load, i.e. package name or path. ~~Union[str, Path]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~List[str]~~ |
|
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~List[str]~~ |
|
||||||
| `exclude` <Tag variant="new">3</Tag> | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ |
|
| `exclude` <Tag variant="new">3</Tag> | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ |
|
||||||
| `config` <Tag variant="new">3</Tag> | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ |
|
| `config` <Tag variant="new">3</Tag> | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ |
|
||||||
| **RETURNS** | A `Language` object with the loaded model. ~~Language~~ |
|
| **RETURNS** | A `Language` object with the loaded pipeline. ~~Language~~ |
|
||||||
|
|
||||||
Essentially, `spacy.load()` is a convenience wrapper that reads the model's
|
Essentially, `spacy.load()` is a convenience wrapper that reads the pipeline's
|
||||||
[`config.cfg`](/api/data-formats#config), uses the language and pipeline
|
[`config.cfg`](/api/data-formats#config), uses the language and pipeline
|
||||||
information to construct a `Language` object, loads in the model data and
|
information to construct a `Language` object, loads in the model data and
|
||||||
returns it.
|
weights, and returns it.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### Abstract example
|
### Abstract example
|
||||||
cls = util.get_lang_class(lang) # get language for ID, e.g. "en"
|
cls = spacy.util.get_lang_class(lang) # 1. Get Language class, e.g. English
|
||||||
nlp = cls() # initialize the language
|
nlp = cls() # 2. Initialize it
|
||||||
for name in pipeline:
|
for name in pipeline:
|
||||||
nlp.add_pipe(name) # add component to pipeline
|
nlp.add_pipe(name) # 3. Add the component to the pipeline
|
||||||
nlp.from_disk(model_data_path) # load in model data
|
nlp.from_disk(data_path) # 4. Load in the binary data
|
||||||
```
|
```
|
||||||
|
|
||||||
### spacy.blank {#spacy.blank tag="function" new="2"}
|
### spacy.blank {#spacy.blank tag="function" new="2"}
|
||||||
|
|
||||||
Create a blank model of a given language class. This function is the twin of
|
Create a blank pipeline of a given language class. This function is the twin of
|
||||||
`spacy.load()`.
|
`spacy.load()`.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
|
@ -85,9 +85,7 @@ Create a blank model of a given language class. This function is the twin of
|
||||||
### spacy.info {#spacy.info tag="function"}
|
### spacy.info {#spacy.info tag="function"}
|
||||||
|
|
||||||
The same as the [`info` command](/api/cli#info). Pretty-print information about
|
The same as the [`info` command](/api/cli#info). Pretty-print information about
|
||||||
your installation, models and local setup from within spaCy. To get the model
|
your installation, installed pipelines and local setup from within spaCy.
|
||||||
meta data as a dictionary instead, you can use the `meta` attribute on your
|
|
||||||
`nlp` object with a loaded model, e.g. `nlp.meta`.
|
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
@ -97,12 +95,12 @@ meta data as a dictionary instead, you can use the `meta` attribute on your
|
||||||
> markdown = spacy.info(markdown=True, silent=True)
|
> markdown = spacy.info(markdown=True, silent=True)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------- | ------------------------------------------------------------------ |
|
| -------------- | ---------------------------------------------------------------------------- |
|
||||||
| `model` | A model, i.e. a package name or path (optional). ~~Optional[str]~~ |
|
| `model` | Optional pipeline, i.e. a package name or path (optional). ~~Optional[str]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `markdown` | Print information as Markdown. ~~bool~~ |
|
| `markdown` | Print information as Markdown. ~~bool~~ |
|
||||||
| `silent` | Don't print anything, just return. ~~bool~~ |
|
| `silent` | Don't print anything, just return. ~~bool~~ |
|
||||||
|
|
||||||
### spacy.explain {#spacy.explain tag="function"}
|
### spacy.explain {#spacy.explain tag="function"}
|
||||||
|
|
||||||
|
@ -133,7 +131,7 @@ list of available terms, see
|
||||||
Allocate data and perform operations on [GPU](/usage/#gpu), if available. If
|
Allocate data and perform operations on [GPU](/usage/#gpu), if available. If
|
||||||
data has already been allocated on CPU, it will not be moved. Ideally, this
|
data has already been allocated on CPU, it will not be moved. Ideally, this
|
||||||
function should be called right after importing spaCy and _before_ loading any
|
function should be called right after importing spaCy and _before_ loading any
|
||||||
models.
|
pipelines.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
@ -152,7 +150,7 @@ models.
|
||||||
Allocate data and perform operations on [GPU](/usage/#gpu). Will raise an error
|
Allocate data and perform operations on [GPU](/usage/#gpu). Will raise an error
|
||||||
if no GPU is available. If data has already been allocated on CPU, it will not
|
if no GPU is available. If data has already been allocated on CPU, it will not
|
||||||
be moved. Ideally, this function should be called right after importing spaCy
|
be moved. Ideally, this function should be called right after importing spaCy
|
||||||
and _before_ loading any models.
|
and _before_ loading any pipelines.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
@ -271,9 +269,9 @@ If a setting is not present in the options, the default value will be used.
|
||||||
| `template` <Tag variant="new">2.2</Tag> | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. See [`templates.py`](https://github.com/explosion/spaCy/blob/master/spacy/displacy/templates.py) for examples. ~~Optional[str]~~ |
|
| `template` <Tag variant="new">2.2</Tag> | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. See [`templates.py`](https://github.com/explosion/spaCy/blob/master/spacy/displacy/templates.py) for examples. ~~Optional[str]~~ |
|
||||||
|
|
||||||
By default, displaCy comes with colors for all entity types used by
|
By default, displaCy comes with colors for all entity types used by
|
||||||
[spaCy models](/models). If you're using custom entity types, you can use the
|
[spaCy's trained pipelines](/models). If you're using custom entity types, you
|
||||||
`colors` setting to add your own colors for them. Your application or model
|
can use the `colors` setting to add your own colors for them. Your application
|
||||||
package can also expose a
|
or pipeline package can also expose a
|
||||||
[`spacy_displacy_colors` entry point](/usage/saving-loading#entry-points-displacy)
|
[`spacy_displacy_colors` entry point](/usage/saving-loading#entry-points-displacy)
|
||||||
to add custom labels and their colors automatically.
|
to add custom labels and their colors automatically.
|
||||||
|
|
||||||
|
@ -666,8 +664,8 @@ loaded lazily, to avoid expensive setup code associated with the language data.
|
||||||
|
|
||||||
### util.load_model {#util.load_model tag="function" new="2"}
|
### util.load_model {#util.load_model tag="function" new="2"}
|
||||||
|
|
||||||
Load a model from a package or data path. If called with a package name, spaCy
|
Load a pipeline from a package or data path. If called with a string name, spaCy
|
||||||
will assume the model is a Python package and import and call its `load()`
|
will assume the pipeline is a Python package and import and call its `load()`
|
||||||
method. If called with a path, spaCy will assume it's a data directory, read the
|
method. If called with a path, spaCy will assume it's a data directory, read the
|
||||||
language and pipeline settings from the [`config.cfg`](/api/data-formats#config)
|
language and pipeline settings from the [`config.cfg`](/api/data-formats#config)
|
||||||
and create a `Language` object. The model data will then be loaded in via
|
and create a `Language` object. The model data will then be loaded in via
|
||||||
|
@ -683,16 +681,16 @@ and create a `Language` object. The model data will then be loaded in via
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `name` | Package name or model path. ~~str~~ |
|
| `name` | Package name or path. ~~str~~ |
|
||||||
| `vocab` <Tag variant="new">3</Tag> | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~. |
|
| `vocab` <Tag variant="new">3</Tag> | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~. |
|
||||||
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~List[str]~~ |
|
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~List[str]~~ |
|
||||||
| `exclude` <Tag variant="new">3</Tag> | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ |
|
| `exclude` <Tag variant="new">3</Tag> | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ |
|
||||||
| `config` <Tag variant="new">3</Tag> | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ |
|
| `config` <Tag variant="new">3</Tag> | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ |
|
||||||
| **RETURNS** | `Language` class with the loaded model. ~~Language~~ |
|
| **RETURNS** | `Language` class with the loaded pipeline. ~~Language~~ |
|
||||||
|
|
||||||
### util.load_model_from_init_py {#util.load_model_from_init_py tag="function" new="2"}
|
### util.load_model_from_init_py {#util.load_model_from_init_py tag="function" new="2"}
|
||||||
|
|
||||||
A helper function to use in the `load()` method of a model package's
|
A helper function to use in the `load()` method of a pipeline package's
|
||||||
[`__init__.py`](https://github.com/explosion/spacy-models/tree/master/template/model/xx_model_name/__init__.py).
|
[`__init__.py`](https://github.com/explosion/spacy-models/tree/master/template/model/xx_model_name/__init__.py).
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
|
@ -706,70 +704,72 @@ A helper function to use in the `load()` method of a model package's
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `init_file` | Path to model's `__init__.py`, i.e. `__file__`. ~~Union[str, Path]~~ |
|
| `init_file` | Path to package's `__init__.py`, i.e. `__file__`. ~~Union[str, Path]~~ |
|
||||||
| `vocab` <Tag variant="new">3</Tag> | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~. |
|
| `vocab` <Tag variant="new">3</Tag> | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~. |
|
||||||
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~List[str]~~ |
|
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~List[str]~~ |
|
||||||
| `exclude` <Tag variant="new">3</Tag> | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ |
|
| `exclude` <Tag variant="new">3</Tag> | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ |
|
||||||
| `config` <Tag variant="new">3</Tag> | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ |
|
| `config` <Tag variant="new">3</Tag> | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ |
|
||||||
| **RETURNS** | `Language` class with the loaded model. ~~Language~~ |
|
| **RETURNS** | `Language` class with the loaded pipeline. ~~Language~~ |
|
||||||
|
|
||||||
### util.load_config {#util.load_config tag="function" new="3"}
|
### util.load_config {#util.load_config tag="function" new="3"}
|
||||||
|
|
||||||
Load a model's [`config.cfg`](/api/data-formats#config) from a file path. The
|
Load a pipeline's [`config.cfg`](/api/data-formats#config) from a file path. The
|
||||||
config typically includes details about the model pipeline and how its
|
config typically includes details about the components and how they're created,
|
||||||
components are created, as well as all training settings and hyperparameters.
|
as well as all training settings and hyperparameters.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> config = util.load_config("/path/to/model/config.cfg")
|
> config = util.load_config("/path/to/config.cfg")
|
||||||
> print(config.to_str())
|
> print(config.to_str())
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `path` | Path to the model's `config.cfg`. ~~Union[str, Path]~~ |
|
| `path` | Path to the pipeline's `config.cfg`. ~~Union[str, Path]~~ |
|
||||||
| `overrides` | Optional config overrides to replace in loaded config. Can be provided as nested dict, or as flat dict with keys in dot notation, e.g. `"nlp.pipeline"`. ~~Dict[str, Any]~~ |
|
| `overrides` | Optional config overrides to replace in loaded config. Can be provided as nested dict, or as flat dict with keys in dot notation, e.g. `"nlp.pipeline"`. ~~Dict[str, Any]~~ |
|
||||||
| `interpolate` | Whether to interpolate the config and replace variables like `${paths.train}` with their values. Defaults to `False`. ~~bool~~ |
|
| `interpolate` | Whether to interpolate the config and replace variables like `${paths.train}` with their values. Defaults to `False`. ~~bool~~ |
|
||||||
| **RETURNS** | The model's config. ~~Config~~ |
|
| **RETURNS** | The pipeline's config. ~~Config~~ |
|
||||||
|
|
||||||
### util.load_meta {#util.load_meta tag="function" new="3"}
|
### util.load_meta {#util.load_meta tag="function" new="3"}
|
||||||
|
|
||||||
Get a model's [`meta.json`](/api/data-formats#meta) from a file path and
|
Get a pipeline's [`meta.json`](/api/data-formats#meta) from a file path and
|
||||||
validate its contents.
|
validate its contents. The meta typically includes details about author,
|
||||||
|
licensing, data sources and version.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> meta = util.load_meta("/path/to/model/meta.json")
|
> meta = util.load_meta("/path/to/meta.json")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------- | ----------------------------------------------------- |
|
| ----------- | -------------------------------------------------------- |
|
||||||
| `path` | Path to the model's `meta.json`. ~~Union[str, Path]~~ |
|
| `path` | Path to the pipeline's `meta.json`. ~~Union[str, Path]~~ |
|
||||||
| **RETURNS** | The model's meta data. ~~Dict[str, Any]~~ |
|
| **RETURNS** | The pipeline's meta data. ~~Dict[str, Any]~~ |
|
||||||
|
|
||||||
### util.get_installed_models {#util.get_installed_models tag="function" new="3"}
|
### util.get_installed_models {#util.get_installed_models tag="function" new="3"}
|
||||||
|
|
||||||
List all model packages installed in the current environment. This will include
|
List all pipeline packages installed in the current environment. This will
|
||||||
any spaCy model that was packaged with [`spacy package`](/api/cli#package).
|
include any spaCy pipeline that was packaged with
|
||||||
Under the hood, model packages expose a Python entry point that spaCy can check,
|
[`spacy package`](/api/cli#package). Under the hood, pipeline packages expose a
|
||||||
without having to load the model.
|
Python entry point that spaCy can check, without having to load the `nlp`
|
||||||
|
object.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> model_names = util.get_installed_models()
|
> names = util.get_installed_models()
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------- | ---------------------------------------------------------------------------------- |
|
| ----------- | ------------------------------------------------------------------------------------- |
|
||||||
| **RETURNS** | The string names of the models installed in the current environment. ~~List[str]~~ |
|
| **RETURNS** | The string names of the pipelines installed in the current environment. ~~List[str]~~ |
|
||||||
|
|
||||||
### util.is_package {#util.is_package tag="function"}
|
### util.is_package {#util.is_package tag="function"}
|
||||||
|
|
||||||
Check if string maps to a package installed via pip. Mainly used to validate
|
Check if string maps to a package installed via pip. Mainly used to validate
|
||||||
[model packages](/usage/models).
|
[pipeline packages](/usage/models).
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
@ -786,7 +786,8 @@ Check if string maps to a package installed via pip. Mainly used to validate
|
||||||
### util.get_package_path {#util.get_package_path tag="function" new="2"}
|
### util.get_package_path {#util.get_package_path tag="function" new="2"}
|
||||||
|
|
||||||
Get path to an installed package. Mainly used to resolve the location of
|
Get path to an installed package. Mainly used to resolve the location of
|
||||||
[model packages](/usage/models). Currently imports the package to find its path.
|
[pipeline packages](/usage/models). Currently imports the package to find its
|
||||||
|
path.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
@ -795,10 +796,10 @@ Get path to an installed package. Mainly used to resolve the location of
|
||||||
> # /usr/lib/python3.6/site-packages/en_core_web_sm
|
> # /usr/lib/python3.6/site-packages/en_core_web_sm
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------- | ----------------------------------------- |
|
| -------------- | -------------------------------------------- |
|
||||||
| `package_name` | Name of installed package. ~~str~~ |
|
| `package_name` | Name of installed package. ~~str~~ |
|
||||||
| **RETURNS** | Path to model package directory. ~~Path~~ |
|
| **RETURNS** | Path to pipeline package directory. ~~Path~~ |
|
||||||
|
|
||||||
### util.is_in_jupyter {#util.is_in_jupyter tag="function" new="2"}
|
### util.is_in_jupyter {#util.is_in_jupyter tag="function" new="2"}
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
---
|
---
|
||||||
title: Models
|
title: Trained Models & Pipelines
|
||||||
teaser: Downloadable pretrained models for spaCy
|
teaser: Downloadable trained pipelines and weights for spaCy
|
||||||
menu:
|
menu:
|
||||||
- ['Quickstart', 'quickstart']
|
- ['Quickstart', 'quickstart']
|
||||||
- ['Conventions', 'conventions']
|
- ['Conventions', 'conventions']
|
||||||
|
@ -8,15 +8,15 @@ menu:
|
||||||
|
|
||||||
<!-- Update page, refer to new /api/architectures and training docs -->
|
<!-- Update page, refer to new /api/architectures and training docs -->
|
||||||
|
|
||||||
The models directory includes two types of pretrained models:
|
This directory includes two types of packages:
|
||||||
|
|
||||||
1. **Core models:** General-purpose pretrained models to predict named entities,
|
1. **Trained pipelines:** General-purpose spaCy pipelines to predict named
|
||||||
part-of-speech tags and syntactic dependencies. Can be used out-of-the-box
|
entities, part-of-speech tags and syntactic dependencies. Can be used
|
||||||
and fine-tuned on more specific data.
|
out-of-the-box and fine-tuned on more specific data.
|
||||||
2. **Starter models:** Transfer learning starter packs with pretrained weights
|
2. **Starters:** Transfer learning starter packs with pretrained weights you can
|
||||||
you can initialize your models with to achieve better accuracy. They can
|
initialize your pipeline models with to achieve better accuracy. They can
|
||||||
include word vectors (which will be used as features during training) or
|
include word vectors (which will be used as features during training) or
|
||||||
other pretrained representations like BERT. These models don't include
|
other pretrained representations like BERT. These packages don't include
|
||||||
components for specific tasks like NER or text classification and are
|
components for specific tasks like NER or text classification and are
|
||||||
intended to be used as base models when training your own models.
|
intended to be used as base models when training your own models.
|
||||||
|
|
||||||
|
@ -28,43 +28,42 @@ import QuickstartModels from 'widgets/quickstart-models.js'
|
||||||
|
|
||||||
<Infobox title="Installation and usage" emoji="📖">
|
<Infobox title="Installation and usage" emoji="📖">
|
||||||
|
|
||||||
For more details on how to use models with spaCy, see the
|
For more details on how to use trained pipelines with spaCy, see the
|
||||||
[usage guide on models](/usage/models).
|
[usage guide](/usage/models).
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
## Model naming conventions {#conventions}
|
## Package naming conventions {#conventions}
|
||||||
|
|
||||||
In general, spaCy expects all model packages to follow the naming convention of
|
In general, spaCy expects all pipeline packages to follow the naming convention
|
||||||
`[lang`\_[name]]. For spaCy's models, we also chose to divide the name into
|
of `[lang`\_[name]]. For spaCy's pipelines, we also chose to divide the name
|
||||||
three components:
|
into three components:
|
||||||
|
|
||||||
1. **Type:** Model capabilities (e.g. `core` for general-purpose model with
|
1. **Type:** Capabilities (e.g. `core` for general-purpose pipeline with
|
||||||
vocabulary, syntax, entities and word vectors, or `depent` for only vocab,
|
vocabulary, syntax, entities and word vectors, or `depent` for only vocab,
|
||||||
syntax and entities).
|
syntax and entities).
|
||||||
2. **Genre:** Type of text the model is trained on, e.g. `web` or `news`.
|
2. **Genre:** Type of text the pipeline is trained on, e.g. `web` or `news`.
|
||||||
3. **Size:** Model size indicator, `sm`, `md` or `lg`.
|
3. **Size:** Package size indicator, `sm`, `md` or `lg`.
|
||||||
|
|
||||||
For example, [`en_core_web_sm`](/models/en#en_core_web_sm) is a small English
|
For example, [`en_core_web_sm`](/models/en#en_core_web_sm) is a small English
|
||||||
model trained on written web text (blogs, news, comments), that includes
|
pipeline trained on written web text (blogs, news, comments), that includes
|
||||||
vocabulary, vectors, syntax and entities.
|
vocabulary, vectors, syntax and entities.
|
||||||
|
|
||||||
### Model versioning {#model-versioning}
|
### Package versioning {#model-versioning}
|
||||||
|
|
||||||
Additionally, the model versioning reflects both the compatibility with spaCy,
|
Additionally, the pipeline package versioning reflects both the compatibility
|
||||||
as well as the major and minor model version. A model version `a.b.c` translates
|
with spaCy, as well as the major and minor version. A package version `a.b.c`
|
||||||
to:
|
translates to:
|
||||||
|
|
||||||
- `a`: **spaCy major version**. For example, `2` for spaCy v2.x.
|
- `a`: **spaCy major version**. For example, `2` for spaCy v2.x.
|
||||||
- `b`: **Model major version**. Models with a different major version can't be
|
- `b`: **Package major version**. Pipelines with a different major version can't
|
||||||
loaded by the same code. For example, changing the width of the model, adding
|
be loaded by the same code. For example, changing the width of the model,
|
||||||
hidden layers or changing the activation changes the model major version.
|
adding hidden layers or changing the activation changes the major version.
|
||||||
- `c`: **Model minor version**. Same model structure, but different parameter
|
- `c`: **Package minor version**. Same pipeline structure, but different
|
||||||
values, e.g. from being trained on different data, for different numbers of
|
parameter values, e.g. from being trained on different data, for different
|
||||||
iterations, etc.
|
numbers of iterations, etc.
|
||||||
|
|
||||||
For a detailed compatibility overview, see the
|
For a detailed compatibility overview, see the
|
||||||
[`compatibility.json`](https://github.com/explosion/spacy-models/tree/master/compatibility.json)
|
[`compatibility.json`](https://github.com/explosion/spacy-models/tree/master/compatibility.json).
|
||||||
in the models repository. This is also the source of spaCy's internal
|
This is also the source of spaCy's internal compatibility check, performed when
|
||||||
compatibility check, performed when you run the [`download`](/api/cli#download)
|
you run the [`download`](/api/cli#download) command.
|
||||||
command.
|
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
When you call `nlp` on a text, spaCy first tokenizes the text to produce a `Doc`
|
When you call `nlp` on a text, spaCy first tokenizes the text to produce a `Doc`
|
||||||
object. The `Doc` is then processed in several different steps – this is also
|
object. The `Doc` is then processed in several different steps – this is also
|
||||||
referred to as the **processing pipeline**. The pipeline used by the
|
referred to as the **processing pipeline**. The pipeline used by the
|
||||||
[default models](/models) typically include a tagger, a lemmatizer, a parser and
|
[trained pipelines](/models) typically include a tagger, a lemmatizer, a parser
|
||||||
an entity recognizer. Each pipeline component returns the processed `Doc`, which
|
and an entity recognizer. Each pipeline component returns the processed `Doc`,
|
||||||
is then passed on to the next component.
|
which is then passed on to the next component.
|
||||||
|
|
||||||
![The processing pipeline](../../images/pipeline.svg)
|
![The processing pipeline](../../images/pipeline.svg)
|
||||||
|
|
||||||
|
@ -23,14 +23,15 @@ is then passed on to the next component.
|
||||||
| **textcat** | [`TextCategorizer`](/api/textcategorizer) | `Doc.cats` | Assign document labels. |
|
| **textcat** | [`TextCategorizer`](/api/textcategorizer) | `Doc.cats` | Assign document labels. |
|
||||||
| **custom** | [custom components](/usage/processing-pipelines#custom-components) | `Doc._.xxx`, `Token._.xxx`, `Span._.xxx` | Assign custom attributes, methods or properties. |
|
| **custom** | [custom components](/usage/processing-pipelines#custom-components) | `Doc._.xxx`, `Token._.xxx`, `Span._.xxx` | Assign custom attributes, methods or properties. |
|
||||||
|
|
||||||
The processing pipeline always **depends on the statistical model** and its
|
The capabilities of a processing pipeline always depend on the components, their
|
||||||
capabilities. For example, a pipeline can only include an entity recognizer
|
models and how they were trained. For example, a pipeline for named entity
|
||||||
component if the model includes data to make predictions of entity labels. This
|
recognition needs to include a trained named entity recognizer component with a
|
||||||
is why each model will specify the pipeline to use in its meta data and
|
statistical model and weights that enable it to **make predictions** of entity
|
||||||
[config](/usage/training#config), as a simple list containing the component
|
labels. This is why each pipeline specifies its components and their settings in
|
||||||
names:
|
the [config](/usage/training#config):
|
||||||
|
|
||||||
```ini
|
```ini
|
||||||
|
[nlp]
|
||||||
pipeline = ["tagger", "parser", "ner"]
|
pipeline = ["tagger", "parser", "ner"]
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
After tokenization, spaCy can **parse** and **tag** a given `Doc`. This is where
|
After tokenization, spaCy can **parse** and **tag** a given `Doc`. This is where
|
||||||
the statistical model comes in, which enables spaCy to **make a prediction** of
|
the trained pipeline and its statistical models come in, which enable spaCy to
|
||||||
which tag or label most likely applies in this context. A model consists of
|
**make predictions** of which tag or label most likely applies in this context.
|
||||||
binary data and is produced by showing a system enough examples for it to make
|
A trained component includes binary data that is produced by showing a system
|
||||||
predictions that generalize across the language – for example, a word following
|
enough examples for it to make predictions that generalize across the language –
|
||||||
"the" in English is most likely a noun.
|
for example, a word following "the" in English is most likely a noun.
|
||||||
|
|
||||||
Linguistic annotations are available as
|
Linguistic annotations are available as
|
||||||
[`Token` attributes](/api/token#attributes). Like many NLP libraries, spaCy
|
[`Token` attributes](/api/token#attributes). Like many NLP libraries, spaCy
|
||||||
|
@ -25,7 +25,8 @@ for token in doc:
|
||||||
|
|
||||||
> - **Text:** The original word text.
|
> - **Text:** The original word text.
|
||||||
> - **Lemma:** The base form of the word.
|
> - **Lemma:** The base form of the word.
|
||||||
> - **POS:** The simple [UPOS](https://universaldependencies.org/docs/u/pos/) part-of-speech tag.
|
> - **POS:** The simple [UPOS](https://universaldependencies.org/docs/u/pos/)
|
||||||
|
> part-of-speech tag.
|
||||||
> - **Tag:** The detailed part-of-speech tag.
|
> - **Tag:** The detailed part-of-speech tag.
|
||||||
> - **Dep:** Syntactic dependency, i.e. the relation between tokens.
|
> - **Dep:** Syntactic dependency, i.e. the relation between tokens.
|
||||||
> - **Shape:** The word shape – capitalization, punctuation, digits.
|
> - **Shape:** The word shape – capitalization, punctuation, digits.
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
If you've been modifying the pipeline, vocabulary, vectors and entities, or made
|
If you've been modifying the pipeline, vocabulary, vectors and entities, or made
|
||||||
updates to the model, you'll eventually want to **save your progress** – for
|
updates to the component models, you'll eventually want to **save your
|
||||||
example, everything that's in your `nlp` object. This means you'll have to
|
progress** – for example, everything that's in your `nlp` object. This means
|
||||||
translate its contents and structure into a format that can be saved, like a
|
you'll have to translate its contents and structure into a format that can be
|
||||||
file or a byte string. This process is called serialization. spaCy comes with
|
saved, like a file or a byte string. This process is called serialization. spaCy
|
||||||
**built-in serialization methods** and supports the
|
comes with **built-in serialization methods** and supports the
|
||||||
[Pickle protocol](https://www.diveinto.org/python3/serializing.html#dump).
|
[Pickle protocol](https://www.diveinto.org/python3/serializing.html#dump).
|
||||||
|
|
||||||
> #### What's pickle?
|
> #### What's pickle?
|
||||||
|
|
|
@ -1,25 +1,25 @@
|
||||||
spaCy's tagger, parser, text categorizer and many other components are powered
|
spaCy's tagger, parser, text categorizer and many other components are powered
|
||||||
by **statistical models**. Every "decision" these components make – for example,
|
by **statistical models**. Every "decision" these components make – for example,
|
||||||
which part-of-speech tag to assign, or whether a word is a named entity – is a
|
which part-of-speech tag to assign, or whether a word is a named entity – is a
|
||||||
**prediction** based on the model's current **weight values**. The weight
|
**prediction** based on the model's current **weight values**. The weight values
|
||||||
values are estimated based on examples the model has seen
|
are estimated based on examples the model has seen during **training**. To train
|
||||||
during **training**. To train a model, you first need training data – examples
|
a model, you first need training data – examples of text, and the labels you
|
||||||
of text, and the labels you want the model to predict. This could be a
|
want the model to predict. This could be a part-of-speech tag, a named entity or
|
||||||
part-of-speech tag, a named entity or any other information.
|
any other information.
|
||||||
|
|
||||||
Training is an iterative process in which the model's predictions are compared
|
Training is an iterative process in which the model's predictions are compared
|
||||||
against the reference annotations in order to estimate the **gradient of the
|
against the reference annotations in order to estimate the **gradient of the
|
||||||
loss**. The gradient of the loss is then used to calculate the gradient of the
|
loss**. The gradient of the loss is then used to calculate the gradient of the
|
||||||
weights through [backpropagation](https://thinc.ai/backprop101). The gradients
|
weights through [backpropagation](https://thinc.ai/backprop101). The gradients
|
||||||
indicate how the weight values should be changed so that the model's
|
indicate how the weight values should be changed so that the model's predictions
|
||||||
predictions become more similar to the reference labels over time.
|
become more similar to the reference labels over time.
|
||||||
|
|
||||||
> - **Training data:** Examples and their annotations.
|
> - **Training data:** Examples and their annotations.
|
||||||
> - **Text:** The input text the model should predict a label for.
|
> - **Text:** The input text the model should predict a label for.
|
||||||
> - **Label:** The label the model should predict.
|
> - **Label:** The label the model should predict.
|
||||||
> - **Gradient:** The direction and rate of change for a numeric value.
|
> - **Gradient:** The direction and rate of change for a numeric value.
|
||||||
> Minimising the gradient of the weights should result in predictions that
|
> Minimising the gradient of the weights should result in predictions that are
|
||||||
> are closer to the reference labels on the training data.
|
> closer to the reference labels on the training data.
|
||||||
|
|
||||||
![The training process](../../images/training.svg)
|
![The training process](../../images/training.svg)
|
||||||
|
|
||||||
|
|
|
@ -24,12 +24,12 @@ array([2.02280000e-01, -7.66180009e-02, 3.70319992e-01,
|
||||||
|
|
||||||
<Infobox title="Important note" variant="warning">
|
<Infobox title="Important note" variant="warning">
|
||||||
|
|
||||||
To make them compact and fast, spaCy's small [models](/models) (all packages
|
To make them compact and fast, spaCy's small [pipeline packages](/models) (all
|
||||||
that end in `sm`) **don't ship with word vectors**, and only include
|
packages that end in `sm`) **don't ship with word vectors**, and only include
|
||||||
context-sensitive **tensors**. This means you can still use the `similarity()`
|
context-sensitive **tensors**. This means you can still use the `similarity()`
|
||||||
methods to compare documents, spans and tokens – but the result won't be as
|
methods to compare documents, spans and tokens – but the result won't be as
|
||||||
good, and individual tokens won't have any vectors assigned. So in order to use
|
good, and individual tokens won't have any vectors assigned. So in order to use
|
||||||
_real_ word vectors, you need to download a larger model:
|
_real_ word vectors, you need to download a larger pipeline package:
|
||||||
|
|
||||||
```diff
|
```diff
|
||||||
- python -m spacy download en_core_web_sm
|
- python -m spacy download en_core_web_sm
|
||||||
|
@ -38,11 +38,11 @@ _real_ word vectors, you need to download a larger model:
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
Models that come with built-in word vectors make them available as the
|
Pipeline packages that come with built-in word vectors make them available as
|
||||||
[`Token.vector`](/api/token#vector) attribute. [`Doc.vector`](/api/doc#vector)
|
the [`Token.vector`](/api/token#vector) attribute.
|
||||||
and [`Span.vector`](/api/span#vector) will default to an average of their token
|
[`Doc.vector`](/api/doc#vector) and [`Span.vector`](/api/span#vector) will
|
||||||
vectors. You can also check if a token has a vector assigned, and get the L2
|
default to an average of their token vectors. You can also check if a token has
|
||||||
norm, which can be used to normalize vectors.
|
a vector assigned, and get the L2 norm, which can be used to normalize vectors.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### {executable="true"}
|
### {executable="true"}
|
||||||
|
@ -62,12 +62,12 @@ for token in tokens:
|
||||||
> - **OOV**: Out-of-vocabulary
|
> - **OOV**: Out-of-vocabulary
|
||||||
|
|
||||||
The words "dog", "cat" and "banana" are all pretty common in English, so they're
|
The words "dog", "cat" and "banana" are all pretty common in English, so they're
|
||||||
part of the model's vocabulary, and come with a vector. The word "afskfsd" on
|
part of the pipeline's vocabulary, and come with a vector. The word "afskfsd" on
|
||||||
the other hand is a lot less common and out-of-vocabulary – so its vector
|
the other hand is a lot less common and out-of-vocabulary – so its vector
|
||||||
representation consists of 300 dimensions of `0`, which means it's practically
|
representation consists of 300 dimensions of `0`, which means it's practically
|
||||||
nonexistent. If your application will benefit from a **large vocabulary** with
|
nonexistent. If your application will benefit from a **large vocabulary** with
|
||||||
more vectors, you should consider using one of the larger models or loading in a
|
more vectors, you should consider using one of the larger pipeline packages or
|
||||||
full vector package, for example,
|
loading in a full vector package, for example,
|
||||||
[`en_vectors_web_lg`](/models/en-starters#en_vectors_web_lg), which includes
|
[`en_vectors_web_lg`](/models/en-starters#en_vectors_web_lg), which includes
|
||||||
over **1 million unique vectors**.
|
over **1 million unique vectors**.
|
||||||
|
|
||||||
|
@ -82,7 +82,7 @@ Each [`Doc`](/api/doc), [`Span`](/api/span), [`Token`](/api/token) and
|
||||||
method that lets you compare it with another object, and determine the
|
method that lets you compare it with another object, and determine the
|
||||||
similarity. Of course similarity is always subjective – whether two words, spans
|
similarity. Of course similarity is always subjective – whether two words, spans
|
||||||
or documents are similar really depends on how you're looking at it. spaCy's
|
or documents are similar really depends on how you're looking at it. spaCy's
|
||||||
similarity model usually assumes a pretty general-purpose definition of
|
similarity implementation usually assumes a pretty general-purpose definition of
|
||||||
similarity.
|
similarity.
|
||||||
|
|
||||||
> #### 📝 Things to try
|
> #### 📝 Things to try
|
||||||
|
@ -99,7 +99,7 @@ similarity.
|
||||||
### {executable="true"}
|
### {executable="true"}
|
||||||
import spacy
|
import spacy
|
||||||
|
|
||||||
nlp = spacy.load("en_core_web_md") # make sure to use larger model!
|
nlp = spacy.load("en_core_web_md") # make sure to use larger package!
|
||||||
doc1 = nlp("I like salty fries and hamburgers.")
|
doc1 = nlp("I like salty fries and hamburgers.")
|
||||||
doc2 = nlp("Fast food tastes very good.")
|
doc2 = nlp("Fast food tastes very good.")
|
||||||
|
|
||||||
|
@ -143,10 +143,9 @@ us that builds on top of spaCy and lets you train and query more interesting and
|
||||||
detailed word vectors. It combines noun phrases like "fast food" or "fair game"
|
detailed word vectors. It combines noun phrases like "fast food" or "fair game"
|
||||||
and includes the part-of-speech tags and entity labels. The library also
|
and includes the part-of-speech tags and entity labels. The library also
|
||||||
includes annotation recipes for our annotation tool [Prodigy](https://prodi.gy)
|
includes annotation recipes for our annotation tool [Prodigy](https://prodi.gy)
|
||||||
that let you evaluate vector models and create terminology lists. For more
|
that let you evaluate vectors and create terminology lists. For more details,
|
||||||
details, check out
|
check out [our blog post](https://explosion.ai/blog/sense2vec-reloaded). To
|
||||||
[our blog post](https://explosion.ai/blog/sense2vec-reloaded). To explore the
|
explore the semantic similarities across all Reddit comments of 2015 and 2019,
|
||||||
semantic similarities across all Reddit comments of 2015 and 2019, see the
|
see the [interactive demo](https://explosion.ai/demos/sense2vec).
|
||||||
[interactive demo](https://explosion.ai/demos/sense2vec).
|
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
|
@ -35,10 +35,10 @@ Using pip, spaCy releases are available as source packages and binary wheels.
|
||||||
$ pip install -U spacy
|
$ pip install -U spacy
|
||||||
```
|
```
|
||||||
|
|
||||||
> #### Download models
|
> #### Download pipelines
|
||||||
>
|
>
|
||||||
> After installation you need to download a language model. For more info and
|
> After installation you typically want to download a trained pipeline. For more
|
||||||
> available models, see the [docs on models](/models).
|
> info and available packages, see the [models directory](/models).
|
||||||
>
|
>
|
||||||
> ```cli
|
> ```cli
|
||||||
> $ python -m spacy download en_core_web_sm
|
> $ python -m spacy download en_core_web_sm
|
||||||
|
@ -54,7 +54,7 @@ To install additional data tables for lemmatization you can run
|
||||||
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data)
|
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data)
|
||||||
separately. The lookups package is needed to provide normalization and
|
separately. The lookups package is needed to provide normalization and
|
||||||
lemmatization data for new models and to lemmatize in languages that don't yet
|
lemmatization data for new models and to lemmatize in languages that don't yet
|
||||||
come with pretrained models and aren't powered by third-party libraries.
|
come with trained pipelines and aren't powered by third-party libraries.
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
|
@ -88,23 +88,21 @@ and pull requests to the recipe and setup are always appreciated.
|
||||||
> spaCy v2.x to v3.x may still require some changes to your code base. For
|
> spaCy v2.x to v3.x may still require some changes to your code base. For
|
||||||
> details see the sections on [backwards incompatibilities](/usage/v3#incompat)
|
> details see the sections on [backwards incompatibilities](/usage/v3#incompat)
|
||||||
> and [migrating](/usage/v3#migrating). Also remember to download the new
|
> and [migrating](/usage/v3#migrating). Also remember to download the new
|
||||||
> models, and retrain your own models.
|
> trained pipelines, and retrain your own pipelines.
|
||||||
|
|
||||||
When updating to a newer version of spaCy, it's generally recommended to start
|
When updating to a newer version of spaCy, it's generally recommended to start
|
||||||
with a clean virtual environment. If you're upgrading to a new major version,
|
with a clean virtual environment. If you're upgrading to a new major version,
|
||||||
make sure you have the latest **compatible models** installed, and that there
|
make sure you have the latest **compatible trained pipelines** installed, and
|
||||||
are no old and incompatible model packages left over in your environment, as
|
that there are no old and incompatible packages left over in your environment,
|
||||||
this can often lead to unexpected results and errors. If you've trained your own
|
as this can often lead to unexpected results and errors. If you've trained your
|
||||||
models, keep in mind that your train and runtime inputs must match. This means
|
own models, keep in mind that your train and runtime inputs must match. This
|
||||||
you'll have to **retrain your models** with the new version.
|
means you'll have to **retrain your pipelines** with the new version.
|
||||||
|
|
||||||
spaCy also provides a [`validate`](/api/cli#validate) command, which lets you
|
spaCy also provides a [`validate`](/api/cli#validate) command, which lets you
|
||||||
verify that all installed models are compatible with your spaCy version. If
|
verify that all installed pipeline packages are compatible with your spaCy
|
||||||
incompatible models are found, tips and installation instructions are printed.
|
version. If incompatible packages are found, tips and installation instructions
|
||||||
The command is also useful to detect out-of-sync model links resulting from
|
are printed. It's recommended to run the command with `python -m` to make sure
|
||||||
links created in different virtual environments. It's recommended to run the
|
you're executing the correct version of spaCy.
|
||||||
command with `python -m` to make sure you're executing the correct version of
|
|
||||||
spaCy.
|
|
||||||
|
|
||||||
```cli
|
```cli
|
||||||
$ pip install -U spacy
|
$ pip install -U spacy
|
||||||
|
@ -132,8 +130,8 @@ $ pip install -U spacy[cuda92]
|
||||||
Once you have a GPU-enabled installation, the best way to activate it is to call
|
Once you have a GPU-enabled installation, the best way to activate it is to call
|
||||||
[`spacy.prefer_gpu`](/api/top-level#spacy.prefer_gpu) or
|
[`spacy.prefer_gpu`](/api/top-level#spacy.prefer_gpu) or
|
||||||
[`spacy.require_gpu()`](/api/top-level#spacy.require_gpu) somewhere in your
|
[`spacy.require_gpu()`](/api/top-level#spacy.require_gpu) somewhere in your
|
||||||
script before any models have been loaded. `require_gpu` will raise an error if
|
script before any pipelines have been loaded. `require_gpu` will raise an error
|
||||||
no GPU is available.
|
if no GPU is available.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import spacy
|
import spacy
|
||||||
|
@ -238,16 +236,16 @@ installing, loading and using spaCy, as well as their solutions.
|
||||||
<Accordion title="No compatible model found" id="compatible-model">
|
<Accordion title="No compatible model found" id="compatible-model">
|
||||||
|
|
||||||
```
|
```
|
||||||
No compatible model found for [lang] (spaCy vX.X.X).
|
No compatible package found for [lang] (spaCy vX.X.X).
|
||||||
```
|
```
|
||||||
|
|
||||||
This usually means that the model you're trying to download does not exist, or
|
This usually means that the trained pipeline you're trying to download does not
|
||||||
isn't available for your version of spaCy. Check the
|
exist, or isn't available for your version of spaCy. Check the
|
||||||
[compatibility table](https://github.com/explosion/spacy-models/tree/master/compatibility.json)
|
[compatibility table](https://github.com/explosion/spacy-models/tree/master/compatibility.json)
|
||||||
to see which models are available for your spaCy version. If you're using an old
|
to see which packages are available for your spaCy version. If you're using an
|
||||||
version, consider upgrading to the latest release. Note that while spaCy
|
old version, consider upgrading to the latest release. Note that while spaCy
|
||||||
supports tokenization for [a variety of languages](/usage/models#languages), not
|
supports tokenization for [a variety of languages](/usage/models#languages), not
|
||||||
all of them come with statistical models. To only use the tokenizer, import the
|
all of them come with trained pipelines. To only use the tokenizer, import the
|
||||||
language's `Language` class instead, for example
|
language's `Language` class instead, for example
|
||||||
`from spacy.lang.fr import French`.
|
`from spacy.lang.fr import French`.
|
||||||
|
|
||||||
|
@ -259,7 +257,7 @@ language's `Language` class instead, for example
|
||||||
no such option: --no-cache-dir
|
no such option: --no-cache-dir
|
||||||
```
|
```
|
||||||
|
|
||||||
The `download` command uses pip to install the models and sets the
|
The `download` command uses pip to install the pipeline packages and sets the
|
||||||
`--no-cache-dir` flag to prevent it from requiring too much memory.
|
`--no-cache-dir` flag to prevent it from requiring too much memory.
|
||||||
[This setting](https://pip.pypa.io/en/stable/reference/pip_install/#caching)
|
[This setting](https://pip.pypa.io/en/stable/reference/pip_install/#caching)
|
||||||
requires pip v6.0 or newer. Run `pip install -U pip` to upgrade to the latest
|
requires pip v6.0 or newer. Run `pip install -U pip` to upgrade to the latest
|
||||||
|
@ -323,19 +321,19 @@ also run `which python` to find out where your Python executable is located.
|
||||||
|
|
||||||
</Accordion>
|
</Accordion>
|
||||||
|
|
||||||
<Accordion title="Import error: No module named [model]" id="import-error-models">
|
<Accordion title="Import error: No module named [name]" id="import-error-models">
|
||||||
|
|
||||||
```
|
```
|
||||||
ImportError: No module named 'en_core_web_sm'
|
ImportError: No module named 'en_core_web_sm'
|
||||||
```
|
```
|
||||||
|
|
||||||
As of spaCy v1.7, all models can be installed as Python packages. This means
|
As of spaCy v1.7, all trained pipelines can be installed as Python packages.
|
||||||
that they'll become importable modules of your application. If this fails, it's
|
This means that they'll become importable modules of your application. If this
|
||||||
usually a sign that the package is not installed in the current environment. Run
|
fails, it's usually a sign that the package is not installed in the current
|
||||||
`pip list` or `pip freeze` to check which model packages you have installed, and
|
environment. Run `pip list` or `pip freeze` to check which pipeline packages you
|
||||||
install the [correct models](/models) if necessary. If you're importing a model
|
have installed, and install the [correct package](/models) if necessary. If
|
||||||
manually at the top of a file, make sure to use the name of the package, not the
|
you're importing a package manually at the top of a file, make sure to use the
|
||||||
shortcut link you've created.
|
full name of the package.
|
||||||
|
|
||||||
</Accordion>
|
</Accordion>
|
||||||
|
|
||||||
|
|
|
@ -132,7 +132,7 @@ language can extend the `Lemmatizer` as part of its
|
||||||
### {executable="true"}
|
### {executable="true"}
|
||||||
import spacy
|
import spacy
|
||||||
|
|
||||||
# English models include a rule-based lemmatizer
|
# English pipelines include a rule-based lemmatizer
|
||||||
nlp = spacy.load("en_core_web_sm")
|
nlp = spacy.load("en_core_web_sm")
|
||||||
lemmatizer = nlp.get_pipe("lemmatizer")
|
lemmatizer = nlp.get_pipe("lemmatizer")
|
||||||
print(lemmatizer.mode) # 'rule'
|
print(lemmatizer.mode) # 'rule'
|
||||||
|
@ -156,14 +156,14 @@ component.
|
||||||
|
|
||||||
The data for spaCy's lemmatizers is distributed in the package
|
The data for spaCy's lemmatizers is distributed in the package
|
||||||
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). The
|
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). The
|
||||||
provided models already include all the required tables, but if you are creating
|
provided trained pipelines already include all the required tables, but if you
|
||||||
new models, you'll probably want to install `spacy-lookups-data` to provide the
|
are creating new pipelines, you'll probably want to install `spacy-lookups-data`
|
||||||
data when the lemmatizer is initialized.
|
to provide the data when the lemmatizer is initialized.
|
||||||
|
|
||||||
### Lookup lemmatizer {#lemmatizer-lookup}
|
### Lookup lemmatizer {#lemmatizer-lookup}
|
||||||
|
|
||||||
For models without a tagger or morphologizer, a lookup lemmatizer can be added
|
For pipelines without a tagger or morphologizer, a lookup lemmatizer can be
|
||||||
to the pipeline as long as a lookup table is provided, typically through
|
added to the pipeline as long as a lookup table is provided, typically through
|
||||||
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). The
|
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). The
|
||||||
lookup lemmatizer looks up the token surface form in the lookup table without
|
lookup lemmatizer looks up the token surface form in the lookup table without
|
||||||
reference to the token's part-of-speech or context.
|
reference to the token's part-of-speech or context.
|
||||||
|
@ -178,9 +178,9 @@ nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
|
||||||
|
|
||||||
### Rule-based lemmatizer {#lemmatizer-rule}
|
### Rule-based lemmatizer {#lemmatizer-rule}
|
||||||
|
|
||||||
When training models that include a component that assigns POS (a morphologizer
|
When training pipelines that include a component that assigns part-of-speech
|
||||||
or a tagger with a [POS mapping](#mappings-exceptions)), a rule-based lemmatizer
|
tags (a morphologizer or a tagger with a [POS mapping](#mappings-exceptions)), a
|
||||||
can be added using rule tables from
|
rule-based lemmatizer can be added using rule tables from
|
||||||
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data):
|
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data):
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
@ -366,10 +366,10 @@ sequence of tokens. You can walk up the tree with the
|
||||||
|
|
||||||
> #### Projective vs. non-projective
|
> #### Projective vs. non-projective
|
||||||
>
|
>
|
||||||
> For the [default English model](/models/en), the parse tree is **projective**,
|
> For the [default English pipelines](/models/en), the parse tree is
|
||||||
> which means that there are no crossing brackets. The tokens returned by
|
> **projective**, which means that there are no crossing brackets. The tokens
|
||||||
> `.subtree` are therefore guaranteed to be contiguous. This is not true for the
|
> returned by `.subtree` are therefore guaranteed to be contiguous. This is not
|
||||||
> German model, which has many
|
> true for the German pipelines, which have many
|
||||||
> [non-projective dependencies](https://explosion.ai/blog/german-model#word-order).
|
> [non-projective dependencies](https://explosion.ai/blog/german-model#word-order).
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
@ -497,26 +497,27 @@ displaCy in our [online demo](https://explosion.ai/demos/displacy)..
|
||||||
|
|
||||||
### Disabling the parser {#disabling}
|
### Disabling the parser {#disabling}
|
||||||
|
|
||||||
In the [default models](/models), the parser is loaded and enabled as part of
|
In the [trained pipelines](/models) provided by spaCy, the parser is loaded and
|
||||||
the [standard processing pipeline](/usage/processing-pipelines). If you don't
|
enabled by default as part of the
|
||||||
need any of the syntactic information, you should disable the parser. Disabling
|
[standard processing pipeline](/usage/processing-pipelines). If you don't need
|
||||||
the parser will make spaCy load and run much faster. If you want to load the
|
any of the syntactic information, you should disable the parser. Disabling the
|
||||||
parser, but need to disable it for specific documents, you can also control its
|
parser will make spaCy load and run much faster. If you want to load the parser,
|
||||||
use on the `nlp` object.
|
but need to disable it for specific documents, you can also control its use on
|
||||||
|
the `nlp` object. For more details, see the usage guide on
|
||||||
|
[disabling pipeline components](/usage/processing-pipelines/#disabling).
|
||||||
|
|
||||||
```python
|
```python
|
||||||
nlp = spacy.load("en_core_web_sm", disable=["parser"])
|
nlp = spacy.load("en_core_web_sm", disable=["parser"])
|
||||||
nlp = English().from_disk("/model", disable=["parser"])
|
|
||||||
doc = nlp("I don't want parsed", disable=["parser"])
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## Named Entity Recognition {#named-entities}
|
## Named Entity Recognition {#named-entities}
|
||||||
|
|
||||||
spaCy features an extremely fast statistical entity recognition system, that
|
spaCy features an extremely fast statistical entity recognition system, that
|
||||||
assigns labels to contiguous spans of tokens. The default model identifies a
|
assigns labels to contiguous spans of tokens. The default
|
||||||
variety of named and numeric entities, including companies, locations,
|
[trained pipelines](/models) can indentify a variety of named and numeric
|
||||||
organizations and products. You can add arbitrary classes to the entity
|
entities, including companies, locations, organizations and products. You can
|
||||||
recognition system, and update the model with new examples.
|
add arbitrary classes to the entity recognition system, and update the model
|
||||||
|
with new examples.
|
||||||
|
|
||||||
### Named Entity Recognition 101 {#named-entities-101}
|
### Named Entity Recognition 101 {#named-entities-101}
|
||||||
|
|
||||||
|
@ -669,7 +670,7 @@ responsibility for ensuring that the data is left in a consistent state.
|
||||||
|
|
||||||
<Infobox title="Annotation scheme">
|
<Infobox title="Annotation scheme">
|
||||||
|
|
||||||
For details on the entity types available in spaCy's pretrained models, see the
|
For details on the entity types available in spaCy's trained pipelines, see the
|
||||||
"label scheme" sections of the individual models in the
|
"label scheme" sections of the individual models in the
|
||||||
[models directory](/models).
|
[models directory](/models).
|
||||||
|
|
||||||
|
@ -710,9 +711,8 @@ import DisplacyEntHtml from 'images/displacy-ent2.html'
|
||||||
To ground the named entities into the "real world", spaCy provides functionality
|
To ground the named entities into the "real world", spaCy provides functionality
|
||||||
to perform entity linking, which resolves a textual entity to a unique
|
to perform entity linking, which resolves a textual entity to a unique
|
||||||
identifier from a knowledge base (KB). You can create your own
|
identifier from a knowledge base (KB). You can create your own
|
||||||
[`KnowledgeBase`](/api/kb) and
|
[`KnowledgeBase`](/api/kb) and [train](/usage/training) a new
|
||||||
[train a new Entity Linking model](/usage/training#entity-linker) using that
|
[`EntityLinker`](/api/entitylinker) using that custom knowledge base.
|
||||||
custom-made KB.
|
|
||||||
|
|
||||||
### Accessing entity identifiers {#entity-linking-accessing model="entity linking"}
|
### Accessing entity identifiers {#entity-linking-accessing model="entity linking"}
|
||||||
|
|
||||||
|
@ -724,7 +724,7 @@ object, or the `ent_kb_id` and `ent_kb_id_` attributes of a
|
||||||
```python
|
```python
|
||||||
import spacy
|
import spacy
|
||||||
|
|
||||||
nlp = spacy.load("my_custom_el_model")
|
nlp = spacy.load("my_custom_el_pipeline")
|
||||||
doc = nlp("Ada Lovelace was born in London")
|
doc = nlp("Ada Lovelace was born in London")
|
||||||
|
|
||||||
# Document level
|
# Document level
|
||||||
|
@ -1042,13 +1042,15 @@ function that behaves the same way.
|
||||||
|
|
||||||
<Infobox title="Important note" variant="warning">
|
<Infobox title="Important note" variant="warning">
|
||||||
|
|
||||||
If you're using a statistical model, writing to the
|
If you've loaded a trained pipeline, writing to the
|
||||||
[`nlp.Defaults`](/api/language#defaults) or `English.Defaults` directly won't
|
[`nlp.Defaults`](/api/language#defaults) or `English.Defaults` directly won't
|
||||||
work, since the regular expressions are read from the model and will be compiled
|
work, since the regular expressions are read from the pipeline data and will be
|
||||||
when you load it. If you modify `nlp.Defaults`, you'll only see the effect if
|
compiled when you load it. If you modify `nlp.Defaults`, you'll only see the
|
||||||
you call [`spacy.blank`](/api/top-level#spacy.blank). If you want to modify the
|
effect if you call [`spacy.blank`](/api/top-level#spacy.blank). If you want to
|
||||||
tokenizer loaded from a statistical model, you should modify `nlp.tokenizer`
|
modify the tokenizer loaded from a trained pipeline, you should modify
|
||||||
directly.
|
`nlp.tokenizer` directly. If you're training your own pipeline, you can register
|
||||||
|
[callbacks](/usage/training/#custom-code-nlp-callbacks) to modify the `nlp`
|
||||||
|
object before training.
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
|
@ -1218,11 +1220,11 @@ print(doc.text, [token.text for token in doc])
|
||||||
|
|
||||||
<Infobox title="Important note on tokenization and models" variant="warning">
|
<Infobox title="Important note on tokenization and models" variant="warning">
|
||||||
|
|
||||||
Keep in mind that your model's result may be less accurate if the tokenization
|
Keep in mind that your models' results may be less accurate if the tokenization
|
||||||
during training differs from the tokenization at runtime. So if you modify a
|
during training differs from the tokenization at runtime. So if you modify a
|
||||||
pretrained model's tokenization afterwards, it may produce very different
|
trained pipeline's tokenization afterwards, it may produce very different
|
||||||
predictions. You should therefore train your model with the **same tokenizer**
|
predictions. You should therefore train your pipeline with the **same
|
||||||
it will be using at runtime. See the docs on
|
tokenizer** it will be using at runtime. See the docs on
|
||||||
[training with custom tokenization](#custom-tokenizer-training) for details.
|
[training with custom tokenization](#custom-tokenizer-training) for details.
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
@ -1231,7 +1233,7 @@ it will be using at runtime. See the docs on
|
||||||
|
|
||||||
spaCy's [training config](/usage/training#config) describe the settings,
|
spaCy's [training config](/usage/training#config) describe the settings,
|
||||||
hyperparameters, pipeline and tokenizer used for constructing and training the
|
hyperparameters, pipeline and tokenizer used for constructing and training the
|
||||||
model. The `[nlp.tokenizer]` block refers to a **registered function** that
|
pipeline. The `[nlp.tokenizer]` block refers to a **registered function** that
|
||||||
takes the `nlp` object and returns a tokenizer. Here, we're registering a
|
takes the `nlp` object and returns a tokenizer. Here, we're registering a
|
||||||
function called `whitespace_tokenizer` in the
|
function called `whitespace_tokenizer` in the
|
||||||
[`@tokenizers` registry](/api/registry). To make sure spaCy knows how to
|
[`@tokenizers` registry](/api/registry). To make sure spaCy knows how to
|
||||||
|
@ -1626,11 +1628,11 @@ spaCy provides four alternatives for sentence segmentation:
|
||||||
|
|
||||||
Unlike other libraries, spaCy uses the dependency parse to determine sentence
|
Unlike other libraries, spaCy uses the dependency parse to determine sentence
|
||||||
boundaries. This is usually the most accurate approach, but it requires a
|
boundaries. This is usually the most accurate approach, but it requires a
|
||||||
**statistical model** that provides accurate predictions. If your texts are
|
**trained pipeline** that provides accurate predictions. If your texts are
|
||||||
closer to general-purpose news or web text, this should work well out-of-the-box
|
closer to general-purpose news or web text, this should work well out-of-the-box
|
||||||
with spaCy's provided models. For social media or conversational text that
|
with spaCy's provided trained pipelines. For social media or conversational text
|
||||||
doesn't follow the same rules, your application may benefit from a custom model
|
that doesn't follow the same rules, your application may benefit from a custom
|
||||||
or rule-based component.
|
trained or rule-based component.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### {executable="true"}
|
### {executable="true"}
|
||||||
|
@ -1652,8 +1654,8 @@ parses consistent with the sentence boundaries.
|
||||||
The [`SentenceRecognizer`](/api/sentencerecognizer) is a simple statistical
|
The [`SentenceRecognizer`](/api/sentencerecognizer) is a simple statistical
|
||||||
component that only provides sentence boundaries. Along with being faster and
|
component that only provides sentence boundaries. Along with being faster and
|
||||||
smaller than the parser, its primary advantage is that it's easier to train
|
smaller than the parser, its primary advantage is that it's easier to train
|
||||||
custom models because it only requires annotated sentence boundaries rather than
|
because it only requires annotated sentence boundaries rather than full
|
||||||
full dependency parses.
|
dependency parses.
|
||||||
|
|
||||||
<!-- TODO: update/confirm usage once we have final models trained -->
|
<!-- TODO: update/confirm usage once we have final models trained -->
|
||||||
|
|
||||||
|
@ -1685,7 +1687,7 @@ need sentence boundaries without dependency parses.
|
||||||
import spacy
|
import spacy
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
|
|
||||||
nlp = English() # just the language with no model
|
nlp = English() # just the language with no pipeline
|
||||||
nlp.add_pipe("sentencizer")
|
nlp.add_pipe("sentencizer")
|
||||||
doc = nlp("This is a sentence. This is another sentence.")
|
doc = nlp("This is a sentence. This is another sentence.")
|
||||||
for sent in doc.sents:
|
for sent in doc.sents:
|
||||||
|
@ -1827,11 +1829,11 @@ or Tomas Mikolov's original
|
||||||
[Word2vec implementation](https://code.google.com/archive/p/word2vec/). Most
|
[Word2vec implementation](https://code.google.com/archive/p/word2vec/). Most
|
||||||
word vector libraries output an easy-to-read text-based format, where each line
|
word vector libraries output an easy-to-read text-based format, where each line
|
||||||
consists of the word followed by its vector. For everyday use, we want to
|
consists of the word followed by its vector. For everyday use, we want to
|
||||||
convert the vectors model into a binary format that loads faster and takes up
|
convert the vectors into a binary format that loads faster and takes up less
|
||||||
less space on disk. The easiest way to do this is the
|
space on disk. The easiest way to do this is the
|
||||||
[`init model`](/api/cli#init-model) command-line utility. This will output a
|
[`init vocab`](/api/cli#init-vocab) command-line utility. This will output a
|
||||||
spaCy model in the directory `/tmp/la_vectors_wiki_lg`, giving you access to
|
blank spaCy pipeline in the directory `/tmp/la_vectors_wiki_lg`, giving you
|
||||||
some nice Latin vectors. You can then pass the directory path to
|
access to some nice Latin vectors. You can then pass the directory path to
|
||||||
[`spacy.load`](/api/top-level#spacy.load).
|
[`spacy.load`](/api/top-level#spacy.load).
|
||||||
|
|
||||||
> #### Usage example
|
> #### Usage example
|
||||||
|
@ -1845,7 +1847,7 @@ some nice Latin vectors. You can then pass the directory path to
|
||||||
|
|
||||||
```cli
|
```cli
|
||||||
$ wget https://s3-us-west-1.amazonaws.com/fasttext-vectors/word-vectors-v2/cc.la.300.vec.gz
|
$ wget https://s3-us-west-1.amazonaws.com/fasttext-vectors/word-vectors-v2/cc.la.300.vec.gz
|
||||||
$ python -m spacy init model en /tmp/la_vectors_wiki_lg --vectors-loc cc.la.300.vec.gz
|
$ python -m spacy init vocab en /tmp/la_vectors_wiki_lg --vectors-loc cc.la.300.vec.gz
|
||||||
```
|
```
|
||||||
|
|
||||||
<Accordion title="How to optimize vector coverage" id="custom-vectors-coverage" spaced>
|
<Accordion title="How to optimize vector coverage" id="custom-vectors-coverage" spaced>
|
||||||
|
@ -1853,13 +1855,13 @@ $ python -m spacy init model en /tmp/la_vectors_wiki_lg --vectors-loc cc.la.300.
|
||||||
To help you strike a good balance between coverage and memory usage, spaCy's
|
To help you strike a good balance between coverage and memory usage, spaCy's
|
||||||
[`Vectors`](/api/vectors) class lets you map **multiple keys** to the **same
|
[`Vectors`](/api/vectors) class lets you map **multiple keys** to the **same
|
||||||
row** of the table. If you're using the
|
row** of the table. If you're using the
|
||||||
[`spacy init model`](/api/cli#init-model) command to create a vocabulary,
|
[`spacy init vocab`](/api/cli#init-vocab) command to create a vocabulary,
|
||||||
pruning the vectors will be taken care of automatically if you set the
|
pruning the vectors will be taken care of automatically if you set the
|
||||||
`--prune-vectors` flag. You can also do it manually in the following steps:
|
`--prune-vectors` flag. You can also do it manually in the following steps:
|
||||||
|
|
||||||
1. Start with a **word vectors model** that covers a huge vocabulary. For
|
1. Start with a **word vectors package** that covers a huge vocabulary. For
|
||||||
instance, the [`en_vectors_web_lg`](/models/en-starters#en_vectors_web_lg)
|
instance, the [`en_vectors_web_lg`](/models/en-starters#en_vectors_web_lg)
|
||||||
model provides 300-dimensional GloVe vectors for over 1 million terms of
|
starter provides 300-dimensional GloVe vectors for over 1 million terms of
|
||||||
English.
|
English.
|
||||||
2. If your vocabulary has values set for the `Lexeme.prob` attribute, the
|
2. If your vocabulary has values set for the `Lexeme.prob` attribute, the
|
||||||
lexemes will be sorted by descending probability to determine which vectors
|
lexemes will be sorted by descending probability to determine which vectors
|
||||||
|
@ -1900,17 +1902,17 @@ the two words.
|
||||||
In the example above, the vector for "Shore" was removed and remapped to the
|
In the example above, the vector for "Shore" was removed and remapped to the
|
||||||
vector of "coast", which is deemed about 73% similar. "Leaving" was remapped to
|
vector of "coast", which is deemed about 73% similar. "Leaving" was remapped to
|
||||||
the vector of "leaving", which is identical. If you're using the
|
the vector of "leaving", which is identical. If you're using the
|
||||||
[`init model`](/api/cli#init-model) command, you can set the `--prune-vectors`
|
[`init vocab`](/api/cli#init-vocab) command, you can set the `--prune-vectors`
|
||||||
option to easily reduce the size of the vectors as you add them to a spaCy
|
option to easily reduce the size of the vectors as you add them to a spaCy
|
||||||
model:
|
pipeline:
|
||||||
|
|
||||||
```cli
|
```cli
|
||||||
$ python -m spacy init model en /tmp/la_vectors_web_md --vectors-loc la.300d.vec.tgz --prune-vectors 10000
|
$ python -m spacy init vocab en /tmp/la_vectors_web_md --vectors-loc la.300d.vec.tgz --prune-vectors 10000
|
||||||
```
|
```
|
||||||
|
|
||||||
This will create a spaCy model with vectors for the first 10,000 words in the
|
This will create a blank spaCy pipeline with vectors for the first 10,000 words
|
||||||
vectors model. All other words in the vectors model are mapped to the closest
|
in the vectors. All other words in the vectors are mapped to the closest vector
|
||||||
vector among those retained.
|
among those retained.
|
||||||
|
|
||||||
</Accordion>
|
</Accordion>
|
||||||
|
|
||||||
|
@ -1925,8 +1927,8 @@ possible. You can modify the vectors via the [`Vocab`](/api/vocab) or
|
||||||
if you have vectors in an arbitrary format, as you can read in the vectors with
|
if you have vectors in an arbitrary format, as you can read in the vectors with
|
||||||
your own logic, and just set them with a simple loop. This method is likely to
|
your own logic, and just set them with a simple loop. This method is likely to
|
||||||
be slower than approaches that work with the whole vectors table at once, but
|
be slower than approaches that work with the whole vectors table at once, but
|
||||||
it's a great approach for once-off conversions before you save out your model to
|
it's a great approach for once-off conversions before you save out your `nlp`
|
||||||
disk.
|
object to disk.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### Adding vectors
|
### Adding vectors
|
||||||
|
@ -1978,14 +1980,14 @@ print(nlp2.lang, [token.is_stop for token in nlp2("custom stop")])
|
||||||
The [`@spacy.registry.languages`](/api/top-level#registry) decorator lets you
|
The [`@spacy.registry.languages`](/api/top-level#registry) decorator lets you
|
||||||
register a custom language class and assign it a string name. This means that
|
register a custom language class and assign it a string name. This means that
|
||||||
you can call [`spacy.blank`](/api/top-level#spacy.blank) with your custom
|
you can call [`spacy.blank`](/api/top-level#spacy.blank) with your custom
|
||||||
language name, and even train models with it and refer to it in your
|
language name, and even train pipelines with it and refer to it in your
|
||||||
[training config](/usage/training#config).
|
[training config](/usage/training#config).
|
||||||
|
|
||||||
> #### Config usage
|
> #### Config usage
|
||||||
>
|
>
|
||||||
> After registering your custom language class using the `languages` registry,
|
> After registering your custom language class using the `languages` registry,
|
||||||
> you can refer to it in your [training config](/usage/training#config). This
|
> you can refer to it in your [training config](/usage/training#config). This
|
||||||
> means spaCy will train your model using the custom subclass.
|
> means spaCy will train your pipeline using the custom subclass.
|
||||||
>
|
>
|
||||||
> ```ini
|
> ```ini
|
||||||
> [nlp]
|
> [nlp]
|
||||||
|
|
|
@ -8,25 +8,24 @@ menu:
|
||||||
- ['Production Use', 'production']
|
- ['Production Use', 'production']
|
||||||
---
|
---
|
||||||
|
|
||||||
spaCy's models can be installed as **Python packages**. This means that they're
|
spaCy's trained pipelines can be installed as **Python packages**. This means
|
||||||
a component of your application, just like any other module. They're versioned
|
that they're a component of your application, just like any other module.
|
||||||
and can be defined as a dependency in your `requirements.txt`. Models can be
|
They're versioned and can be defined as a dependency in your `requirements.txt`.
|
||||||
installed from a download URL or a local directory, manually or via
|
Trained pipelines can be installed from a download URL or a local directory,
|
||||||
[pip](https://pypi.python.org/pypi/pip). Their data can be located anywhere on
|
manually or via [pip](https://pypi.python.org/pypi/pip). Their data can be
|
||||||
your file system.
|
located anywhere on your file system.
|
||||||
|
|
||||||
> #### Important note
|
> #### Important note
|
||||||
>
|
>
|
||||||
> If you're upgrading to spaCy v3.x, you need to **download the new models**. If
|
> If you're upgrading to spaCy v3.x, you need to **download the new pipeline
|
||||||
> you've trained statistical models that use spaCy's annotations, you should
|
> packages**. If you've trained your own pipelines, you need to **retrain** them
|
||||||
> **retrain your models** after updating spaCy. If you don't retrain, you may
|
> after updating spaCy.
|
||||||
> suffer train/test skew, which might decrease your accuracy.
|
|
||||||
|
|
||||||
## Quickstart {hidden="true"}
|
## Quickstart {hidden="true"}
|
||||||
|
|
||||||
import QuickstartModels from 'widgets/quickstart-models.js'
|
import QuickstartModels from 'widgets/quickstart-models.js'
|
||||||
|
|
||||||
<QuickstartModels title="Quickstart" id="quickstart" description="Install a default model, get the code to load it from within spaCy and an example to test it. For more options, see the section on available models below." />
|
<QuickstartModels title="Quickstart" id="quickstart" description="Install a default trained pipeline package, get the code to load it from within spaCy and an example to test it. For more options, see the section on available packages below." />
|
||||||
|
|
||||||
## Language support {#languages}
|
## Language support {#languages}
|
||||||
|
|
||||||
|
@ -34,14 +33,14 @@ spaCy currently provides support for the following languages. You can help by
|
||||||
[improving the existing language data](/usage/adding-languages#language-data)
|
[improving the existing language data](/usage/adding-languages#language-data)
|
||||||
and extending the tokenization patterns.
|
and extending the tokenization patterns.
|
||||||
[See here](https://github.com/explosion/spaCy/issues/3056) for details on how to
|
[See here](https://github.com/explosion/spaCy/issues/3056) for details on how to
|
||||||
contribute to model development.
|
contribute to development.
|
||||||
|
|
||||||
> #### Usage note
|
> #### Usage note
|
||||||
>
|
>
|
||||||
> If a model is available for a language, you can download it using the
|
> If a trained pipeline is available for a language, you can download it using
|
||||||
> [`spacy download`](/api/cli#download) command. In order to use languages that
|
> the [`spacy download`](/api/cli#download) command. In order to use languages
|
||||||
> don't yet come with a model, you have to import them directly, or use
|
> that don't yet come with a trained pipeline, you have to import them directly,
|
||||||
> [`spacy.blank`](/api/top-level#spacy.blank):
|
> or use [`spacy.blank`](/api/top-level#spacy.blank):
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> from spacy.lang.fi import Finnish
|
> from spacy.lang.fi import Finnish
|
||||||
|
@ -73,13 +72,13 @@ import Languages from 'widgets/languages.js'
|
||||||
> nlp = spacy.blank("xx")
|
> nlp = spacy.blank("xx")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
spaCy also supports models trained on more than one language. This is especially
|
spaCy also supports pipelines trained on more than one language. This is
|
||||||
useful for named entity recognition. The language ID used for multi-language or
|
especially useful for named entity recognition. The language ID used for
|
||||||
language-neutral models is `xx`. The language class, a generic subclass
|
multi-language or language-neutral pipelines is `xx`. The language class, a
|
||||||
containing only the base language data, can be found in
|
generic subclass containing only the base language data, can be found in
|
||||||
[`lang/xx`](https://github.com/explosion/spaCy/tree/master/spacy/lang/xx).
|
[`lang/xx`](https://github.com/explosion/spaCy/tree/master/spacy/lang/xx).
|
||||||
|
|
||||||
To train a model using the neutral multi-language class, you can set
|
To train a pipeline using the neutral multi-language class, you can set
|
||||||
`lang = "xx"` in your [training config](/usage/training#config). You can also
|
`lang = "xx"` in your [training config](/usage/training#config). You can also
|
||||||
import the `MultiLanguage` class directly, or call
|
import the `MultiLanguage` class directly, or call
|
||||||
[`spacy.blank("xx")`](/api/top-level#spacy.blank) for lazy-loading.
|
[`spacy.blank("xx")`](/api/top-level#spacy.blank) for lazy-loading.
|
||||||
|
@ -111,7 +110,7 @@ The Chinese language class supports three word segmentation options:
|
||||||
3. **PKUSeg**: As of spaCy v2.3.0, support for
|
3. **PKUSeg**: As of spaCy v2.3.0, support for
|
||||||
[PKUSeg](https://github.com/lancopku/PKUSeg-python) has been added to support
|
[PKUSeg](https://github.com/lancopku/PKUSeg-python) has been added to support
|
||||||
better segmentation for Chinese OntoNotes and the provided
|
better segmentation for Chinese OntoNotes and the provided
|
||||||
[Chinese models](/models/zh). Enable PKUSeg with the tokenizer option
|
[Chinese pipelines](/models/zh). Enable PKUSeg with the tokenizer option
|
||||||
`{"segmenter": "pkuseg"}`.
|
`{"segmenter": "pkuseg"}`.
|
||||||
|
|
||||||
<Infobox variant="warning">
|
<Infobox variant="warning">
|
||||||
|
@ -169,9 +168,9 @@ nlp.tokenizer.pkuseg_update_user_dict([], reset=True)
|
||||||
|
|
||||||
</Accordion>
|
</Accordion>
|
||||||
|
|
||||||
<Accordion title="Details on pretrained and custom Chinese models" spaced>
|
<Accordion title="Details on trained and custom Chinese pipelines" spaced>
|
||||||
|
|
||||||
The [Chinese models](/models/zh) provided by spaCy include a custom `pkuseg`
|
The [Chinese pipelines](/models/zh) provided by spaCy include a custom `pkuseg`
|
||||||
model trained only on
|
model trained only on
|
||||||
[Chinese OntoNotes 5.0](https://catalog.ldc.upenn.edu/LDC2013T19), since the
|
[Chinese OntoNotes 5.0](https://catalog.ldc.upenn.edu/LDC2013T19), since the
|
||||||
models provided by `pkuseg` include data restricted to research use. For
|
models provided by `pkuseg` include data restricted to research use. For
|
||||||
|
@ -208,29 +207,29 @@ nlp = Chinese(meta={"tokenizer": {"config": {"pkuseg_model": "/path/to/pkuseg_mo
|
||||||
The Japanese language class uses
|
The Japanese language class uses
|
||||||
[SudachiPy](https://github.com/WorksApplications/SudachiPy) for word
|
[SudachiPy](https://github.com/WorksApplications/SudachiPy) for word
|
||||||
segmentation and part-of-speech tagging. The default Japanese language class and
|
segmentation and part-of-speech tagging. The default Japanese language class and
|
||||||
the provided Japanese models use SudachiPy split mode `A`. The `meta` argument
|
the provided Japanese pipelines use SudachiPy split mode `A`. The `meta`
|
||||||
of the `Japanese` language class can be used to configure the split mode to `A`,
|
argument of the `Japanese` language class can be used to configure the split
|
||||||
`B` or `C`.
|
mode to `A`, `B` or `C`.
|
||||||
|
|
||||||
<Infobox variant="warning">
|
<Infobox variant="warning">
|
||||||
|
|
||||||
If you run into errors related to `sudachipy`, which is currently under active
|
If you run into errors related to `sudachipy`, which is currently under active
|
||||||
development, we suggest downgrading to `sudachipy==0.4.5`, which is the version
|
development, we suggest downgrading to `sudachipy==0.4.5`, which is the version
|
||||||
used for training the current [Japanese models](/models/ja).
|
used for training the current [Japanese pipelines](/models/ja).
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
## Installing and using models {#download}
|
## Installing and using trained pipelines {#download}
|
||||||
|
|
||||||
The easiest way to download a model is via spaCy's
|
The easiest way to download a trained pipeline is via spaCy's
|
||||||
[`download`](/api/cli#download) command. It takes care of finding the
|
[`download`](/api/cli#download) command. It takes care of finding the
|
||||||
best-matching model compatible with your spaCy installation.
|
best-matching package compatible with your spaCy installation.
|
||||||
|
|
||||||
> #### Important note for v3.0
|
> #### Important note for v3.0
|
||||||
>
|
>
|
||||||
> Note that as of spaCy v3.0, model shortcut links that create (potentially
|
> Note that as of spaCy v3.0, shortcut links like `en` that create (potentially
|
||||||
> brittle) symlinks in your spaCy installation are **deprecated**. To download
|
> brittle) symlinks in your spaCy installation are **deprecated**. To download
|
||||||
> and load an installed model, use its full name:
|
> and load an installed pipeline package, use its full name:
|
||||||
>
|
>
|
||||||
> ```diff
|
> ```diff
|
||||||
> - python -m spacy download en
|
> - python -m spacy download en
|
||||||
|
@ -243,14 +242,14 @@ best-matching model compatible with your spaCy installation.
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
```cli
|
```cli
|
||||||
# Download best-matching version of a model for your spaCy installation
|
# Download best-matching version of a package for your spaCy installation
|
||||||
$ python -m spacy download en_core_web_sm
|
$ python -m spacy download en_core_web_sm
|
||||||
|
|
||||||
# Download exact model version
|
# Download exact package version
|
||||||
$ python -m spacy download en_core_web_sm-3.0.0 --direct
|
$ python -m spacy download en_core_web_sm-3.0.0 --direct
|
||||||
```
|
```
|
||||||
|
|
||||||
The download command will [install the model](/usage/models#download-pip) via
|
The download command will [install the package](/usage/models#download-pip) via
|
||||||
pip and place the package in your `site-packages` directory.
|
pip and place the package in your `site-packages` directory.
|
||||||
|
|
||||||
```cli
|
```cli
|
||||||
|
@ -266,11 +265,11 @@ doc = nlp("This is a sentence.")
|
||||||
|
|
||||||
### Installation via pip {#download-pip}
|
### Installation via pip {#download-pip}
|
||||||
|
|
||||||
To download a model directly using [pip](https://pypi.python.org/pypi/pip),
|
To download a trained pipeline directly using
|
||||||
point `pip install` to the URL or local path of the archive file. To find the
|
[pip](https://pypi.python.org/pypi/pip), point `pip install` to the URL or local
|
||||||
direct link to a model, head over to the
|
path of the archive file. To find the direct link to a package, head over to the
|
||||||
[model releases](https://github.com/explosion/spacy-models/releases), right
|
[releases](https://github.com/explosion/spacy-models/releases), right click on
|
||||||
click on the archive link and copy it to your clipboard.
|
the archive link and copy it to your clipboard.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# With external URL
|
# With external URL
|
||||||
|
@ -280,60 +279,61 @@ $ pip install https://github.com/explosion/spacy-models/releases/download/en_cor
|
||||||
$ pip install /Users/you/en_core_web_sm-3.0.0.tar.gz
|
$ pip install /Users/you/en_core_web_sm-3.0.0.tar.gz
|
||||||
```
|
```
|
||||||
|
|
||||||
By default, this will install the model into your `site-packages` directory. You
|
By default, this will install the pipeline package into your `site-packages`
|
||||||
can then use `spacy.load()` to load it via its package name or
|
directory. You can then use `spacy.load` to load it via its package name or
|
||||||
[import it](#usage-import) explicitly as a module. If you need to download
|
[import it](#usage-import) explicitly as a module. If you need to download
|
||||||
models as part of an automated process, we recommend using pip with a direct
|
pipeline packages as part of an automated process, we recommend using pip with a
|
||||||
link, instead of relying on spaCy's [`download`](/api/cli#download) command.
|
direct link, instead of relying on spaCy's [`download`](/api/cli#download)
|
||||||
|
command.
|
||||||
|
|
||||||
You can also add the direct download link to your application's
|
You can also add the direct download link to your application's
|
||||||
`requirements.txt`. For more details, see the section on
|
`requirements.txt`. For more details, see the section on
|
||||||
[working with models in production](#production).
|
[working with pipeline packages in production](#production).
|
||||||
|
|
||||||
### Manual download and installation {#download-manual}
|
### Manual download and installation {#download-manual}
|
||||||
|
|
||||||
In some cases, you might prefer downloading the data manually, for example to
|
In some cases, you might prefer downloading the data manually, for example to
|
||||||
place it into a custom directory. You can download the model via your browser
|
place it into a custom directory. You can download the package via your browser
|
||||||
from the [latest releases](https://github.com/explosion/spacy-models/releases),
|
from the [latest releases](https://github.com/explosion/spacy-models/releases),
|
||||||
or configure your own download script using the URL of the archive file. The
|
or configure your own download script using the URL of the archive file. The
|
||||||
archive consists of a model directory that contains another directory with the
|
archive consists of a package directory that contains another directory with the
|
||||||
model data.
|
pipeline data.
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
### Directory structure {highlight="6"}
|
### Directory structure {highlight="6"}
|
||||||
└── en_core_web_md-3.0.0.tar.gz # downloaded archive
|
└── en_core_web_md-3.0.0.tar.gz # downloaded archive
|
||||||
├── setup.py # setup file for pip installation
|
├── setup.py # setup file for pip installation
|
||||||
├── meta.json # copy of model meta
|
├── meta.json # copy of pipeline meta
|
||||||
└── en_core_web_md # 📦 model package
|
└── en_core_web_md # 📦 pipeline package
|
||||||
├── __init__.py # init for pip installation
|
├── __init__.py # init for pip installation
|
||||||
└── en_core_web_md-3.0.0 # model data
|
└── en_core_web_md-3.0.0 # pipeline data
|
||||||
├── config.cfg # model config
|
├── config.cfg # pipeline config
|
||||||
├── meta.json # model meta
|
├── meta.json # pipeline meta
|
||||||
└── ... # directories with component data
|
└── ... # directories with component data
|
||||||
```
|
```
|
||||||
|
|
||||||
You can place the **model package directory** anywhere on your local file
|
You can place the **pipeline package directory** anywhere on your local file
|
||||||
system.
|
system.
|
||||||
|
|
||||||
### Using models with spaCy {#usage}
|
### Using trained pipelines with spaCy {#usage}
|
||||||
|
|
||||||
To load a model, use [`spacy.load`](/api/top-level#spacy.load) with the model's
|
To load a pipeline package, use [`spacy.load`](/api/top-level#spacy.load) with
|
||||||
package name or a path to the data directory:
|
the package name or a path to the data directory:
|
||||||
|
|
||||||
> #### Important note for v3.0
|
> #### Important note for v3.0
|
||||||
>
|
>
|
||||||
> Note that as of spaCy v3.0, model shortcut links that create (potentially
|
> Note that as of spaCy v3.0, shortcut links like `en` that create (potentially
|
||||||
> brittle) symlinks in your spaCy installation are **deprecated**. To load an
|
> brittle) symlinks in your spaCy installation are **deprecated**. To download
|
||||||
> installed model, use its full name:
|
> and load an installed pipeline package, use its full name:
|
||||||
>
|
>
|
||||||
> ```diff
|
> ```diff
|
||||||
> - nlp = spacy.load("en")
|
> - python -m spacy download en
|
||||||
> + nlp = spacy.load("en_core_web_sm")
|
> + python -m spacy dowmload en_core_web_sm
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import spacy
|
import spacy
|
||||||
nlp = spacy.load("en_core_web_sm") # load model package "en_core_web_sm"
|
nlp = spacy.load("en_core_web_sm") # load package "en_core_web_sm"
|
||||||
nlp = spacy.load("/path/to/en_core_web_sm") # load package from a directory
|
nlp = spacy.load("/path/to/en_core_web_sm") # load package from a directory
|
||||||
|
|
||||||
doc = nlp("This is a sentence.")
|
doc = nlp("This is a sentence.")
|
||||||
|
@ -342,17 +342,18 @@ doc = nlp("This is a sentence.")
|
||||||
<Infobox title="Tip: Preview model info" emoji="💡">
|
<Infobox title="Tip: Preview model info" emoji="💡">
|
||||||
|
|
||||||
You can use the [`info`](/api/cli#info) command or
|
You can use the [`info`](/api/cli#info) command or
|
||||||
[`spacy.info()`](/api/top-level#spacy.info) method to print a model's meta data
|
[`spacy.info()`](/api/top-level#spacy.info) method to print a pipeline
|
||||||
before loading it. Each `Language` object with a loaded model also exposes the
|
packages's meta data before loading it. Each `Language` object with a loaded
|
||||||
model's meta data as the attribute `meta`. For example, `nlp.meta['version']`
|
pipeline also exposes the pipeline's meta data as the attribute `meta`. For
|
||||||
will return the model's version.
|
example, `nlp.meta['version']` will return the package version.
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
### Importing models as modules {#usage-import}
|
### Importing pipeline packages as modules {#usage-import}
|
||||||
|
|
||||||
If you've installed a model via spaCy's downloader, or directly via pip, you can
|
If you've installed a trained pipeline via [`spacy download`](/api/cli#download)
|
||||||
also `import` it and then call its `load()` method with no arguments:
|
or directly via pip, you can also `import` it and then call its `load()` method
|
||||||
|
with no arguments:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### {executable="true"}
|
### {executable="true"}
|
||||||
|
@ -362,51 +363,38 @@ nlp = en_core_web_sm.load()
|
||||||
doc = nlp("This is a sentence.")
|
doc = nlp("This is a sentence.")
|
||||||
```
|
```
|
||||||
|
|
||||||
How you choose to load your models ultimately depends on personal preference.
|
How you choose to load your trained pipelines ultimately depends on personal
|
||||||
However, **for larger code bases**, we usually recommend native imports, as this
|
preference. However, **for larger code bases**, we usually recommend native
|
||||||
will make it easier to integrate models with your existing build process,
|
imports, as this will make it easier to integrate pipeline packages with your
|
||||||
continuous integration workflow and testing framework. It'll also prevent you
|
existing build process, continuous integration workflow and testing framework.
|
||||||
from ever trying to load a model that is not installed, as your code will raise
|
It'll also prevent you from ever trying to load a package that is not installed,
|
||||||
an `ImportError` immediately, instead of failing somewhere down the line when
|
as your code will raise an `ImportError` immediately, instead of failing
|
||||||
calling `spacy.load()`.
|
somewhere down the line when calling `spacy.load()`. For more details, see the
|
||||||
|
section on [working with pipeline packages in production](#production).
|
||||||
|
|
||||||
For more details, see the section on
|
## Using trained pipelines in production {#production}
|
||||||
[working with models in production](#production).
|
|
||||||
|
|
||||||
### Using your own models {#own-models}
|
If your application depends on one or more trained pipeline packages, you'll
|
||||||
|
usually want to integrate them into your continuous integration workflow and
|
||||||
If you've trained your own model, for example for
|
build process. While spaCy provides a range of useful helpers for downloading
|
||||||
[additional languages](/usage/adding-languages) or
|
and loading pipeline packages, the underlying functionality is entirely based on
|
||||||
[custom named entities](/usage/training#ner), you can save its state using the
|
native Python packaging. This allows your application to handle a spaCy pipeline
|
||||||
[`Language.to_disk()`](/api/language#to_disk) method. To make the model more
|
like any other package dependency.
|
||||||
convenient to deploy, we recommend wrapping it as a Python package.
|
|
||||||
|
|
||||||
For more information and a detailed guide on how to package your model, see the
|
|
||||||
documentation on [saving and loading models](/usage/saving-loading#models).
|
|
||||||
|
|
||||||
## Using models in production {#production}
|
|
||||||
|
|
||||||
If your application depends on one or more models, you'll usually want to
|
|
||||||
integrate them into your continuous integration workflow and build process.
|
|
||||||
While spaCy provides a range of useful helpers for downloading, linking and
|
|
||||||
loading models, the underlying functionality is entirely based on native Python
|
|
||||||
packages. This allows your application to handle a model like any other package
|
|
||||||
dependency.
|
|
||||||
|
|
||||||
<!-- TODO: reference relevant spaCy project -->
|
<!-- TODO: reference relevant spaCy project -->
|
||||||
|
|
||||||
### Downloading and requiring model dependencies {#models-download}
|
### Downloading and requiring package dependencies {#models-download}
|
||||||
|
|
||||||
spaCy's built-in [`download`](/api/cli#download) command is mostly intended as a
|
spaCy's built-in [`download`](/api/cli#download) command is mostly intended as a
|
||||||
convenient, interactive wrapper. It performs compatibility checks and prints
|
convenient, interactive wrapper. It performs compatibility checks and prints
|
||||||
detailed error messages and warnings. However, if you're downloading models as
|
detailed error messages and warnings. However, if you're downloading pipeline
|
||||||
part of an automated build process, this only adds an unnecessary layer of
|
packages as part of an automated build process, this only adds an unnecessary
|
||||||
complexity. If you know which models your application needs, you should be
|
layer of complexity. If you know which packages your application needs, you
|
||||||
specifying them directly.
|
should be specifying them directly.
|
||||||
|
|
||||||
Because all models are valid Python packages, you can add them to your
|
Because pipeline packages are valid Python packages, you can add them to your
|
||||||
application's `requirements.txt`. If you're running your own internal PyPi
|
application's `requirements.txt`. If you're running your own internal PyPi
|
||||||
installation, you can upload the models there. pip's
|
installation, you can upload the pipeline packages there. pip's
|
||||||
[requirements file format](https://pip.pypa.io/en/latest/reference/pip_install/#requirements-file-format)
|
[requirements file format](https://pip.pypa.io/en/latest/reference/pip_install/#requirements-file-format)
|
||||||
supports both package names to download via a PyPi server, as well as direct
|
supports both package names to download via a PyPi server, as well as direct
|
||||||
URLs.
|
URLs.
|
||||||
|
@ -422,17 +410,17 @@ the download URL. This way, the package won't be re-downloaded and overwritten
|
||||||
if it's already installed - just like when you're downloading a package from
|
if it's already installed - just like when you're downloading a package from
|
||||||
PyPi.
|
PyPi.
|
||||||
|
|
||||||
All models are versioned and specify their spaCy dependency. This ensures
|
All pipeline packages are versioned and specify their spaCy dependency. This
|
||||||
cross-compatibility and lets you specify exact version requirements for each
|
ensures cross-compatibility and lets you specify exact version requirements for
|
||||||
model. If you've trained your own model, you can use the
|
each pipeline. If you've [trained](/usage/training) your own pipeline, you can
|
||||||
[`package`](/api/cli#package) command to generate the required meta data and
|
use the [`spacy package`](/api/cli#package) command to generate the required
|
||||||
turn it into a loadable package.
|
meta data and turn it into a loadable package.
|
||||||
|
|
||||||
### Loading and testing models {#models-loading}
|
### Loading and testing pipeline packages {#models-loading}
|
||||||
|
|
||||||
Models are regular Python packages, so you can also import them as a package
|
Pipeline packages are regular Python packages, so you can also import them as a
|
||||||
using Python's native `import` syntax, and then call the `load` method to load
|
package using Python's native `import` syntax, and then call the `load` method
|
||||||
the model data and return an `nlp` object:
|
to load the data and return an `nlp` object:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import en_core_web_sm
|
import en_core_web_sm
|
||||||
|
@ -440,16 +428,17 @@ nlp = en_core_web_sm.load()
|
||||||
```
|
```
|
||||||
|
|
||||||
In general, this approach is recommended for larger code bases, as it's more
|
In general, this approach is recommended for larger code bases, as it's more
|
||||||
"native", and doesn't depend on symlinks or rely on spaCy's loader to resolve
|
"native", and doesn't rely on spaCy's loader to resolve string names to
|
||||||
string names to model packages. If a model can't be imported, Python will raise
|
packages. If a package can't be imported, Python will raise an `ImportError`
|
||||||
an `ImportError` immediately. And if a model is imported but not used, any
|
immediately. And if a package is imported but not used, any linter will catch
|
||||||
linter will catch that.
|
that.
|
||||||
|
|
||||||
Similarly, it'll give you more flexibility when writing tests that require
|
Similarly, it'll give you more flexibility when writing tests that require
|
||||||
loading models. For example, instead of writing your own `try` and `except`
|
loading pipelines. For example, instead of writing your own `try` and `except`
|
||||||
logic around spaCy's loader, you can use
|
logic around spaCy's loader, you can use
|
||||||
[pytest](http://pytest.readthedocs.io/en/latest/)'s
|
[pytest](http://pytest.readthedocs.io/en/latest/)'s
|
||||||
[`importorskip()`](https://docs.pytest.org/en/latest/builtin.html#_pytest.outcomes.importorskip)
|
[`importorskip()`](https://docs.pytest.org/en/latest/builtin.html#_pytest.outcomes.importorskip)
|
||||||
method to only run a test if a specific model or model version is installed.
|
method to only run a test if a specific pipeline package or version is
|
||||||
Each model package exposes a `__version__` attribute which you can also use to
|
installed. Each pipeline package package exposes a `__version__` attribute which
|
||||||
perform your own version compatibility checks before loading a model.
|
you can also use to perform your own version compatibility checks before loading
|
||||||
|
it.
|
||||||
|
|
|
@ -42,8 +42,8 @@ texts = ["This is a text", "These are lots of texts", "..."]
|
||||||
- Only apply the **pipeline components you need**. Getting predictions from the
|
- Only apply the **pipeline components you need**. Getting predictions from the
|
||||||
model that you don't actually need adds up and becomes very inefficient at
|
model that you don't actually need adds up and becomes very inefficient at
|
||||||
scale. To prevent this, use the `disable` keyword argument to disable
|
scale. To prevent this, use the `disable` keyword argument to disable
|
||||||
components you don't need – either when loading a model, or during processing
|
components you don't need – either when loading a pipeline, or during
|
||||||
with `nlp.pipe`. See the section on
|
processing with `nlp.pipe`. See the section on
|
||||||
[disabling pipeline components](#disabling) for more details and examples.
|
[disabling pipeline components](#disabling) for more details and examples.
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
@ -95,7 +95,7 @@ spaCy makes it very easy to create your own pipelines consisting of reusable
|
||||||
components – this includes spaCy's default tagger, parser and entity recognizer,
|
components – this includes spaCy's default tagger, parser and entity recognizer,
|
||||||
but also your own custom processing functions. A pipeline component can be added
|
but also your own custom processing functions. A pipeline component can be added
|
||||||
to an already existing `nlp` object, specified when initializing a `Language`
|
to an already existing `nlp` object, specified when initializing a `Language`
|
||||||
class, or defined within a [model package](/usage/saving-loading#models).
|
class, or defined within a [pipeline package](/usage/saving-loading#models).
|
||||||
|
|
||||||
> #### config.cfg (excerpt)
|
> #### config.cfg (excerpt)
|
||||||
>
|
>
|
||||||
|
@ -115,7 +115,7 @@ class, or defined within a [model package](/usage/saving-loading#models).
|
||||||
> # Settings for the parser component
|
> # Settings for the parser component
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
When you load a model, spaCy first consults the model's
|
When you load a pipeline, spaCy first consults the
|
||||||
[`meta.json`](/usage/saving-loading#models) and
|
[`meta.json`](/usage/saving-loading#models) and
|
||||||
[`config.cfg`](/usage/training#config). The config tells spaCy what language
|
[`config.cfg`](/usage/training#config). The config tells spaCy what language
|
||||||
class to use, which components are in the pipeline, and how those components
|
class to use, which components are in the pipeline, and how those components
|
||||||
|
@ -131,8 +131,7 @@ should be created. spaCy will then do the following:
|
||||||
component with with [`add_pipe`](/api/language#add_pipe). The settings are
|
component with with [`add_pipe`](/api/language#add_pipe). The settings are
|
||||||
passed into the factory.
|
passed into the factory.
|
||||||
3. Make the **model data** available to the `Language` class by calling
|
3. Make the **model data** available to the `Language` class by calling
|
||||||
[`from_disk`](/api/language#from_disk) with the path to the model data
|
[`from_disk`](/api/language#from_disk) with the path to the data directory.
|
||||||
directory.
|
|
||||||
|
|
||||||
So when you call this...
|
So when you call this...
|
||||||
|
|
||||||
|
@ -140,27 +139,27 @@ So when you call this...
|
||||||
nlp = spacy.load("en_core_web_sm")
|
nlp = spacy.load("en_core_web_sm")
|
||||||
```
|
```
|
||||||
|
|
||||||
... the model's `config.cfg` tells spaCy to use the language `"en"` and the
|
... the pipeline's `config.cfg` tells spaCy to use the language `"en"` and the
|
||||||
pipeline `["tagger", "parser", "ner"]`. spaCy will then initialize
|
pipeline `["tagger", "parser", "ner"]`. spaCy will then initialize
|
||||||
`spacy.lang.en.English`, and create each pipeline component and add it to the
|
`spacy.lang.en.English`, and create each pipeline component and add it to the
|
||||||
processing pipeline. It'll then load in the model's data from its data directory
|
processing pipeline. It'll then load in the model data from the data directory
|
||||||
and return the modified `Language` class for you to use as the `nlp` object.
|
and return the modified `Language` class for you to use as the `nlp` object.
|
||||||
|
|
||||||
<Infobox title="Changed in v3.0" variant="warning">
|
<Infobox title="Changed in v3.0" variant="warning">
|
||||||
|
|
||||||
spaCy v3.0 introduces a `config.cfg`, which includes more detailed settings for
|
spaCy v3.0 introduces a `config.cfg`, which includes more detailed settings for
|
||||||
the model pipeline, its components and the
|
the pipeline, its components and the [training process](/usage/training#config).
|
||||||
[training process](/usage/training#config). You can export the config of your
|
You can export the config of your current `nlp` object by calling
|
||||||
current `nlp` object by calling [`nlp.config.to_disk`](/api/language#config).
|
[`nlp.config.to_disk`](/api/language#config).
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
Fundamentally, a [spaCy model](/models) consists of three components: **the
|
Fundamentally, a [spaCy pipeline package](/models) consists of three components:
|
||||||
weights**, i.e. binary data loaded in from a directory, a **pipeline** of
|
**the weights**, i.e. binary data loaded in from a directory, a **pipeline** of
|
||||||
functions called in order, and **language data** like the tokenization rules and
|
functions called in order, and **language data** like the tokenization rules and
|
||||||
language-specific settings. For example, a Spanish NER model requires different
|
language-specific settings. For example, a Spanish NER pipeline requires
|
||||||
weights, language data and pipeline components than an English parsing and
|
different weights, language data and components than an English parsing and
|
||||||
tagging model. This is also why the pipeline state is always held by the
|
tagging pipeline. This is also why the pipeline state is always held by the
|
||||||
`Language` class. [`spacy.load`](/api/top-level#spacy.load) puts this all
|
`Language` class. [`spacy.load`](/api/top-level#spacy.load) puts this all
|
||||||
together and returns an instance of `Language` with a pipeline set and access to
|
together and returns an instance of `Language` with a pipeline set and access to
|
||||||
the binary data:
|
the binary data:
|
||||||
|
@ -175,7 +174,7 @@ cls = spacy.util.get_lang_class(lang) # 1. Get Language class, e.g. English
|
||||||
nlp = cls() # 2. Initialize it
|
nlp = cls() # 2. Initialize it
|
||||||
for name in pipeline:
|
for name in pipeline:
|
||||||
nlp.add_pipe(name) # 3. Add the component to the pipeline
|
nlp.add_pipe(name) # 3. Add the component to the pipeline
|
||||||
nlp.from_disk(model_data_path) # 4. Load in the binary data
|
nlp.from_disk(data_path) # 4. Load in the binary data
|
||||||
```
|
```
|
||||||
|
|
||||||
When you call `nlp` on a text, spaCy will **tokenize** it and then **call each
|
When you call `nlp` on a text, spaCy will **tokenize** it and then **call each
|
||||||
|
@ -243,28 +242,29 @@ tagger or the parser, you can **disable or exclude** it. This can sometimes make
|
||||||
a big difference and improve loading and inference speed. There are two
|
a big difference and improve loading and inference speed. There are two
|
||||||
different mechanisms you can use:
|
different mechanisms you can use:
|
||||||
|
|
||||||
1. **Disable:** The component and its data will be loaded with the model, but it
|
1. **Disable:** The component and its data will be loaded with the pipeline, but
|
||||||
will be disabled by default and not run as part of the processing pipeline.
|
it will be disabled by default and not run as part of the processing
|
||||||
To run it, you can explicitly enable it by calling
|
pipeline. To run it, you can explicitly enable it by calling
|
||||||
[`nlp.enable_pipe`](/api/language#enable_pipe). When you save out the `nlp`
|
[`nlp.enable_pipe`](/api/language#enable_pipe). When you save out the `nlp`
|
||||||
object, the disabled component will be included but disabled by default.
|
object, the disabled component will be included but disabled by default.
|
||||||
2. **Exclude:** Don't load the component and its data with the model. Once the
|
2. **Exclude:** Don't load the component and its data with the pipeline. Once
|
||||||
model is loaded, there will be no reference to the excluded component.
|
the pipeline is loaded, there will be no reference to the excluded component.
|
||||||
|
|
||||||
Disabled and excluded component names can be provided to
|
Disabled and excluded component names can be provided to
|
||||||
[`spacy.load`](/api/top-level#spacy.load) as a list.
|
[`spacy.load`](/api/top-level#spacy.load) as a list.
|
||||||
|
|
||||||
<!-- TODO: update with info on our models shipped with optional components -->
|
<!-- TODO: update with info on our models shipped with optional components -->
|
||||||
|
|
||||||
> #### 💡 Models with optional components
|
> #### 💡 Optional pipeline components
|
||||||
>
|
>
|
||||||
> The `disable` mechanism makes it easy to distribute models with optional
|
> The `disable` mechanism makes it easy to distribute pipeline packages with
|
||||||
> components that you can enable or disable at runtime. For instance, your model
|
> optional components that you can enable or disable at runtime. For instance,
|
||||||
> may include a statistical _and_ a rule-based component for sentence
|
> your pipeline may include a statistical _and_ a rule-based component for
|
||||||
> segmentation, and you can choose which one to run depending on your use case.
|
> sentence segmentation, and you can choose which one to run depending on your
|
||||||
|
> use case.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
# Load the model without the entity recognizer
|
# Load the pipeline without the entity recognizer
|
||||||
nlp = spacy.load("en_core_web_sm", exclude=["ner"])
|
nlp = spacy.load("en_core_web_sm", exclude=["ner"])
|
||||||
|
|
||||||
# Load the tagger and parser but don't enable them
|
# Load the tagger and parser but don't enable them
|
||||||
|
@ -358,25 +358,25 @@ run as part of the pipeline.
|
||||||
| `nlp.component_names` | All component names, including disabled components. |
|
| `nlp.component_names` | All component names, including disabled components. |
|
||||||
| `nlp.disabled` | Names of components that are currently disabled. |
|
| `nlp.disabled` | Names of components that are currently disabled. |
|
||||||
|
|
||||||
### Sourcing pipeline components from existing models {#sourced-components new="3"}
|
### Sourcing components from existing pipelines {#sourced-components new="3"}
|
||||||
|
|
||||||
Pipeline components that are independent can also be reused across models.
|
Pipeline components that are independent can also be reused across pipelines.
|
||||||
Instead of adding a new blank component to a pipeline, you can also copy an
|
Instead of adding a new blank component, you can also copy an existing component
|
||||||
existing component from a pretrained model by setting the `source` argument on
|
from a trained pipeline by setting the `source` argument on
|
||||||
[`nlp.add_pipe`](/api/language#add_pipe). The first argument will then be
|
[`nlp.add_pipe`](/api/language#add_pipe). The first argument will then be
|
||||||
interpreted as the name of the component in the source pipeline – for instance,
|
interpreted as the name of the component in the source pipeline – for instance,
|
||||||
`"ner"`. This is especially useful for
|
`"ner"`. This is especially useful for
|
||||||
[training a model](/usage/training#config-components) because it lets you mix
|
[training a pipeline](/usage/training#config-components) because it lets you mix
|
||||||
and match components and create fully custom model packages with updated
|
and match components and create fully custom pipeline packages with updated
|
||||||
pretrained components and new components trained on your data.
|
trained components and new components trained on your data.
|
||||||
|
|
||||||
<Infobox variant="warning" title="Important note for pretrained components">
|
<Infobox variant="warning" title="Important note for trained components">
|
||||||
|
|
||||||
When reusing components across models, keep in mind that the **vocabulary**,
|
When reusing components across pipelines, keep in mind that the **vocabulary**,
|
||||||
**vectors** and model settings **must match**. If a pretrained model includes
|
**vectors** and model settings **must match**. If a trained pipeline includes
|
||||||
[word vectors](/usage/linguistic-features#vectors-similarity) and the component
|
[word vectors](/usage/linguistic-features#vectors-similarity) and the component
|
||||||
uses them as features, the model you copy it to needs to have the _same_ vectors
|
uses them as features, the pipeline you copy it to needs to have the _same_
|
||||||
available – otherwise, it won't be able to make the same predictions.
|
vectors available – otherwise, it won't be able to make the same predictions.
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
|
@ -384,7 +384,7 @@ available – otherwise, it won't be able to make the same predictions.
|
||||||
>
|
>
|
||||||
> Instead of providing a `factory`, component blocks in the training
|
> Instead of providing a `factory`, component blocks in the training
|
||||||
> [config](/usage/training#config) can also define a `source`. The string needs
|
> [config](/usage/training#config) can also define a `source`. The string needs
|
||||||
> to be a loadable spaCy model package or path. The
|
> to be a loadable spaCy pipeline package or path. The
|
||||||
>
|
>
|
||||||
> ```ini
|
> ```ini
|
||||||
> [components.ner]
|
> [components.ner]
|
||||||
|
@ -404,11 +404,11 @@ available – otherwise, it won't be able to make the same predictions.
|
||||||
### {executable="true"}
|
### {executable="true"}
|
||||||
import spacy
|
import spacy
|
||||||
|
|
||||||
# The source model with different components
|
# The source pipeline with different components
|
||||||
source_nlp = spacy.load("en_core_web_sm")
|
source_nlp = spacy.load("en_core_web_sm")
|
||||||
print(source_nlp.pipe_names)
|
print(source_nlp.pipe_names)
|
||||||
|
|
||||||
# Add only the entity recognizer to the new blank model
|
# Add only the entity recognizer to the new blank pipeline
|
||||||
nlp = spacy.blank("en")
|
nlp = spacy.blank("en")
|
||||||
nlp.add_pipe("ner", source=source_nlp)
|
nlp.add_pipe("ner", source=source_nlp)
|
||||||
print(nlp.pipe_names)
|
print(nlp.pipe_names)
|
||||||
|
@ -535,8 +535,8 @@ only being able to modify it afterwards.
|
||||||
The [`@Language.component`](/api/language#component) decorator lets you turn a
|
The [`@Language.component`](/api/language#component) decorator lets you turn a
|
||||||
simple function into a pipeline component. It takes at least one argument, the
|
simple function into a pipeline component. It takes at least one argument, the
|
||||||
**name** of the component factory. You can use this name to add an instance of
|
**name** of the component factory. You can use this name to add an instance of
|
||||||
your component to the pipeline. It can also be listed in your model config, so
|
your component to the pipeline. It can also be listed in your pipeline config,
|
||||||
you can save, load and train models using your component.
|
so you can save, load and train pipelines using your component.
|
||||||
|
|
||||||
Custom components can be added to the pipeline using the
|
Custom components can be added to the pipeline using the
|
||||||
[`add_pipe`](/api/language#add_pipe) method. Optionally, you can either specify
|
[`add_pipe`](/api/language#add_pipe) method. Optionally, you can either specify
|
||||||
|
@ -838,7 +838,7 @@ If what you're passing in isn't JSON-serializable – e.g. a custom object like
|
||||||
[model](#trainable-components) – saving out the component config becomes
|
[model](#trainable-components) – saving out the component config becomes
|
||||||
impossible because there's no way for spaCy to know _how_ that object was
|
impossible because there's no way for spaCy to know _how_ that object was
|
||||||
created, and what to do to create it again. This makes it much harder to save,
|
created, and what to do to create it again. This makes it much harder to save,
|
||||||
load and train custom models with custom components. A simple solution is to
|
load and train custom pipelines with custom components. A simple solution is to
|
||||||
**register a function** that returns your resources. The
|
**register a function** that returns your resources. The
|
||||||
[registry](/api/top-level#registry) lets you **map string names to functions**
|
[registry](/api/top-level#registry) lets you **map string names to functions**
|
||||||
that create objects, so given a name and optional arguments, spaCy will know how
|
that create objects, so given a name and optional arguments, spaCy will know how
|
||||||
|
@ -876,13 +876,13 @@ the result of the registered function is passed in as the key `"dictionary"`.
|
||||||
```
|
```
|
||||||
|
|
||||||
Using a registered function also means that you can easily include your custom
|
Using a registered function also means that you can easily include your custom
|
||||||
components in models that you [train](/usage/training). To make sure spaCy knows
|
components in pipelines that you [train](/usage/training). To make sure spaCy
|
||||||
where to find your custom `@assets` function, you can pass in a Python file via
|
knows where to find your custom `@assets` function, you can pass in a Python
|
||||||
the argument `--code`. If someone else is using your component, all they have to
|
file via the argument `--code`. If someone else is using your component, all
|
||||||
do to customize the data is to register their own function and swap out the
|
they have to do to customize the data is to register their own function and swap
|
||||||
name. Registered functions can also take **arguments** by the way that can be
|
out the name. Registered functions can also take **arguments** by the way that
|
||||||
defined in the config as well – you can read more about this in the docs on
|
can be defined in the config as well – you can read more about this in the docs
|
||||||
[training with custom code](/usage/training#custom-code).
|
on [training with custom code](/usage/training#custom-code).
|
||||||
|
|
||||||
### Python type hints and pydantic validation {#type-hints new="3"}
|
### Python type hints and pydantic validation {#type-hints new="3"}
|
||||||
|
|
||||||
|
@ -1121,7 +1121,14 @@ loss is calculated and to add evaluation scores to the training output.
|
||||||
| [`get_loss`](/api/pipe#get_loss) | Return a tuple of the loss and the gradient for a batch of [`Example`](/api/example) objects. |
|
| [`get_loss`](/api/pipe#get_loss) | Return a tuple of the loss and the gradient for a batch of [`Example`](/api/example) objects. |
|
||||||
| [`score`](/api/pipe#score) | Score a batch of [`Example`](/api/example) objects and return a dictionary of scores. The [`@Language.factory`](/api/language#factory) decorator can define the `default_socre_weights` of the component to decide which keys of the scores to display during training and how they count towards the final score. |
|
| [`score`](/api/pipe#score) | Score a batch of [`Example`](/api/example) objects and return a dictionary of scores. The [`@Language.factory`](/api/language#factory) decorator can define the `default_socre_weights` of the component to decide which keys of the scores to display during training and how they count towards the final score. |
|
||||||
|
|
||||||
<!-- TODO: link to (not yet created) page for defining models for trainable components -->
|
<Infobox title="Custom trainable components and models" emoji="📖">
|
||||||
|
|
||||||
|
For more details on how to implement your own trainable components and model
|
||||||
|
architectures, and plug existing models implemented in PyTorch or TensorFlow
|
||||||
|
into your spaCy pipeline, see the usage guide on
|
||||||
|
[layers and model architectures](/usage/layers-architectures#components).
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
## Extension attributes {#custom-components-attributes new="2"}
|
## Extension attributes {#custom-components-attributes new="2"}
|
||||||
|
|
||||||
|
@ -1322,9 +1329,9 @@ While it's generally recommended to use the `Doc._`, `Span._` and `Token._`
|
||||||
proxies to add your own custom attributes, spaCy offers a few exceptions to
|
proxies to add your own custom attributes, spaCy offers a few exceptions to
|
||||||
allow **customizing the built-in methods** like
|
allow **customizing the built-in methods** like
|
||||||
[`Doc.similarity`](/api/doc#similarity) or [`Doc.vector`](/api/doc#vector) with
|
[`Doc.similarity`](/api/doc#similarity) or [`Doc.vector`](/api/doc#vector) with
|
||||||
your own hooks, which can rely on statistical models you train yourself. For
|
your own hooks, which can rely on components you train yourself. For instance,
|
||||||
instance, you can provide your own on-the-fly sentence segmentation algorithm or
|
you can provide your own on-the-fly sentence segmentation algorithm or document
|
||||||
document similarity method.
|
similarity method.
|
||||||
|
|
||||||
Hooks let you customize some of the behaviors of the `Doc`, `Span` or `Token`
|
Hooks let you customize some of the behaviors of the `Doc`, `Span` or `Token`
|
||||||
objects by adding a component to the pipeline. For instance, to customize the
|
objects by adding a component to the pipeline. For instance, to customize the
|
||||||
|
@ -1456,13 +1463,13 @@ function that takes a `Doc`, modifies it and returns it.
|
||||||
method. However, a third-party extension should **never silently overwrite
|
method. However, a third-party extension should **never silently overwrite
|
||||||
built-ins**, or attributes set by other extensions.
|
built-ins**, or attributes set by other extensions.
|
||||||
|
|
||||||
- If you're looking to publish a model that depends on a custom pipeline
|
- If you're looking to publish a pipeline package that depends on a custom
|
||||||
component, you can either **require it** in the model package's dependencies,
|
pipeline component, you can either **require it** in the package's
|
||||||
or – if the component is specific and lightweight – choose to **ship it with
|
dependencies, or – if the component is specific and lightweight – choose to
|
||||||
your model package**. Just make sure the
|
**ship it with your pipeline package**. Just make sure the
|
||||||
[`@Language.component`](/api/language#component) or
|
[`@Language.component`](/api/language#component) or
|
||||||
[`@Language.factory`](/api/language#factory) decorator that registers the
|
[`@Language.factory`](/api/language#factory) decorator that registers the
|
||||||
custom component runs in your model's `__init__.py` or is exposed via an
|
custom component runs in your package's `__init__.py` or is exposed via an
|
||||||
[entry point](/usage/saving-loading#entry-points).
|
[entry point](/usage/saving-loading#entry-points).
|
||||||
|
|
||||||
- Once you're ready to share your extension with others, make sure to **add docs
|
- Once you're ready to share your extension with others, make sure to **add docs
|
||||||
|
@ -1511,9 +1518,9 @@ def custom_ner_wrapper(doc):
|
||||||
return doc
|
return doc
|
||||||
```
|
```
|
||||||
|
|
||||||
The `custom_ner_wrapper` can then be added to the pipeline of a blank model
|
The `custom_ner_wrapper` can then be added to a blank pipeline using
|
||||||
using [`nlp.add_pipe`](/api/language#add_pipe). You can also replace the
|
[`nlp.add_pipe`](/api/language#add_pipe). You can also replace the existing
|
||||||
existing entity recognizer of a pretrained model with
|
entity recognizer of a trained pipeline with
|
||||||
[`nlp.replace_pipe`](/api/language#replace_pipe).
|
[`nlp.replace_pipe`](/api/language#replace_pipe).
|
||||||
|
|
||||||
Here's another example of a custom model, `your_custom_model`, that takes a list
|
Here's another example of a custom model, `your_custom_model`, that takes a list
|
||||||
|
|
|
@ -20,10 +20,10 @@ menu:
|
||||||
|
|
||||||
spaCy projects let you manage and share **end-to-end spaCy workflows** for
|
spaCy projects let you manage and share **end-to-end spaCy workflows** for
|
||||||
different **use cases and domains**, and orchestrate training, packaging and
|
different **use cases and domains**, and orchestrate training, packaging and
|
||||||
serving your custom models. You can start off by cloning a pre-defined project
|
serving your custom pipelines. You can start off by cloning a pre-defined
|
||||||
template, adjust it to fit your needs, load in your data, train a model, export
|
project template, adjust it to fit your needs, load in your data, train a
|
||||||
it as a Python package, upload your outputs to a remote storage and share your
|
pipeline, export it as a Python package, upload your outputs to a remote storage
|
||||||
results with your team. spaCy projects can be used via the new
|
and share your results with your team. spaCy projects can be used via the new
|
||||||
[`spacy project`](/api/cli#project) command and we provide templates in our
|
[`spacy project`](/api/cli#project) command and we provide templates in our
|
||||||
[`projects`](https://github.com/explosion/projects) repo.
|
[`projects`](https://github.com/explosion/projects) repo.
|
||||||
|
|
||||||
|
@ -51,7 +51,7 @@ production.
|
||||||
<Grid narrow cols={3}>
|
<Grid narrow cols={3}>
|
||||||
<Integration title="DVC" logo="dvc" url="#dvc">Manage and version your data</Integration>
|
<Integration title="DVC" logo="dvc" url="#dvc">Manage and version your data</Integration>
|
||||||
<Integration title="Prodigy" logo="prodigy" url="#prodigy">Create labelled training data</Integration>
|
<Integration title="Prodigy" logo="prodigy" url="#prodigy">Create labelled training data</Integration>
|
||||||
<Integration title="Streamlit" logo="streamlit" url="#streamlit">Visualize and demo your models</Integration>
|
<Integration title="Streamlit" logo="streamlit" url="#streamlit">Visualize and demo your pipelines</Integration>
|
||||||
<Integration title="FastAPI" logo="fastapi" url="#fastapi">Serve your models and host APIs</Integration>
|
<Integration title="FastAPI" logo="fastapi" url="#fastapi">Serve your models and host APIs</Integration>
|
||||||
<Integration title="Ray" logo="ray" url="#ray">Distributed and parallel training</Integration>
|
<Integration title="Ray" logo="ray" url="#ray">Distributed and parallel training</Integration>
|
||||||
<Integration title="Weights & Biases" logo="wandb" url="#wandb">Track your experiments and results</Integration>
|
<Integration title="Weights & Biases" logo="wandb" url="#wandb">Track your experiments and results</Integration>
|
||||||
|
@ -66,8 +66,8 @@ production.
|
||||||
|
|
||||||
The [`spacy project clone`](/api/cli#project-clone) command clones an existing
|
The [`spacy project clone`](/api/cli#project-clone) command clones an existing
|
||||||
project template and copies the files to a local directory. You can then run the
|
project template and copies the files to a local directory. You can then run the
|
||||||
project, e.g. to train a model and edit the commands and scripts to build fully
|
project, e.g. to train a pipeline and edit the commands and scripts to build
|
||||||
custom workflows.
|
fully custom workflows.
|
||||||
|
|
||||||
```cli
|
```cli
|
||||||
python -m spacy project clone some_example_project
|
python -m spacy project clone some_example_project
|
||||||
|
@ -162,12 +162,12 @@ script).
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
Workflows are series of commands that are run in order and often depend on each
|
Workflows are series of commands that are run in order and often depend on each
|
||||||
other. For instance, to generate a packaged model, you might start by converting
|
other. For instance, to generate a pipeline package, you might start by
|
||||||
your data, then run [`spacy train`](/api/cli#train) to train your model on the
|
converting your data, then run [`spacy train`](/api/cli#train) to train your
|
||||||
converted data and if that's successful, run [`spacy package`](/api/cli#package)
|
pipeline on the converted data and if that's successful, run
|
||||||
to turn the best model artifact into an installable Python package. The
|
[`spacy package`](/api/cli#package) to turn the best trained artifact into an
|
||||||
following command runs the workflow named `all` defined in the `project.yml`,
|
installable Python package. The following command runs the workflow named `all`
|
||||||
and executes the commands it specifies, in order:
|
defined in the `project.yml`, and executes the commands it specifies, in order:
|
||||||
|
|
||||||
```cli
|
```cli
|
||||||
$ python -m spacy project run all
|
$ python -m spacy project run all
|
||||||
|
@ -191,11 +191,11 @@ project as a DVC repo.
|
||||||
> local: '/mnt/scratch/cache'
|
> local: '/mnt/scratch/cache'
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
After training a model, you can optionally use the
|
After training a pipeline, you can optionally use the
|
||||||
[`spacy project push`](/api/cli#project-push) command to upload your outputs to
|
[`spacy project push`](/api/cli#project-push) command to upload your outputs to
|
||||||
a remote storage, using protocols like [S3](https://aws.amazon.com/s3/),
|
a remote storage, using protocols like [S3](https://aws.amazon.com/s3/),
|
||||||
[Google Cloud Storage](https://cloud.google.com/storage) or SSH. This can help
|
[Google Cloud Storage](https://cloud.google.com/storage) or SSH. This can help
|
||||||
you **export** your model packages, **share** work with your team, or **cache
|
you **export** your pipeline packages, **share** work with your team, or **cache
|
||||||
results** to avoid repeating work.
|
results** to avoid repeating work.
|
||||||
|
|
||||||
```cli
|
```cli
|
||||||
|
@ -214,8 +214,8 @@ docs on [remote storage](#remote).
|
||||||
The `project.yml` defines the assets a project depends on, like datasets and
|
The `project.yml` defines the assets a project depends on, like datasets and
|
||||||
pretrained weights, as well as a series of commands that can be run separately
|
pretrained weights, as well as a series of commands that can be run separately
|
||||||
or as a workflow – for instance, to preprocess the data, convert it to spaCy's
|
or as a workflow – for instance, to preprocess the data, convert it to spaCy's
|
||||||
format, train a model, evaluate it and export metrics, package it and spin up a
|
format, train a pipeline, evaluate it and export metrics, package it and spin up
|
||||||
quick web demo. It looks pretty similar to a config file used to define CI
|
a quick web demo. It looks pretty similar to a config file used to define CI
|
||||||
pipelines.
|
pipelines.
|
||||||
|
|
||||||
<!-- TODO: update with better (final) example -->
|
<!-- TODO: update with better (final) example -->
|
||||||
|
@ -324,17 +324,17 @@ others are running your project with the same data.
|
||||||
|
|
||||||
Each command defined in the `project.yml` can optionally define a list of
|
Each command defined in the `project.yml` can optionally define a list of
|
||||||
dependencies and outputs. These are the files the command requires and creates.
|
dependencies and outputs. These are the files the command requires and creates.
|
||||||
For example, a command for training a model may depend on a
|
For example, a command for training a pipeline may depend on a
|
||||||
[`config.cfg`](/usage/training#config) and the training and evaluation data, and
|
[`config.cfg`](/usage/training#config) and the training and evaluation data, and
|
||||||
it will export a directory `model-best`, containing the best model, which you
|
it will export a directory `model-best`, which you can then re-use in other
|
||||||
can then re-use in other commands.
|
commands.
|
||||||
|
|
||||||
<!-- prettier-ignore -->
|
<!-- prettier-ignore -->
|
||||||
```yaml
|
```yaml
|
||||||
### project.yml
|
### project.yml
|
||||||
commands:
|
commands:
|
||||||
- name: train
|
- name: train
|
||||||
help: 'Train a spaCy model using the specified corpus and config'
|
help: 'Train a spaCy pipeline using the specified corpus and config'
|
||||||
script:
|
script:
|
||||||
- 'python -m spacy train ./configs/config.cfg -o training/ --paths.train ./corpus/training.spacy --paths.dev ./corpus/evaluation.spacy'
|
- 'python -m spacy train ./configs/config.cfg -o training/ --paths.train ./corpus/training.spacy --paths.dev ./corpus/evaluation.spacy'
|
||||||
deps:
|
deps:
|
||||||
|
@ -392,14 +392,14 @@ directory:
|
||||||
├── project.yml # the project settings
|
├── project.yml # the project settings
|
||||||
├── project.lock # lockfile that tracks inputs/outputs
|
├── project.lock # lockfile that tracks inputs/outputs
|
||||||
├── assets/ # downloaded data assets
|
├── assets/ # downloaded data assets
|
||||||
├── configs/ # model config.cfg files used for training
|
├── configs/ # pipeline config.cfg files used for training
|
||||||
├── corpus/ # output directory for training corpus
|
├── corpus/ # output directory for training corpus
|
||||||
├── metas/ # model meta.json templates used for packaging
|
├── metas/ # pipeline meta.json templates used for packaging
|
||||||
├── metrics/ # output directory for evaluation metrics
|
├── metrics/ # output directory for evaluation metrics
|
||||||
├── notebooks/ # directory for Jupyter notebooks
|
├── notebooks/ # directory for Jupyter notebooks
|
||||||
├── packages/ # output directory for model Python packages
|
├── packages/ # output directory for pipeline Python packages
|
||||||
├── scripts/ # directory for scripts, e.g. referenced in commands
|
├── scripts/ # directory for scripts, e.g. referenced in commands
|
||||||
├── training/ # output directory for trained models
|
├── training/ # output directory for trained pipelines
|
||||||
└── ... # any other files, like a requirements.txt etc.
|
└── ... # any other files, like a requirements.txt etc.
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -426,7 +426,7 @@ report:
|
||||||
### project.yml
|
### project.yml
|
||||||
commands:
|
commands:
|
||||||
- name: test
|
- name: test
|
||||||
help: 'Test the trained model'
|
help: 'Test the trained pipeline'
|
||||||
script:
|
script:
|
||||||
- 'pip install pytest pytest-html'
|
- 'pip install pytest pytest-html'
|
||||||
- 'python -m pytest ./scripts/tests --html=metrics/test-report.html'
|
- 'python -m pytest ./scripts/tests --html=metrics/test-report.html'
|
||||||
|
@ -440,8 +440,8 @@ commands:
|
||||||
Adding `training/model-best` to the command's `deps` lets you ensure that the
|
Adding `training/model-best` to the command's `deps` lets you ensure that the
|
||||||
file is available. If not, spaCy will show an error and the command won't run.
|
file is available. If not, spaCy will show an error and the command won't run.
|
||||||
Setting `no_skip: true` means that the command will always run, even if the
|
Setting `no_skip: true` means that the command will always run, even if the
|
||||||
dependencies (the trained model) hasn't changed. This makes sense here, because
|
dependencies (the trained pipeline) haven't changed. This makes sense here,
|
||||||
you typically don't want to skip your tests.
|
because you typically don't want to skip your tests.
|
||||||
|
|
||||||
### Writing custom scripts {#custom-scripts}
|
### Writing custom scripts {#custom-scripts}
|
||||||
|
|
||||||
|
@ -554,7 +554,7 @@ notebooks with usage examples.
|
||||||
|
|
||||||
<Infobox title="Important note about assets" variant="warning">
|
<Infobox title="Important note about assets" variant="warning">
|
||||||
|
|
||||||
It's typically not a good idea to check large data assets, trained models or
|
It's typically not a good idea to check large data assets, trained pipelines or
|
||||||
other artifacts into a Git repo and you should exclude them from your project
|
other artifacts into a Git repo and you should exclude them from your project
|
||||||
template by adding a `.gitignore`. If you want to version your data and models,
|
template by adding a `.gitignore`. If you want to version your data and models,
|
||||||
check out [Data Version Control](#dvc) (DVC), which integrates with spaCy
|
check out [Data Version Control](#dvc) (DVC), which integrates with spaCy
|
||||||
|
@ -566,7 +566,7 @@ projects.
|
||||||
|
|
||||||
You can persist your project outputs to a remote storage using the
|
You can persist your project outputs to a remote storage using the
|
||||||
[`project push`](/api/cli#project-push) command. This can help you **export**
|
[`project push`](/api/cli#project-push) command. This can help you **export**
|
||||||
your model packages, **share** work with your team, or **cache results** to
|
your pipeline packages, **share** work with your team, or **cache results** to
|
||||||
avoid repeating work. The [`project pull`](/api/cli#project-pull) command will
|
avoid repeating work. The [`project pull`](/api/cli#project-pull) command will
|
||||||
download any outputs that are in the remote storage and aren't available
|
download any outputs that are in the remote storage and aren't available
|
||||||
locally.
|
locally.
|
||||||
|
@ -622,7 +622,7 @@ For instance, let's say you had the following command in your `project.yml`:
|
||||||
```yaml
|
```yaml
|
||||||
### project.yml
|
### project.yml
|
||||||
- name: train
|
- name: train
|
||||||
help: 'Train a spaCy model using the specified corpus and config'
|
help: 'Train a spaCy pipeline using the specified corpus and config'
|
||||||
script:
|
script:
|
||||||
- 'spacy train ./config.cfg --output training/'
|
- 'spacy train ./config.cfg --output training/'
|
||||||
deps:
|
deps:
|
||||||
|
@ -814,8 +814,8 @@ mattis pretium.
|
||||||
[Streamlit](https://streamlit.io) is a Python framework for building interactive
|
[Streamlit](https://streamlit.io) is a Python framework for building interactive
|
||||||
data apps. The [`spacy-streamlit`](https://github.com/explosion/spacy-streamlit)
|
data apps. The [`spacy-streamlit`](https://github.com/explosion/spacy-streamlit)
|
||||||
package helps you integrate spaCy visualizations into your Streamlit apps and
|
package helps you integrate spaCy visualizations into your Streamlit apps and
|
||||||
quickly spin up demos to explore your models interactively. It includes a full
|
quickly spin up demos to explore your pipelines interactively. It includes a
|
||||||
embedded visualizer, as well as individual components.
|
full embedded visualizer, as well as individual components.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ pip install spacy_streamlit
|
$ pip install spacy_streamlit
|
||||||
|
@ -829,11 +829,11 @@ $ pip install spacy_streamlit
|
||||||
|
|
||||||
Using [`spacy-streamlit`](https://github.com/explosion/spacy-streamlit), your
|
Using [`spacy-streamlit`](https://github.com/explosion/spacy-streamlit), your
|
||||||
projects can easily define their own scripts that spin up an interactive
|
projects can easily define their own scripts that spin up an interactive
|
||||||
visualizer, using the latest model you trained, or a selection of models so you
|
visualizer, using the latest pipeline you trained, or a selection of pipelines
|
||||||
can compare their results. The following script starts an
|
so you can compare their results. The following script starts an
|
||||||
[NER visualizer](/usage/visualizers#ent) and takes two positional command-line
|
[NER visualizer](/usage/visualizers#ent) and takes two positional command-line
|
||||||
argument you can pass in from your `config.yml`: a comma-separated list of model
|
argument you can pass in from your `config.yml`: a comma-separated list of paths
|
||||||
paths and an example text to use as the default text.
|
to load the pipelines from and an example text to use as the default text.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### scripts/visualize.py
|
### scripts/visualize.py
|
||||||
|
@ -841,8 +841,8 @@ import spacy_streamlit
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
DEFAULT_TEXT = sys.argv[2] if len(sys.argv) >= 3 else ""
|
DEFAULT_TEXT = sys.argv[2] if len(sys.argv) >= 3 else ""
|
||||||
MODELS = [name.strip() for name in sys.argv[1].split(",")]
|
PIPELINES = [name.strip() for name in sys.argv[1].split(",")]
|
||||||
spacy_streamlit.visualize(MODELS, DEFAULT_TEXT, visualizers=["ner"])
|
spacy_streamlit.visualize(PIPELINES, DEFAULT_TEXT, visualizers=["ner"])
|
||||||
```
|
```
|
||||||
|
|
||||||
> #### Example usage
|
> #### Example usage
|
||||||
|
@ -856,7 +856,7 @@ spacy_streamlit.visualize(MODELS, DEFAULT_TEXT, visualizers=["ner"])
|
||||||
### project.yml
|
### project.yml
|
||||||
commands:
|
commands:
|
||||||
- name: visualize
|
- name: visualize
|
||||||
help: "Visualize the model's output interactively using Streamlit"
|
help: "Visualize the pipeline's output interactively using Streamlit"
|
||||||
script:
|
script:
|
||||||
- 'streamlit run ./scripts/visualize.py ./training/model-best "I like Adidas shoes."'
|
- 'streamlit run ./scripts/visualize.py ./training/model-best "I like Adidas shoes."'
|
||||||
deps:
|
deps:
|
||||||
|
@ -879,8 +879,8 @@ mattis pretium.
|
||||||
for building REST APIs with Python, based on Python
|
for building REST APIs with Python, based on Python
|
||||||
[type hints](https://fastapi.tiangolo.com/python-types/). It's become a popular
|
[type hints](https://fastapi.tiangolo.com/python-types/). It's become a popular
|
||||||
library for serving machine learning models and you can use it in your spaCy
|
library for serving machine learning models and you can use it in your spaCy
|
||||||
projects to quickly serve up a trained model and make it available behind a REST
|
projects to quickly serve up a trained pipeline and make it available behind a
|
||||||
API.
|
REST API.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
# TODO: show an example that addresses some of the main concerns for serving ML (workers etc.)
|
# TODO: show an example that addresses some of the main concerns for serving ML (workers etc.)
|
||||||
|
@ -897,7 +897,7 @@ API.
|
||||||
### project.yml
|
### project.yml
|
||||||
commands:
|
commands:
|
||||||
- name: serve
|
- name: serve
|
||||||
help: "Serve the trained model with FastAPI"
|
help: "Serve the trained pipeline with FastAPI"
|
||||||
script:
|
script:
|
||||||
- 'python ./scripts/serve.py ./training/model-best'
|
- 'python ./scripts/serve.py ./training/model-best'
|
||||||
deps:
|
deps:
|
||||||
|
|
|
@ -759,7 +759,7 @@ whitespace, making them easy to match as well.
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.matcher import Matcher
|
from spacy.matcher import Matcher
|
||||||
|
|
||||||
nlp = English() # We only want the tokenizer, so no need to load a model
|
nlp = English() # We only want the tokenizer, so no need to load a pipeline
|
||||||
matcher = Matcher(nlp.vocab)
|
matcher = Matcher(nlp.vocab)
|
||||||
|
|
||||||
pos_emoji = ["😀", "😃", "😂", "🤣", "😊", "😍"] # Positive emoji
|
pos_emoji = ["😀", "😃", "😂", "🤣", "😊", "😍"] # Positive emoji
|
||||||
|
@ -893,12 +893,13 @@ pattern covering the exact tokenization of the term.
|
||||||
<Infobox title="Important note on creating patterns" variant="warning">
|
<Infobox title="Important note on creating patterns" variant="warning">
|
||||||
|
|
||||||
To create the patterns, each phrase has to be processed with the `nlp` object.
|
To create the patterns, each phrase has to be processed with the `nlp` object.
|
||||||
If you have a model loaded, doing this in a loop or list comprehension can
|
If you have a trained pipeline loaded, doing this in a loop or list
|
||||||
easily become inefficient and slow. If you **only need the tokenization and
|
comprehension can easily become inefficient and slow. If you **only need the
|
||||||
lexical attributes**, you can run [`nlp.make_doc`](/api/language#make_doc)
|
tokenization and lexical attributes**, you can run
|
||||||
instead, which will only run the tokenizer. For an additional speed boost, you
|
[`nlp.make_doc`](/api/language#make_doc) instead, which will only run the
|
||||||
can also use the [`nlp.tokenizer.pipe`](/api/tokenizer#pipe) method, which will
|
tokenizer. For an additional speed boost, you can also use the
|
||||||
process the texts as a stream.
|
[`nlp.tokenizer.pipe`](/api/tokenizer#pipe) method, which will process the texts
|
||||||
|
as a stream.
|
||||||
|
|
||||||
```diff
|
```diff
|
||||||
- patterns = [nlp(term) for term in LOTS_OF_TERMS]
|
- patterns = [nlp(term) for term in LOTS_OF_TERMS]
|
||||||
|
@ -977,7 +978,7 @@ of an advantage over writing one or two token patterns.
|
||||||
The [`EntityRuler`](/api/entityruler) is an exciting new component that lets you
|
The [`EntityRuler`](/api/entityruler) is an exciting new component that lets you
|
||||||
add named entities based on pattern dictionaries, and makes it easy to combine
|
add named entities based on pattern dictionaries, and makes it easy to combine
|
||||||
rule-based and statistical named entity recognition for even more powerful
|
rule-based and statistical named entity recognition for even more powerful
|
||||||
models.
|
pipelines.
|
||||||
|
|
||||||
### Entity Patterns {#entityruler-patterns}
|
### Entity Patterns {#entityruler-patterns}
|
||||||
|
|
||||||
|
@ -1021,8 +1022,8 @@ doc = nlp("Apple is opening its first big office in San Francisco.")
|
||||||
print([(ent.text, ent.label_) for ent in doc.ents])
|
print([(ent.text, ent.label_) for ent in doc.ents])
|
||||||
```
|
```
|
||||||
|
|
||||||
The entity ruler is designed to integrate with spaCy's existing statistical
|
The entity ruler is designed to integrate with spaCy's existing pipeline
|
||||||
models and enhance the named entity recognizer. If it's added **before the
|
components and enhance the named entity recognizer. If it's added **before the
|
||||||
`"ner"` component**, the entity recognizer will respect the existing entity
|
`"ner"` component**, the entity recognizer will respect the existing entity
|
||||||
spans and adjust its predictions around it. This can significantly improve
|
spans and adjust its predictions around it. This can significantly improve
|
||||||
accuracy in some cases. If it's added **after the `"ner"` component**, the
|
accuracy in some cases. If it's added **after the `"ner"` component**, the
|
||||||
|
@ -1111,20 +1112,20 @@ versa.
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
When you save out an `nlp` object that has an `EntityRuler` added to its
|
When you save out an `nlp` object that has an `EntityRuler` added to its
|
||||||
pipeline, its patterns are automatically exported to the model directory:
|
pipeline, its patterns are automatically exported to the pipeline directory:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
nlp = spacy.load("en_core_web_sm")
|
nlp = spacy.load("en_core_web_sm")
|
||||||
ruler = nlp.add_pipe("entity_ruler")
|
ruler = nlp.add_pipe("entity_ruler")
|
||||||
ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
|
ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
|
||||||
nlp.to_disk("/path/to/model")
|
nlp.to_disk("/path/to/pipeline")
|
||||||
```
|
```
|
||||||
|
|
||||||
The saved model now includes the `"entity_ruler"` in its
|
The saved pipeline now includes the `"entity_ruler"` in its
|
||||||
[`config.cfg`](/api/data-formats#config) and the model directory contains a file
|
[`config.cfg`](/api/data-formats#config) and the pipeline directory contains a
|
||||||
`entityruler.jsonl` with the patterns. When you load the model back in, all
|
file `entityruler.jsonl` with the patterns. When you load the pipeline back in,
|
||||||
pipeline components will be restored and deserialized – including the entity
|
all pipeline components will be restored and deserialized – including the entity
|
||||||
ruler. This lets you ship powerful model packages with binary weights _and_
|
ruler. This lets you ship powerful pipeline packages with binary weights _and_
|
||||||
rules included!
|
rules included!
|
||||||
|
|
||||||
### Using a large number of phrase patterns {#entityruler-large-phrase-patterns new="2.2.4"}
|
### Using a large number of phrase patterns {#entityruler-large-phrase-patterns new="2.2.4"}
|
||||||
|
@ -1141,7 +1142,7 @@ of `"phrase_matcher_attr": "POS"` for the entity ruler.
|
||||||
|
|
||||||
Running the full language pipeline across every pattern in a large list scales
|
Running the full language pipeline across every pattern in a large list scales
|
||||||
linearly and can therefore take a long time on large amounts of phrase patterns.
|
linearly and can therefore take a long time on large amounts of phrase patterns.
|
||||||
As of spaCy 2.2.4 the `add_patterns` function has been refactored to use
|
As of spaCy v2.2.4 the `add_patterns` function has been refactored to use
|
||||||
nlp.pipe on all phrase patterns resulting in about a 10x-20x speed up with
|
nlp.pipe on all phrase patterns resulting in about a 10x-20x speed up with
|
||||||
5,000-100,000 phrase patterns respectively. Even with this speedup (but
|
5,000-100,000 phrase patterns respectively. Even with this speedup (but
|
||||||
especially if you're using an older version) the `add_patterns` function can
|
especially if you're using an older version) the `add_patterns` function can
|
||||||
|
@ -1168,7 +1169,7 @@ order to implement more abstract logic.
|
||||||
|
|
||||||
### Example: Expanding named entities {#models-rules-ner}
|
### Example: Expanding named entities {#models-rules-ner}
|
||||||
|
|
||||||
When using the a pretrained
|
When using a trained
|
||||||
[named entity recognition](/usage/linguistic-features/#named-entities) model to
|
[named entity recognition](/usage/linguistic-features/#named-entities) model to
|
||||||
extract information from your texts, you may find that the predicted span only
|
extract information from your texts, you may find that the predicted span only
|
||||||
includes parts of the entity you're looking for. Sometimes, this happens if
|
includes parts of the entity you're looking for. Sometimes, this happens if
|
||||||
|
@ -1178,15 +1179,15 @@ what you need for your application.
|
||||||
|
|
||||||
> #### Where corpora come from
|
> #### Where corpora come from
|
||||||
>
|
>
|
||||||
> Corpora used to train models from scratch are often produced in academia. They
|
> Corpora used to train pipelines from scratch are often produced in academia.
|
||||||
> contain text from various sources with linguistic features labeled manually by
|
> They contain text from various sources with linguistic features labeled
|
||||||
> human annotators (following a set of specific guidelines). The corpora are
|
> manually by human annotators (following a set of specific guidelines). The
|
||||||
> then distributed with evaluation data, so other researchers can benchmark
|
> corpora are then distributed with evaluation data, so other researchers can
|
||||||
> their algorithms and everyone can report numbers on the same data. However,
|
> benchmark their algorithms and everyone can report numbers on the same data.
|
||||||
> most applications need to learn information that isn't contained in any
|
> However, most applications need to learn information that isn't contained in
|
||||||
> available corpus.
|
> any available corpus.
|
||||||
|
|
||||||
For example, the corpus spaCy's [English models](/models/en) were trained on
|
For example, the corpus spaCy's [English pipelines](/models/en) were trained on
|
||||||
defines a `PERSON` entity as just the **person name**, without titles like "Mr."
|
defines a `PERSON` entity as just the **person name**, without titles like "Mr."
|
||||||
or "Dr.". This makes sense, because it makes it easier to resolve the entity
|
or "Dr.". This makes sense, because it makes it easier to resolve the entity
|
||||||
type back to a knowledge base. But what if your application needs the full
|
type back to a knowledge base. But what if your application needs the full
|
||||||
|
|
|
@ -4,7 +4,7 @@ menu:
|
||||||
- ['Basics', 'basics']
|
- ['Basics', 'basics']
|
||||||
- ['Serialization Methods', 'serialization-methods']
|
- ['Serialization Methods', 'serialization-methods']
|
||||||
- ['Entry Points', 'entry-points']
|
- ['Entry Points', 'entry-points']
|
||||||
- ['Models', 'models']
|
- ['Trained Pipelines', 'models']
|
||||||
---
|
---
|
||||||
|
|
||||||
## Basics {#basics hidden="true"}
|
## Basics {#basics hidden="true"}
|
||||||
|
@ -25,10 +25,10 @@ can load in the data.
|
||||||
> #### Saving the meta and config
|
> #### Saving the meta and config
|
||||||
>
|
>
|
||||||
> The [`nlp.meta`](/api/language#meta) attribute is a JSON-serializable
|
> The [`nlp.meta`](/api/language#meta) attribute is a JSON-serializable
|
||||||
> dictionary and contains all model meta information like the author and license
|
> dictionary and contains all pipeline meta information like the author and
|
||||||
> information. The [`nlp.config`](/api/language#config) attribute is a
|
> license information. The [`nlp.config`](/api/language#config) attribute is a
|
||||||
> dictionary containing the training configuration, pipeline component factories
|
> dictionary containing the training configuration, pipeline component factories
|
||||||
> and other settings. It is saved out with a model as the `config.cfg`.
|
> and other settings. It is saved out with a pipeline as the `config.cfg`.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### Serialize
|
### Serialize
|
||||||
|
@ -45,12 +45,11 @@ for pipe_name in pipeline:
|
||||||
nlp.from_bytes(bytes_data)
|
nlp.from_bytes(bytes_data)
|
||||||
```
|
```
|
||||||
|
|
||||||
This is also how spaCy does it under the hood when loading a model: it loads the
|
This is also how spaCy does it under the hood when loading a pipeline: it loads
|
||||||
model's `config.cfg` containing the language and pipeline information,
|
the `config.cfg` containing the language and pipeline information, initializes
|
||||||
initializes the language class, creates and adds the pipeline components based
|
the language class, creates and adds the pipeline components based on the
|
||||||
on the defined
|
defined [factories](/usage/processing-pipeline#custom-components-factories) and
|
||||||
[factories](/usage/processing-pipeline#custom-components-factories) and _then_
|
_then_ loads in the binary data. You can read more about this process
|
||||||
loads in the binary data. You can read more about this process
|
|
||||||
[here](/usage/processing-pipelines#pipelines).
|
[here](/usage/processing-pipelines#pipelines).
|
||||||
|
|
||||||
### Serializing Doc objects efficiently {#docs new="2.2"}
|
### Serializing Doc objects efficiently {#docs new="2.2"}
|
||||||
|
@ -168,10 +167,10 @@ data = pickle.dumps(span_doc)
|
||||||
## Implementing serialization methods {#serialization-methods}
|
## Implementing serialization methods {#serialization-methods}
|
||||||
|
|
||||||
When you call [`nlp.to_disk`](/api/language#to_disk),
|
When you call [`nlp.to_disk`](/api/language#to_disk),
|
||||||
[`nlp.from_disk`](/api/language#from_disk) or load a model package, spaCy will
|
[`nlp.from_disk`](/api/language#from_disk) or load a pipeline package, spaCy
|
||||||
iterate over the components in the pipeline, check if they expose a `to_disk` or
|
will iterate over the components in the pipeline, check if they expose a
|
||||||
`from_disk` method and if so, call it with the path to the model directory plus
|
`to_disk` or `from_disk` method and if so, call it with the path to the pipeline
|
||||||
the string name of the component. For example, if you're calling
|
directory plus the string name of the component. For example, if you're calling
|
||||||
`nlp.to_disk("/path")`, the data for the named entity recognizer will be saved
|
`nlp.to_disk("/path")`, the data for the named entity recognizer will be saved
|
||||||
in `/path/ner`.
|
in `/path/ner`.
|
||||||
|
|
||||||
|
@ -191,8 +190,8 @@ add to that data and saves and loads the data to and from a JSON file.
|
||||||
> [source](https://github.com/explosion/spaCy/tree/master/spacy/pipeline/entityruler.py).
|
> [source](https://github.com/explosion/spaCy/tree/master/spacy/pipeline/entityruler.py).
|
||||||
> Patterns added to the component will be saved to a `.jsonl` file if the
|
> Patterns added to the component will be saved to a `.jsonl` file if the
|
||||||
> pipeline is serialized to disk, and to a bytestring if the pipeline is
|
> pipeline is serialized to disk, and to a bytestring if the pipeline is
|
||||||
> serialized to bytes. This allows saving out a model with a rule-based entity
|
> serialized to bytes. This allows saving out a pipeline with a rule-based
|
||||||
> recognizer and including all rules _with_ the model data.
|
> entity recognizer and including all rules _with_ the component data.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### {highlight="14-18,20-25"}
|
### {highlight="14-18,20-25"}
|
||||||
|
@ -232,7 +231,7 @@ component's `to_disk` method.
|
||||||
nlp = spacy.load("en_core_web_sm")
|
nlp = spacy.load("en_core_web_sm")
|
||||||
my_component = nlp.add_pipe("my_component")
|
my_component = nlp.add_pipe("my_component")
|
||||||
my_component.add({"hello": "world"})
|
my_component.add({"hello": "world"})
|
||||||
nlp.to_disk("/path/to/model")
|
nlp.to_disk("/path/to/pipeline")
|
||||||
```
|
```
|
||||||
|
|
||||||
The contents of the directory would then look like this.
|
The contents of the directory would then look like this.
|
||||||
|
@ -241,15 +240,15 @@ file `data.json` in its subdirectory:
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
### Directory structure {highlight="2-3"}
|
### Directory structure {highlight="2-3"}
|
||||||
└── /path/to/model
|
└── /path/to/pipeline
|
||||||
├── my_component # data serialized by "my_component"
|
├── my_component # data serialized by "my_component"
|
||||||
│ └── data.json
|
│ └── data.json
|
||||||
├── ner # data for "ner" component
|
├── ner # data for "ner" component
|
||||||
├── parser # data for "parser" component
|
├── parser # data for "parser" component
|
||||||
├── tagger # data for "tagger" component
|
├── tagger # data for "tagger" component
|
||||||
├── vocab # model vocabulary
|
├── vocab # pipeline vocabulary
|
||||||
├── meta.json # model meta.json
|
├── meta.json # pipeline meta.json
|
||||||
├── config.cfg # model config
|
├── config.cfg # pipeline config
|
||||||
└── tokenizer # tokenization rules
|
└── tokenizer # tokenization rules
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -258,18 +257,19 @@ When you load the data back in, spaCy will call the custom component's
|
||||||
contents of `data.json`, convert them to a Python object and restore the
|
contents of `data.json`, convert them to a Python object and restore the
|
||||||
component state. The same works for other types of data, of course – for
|
component state. The same works for other types of data, of course – for
|
||||||
instance, you could add a
|
instance, you could add a
|
||||||
[wrapper for a model](/usage/processing-pipelines#wrapping-models-libraries)
|
[wrapper for a model](/usage/layers-architectures#frameworks) trained with a
|
||||||
trained with a different library like TensorFlow or PyTorch and make spaCy load
|
different library like TensorFlow or PyTorch and make spaCy load its weights
|
||||||
its weights automatically when you load the model package.
|
automatically when you load the pipeline package.
|
||||||
|
|
||||||
<Infobox title="Important note on loading custom components" variant="warning">
|
<Infobox title="Important note on loading custom components" variant="warning">
|
||||||
|
|
||||||
When you load back a model with custom components, make sure that the components
|
When you load back a pipeline with custom components, make sure that the
|
||||||
are **available** and that the [`@Language.component`](/api/language#component)
|
components are **available** and that the
|
||||||
or [`@Language.factory`](/api/language#factory) decorators are executed _before_
|
[`@Language.component`](/api/language#component) or
|
||||||
your model is loaded back. Otherwise, spaCy won't know how to resolve the string
|
[`@Language.factory`](/api/language#factory) decorators are executed _before_
|
||||||
name of a component factory like `"my_component"` back to a function. For more
|
your pipeline is loaded back. Otherwise, spaCy won't know how to resolve the
|
||||||
details, see the documentation on
|
string name of a component factory like `"my_component"` back to a function. For
|
||||||
|
more details, see the documentation on
|
||||||
[adding factories](/usage/processing-pipelines#custom-components-factories) or
|
[adding factories](/usage/processing-pipelines#custom-components-factories) or
|
||||||
use [entry points](#entry-points) to make your extension package expose your
|
use [entry points](#entry-points) to make your extension package expose your
|
||||||
custom components to spaCy automatically.
|
custom components to spaCy automatically.
|
||||||
|
@ -297,18 +297,19 @@ installed in the same environment – that's it.
|
||||||
|
|
||||||
### Custom components via entry points {#entry-points-components}
|
### Custom components via entry points {#entry-points-components}
|
||||||
|
|
||||||
When you load a model, spaCy will generally use the model's `config.cfg` to set
|
When you load a pipeline, spaCy will generally use its `config.cfg` to set up
|
||||||
up the language class and construct the pipeline. The pipeline is specified as a
|
the language class and construct the pipeline. The pipeline is specified as a
|
||||||
list of strings, e.g. `pipeline = ["tagger", "paser", "ner"]`. For each of those
|
list of strings, e.g. `pipeline = ["tagger", "paser", "ner"]`. For each of those
|
||||||
strings, spaCy will call `nlp.add_pipe` and look up the name in all factories
|
strings, spaCy will call `nlp.add_pipe` and look up the name in all factories
|
||||||
defined by the decorators [`@Language.component`](/api/language#component) and
|
defined by the decorators [`@Language.component`](/api/language#component) and
|
||||||
[`@Language.factory`](/api/language#factory). This means that you have to import
|
[`@Language.factory`](/api/language#factory). This means that you have to import
|
||||||
your custom components _before_ loading the model.
|
your custom components _before_ loading the pipeline.
|
||||||
|
|
||||||
Using entry points, model packages and extension packages can define their own
|
Using entry points, pipeline packages and extension packages can define their
|
||||||
`"spacy_factories"`, which will be loaded automatically in the background when
|
own `"spacy_factories"`, which will be loaded automatically in the background
|
||||||
the `Language` class is initialized. So if a user has your package installed,
|
when the `Language` class is initialized. So if a user has your package
|
||||||
they'll be able to use your components – even if they **don't import them**!
|
installed, they'll be able to use your components – even if they **don't import
|
||||||
|
them**!
|
||||||
|
|
||||||
To stick with the theme of
|
To stick with the theme of
|
||||||
[this entry points blog post](https://amir.rachum.com/blog/2017/07/28/python-entry-points/),
|
[this entry points blog post](https://amir.rachum.com/blog/2017/07/28/python-entry-points/),
|
||||||
|
@ -343,10 +344,10 @@ def snek_component(doc):
|
||||||
|
|
||||||
Since it's a very complex and sophisticated module, you want to split it off
|
Since it's a very complex and sophisticated module, you want to split it off
|
||||||
into its own package so you can version it and upload it to PyPi. You also want
|
into its own package so you can version it and upload it to PyPi. You also want
|
||||||
your custom model to be able to define `pipeline = ["snek"]` in its
|
your custom package to be able to define `pipeline = ["snek"]` in its
|
||||||
`config.cfg`. For that, you need to be able to tell spaCy where to find the
|
`config.cfg`. For that, you need to be able to tell spaCy where to find the
|
||||||
component `"snek"`. If you don't do this, spaCy will raise an error when you try
|
component `"snek"`. If you don't do this, spaCy will raise an error when you try
|
||||||
to load the model because there's no built-in `"snek"` component. To add an
|
to load the pipeline because there's no built-in `"snek"` component. To add an
|
||||||
entry to the factories, you can now expose it in your `setup.py` via the
|
entry to the factories, you can now expose it in your `setup.py` via the
|
||||||
`entry_points` dictionary:
|
`entry_points` dictionary:
|
||||||
|
|
||||||
|
@ -380,7 +381,7 @@ $ python setup.py develop
|
||||||
spaCy is now able to create the pipeline component `"snek"` – even though you
|
spaCy is now able to create the pipeline component `"snek"` – even though you
|
||||||
never imported `snek_component`. When you save the
|
never imported `snek_component`. When you save the
|
||||||
[`nlp.config`](/api/language#config) to disk, it includes an entry for your
|
[`nlp.config`](/api/language#config) to disk, it includes an entry for your
|
||||||
`"snek"` component and any model you train with this config will include the
|
`"snek"` component and any pipeline you train with this config will include the
|
||||||
component and know how to load it – if your `snek` package is installed.
|
component and know how to load it – if your `snek` package is installed.
|
||||||
|
|
||||||
> #### config.cfg (excerpt)
|
> #### config.cfg (excerpt)
|
||||||
|
@ -449,9 +450,9 @@ entry_points={
|
||||||
|
|
||||||
The factory can also implement other pipeline component like `to_disk` and
|
The factory can also implement other pipeline component like `to_disk` and
|
||||||
`from_disk` for serialization, or even `update` to make the component trainable.
|
`from_disk` for serialization, or even `update` to make the component trainable.
|
||||||
If a component exposes a `from_disk` method and is included in a model's
|
If a component exposes a `from_disk` method and is included in a pipeline, spaCy
|
||||||
pipeline, spaCy will call it on load. This lets you ship custom data with your
|
will call it on load. This lets you ship custom data with your pipeline package.
|
||||||
model. When you save out a model using `nlp.to_disk` and the component exposes a
|
When you save out a pipeline using `nlp.to_disk` and the component exposes a
|
||||||
`to_disk` method, it will be called with the disk path.
|
`to_disk` method, it will be called with the disk path.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
@ -467,8 +468,8 @@ def from_disk(self, path, exclude=tuple()):
|
||||||
return self
|
return self
|
||||||
```
|
```
|
||||||
|
|
||||||
The above example will serialize the current snake in a `snek.txt` in the model
|
The above example will serialize the current snake in a `snek.txt` in the data
|
||||||
data directory. When a model using the `snek` component is loaded, it will open
|
directory. When a pipeline using the `snek` component is loaded, it will open
|
||||||
the `snek.txt` and make it available to the component.
|
the `snek.txt` and make it available to the component.
|
||||||
|
|
||||||
### Custom language classes via entry points {#entry-points-languages}
|
### Custom language classes via entry points {#entry-points-languages}
|
||||||
|
@ -476,7 +477,7 @@ the `snek.txt` and make it available to the component.
|
||||||
To stay with the theme of the previous example and
|
To stay with the theme of the previous example and
|
||||||
[this blog post on entry points](https://amir.rachum.com/blog/2017/07/28/python-entry-points/),
|
[this blog post on entry points](https://amir.rachum.com/blog/2017/07/28/python-entry-points/),
|
||||||
let's imagine you wanted to implement your own `SnekLanguage` class for your
|
let's imagine you wanted to implement your own `SnekLanguage` class for your
|
||||||
custom model – but you don't necessarily want to modify spaCy's code to add a
|
custom pipeline – but you don't necessarily want to modify spaCy's code to add a
|
||||||
language. In your package, you could then implement the following
|
language. In your package, you could then implement the following
|
||||||
[custom language subclass](/usage/linguistic-features#language-subclass):
|
[custom language subclass](/usage/linguistic-features#language-subclass):
|
||||||
|
|
||||||
|
@ -510,10 +511,10 @@ setup(
|
||||||
```
|
```
|
||||||
|
|
||||||
In spaCy, you can then load the custom `snk` language and it will be resolved to
|
In spaCy, you can then load the custom `snk` language and it will be resolved to
|
||||||
`SnekLanguage` via the custom entry point. This is especially relevant for model
|
`SnekLanguage` via the custom entry point. This is especially relevant for
|
||||||
packages you train, which could then specify `lang = snk` in their `config.cfg`
|
pipeline packages you [train](/usage/training), which could then specify
|
||||||
without spaCy raising an error because the language is not available in the core
|
`lang = snk` in their `config.cfg` without spaCy raising an error because the
|
||||||
library.
|
language is not available in the core library.
|
||||||
|
|
||||||
### Custom displaCy colors via entry points {#entry-points-displacy new="2.2"}
|
### Custom displaCy colors via entry points {#entry-points-displacy new="2.2"}
|
||||||
|
|
||||||
|
@ -526,7 +527,7 @@ values.
|
||||||
|
|
||||||
> #### Domain-specific NER labels
|
> #### Domain-specific NER labels
|
||||||
>
|
>
|
||||||
> Good examples of models with domain-specific label schemes are
|
> Good examples of pipelines with domain-specific label schemes are
|
||||||
> [scispaCy](/universe/project/scispacy) and
|
> [scispaCy](/universe/project/scispacy) and
|
||||||
> [Blackstone](/universe/project/blackstone).
|
> [Blackstone](/universe/project/blackstone).
|
||||||
|
|
||||||
|
@ -559,24 +560,23 @@ import DisplaCyEntSnekHtml from 'images/displacy-ent-snek.html'
|
||||||
|
|
||||||
<Iframe title="displaCy visualization of entities" html={DisplaCyEntSnekHtml} height={100} />
|
<Iframe title="displaCy visualization of entities" html={DisplaCyEntSnekHtml} height={100} />
|
||||||
|
|
||||||
## Saving, loading and distributing models {#models}
|
## Saving, loading and distributing trained pipelines {#models}
|
||||||
|
|
||||||
After training your model, you'll usually want to save its state, and load it
|
After training your pipeline, you'll usually want to save its state, and load it
|
||||||
back later. You can do this with the [`Language.to_disk`](/api/language#to_disk)
|
back later. You can do this with the [`Language.to_disk`](/api/language#to_disk)
|
||||||
method:
|
method:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
nlp.to_disk("./en_example_model")
|
nlp.to_disk("./en_example_pipeline")
|
||||||
```
|
```
|
||||||
|
|
||||||
The directory will be created if it doesn't exist, and the whole pipeline data,
|
The directory will be created if it doesn't exist, and the whole pipeline data,
|
||||||
model meta and model configuration will be written out. To make the model more
|
meta and configuration will be written out. To make the pipeline more convenient
|
||||||
convenient to deploy, we recommend wrapping it as a
|
to deploy, we recommend wrapping it as a [Python package](/api/cli#package).
|
||||||
[Python package](/api/cli#package).
|
|
||||||
|
|
||||||
<Accordion title="What’s the difference between the config.cfg and meta.json?" spaced id="models-meta-vs-config">
|
<Accordion title="What’s the difference between the config.cfg and meta.json?" spaced id="models-meta-vs-config">
|
||||||
|
|
||||||
When you save a model in spaCy v3.0+, two files will be exported: a
|
When you save a pipeline in spaCy v3.0+, two files will be exported: a
|
||||||
[`config.cfg`](/api/data-formats#config) based on
|
[`config.cfg`](/api/data-formats#config) based on
|
||||||
[`nlp.config`](/api/language#config) and a [`meta.json`](/api/data-formats#meta)
|
[`nlp.config`](/api/language#config) and a [`meta.json`](/api/data-formats#meta)
|
||||||
based on [`nlp.meta`](/api/language#meta).
|
based on [`nlp.meta`](/api/language#meta).
|
||||||
|
@ -587,42 +587,42 @@ based on [`nlp.meta`](/api/language#meta).
|
||||||
[pipeline components](/usage/processing-pipelines#custom-components) or
|
[pipeline components](/usage/processing-pipelines#custom-components) or
|
||||||
[model architectures](/api/architectures). Given a config, spaCy is able
|
[model architectures](/api/architectures). Given a config, spaCy is able
|
||||||
reconstruct the whole tree of objects and the `nlp` object. An exported config
|
reconstruct the whole tree of objects and the `nlp` object. An exported config
|
||||||
can also be used to [train a model](/usage/training#conig) with the same
|
can also be used to [train a pipeline](/usage/training#config) with the same
|
||||||
settings.
|
settings.
|
||||||
- **meta**: Meta information about the model and the Python package, such as the
|
- **meta**: Meta information about the pipeline and the Python package, such as
|
||||||
author information, license, version, data sources and label scheme. This is
|
the author information, license, version, data sources and label scheme. This
|
||||||
mostly used for documentation purposes and for packaging models. It has no
|
is mostly used for documentation purposes and for packaging pipelines. It has
|
||||||
impact on the functionality of the `nlp` object.
|
no impact on the functionality of the `nlp` object.
|
||||||
|
|
||||||
</Accordion>
|
</Accordion>
|
||||||
|
|
||||||
### Generating a model package {#models-generating}
|
### Generating a pipeline package {#models-generating}
|
||||||
|
|
||||||
<Infobox title="Important note" variant="warning">
|
<Infobox title="Important note" variant="warning">
|
||||||
|
|
||||||
The model packages are **not suitable** for the public
|
Pipeline packages are typically **not suitable** for the public
|
||||||
[pypi.python.org](https://pypi.python.org) directory, which is not designed for
|
[pypi.python.org](https://pypi.python.org) directory, which is not designed for
|
||||||
binary data and files over 50 MB. However, if your company is running an
|
binary data and files over 50 MB. However, if your company is running an
|
||||||
**internal installation** of PyPi, publishing your models on there can be a
|
**internal installation** of PyPi, publishing your pipeline packages on there
|
||||||
convenient way to share them with your team.
|
can be a convenient way to share them with your team.
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
spaCy comes with a handy CLI command that will create all required files, and
|
spaCy comes with a handy CLI command that will create all required files, and
|
||||||
walk you through generating the meta data. You can also create the meta.json
|
walk you through generating the meta data. You can also create the `meta.json`
|
||||||
manually and place it in the model data directory, or supply a path to it using
|
manually and place it in the data directory, or supply a path to it using the
|
||||||
the `--meta` flag. For more info on this, see the [`package`](/api/cli#package)
|
`--meta` flag. For more info on this, see the [`package`](/api/cli#package)
|
||||||
docs.
|
docs.
|
||||||
|
|
||||||
> #### meta.json (example)
|
> #### meta.json (example)
|
||||||
>
|
>
|
||||||
> ```json
|
> ```json
|
||||||
> {
|
> {
|
||||||
> "name": "example_model",
|
> "name": "example_pipeline",
|
||||||
> "lang": "en",
|
> "lang": "en",
|
||||||
> "version": "1.0.0",
|
> "version": "1.0.0",
|
||||||
> "spacy_version": ">=2.0.0,<3.0.0",
|
> "spacy_version": ">=2.0.0,<3.0.0",
|
||||||
> "description": "Example model for spaCy",
|
> "description": "Example pipeline for spaCy",
|
||||||
> "author": "You",
|
> "author": "You",
|
||||||
> "email": "you@example.com",
|
> "email": "you@example.com",
|
||||||
> "license": "CC BY-SA 3.0"
|
> "license": "CC BY-SA 3.0"
|
||||||
|
@ -630,27 +630,27 @@ docs.
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
```cli
|
```cli
|
||||||
$ python -m spacy package ./en_example_model ./my_models
|
$ python -m spacy package ./en_example_pipeline ./my_pipelines
|
||||||
```
|
```
|
||||||
|
|
||||||
This command will create a model package directory and will run
|
This command will create a pipeline package directory and will run
|
||||||
`python setup.py sdist` in that directory to create `.tar.gz` archive of your
|
`python setup.py sdist` in that directory to create `.tar.gz` archive of your
|
||||||
model package that can be installed using `pip install`.
|
package that can be installed using `pip install`.
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
### Directory structure
|
### Directory structure
|
||||||
└── /
|
└── /
|
||||||
├── MANIFEST.in # to include meta.json
|
├── MANIFEST.in # to include meta.json
|
||||||
├── meta.json # model meta data
|
├── meta.json # pipeline meta data
|
||||||
├── setup.py # setup file for pip installation
|
├── setup.py # setup file for pip installation
|
||||||
├── en_example_model # model directory
|
├── en_example_pipeline # pipeline directory
|
||||||
│ ├── __init__.py # init for pip installation
|
│ ├── __init__.py # init for pip installation
|
||||||
│ └── en_example_model-1.0.0 # model data
|
│ └── en_example_pipeline-1.0.0 # pipeline data
|
||||||
│ ├── config.cfg # model config
|
│ ├── config.cfg # pipeline config
|
||||||
│ ├── meta.json # model meta
|
│ ├── meta.json # pipeline meta
|
||||||
│ └── ... # directories with component data
|
│ └── ... # directories with component data
|
||||||
└── dist
|
└── dist
|
||||||
└── en_example_model-1.0.0.tar.gz # installable package
|
└── en_example_pipeline-1.0.0.tar.gz # installable package
|
||||||
```
|
```
|
||||||
|
|
||||||
You can also find templates for all files in the
|
You can also find templates for all files in the
|
||||||
|
@ -659,16 +659,15 @@ If you're creating the package manually, keep in mind that the directories need
|
||||||
to be named according to the naming conventions of `lang_name` and
|
to be named according to the naming conventions of `lang_name` and
|
||||||
`lang_name-version`.
|
`lang_name-version`.
|
||||||
|
|
||||||
### Customizing the model setup {#models-custom}
|
### Customizing the package setup {#models-custom}
|
||||||
|
|
||||||
The `load()` method that comes with our model package templates will take care
|
The `load()` method that comes with our pipeline package templates will take
|
||||||
of putting all this together and returning a `Language` object with the loaded
|
care of putting all this together and returning a `Language` object with the
|
||||||
pipeline and data. If your model requires custom
|
loaded pipeline and data. If your pipeline requires
|
||||||
[pipeline components](/usage/processing-pipelines) or a custom language class,
|
[custom components](/usage/processing-pipelines#custom-components) or a custom
|
||||||
you can also **ship the code with your model** and include it in the
|
language class, you can also **ship the code with your package** and include it
|
||||||
`__init__.py` – for example, to register custom
|
in the `__init__.py` – for example, to register component before the `nlp`
|
||||||
[pipeline components](/usage/processing-pipelines#custom-components) before the
|
object is created.
|
||||||
`nlp` object is created.
|
|
||||||
|
|
||||||
<Infobox variant="warning" title="Important note on making manual edits">
|
<Infobox variant="warning" title="Important note on making manual edits">
|
||||||
|
|
||||||
|
@ -682,16 +681,16 @@ spaCy to export the current state of its `nlp` objects via
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
### Loading a custom model package {#loading}
|
### Loading a custom pipeline package {#loading}
|
||||||
|
|
||||||
To load a model from a data directory, you can use
|
To load a pipeline from a data directory, you can use
|
||||||
[`spacy.load()`](/api/top-level#spacy.load) with the local path. This will look
|
[`spacy.load()`](/api/top-level#spacy.load) with the local path. This will look
|
||||||
for a `config.cfg` in the directory and use the `lang` and `pipeline` settings
|
for a `config.cfg` in the directory and use the `lang` and `pipeline` settings
|
||||||
to initialize a `Language` class with a processing pipeline and load in the
|
to initialize a `Language` class with a processing pipeline and load in the
|
||||||
model data.
|
model data.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
nlp = spacy.load("/path/to/model")
|
nlp = spacy.load("/path/to/pipeline")
|
||||||
```
|
```
|
||||||
|
|
||||||
If you want to **load only the binary data**, you'll have to create a `Language`
|
If you want to **load only the binary data**, you'll have to create a `Language`
|
||||||
|
|
|
@ -130,14 +130,15 @@ related to more general machine learning functionality.
|
||||||
### Statistical models {#statistical-models}
|
### Statistical models {#statistical-models}
|
||||||
|
|
||||||
While some of spaCy's features work independently, others require
|
While some of spaCy's features work independently, others require
|
||||||
[ statistical models](/models) to be loaded, which enable spaCy to **predict**
|
[trained pipelines](/models) to be loaded, which enable spaCy to **predict**
|
||||||
linguistic annotations – for example, whether a word is a verb or a noun. spaCy
|
linguistic annotations – for example, whether a word is a verb or a noun. A
|
||||||
currently offers statistical models for a variety of languages, which can be
|
trained pipeline can consist of multiple components that use a statistical model
|
||||||
installed as individual Python modules. Models can differ in size, speed, memory
|
trained on labeled data. spaCy currently offers trained pipelines for a variety
|
||||||
usage, accuracy and the data they include. The model you choose always depends
|
of languages, which can be installed as individual Python modules. Pipeline
|
||||||
on your use case and the texts you're working with. For a general-purpose use
|
packages can differ in size, speed, memory usage, accuracy and the data they
|
||||||
case, the small, default models are always a good start. They typically include
|
include. The package you choose always depends on your use case and the texts
|
||||||
the following components:
|
you're working with. For a general-purpose use case, the small, default packages
|
||||||
|
are always a good start. They typically include the following components:
|
||||||
|
|
||||||
- **Binary weights** for the part-of-speech tagger, dependency parser and named
|
- **Binary weights** for the part-of-speech tagger, dependency parser and named
|
||||||
entity recognizer to predict those annotations in context.
|
entity recognizer to predict those annotations in context.
|
||||||
|
@ -146,8 +147,9 @@ the following components:
|
||||||
- **Data files** like lemmatization rules and lookup tables.
|
- **Data files** like lemmatization rules and lookup tables.
|
||||||
- **Word vectors**, i.e. multi-dimensional meaning representations of words that
|
- **Word vectors**, i.e. multi-dimensional meaning representations of words that
|
||||||
let you determine how similar they are to each other.
|
let you determine how similar they are to each other.
|
||||||
- **Configuration** options, like the language and processing pipeline settings,
|
- **Configuration** options, like the language and processing pipeline settings
|
||||||
to put spaCy in the correct state when you load in the model.
|
and model implementations to use, to put spaCy in the correct state when you
|
||||||
|
load the pipeline.
|
||||||
|
|
||||||
## Linguistic annotations {#annotations}
|
## Linguistic annotations {#annotations}
|
||||||
|
|
||||||
|
@ -158,7 +160,7 @@ analyzing text, it makes a huge difference whether a noun is the subject of a
|
||||||
sentence, or the object – or whether "google" is used as a verb, or refers to
|
sentence, or the object – or whether "google" is used as a verb, or refers to
|
||||||
the website or company in a specific context.
|
the website or company in a specific context.
|
||||||
|
|
||||||
> #### Loading models
|
> #### Loading pipelines
|
||||||
>
|
>
|
||||||
> ```cli
|
> ```cli
|
||||||
> $ python -m spacy download en_core_web_sm
|
> $ python -m spacy download en_core_web_sm
|
||||||
|
@ -167,11 +169,11 @@ the website or company in a specific context.
|
||||||
> >>> nlp = spacy.load("en_core_web_sm")
|
> >>> nlp = spacy.load("en_core_web_sm")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
Once you've [downloaded and installed](/usage/models) a model, you can load it
|
Once you've [downloaded and installed](/usage/models) a trained pipeline, you
|
||||||
via [`spacy.load()`](/api/top-level#spacy.load). This will return a `Language`
|
can load it via [`spacy.load`](/api/top-level#spacy.load). This will return a
|
||||||
object containing all components and data needed to process text. We usually
|
`Language` object containing all components and data needed to process text. We
|
||||||
call it `nlp`. Calling the `nlp` object on a string of text will return a
|
usually call it `nlp`. Calling the `nlp` object on a string of text will return
|
||||||
processed `Doc`:
|
a processed `Doc`:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### {executable="true"}
|
### {executable="true"}
|
||||||
|
@ -233,7 +235,7 @@ To learn more about entity recognition in spaCy, how to **add your own
|
||||||
entities** to a document and how to **train and update** the entity predictions
|
entities** to a document and how to **train and update** the entity predictions
|
||||||
of a model, see the usage guides on
|
of a model, see the usage guides on
|
||||||
[named entity recognition](/usage/linguistic-features#named-entities) and
|
[named entity recognition](/usage/linguistic-features#named-entities) and
|
||||||
[training the named entity recognizer](/usage/training#ner).
|
[training pipelines](/usage/training).
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
|
@ -346,7 +348,7 @@ The mapping of words to hashes doesn't depend on any state. To make sure each
|
||||||
value is unique, spaCy uses a
|
value is unique, spaCy uses a
|
||||||
[hash function](https://en.wikipedia.org/wiki/Hash_function) to calculate the
|
[hash function](https://en.wikipedia.org/wiki/Hash_function) to calculate the
|
||||||
hash **based on the word string**. This also means that the hash for "coffee"
|
hash **based on the word string**. This also means that the hash for "coffee"
|
||||||
will always be the same, no matter which model you're using or how you've
|
will always be the same, no matter which pipeline you're using or how you've
|
||||||
configured spaCy.
|
configured spaCy.
|
||||||
|
|
||||||
However, hashes **cannot be reversed** and there's no way to resolve
|
However, hashes **cannot be reversed** and there's no way to resolve
|
||||||
|
@ -391,7 +393,7 @@ import Serialization101 from 'usage/101/\_serialization.md'
|
||||||
|
|
||||||
<Infobox title="Saving and loading" emoji="📖">
|
<Infobox title="Saving and loading" emoji="📖">
|
||||||
|
|
||||||
To learn more about how to **save and load your own models**, see the usage
|
To learn more about how to **save and load your own pipelines**, see the usage
|
||||||
guide on [saving and loading](/usage/saving-loading#models).
|
guide on [saving and loading](/usage/saving-loading#models).
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
@ -402,9 +404,9 @@ import Training101 from 'usage/101/\_training.md'
|
||||||
|
|
||||||
<Training101 />
|
<Training101 />
|
||||||
|
|
||||||
<Infobox title="Training statistical models" emoji="📖">
|
<Infobox title="Training pipelines and models" emoji="📖">
|
||||||
|
|
||||||
To learn more about **training and updating** models, how to create training
|
To learn more about **training and updating** pipelines, how to create training
|
||||||
data and how to improve spaCy's named entity recognition models, see the usage
|
data and how to improve spaCy's named entity recognition models, see the usage
|
||||||
guides on [training](/usage/training).
|
guides on [training](/usage/training).
|
||||||
|
|
||||||
|
@ -454,8 +456,8 @@ via the following platforms:
|
||||||
practices**.
|
practices**.
|
||||||
- [GitHub issue tracker](https://github.com/explosion/spaCy/issues): **Bug
|
- [GitHub issue tracker](https://github.com/explosion/spaCy/issues): **Bug
|
||||||
reports** and **improvement suggestions**, i.e. everything that's likely
|
reports** and **improvement suggestions**, i.e. everything that's likely
|
||||||
spaCy's fault. This also includes problems with the models beyond statistical
|
spaCy's fault. This also includes problems with the trained pipelines beyond
|
||||||
imprecisions, like patterns that point to a bug.
|
statistical imprecisions, like patterns that point to a bug.
|
||||||
|
|
||||||
<Infobox title="Important note" variant="warning">
|
<Infobox title="Important note" variant="warning">
|
||||||
|
|
||||||
|
@ -484,10 +486,10 @@ Another way of getting involved is to help us improve the
|
||||||
happen to speak one of the languages currently in
|
happen to speak one of the languages currently in
|
||||||
[alpha support](/usage/models#languages). Even adding simple tokenizer
|
[alpha support](/usage/models#languages). Even adding simple tokenizer
|
||||||
exceptions, stop words or lemmatizer data can make a big difference. It will
|
exceptions, stop words or lemmatizer data can make a big difference. It will
|
||||||
also make it easier for us to provide a statistical model for the language in
|
also make it easier for us to provide a trained pipeline for the language in the
|
||||||
the future. Submitting a test that documents a bug or performance issue, or
|
future. Submitting a test that documents a bug or performance issue, or covers
|
||||||
covers functionality that's especially important for your application is also
|
functionality that's especially important for your application is also very
|
||||||
very helpful. This way, you'll also make sure we never accidentally introduce
|
helpful. This way, you'll also make sure we never accidentally introduce
|
||||||
regressions to the parts of the library that you care about the most.
|
regressions to the parts of the library that you care about the most.
|
||||||
|
|
||||||
**For more details on the types of contributions we're looking for, the code
|
**For more details on the types of contributions we're looking for, the code
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
---
|
---
|
||||||
title: Training Models
|
title: Training Pipelines & Models
|
||||||
|
teaser: Train and update components on your own data and integrate custom models
|
||||||
next: /usage/layers-architectures
|
next: /usage/layers-architectures
|
||||||
menu:
|
menu:
|
||||||
- ['Introduction', 'basics']
|
- ['Introduction', 'basics']
|
||||||
|
@ -10,7 +11,7 @@ menu:
|
||||||
- ['Internal API', 'api']
|
- ['Internal API', 'api']
|
||||||
---
|
---
|
||||||
|
|
||||||
## Introduction to training models {#basics hidden="true"}
|
## Introduction to training {#basics hidden="true"}
|
||||||
|
|
||||||
import Training101 from 'usage/101/\_training.md'
|
import Training101 from 'usage/101/\_training.md'
|
||||||
|
|
||||||
|
@ -25,13 +26,13 @@ new, active learning-powered annotation tool we've developed. Prodigy is fast
|
||||||
and extensible, and comes with a modern **web application** that helps you
|
and extensible, and comes with a modern **web application** that helps you
|
||||||
collect training data faster. It integrates seamlessly with spaCy, pre-selects
|
collect training data faster. It integrates seamlessly with spaCy, pre-selects
|
||||||
the **most relevant examples** for annotation, and lets you train and evaluate
|
the **most relevant examples** for annotation, and lets you train and evaluate
|
||||||
ready-to-use spaCy models.
|
ready-to-use spaCy pipelines.
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
## Quickstart {#quickstart tag="new"}
|
## Quickstart {#quickstart tag="new"}
|
||||||
|
|
||||||
The recommended way to train your spaCy models is via the
|
The recommended way to train your spaCy pipelines is via the
|
||||||
[`spacy train`](/api/cli#train) command on the command line. It only needs a
|
[`spacy train`](/api/cli#train) command on the command line. It only needs a
|
||||||
single [`config.cfg`](#config) **configuration file** that includes all settings
|
single [`config.cfg`](#config) **configuration file** that includes all settings
|
||||||
and hyperparameters. You can optionally [overwrite](#config-overrides) settings
|
and hyperparameters. You can optionally [overwrite](#config-overrides) settings
|
||||||
|
@ -94,9 +95,9 @@ $ python -m spacy train config.cfg --output ./output --paths.train ./train.spacy
|
||||||
## Training config {#config}
|
## Training config {#config}
|
||||||
|
|
||||||
Training config files include all **settings and hyperparameters** for training
|
Training config files include all **settings and hyperparameters** for training
|
||||||
your model. Instead of providing lots of arguments on the command line, you only
|
your pipeline. Instead of providing lots of arguments on the command line, you
|
||||||
need to pass your `config.cfg` file to [`spacy train`](/api/cli#train). Under
|
only need to pass your `config.cfg` file to [`spacy train`](/api/cli#train).
|
||||||
the hood, the training config uses the
|
Under the hood, the training config uses the
|
||||||
[configuration system](https://thinc.ai/docs/usage-config) provided by our
|
[configuration system](https://thinc.ai/docs/usage-config) provided by our
|
||||||
machine learning library [Thinc](https://thinc.ai). This also makes it easy to
|
machine learning library [Thinc](https://thinc.ai). This also makes it easy to
|
||||||
integrate custom models and architectures, written in your framework of choice.
|
integrate custom models and architectures, written in your framework of choice.
|
||||||
|
@ -178,27 +179,26 @@ $ python -m spacy train config.cfg --paths.train ./corpus/train.spacy --paths.de
|
||||||
```
|
```
|
||||||
|
|
||||||
Only existing sections and values in the config can be overwritten. At the end
|
Only existing sections and values in the config can be overwritten. At the end
|
||||||
of the training, the final filled `config.cfg` is exported with your model, so
|
of the training, the final filled `config.cfg` is exported with your pipeline,
|
||||||
you'll always have a record of the settings that were used, including your
|
so you'll always have a record of the settings that were used, including your
|
||||||
overrides. Overrides are added before [variables](#config-interpolation) are
|
overrides. Overrides are added before [variables](#config-interpolation) are
|
||||||
resolved, by the way – so if you need to use a value in multiple places,
|
resolved, by the way – so if you need to use a value in multiple places,
|
||||||
reference it across your config and override it on the CLI once.
|
reference it across your config and override it on the CLI once.
|
||||||
|
|
||||||
### Defining pipeline components {#config-components}
|
### Defining pipeline components {#config-components}
|
||||||
|
|
||||||
When you train a model, you typically train a
|
You typically train a [pipeline](/usage/processing-pipelines) of **one or more
|
||||||
[pipeline](/usage/processing-pipelines) of **one or more components**. The
|
components**. The `[components]` block in the config defines the available
|
||||||
`[components]` block in the config defines the available pipeline components and
|
pipeline components and how they should be created – either by a built-in or
|
||||||
how they should be created – either by a built-in or custom
|
custom [factory](/usage/processing-pipelines#built-in), or
|
||||||
[factory](/usage/processing-pipelines#built-in), or
|
|
||||||
[sourced](/usage/processing-pipelines#sourced-components) from an existing
|
[sourced](/usage/processing-pipelines#sourced-components) from an existing
|
||||||
pretrained model. For example, `[components.parser]` defines the component named
|
trained pipeline. For example, `[components.parser]` defines the component named
|
||||||
`"parser"` in the pipeline. There are different ways you might want to treat
|
`"parser"` in the pipeline. There are different ways you might want to treat
|
||||||
your components during training, and the most common scenarios are:
|
your components during training, and the most common scenarios are:
|
||||||
|
|
||||||
1. Train a **new component** from scratch on your data.
|
1. Train a **new component** from scratch on your data.
|
||||||
2. Update an existing **pretrained component** with more examples.
|
2. Update an existing **trained component** with more examples.
|
||||||
3. Include an existing pretrained component without updating it.
|
3. Include an existing trained component without updating it.
|
||||||
4. Include a non-trainable component, like a rule-based
|
4. Include a non-trainable component, like a rule-based
|
||||||
[`EntityRuler`](/api/entityruler) or [`Sentencizer`](/api/sentencizer), or a
|
[`EntityRuler`](/api/entityruler) or [`Sentencizer`](/api/sentencizer), or a
|
||||||
fully [custom component](/usage/processing-pipelines#custom-components).
|
fully [custom component](/usage/processing-pipelines#custom-components).
|
||||||
|
@ -209,16 +209,16 @@ If a component block defines a `factory`, spaCy will look it up in the
|
||||||
new component from scratch. All settings defined in the config block will be
|
new component from scratch. All settings defined in the config block will be
|
||||||
passed to the component factory as arguments. This lets you configure the model
|
passed to the component factory as arguments. This lets you configure the model
|
||||||
settings and hyperparameters. If a component block defines a `source`, the
|
settings and hyperparameters. If a component block defines a `source`, the
|
||||||
component will be copied over from an existing pretrained model, with its
|
component will be copied over from an existing trained pipeline, with its
|
||||||
existing weights. This lets you include an already trained component in your
|
existing weights. This lets you include an already trained component in your
|
||||||
model pipeline, or update a pretrained component with more data specific to your
|
pipeline, or update a trained component with more data specific to your use
|
||||||
use case.
|
case.
|
||||||
|
|
||||||
```ini
|
```ini
|
||||||
### config.cfg (excerpt)
|
### config.cfg (excerpt)
|
||||||
[components]
|
[components]
|
||||||
|
|
||||||
# "parser" and "ner" are sourced from a pretrained model
|
# "parser" and "ner" are sourced from a trained pipeline
|
||||||
[components.parser]
|
[components.parser]
|
||||||
source = "en_core_web_sm"
|
source = "en_core_web_sm"
|
||||||
|
|
||||||
|
@ -243,7 +243,7 @@ weights and [resume training](/api/language#resume_training).
|
||||||
|
|
||||||
If you don't want a component to be updated, you can **freeze** it by adding it
|
If you don't want a component to be updated, you can **freeze** it by adding it
|
||||||
to the `frozen_components` list in the `[training]` block. Frozen components are
|
to the `frozen_components` list in the `[training]` block. Frozen components are
|
||||||
**not updated** during training and are included in the final trained model
|
**not updated** during training and are included in the final trained pipeline
|
||||||
as-is.
|
as-is.
|
||||||
|
|
||||||
> #### Note on frozen components
|
> #### Note on frozen components
|
||||||
|
@ -252,8 +252,8 @@ as-is.
|
||||||
> still **run** during training and evaluation. This is very important, because
|
> still **run** during training and evaluation. This is very important, because
|
||||||
> they may still impact your model's performance – for instance, a sentence
|
> they may still impact your model's performance – for instance, a sentence
|
||||||
> boundary detector can impact what the parser or entity recognizer considers a
|
> boundary detector can impact what the parser or entity recognizer considers a
|
||||||
> valid parse. So the evaluation results should always reflect what your model
|
> valid parse. So the evaluation results should always reflect what your
|
||||||
> will produce at runtime.
|
> pipeline will produce at runtime.
|
||||||
|
|
||||||
```ini
|
```ini
|
||||||
[nlp]
|
[nlp]
|
||||||
|
@ -398,11 +398,11 @@ different tasks. For example:
|
||||||
|
|
||||||
### Metrics, training output and weighted scores {#metrics}
|
### Metrics, training output and weighted scores {#metrics}
|
||||||
|
|
||||||
When you train a model using the [`spacy train`](/api/cli#train) command, you'll
|
When you train a pipeline using the [`spacy train`](/api/cli#train) command,
|
||||||
see a table showing the metrics after each pass over the data. The available
|
you'll see a table showing the metrics after each pass over the data. The
|
||||||
metrics **depend on the pipeline components**. Pipeline components also define
|
available metrics **depend on the pipeline components**. Pipeline components
|
||||||
which scores are shown and how they should be **weighted in the final score**
|
also define which scores are shown and how they should be **weighted in the
|
||||||
that decides about the best model.
|
final score** that decides about the best model.
|
||||||
|
|
||||||
The `training.score_weights` setting in your `config.cfg` lets you customize the
|
The `training.score_weights` setting in your `config.cfg` lets you customize the
|
||||||
scores shown in the table and how they should be weighted. In this example, the
|
scores shown in the table and how they should be weighted. In this example, the
|
||||||
|
@ -415,8 +415,8 @@ score.
|
||||||
>
|
>
|
||||||
> At the end of your training process, you typically want to select the **best
|
> At the end of your training process, you typically want to select the **best
|
||||||
> model** – but what "best" means depends on the available components and your
|
> model** – but what "best" means depends on the available components and your
|
||||||
> specific use case. For instance, you may prefer a model with higher NER and
|
> specific use case. For instance, you may prefer a pipeline with higher NER and
|
||||||
> lower POS tagging accuracy over a model with lower NER and higher POS
|
> lower POS tagging accuracy over a pipeline with lower NER and higher POS
|
||||||
> accuracy. You can express this preference in the score weights, e.g. by
|
> accuracy. You can express this preference in the score weights, e.g. by
|
||||||
> assigning `ents_f` (NER F-score) a higher weight.
|
> assigning `ents_f` (NER F-score) a higher weight.
|
||||||
|
|
||||||
|
@ -488,8 +488,8 @@ The [`spacy train`](/api/cli#train) recipe lets you specify an optional argument
|
||||||
`--code` that points to a Python file. The file is imported before training and
|
`--code` that points to a Python file. The file is imported before training and
|
||||||
allows you to add custom functions and architectures to the function registry
|
allows you to add custom functions and architectures to the function registry
|
||||||
that can then be referenced from your `config.cfg`. This lets you train spaCy
|
that can then be referenced from your `config.cfg`. This lets you train spaCy
|
||||||
models with custom components, without having to re-implement the whole training
|
pipelines with custom components, without having to re-implement the whole
|
||||||
workflow.
|
training workflow.
|
||||||
|
|
||||||
#### Example: Modifying the nlp object {#custom-code-nlp-callbacks}
|
#### Example: Modifying the nlp object {#custom-code-nlp-callbacks}
|
||||||
|
|
||||||
|
@ -837,11 +837,11 @@ def MyModel(output_width: int) -> Model[List[Doc], List[Floats2d]]:
|
||||||
<Infobox variant="warning">
|
<Infobox variant="warning">
|
||||||
|
|
||||||
spaCy gives you full control over the training loop. However, for most use
|
spaCy gives you full control over the training loop. However, for most use
|
||||||
cases, it's recommended to train your models via the
|
cases, it's recommended to train your pipelines via the
|
||||||
[`spacy train`](/api/cli#train) command with a [`config.cfg`](#config) to keep
|
[`spacy train`](/api/cli#train) command with a [`config.cfg`](#config) to keep
|
||||||
track of your settings and hyperparameters, instead of writing your own training
|
track of your settings and hyperparameters, instead of writing your own training
|
||||||
scripts from scratch. [Custom registered functions](#custom-code) should
|
scripts from scratch. [Custom registered functions](#custom-code) should
|
||||||
typically give you everything you need to train fully custom models with
|
typically give you everything you need to train fully custom pipelines with
|
||||||
[`spacy train`](/api/cli#train).
|
[`spacy train`](/api/cli#train).
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
@ -874,8 +874,8 @@ their assigned part-of-speech tags.
|
||||||
> #### About the tag map
|
> #### About the tag map
|
||||||
>
|
>
|
||||||
> The tag map is part of the vocabulary and defines the annotation scheme. If
|
> The tag map is part of the vocabulary and defines the annotation scheme. If
|
||||||
> you're training a new language model, this will let you map the tags present
|
> you're training a new pipeline, this will let you map the tags present in the
|
||||||
> in the treebank you train on to spaCy's tag scheme:
|
> treebank you train on to spaCy's tag scheme:
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> tag_map = {"N": {"pos": "NOUN"}, "V": {"pos": "VERB"}}
|
> tag_map = {"N": {"pos": "NOUN"}, "V": {"pos": "VERB"}}
|
||||||
|
@ -924,15 +924,16 @@ it harder for the model to memorize the training data. For example, a `0.25`
|
||||||
dropout means that each feature or internal representation has a 1/4 likelihood
|
dropout means that each feature or internal representation has a 1/4 likelihood
|
||||||
of being dropped.
|
of being dropped.
|
||||||
|
|
||||||
> - [`nlp`](/api/language): The `nlp` object with the model.
|
> - [`nlp`](/api/language): The `nlp` object with the pipeline components and
|
||||||
|
> their models.
|
||||||
> - [`nlp.begin_training`](/api/language#begin_training): Start the training and
|
> - [`nlp.begin_training`](/api/language#begin_training): Start the training and
|
||||||
> return an optimizer to update the model's weights.
|
> return an optimizer to update the component model weights.
|
||||||
> - [`Optimizer`](https://thinc.ai/docs/api-optimizers): Function that holds
|
> - [`Optimizer`](https://thinc.ai/docs/api-optimizers): Function that holds
|
||||||
> state between updates.
|
> state between updates.
|
||||||
> - [`nlp.update`](/api/language#update): Update model with examples.
|
> - [`nlp.update`](/api/language#update): Update component models with examples.
|
||||||
> - [`Example`](/api/example): object holding predictions and gold-standard
|
> - [`Example`](/api/example): object holding predictions and gold-standard
|
||||||
> annotations.
|
> annotations.
|
||||||
> - [`nlp.to_disk`](/api/language#to_disk): Save the updated model to a
|
> - [`nlp.to_disk`](/api/language#to_disk): Save the updated pipeline to a
|
||||||
> directory.
|
> directory.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
@ -944,7 +945,7 @@ for itn in range(100):
|
||||||
doc = nlp.make_doc(raw_text)
|
doc = nlp.make_doc(raw_text)
|
||||||
example = Example.from_dict(doc, {"entities": entity_offsets})
|
example = Example.from_dict(doc, {"entities": entity_offsets})
|
||||||
nlp.update([example], sgd=optimizer)
|
nlp.update([example], sgd=optimizer)
|
||||||
nlp.to_disk("/model")
|
nlp.to_disk("/output")
|
||||||
```
|
```
|
||||||
|
|
||||||
The [`nlp.update`](/api/language#update) method takes the following arguments:
|
The [`nlp.update`](/api/language#update) method takes the following arguments:
|
||||||
|
|
|
@ -42,7 +42,7 @@ menu:
|
||||||
|
|
||||||
<Infobox title="Details & Documentation" emoji="📖" list>
|
<Infobox title="Details & Documentation" emoji="📖" list>
|
||||||
|
|
||||||
- **Usage:** [Training models](/usage/training)
|
- **Usage:** [Training pipelines and models](/usage/training)
|
||||||
- **Thinc:** [Thinc's config system](https://thinc.ai/docs/usage-config),
|
- **Thinc:** [Thinc's config system](https://thinc.ai/docs/usage-config),
|
||||||
[`Config`](https://thinc.ai/docs/api-config#config)
|
[`Config`](https://thinc.ai/docs/api-config#config)
|
||||||
- **CLI:** [`train`](/api/cli#train), [`pretrain`](/api/cli#pretrain),
|
- **CLI:** [`train`](/api/cli#train), [`pretrain`](/api/cli#pretrain),
|
||||||
|
@ -59,14 +59,14 @@ menu:
|
||||||
<Infobox title="Details & Documentation" emoji="📖" list>
|
<Infobox title="Details & Documentation" emoji="📖" list>
|
||||||
|
|
||||||
- **Usage:** [Embeddings & Transformers](/usage/embeddings-transformers),
|
- **Usage:** [Embeddings & Transformers](/usage/embeddings-transformers),
|
||||||
[Training models](/usage/training)
|
[Training pipelines and models](/usage/training)
|
||||||
- **API:** [`Transformer`](/api/transformer),
|
- **API:** [`Transformer`](/api/transformer),
|
||||||
[`TransformerData`](/api/transformer#transformerdata),
|
[`TransformerData`](/api/transformer#transformerdata),
|
||||||
[`FullTransformerBatch`](/api/transformer#fulltransformerbatch)
|
[`FullTransformerBatch`](/api/transformer#fulltransformerbatch)
|
||||||
- **Architectures: ** [TransformerModel](/api/architectures#TransformerModel),
|
- **Architectures: ** [TransformerModel](/api/architectures#TransformerModel),
|
||||||
[TransformerListener](/api/architectures#TransformerListener),
|
[TransformerListener](/api/architectures#TransformerListener),
|
||||||
[Tok2VecTransformer](/api/architectures#Tok2VecTransformer)
|
[Tok2VecTransformer](/api/architectures#Tok2VecTransformer)
|
||||||
- **Models:** [`en_core_trf_lg_sm`](/models/en)
|
- **Trained Pipelines:** [`en_core_trf_lg_sm`](/models/en)
|
||||||
- **Implementation:**
|
- **Implementation:**
|
||||||
[`spacy-transformers`](https://github.com/explosion/spacy-transformers)
|
[`spacy-transformers`](https://github.com/explosion/spacy-transformers)
|
||||||
|
|
||||||
|
@ -76,8 +76,7 @@ menu:
|
||||||
|
|
||||||
<Infobox title="Details & Documentation" emoji="📖" list>
|
<Infobox title="Details & Documentation" emoji="📖" list>
|
||||||
|
|
||||||
<!-- TODO: link to new custom models page -->
|
- **Usage: ** [Layers and architectures](/usage/layers-architectures)
|
||||||
|
|
||||||
- **Thinc: **
|
- **Thinc: **
|
||||||
[Wrapping PyTorch, TensorFlow & MXNet](https://thinc.ai/docs/usage-frameworks)
|
[Wrapping PyTorch, TensorFlow & MXNet](https://thinc.ai/docs/usage-frameworks)
|
||||||
- **API:** [Model architectures](/api/architectures), [`Pipe`](/api/pipe)
|
- **API:** [Model architectures](/api/architectures), [`Pipe`](/api/pipe)
|
||||||
|
@ -102,10 +101,10 @@ menu:
|
||||||
|
|
||||||
spaCy projects let you manage and share **end-to-end spaCy workflows** for
|
spaCy projects let you manage and share **end-to-end spaCy workflows** for
|
||||||
different **use cases and domains**, and orchestrate training, packaging and
|
different **use cases and domains**, and orchestrate training, packaging and
|
||||||
serving your custom models. You can start off by cloning a pre-defined project
|
serving your custom pipelines. You can start off by cloning a pre-defined
|
||||||
template, adjust it to fit your needs, load in your data, train a model, export
|
project template, adjust it to fit your needs, load in your data, train a
|
||||||
it as a Python package, upload your outputs to a remote storage and share your
|
pipeline, export it as a Python package, upload your outputs to a remote storage
|
||||||
results with your team.
|
and share your results with your team.
|
||||||
|
|
||||||
![Illustration of project workflow and commands](../images/projects.svg)
|
![Illustration of project workflow and commands](../images/projects.svg)
|
||||||
|
|
||||||
|
@ -121,14 +120,14 @@ data, [Streamlit](/usage/projects#streamlit) for building interactive apps,
|
||||||
|
|
||||||
The easiest way to get started with an end-to-end training process is to clone a
|
The easiest way to get started with an end-to-end training process is to clone a
|
||||||
[project](/usage/projects) template. Projects let you manage multi-step
|
[project](/usage/projects) template. Projects let you manage multi-step
|
||||||
workflows, from data preprocessing to training and packaging your model.
|
workflows, from data preprocessing to training and packaging your pipeline.
|
||||||
|
|
||||||
</Project>-->
|
</Project>-->
|
||||||
|
|
||||||
<Infobox title="Details & Documentation" emoji="📖" list>
|
<Infobox title="Details & Documentation" emoji="📖" list>
|
||||||
|
|
||||||
- **Usage:** [spaCy projects](/usage/projects),
|
- **Usage:** [spaCy projects](/usage/projects),
|
||||||
[Training models](/usage/training)
|
[Training pipelines and models](/usage/training)
|
||||||
- **CLI:** [`project`](/api/cli#project), [`train`](/api/cli#train)
|
- **CLI:** [`project`](/api/cli#project), [`train`](/api/cli#train)
|
||||||
- **Templates:** [`projects`](https://github.com/explosion/projects)
|
- **Templates:** [`projects`](https://github.com/explosion/projects)
|
||||||
|
|
||||||
|
@ -183,7 +182,7 @@ now easier and more convenient. The `@Language.component` and
|
||||||
`@Language.factory` decorators let you register your component, define its
|
`@Language.factory` decorators let you register your component, define its
|
||||||
default configuration and meta data, like the attribute values it assigns and
|
default configuration and meta data, like the attribute values it assigns and
|
||||||
requires. Any custom component can be included during training, and sourcing
|
requires. Any custom component can be included during training, and sourcing
|
||||||
components from existing pretrained models lets you **mix and match custom
|
components from existing trained pipelines lets you **mix and match custom
|
||||||
pipelines**. The `nlp.analyze_pipes` method outputs structured information about
|
pipelines**. The `nlp.analyze_pipes` method outputs structured information about
|
||||||
the current pipeline and its components, including the attributes they assign,
|
the current pipeline and its components, including the attributes they assign,
|
||||||
the scores they compute during training and whether any required attributes
|
the scores they compute during training and whether any required attributes
|
||||||
|
@ -257,7 +256,7 @@ The following methods, attributes and commands are new in spaCy v3.0.
|
||||||
| [`Language.select_pipes`](/api/language#select_pipes) | Context manager for enabling or disabling specific pipeline components for a block. |
|
| [`Language.select_pipes`](/api/language#select_pipes) | Context manager for enabling or disabling specific pipeline components for a block. |
|
||||||
| [`Language.disable_pipe`](/api/language#disable_pipe), [`Language.enable_pipe`](/api/language#enable_pipe) | Disable or enable a loaded pipeline component (but don't remove it). |
|
| [`Language.disable_pipe`](/api/language#disable_pipe), [`Language.enable_pipe`](/api/language#enable_pipe) | Disable or enable a loaded pipeline component (but don't remove it). |
|
||||||
| [`Language.analyze_pipes`](/api/language#analyze_pipes) | [Analyze](/usage/processing-pipelines#analysis) components and their interdependencies. |
|
| [`Language.analyze_pipes`](/api/language#analyze_pipes) | [Analyze](/usage/processing-pipelines#analysis) components and their interdependencies. |
|
||||||
| [`Language.resume_training`](/api/language#resume_training) | Experimental: continue training a pretrained model and initialize "rehearsal" for components that implement a `rehearse` method to prevent catastrophic forgetting. |
|
| [`Language.resume_training`](/api/language#resume_training) | Experimental: continue training a trained pipeline and initialize "rehearsal" for components that implement a `rehearse` method to prevent catastrophic forgetting. |
|
||||||
| [`@Language.factory`](/api/language#factory), [`@Language.component`](/api/language#component) | Decorators for [registering](/usage/processing-pipelines#custom-components) pipeline component factories and simple stateless component functions. |
|
| [`@Language.factory`](/api/language#factory), [`@Language.component`](/api/language#component) | Decorators for [registering](/usage/processing-pipelines#custom-components) pipeline component factories and simple stateless component functions. |
|
||||||
| [`Language.has_factory`](/api/language#has_factory) | Check whether a component factory is registered on a language class.s |
|
| [`Language.has_factory`](/api/language#has_factory) | Check whether a component factory is registered on a language class.s |
|
||||||
| [`Language.get_factory_meta`](/api/language#get_factory_meta), [`Language.get_pipe_meta`](/api/language#get_factory_meta) | Get the [`FactoryMeta`](/api/language#factorymeta) with component metadata for a factory or instance name. |
|
| [`Language.get_factory_meta`](/api/language#get_factory_meta), [`Language.get_pipe_meta`](/api/language#get_factory_meta) | Get the [`FactoryMeta`](/api/language#factorymeta) with component metadata for a factory or instance name. |
|
||||||
|
@ -266,8 +265,8 @@ The following methods, attributes and commands are new in spaCy v3.0.
|
||||||
| [`Language.disabled`](/api/language#attributes) | Names of disabled components that are not run as part of the pipeline. |
|
| [`Language.disabled`](/api/language#attributes) | Names of disabled components that are not run as part of the pipeline. |
|
||||||
| [`Pipe.score`](/api/pipe#score) | Method on pipeline components that returns a dictionary of evaluation scores. |
|
| [`Pipe.score`](/api/pipe#score) | Method on pipeline components that returns a dictionary of evaluation scores. |
|
||||||
| [`registry`](/api/top-level#registry) | Function registry to map functions to string names that can be referenced in [configs](/usage/training#config). |
|
| [`registry`](/api/top-level#registry) | Function registry to map functions to string names that can be referenced in [configs](/usage/training#config). |
|
||||||
| [`util.load_meta`](/api/top-level#util.load_meta), [`util.load_config`](/api/top-level#util.load_config) | Updated helpers for loading a model's [`meta.json`](/api/data-formats#meta) and [`config.cfg`](/api/data-formats#config). |
|
| [`util.load_meta`](/api/top-level#util.load_meta), [`util.load_config`](/api/top-level#util.load_config) | Updated helpers for loading a pipeline's [`meta.json`](/api/data-formats#meta) and [`config.cfg`](/api/data-formats#config). |
|
||||||
| [`util.get_installed_models`](/api/top-level#util.get_installed_models) | Names of all models installed in the environment. |
|
| [`util.get_installed_models`](/api/top-level#util.get_installed_models) | Names of all pipeline packages installed in the environment. |
|
||||||
| [`init config`](/api/cli#init-config), [`init fill-config`](/api/cli#init-fill-config), [`debug config`](/api/cli#debug-config) | CLI commands for initializing, auto-filling and debugging [training configs](/usage/training). |
|
| [`init config`](/api/cli#init-config), [`init fill-config`](/api/cli#init-fill-config), [`debug config`](/api/cli#debug-config) | CLI commands for initializing, auto-filling and debugging [training configs](/usage/training). |
|
||||||
| [`project`](/api/cli#project) | Suite of CLI commands for cloning, running and managing [spaCy projects](/usage/projects). |
|
| [`project`](/api/cli#project) | Suite of CLI commands for cloning, running and managing [spaCy projects](/usage/projects). |
|
||||||
|
|
||||||
|
@ -280,9 +279,9 @@ The following methods, attributes and commands are new in spaCy v3.0.
|
||||||
To help you get started with spaCy v3.0 and the new features, we've added
|
To help you get started with spaCy v3.0 and the new features, we've added
|
||||||
several new or rewritten documentation pages, including a new usage guide on
|
several new or rewritten documentation pages, including a new usage guide on
|
||||||
[embeddings, transformers and transfer learning](/usage/embeddings-transformers),
|
[embeddings, transformers and transfer learning](/usage/embeddings-transformers),
|
||||||
a guide on [training models](/usage/training) rewritten from scratch, a page
|
a guide on [training pipelines and models](/usage/training) rewritten from
|
||||||
explaining the new [spaCy projects](/usage/projects) and updated usage
|
scratch, a page explaining the new [spaCy projects](/usage/projects) and updated
|
||||||
documentation on
|
usage documentation on
|
||||||
[custom pipeline components](/usage/processing-pipelines#custom-components).
|
[custom pipeline components](/usage/processing-pipelines#custom-components).
|
||||||
We've also added a bunch of new illustrations and new API reference pages
|
We've also added a bunch of new illustrations and new API reference pages
|
||||||
documenting spaCy's machine learning [model architectures](/api/architectures)
|
documenting spaCy's machine learning [model architectures](/api/architectures)
|
||||||
|
@ -335,15 +334,15 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
|
||||||
|
|
||||||
### API changes {#incompat-api}
|
### API changes {#incompat-api}
|
||||||
|
|
||||||
- Model symlinks, the `link` command and shortcut names are now deprecated.
|
- Pipeline package symlinks, the `link` command and shortcut names are now
|
||||||
There can be many [different models](/models) and not just one "English
|
deprecated. There can be many [different trained pipelines](/models) and not
|
||||||
model", so you should always use the full model name like
|
just one "English model", so you should always use the full package name like
|
||||||
[`en_core_web_sm`](/models/en) explicitly.
|
[`en_core_web_sm`](/models/en) explicitly.
|
||||||
- A model's [`meta.json`](/api/data-formats#meta) is now only used to provide
|
- A pipeline's [`meta.json`](/api/data-formats#meta) is now only used to provide
|
||||||
meta information like the model name, author, license and labels. It's **not**
|
meta information like the package name, author, license and labels. It's
|
||||||
used to construct the processing pipeline anymore. This is all defined in the
|
**not** used to construct the processing pipeline anymore. This is all defined
|
||||||
[`config.cfg`](/api/data-formats#config), which also includes all settings
|
in the [`config.cfg`](/api/data-formats#config), which also includes all
|
||||||
used to train the model.
|
settings used to train the pipeline.
|
||||||
- The [`train`](/api/cli#train) and [`pretrain`](/api/cli#pretrain) commands now
|
- The [`train`](/api/cli#train) and [`pretrain`](/api/cli#pretrain) commands now
|
||||||
only take a `config.cfg` file containing the full
|
only take a `config.cfg` file containing the full
|
||||||
[training config](/usage/training#config).
|
[training config](/usage/training#config).
|
||||||
|
@ -390,10 +389,10 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
|
||||||
| `GoldCorpus` | [`Corpus`](/api/corpus) |
|
| `GoldCorpus` | [`Corpus`](/api/corpus) |
|
||||||
| `KnowledgeBase.load_bulk`, `KnowledgeBase.dump` | [`KnowledgeBase.from_disk`](/api/kb#from_disk), [`KnowledgeBase.to_disk`](/api/kb#to_disk) |
|
| `KnowledgeBase.load_bulk`, `KnowledgeBase.dump` | [`KnowledgeBase.from_disk`](/api/kb#from_disk), [`KnowledgeBase.to_disk`](/api/kb#to_disk) |
|
||||||
| `Matcher.pipe`, `PhraseMatcher.pipe` | not needed |
|
| `Matcher.pipe`, `PhraseMatcher.pipe` | not needed |
|
||||||
| `spacy init-model` | [`spacy init model`](/api/cli#init-model) |
|
| `spacy init-model` | [`spacy init vocab`](/api/cli#init-vocab) |
|
||||||
| `spacy debug-data` | [`spacy debug data`](/api/cli#debug-data) |
|
| `spacy debug-data` | [`spacy debug data`](/api/cli#debug-data) |
|
||||||
| `spacy profile` | [`spacy debug profile`](/api/cli#debug-profile) |
|
| `spacy profile` | [`spacy debug profile`](/api/cli#debug-profile) |
|
||||||
| `spacy link`, `util.set_data_path`, `util.get_data_path` | not needed, model symlinks are deprecated |
|
| `spacy link`, `util.set_data_path`, `util.get_data_path` | not needed, symlinks are deprecated |
|
||||||
|
|
||||||
The following deprecated methods, attributes and arguments were removed in v3.0.
|
The following deprecated methods, attributes and arguments were removed in v3.0.
|
||||||
Most of them have been **deprecated for a while** and many would previously
|
Most of them have been **deprecated for a while** and many would previously
|
||||||
|
@ -414,12 +413,13 @@ on them.
|
||||||
|
|
||||||
## Migrating from v2.x {#migrating}
|
## Migrating from v2.x {#migrating}
|
||||||
|
|
||||||
### Downloading and loading models {#migrating-downloading-models}
|
### Downloading and loading trained pipelines {#migrating-downloading-models}
|
||||||
|
|
||||||
Model symlinks and shortcuts like `en` are now officially deprecated. There are
|
Symlinks and shortcuts like `en` are now officially deprecated. There are
|
||||||
[many different models](/models) with different capabilities and not just one
|
[many different trained pipelines](/models) with different capabilities and not
|
||||||
"English model". In order to download and load a model, you should always use
|
just one "English model". In order to download and load a package, you should
|
||||||
its full name – for instance, [`en_core_web_sm`](/models/en#en_core_web_sm).
|
always use its full name – for instance,
|
||||||
|
[`en_core_web_sm`](/models/en#en_core_web_sm).
|
||||||
|
|
||||||
```diff
|
```diff
|
||||||
- python -m spacy download en
|
- python -m spacy download en
|
||||||
|
@ -522,12 +522,12 @@ and you typically shouldn't have to use it in your code.
|
||||||
+ parser = nlp.add_pipe("parser")
|
+ parser = nlp.add_pipe("parser")
|
||||||
```
|
```
|
||||||
|
|
||||||
If you need to add a component from an existing pretrained model, you can now
|
If you need to add a component from an existing trained pipeline, you can now
|
||||||
use the `source` argument on [`nlp.add_pipe`](/api/language#add_pipe). This will
|
use the `source` argument on [`nlp.add_pipe`](/api/language#add_pipe). This will
|
||||||
check that the component is compatible, and take care of porting over all
|
check that the component is compatible, and take care of porting over all
|
||||||
config. During training, you can also reference existing pretrained components
|
config. During training, you can also reference existing trained components in
|
||||||
in your [config](/usage/training#config-components) and decide whether or not
|
your [config](/usage/training#config-components) and decide whether or not they
|
||||||
they should be updated with more data.
|
should be updated with more data.
|
||||||
|
|
||||||
> #### config.cfg (excerpt)
|
> #### config.cfg (excerpt)
|
||||||
>
|
>
|
||||||
|
@ -599,13 +599,13 @@ nlp = spacy.blank("en")
|
||||||
+ ruler.load_from_tag_map(YOUR_TAG_MAP)
|
+ ruler.load_from_tag_map(YOUR_TAG_MAP)
|
||||||
```
|
```
|
||||||
|
|
||||||
### Training models {#migrating-training}
|
### Training pipelines and models {#migrating-training}
|
||||||
|
|
||||||
To train your models, you should now pretty much always use the
|
To train your pipelines, you should now pretty much always use the
|
||||||
[`spacy train`](/api/cli#train) CLI. You shouldn't have to put together your own
|
[`spacy train`](/api/cli#train) CLI. You shouldn't have to put together your own
|
||||||
training scripts anymore, unless you _really_ want to. The training commands now
|
training scripts anymore, unless you _really_ want to. The training commands now
|
||||||
use a [flexible config file](/usage/training#config) that describes all training
|
use a [flexible config file](/usage/training#config) that describes all training
|
||||||
settings and hyperparameters, as well as your pipeline, model components and
|
settings and hyperparameters, as well as your pipeline, components and
|
||||||
architectures to use. The `--code` argument lets you pass in code containing
|
architectures to use. The `--code` argument lets you pass in code containing
|
||||||
[custom registered functions](/usage/training#custom-code) that you can
|
[custom registered functions](/usage/training#custom-code) that you can
|
||||||
reference in your config. To get started, check out the
|
reference in your config. To get started, check out the
|
||||||
|
@ -616,7 +616,7 @@ reference in your config. To get started, check out the
|
||||||
spaCy v3.0 uses a new
|
spaCy v3.0 uses a new
|
||||||
[binary training data format](/api/data-formats#binary-training) created by
|
[binary training data format](/api/data-formats#binary-training) created by
|
||||||
serializing a [`DocBin`](/api/docbin), which represents a collection of `Doc`
|
serializing a [`DocBin`](/api/docbin), which represents a collection of `Doc`
|
||||||
objects. This means that you can train spaCy models using the same format it
|
objects. This means that you can train spaCy pipelines using the same format it
|
||||||
outputs: annotated `Doc` objects. The binary format is extremely **efficient in
|
outputs: annotated `Doc` objects. The binary format is extremely **efficient in
|
||||||
storage**, especially when packing multiple documents together. You can convert
|
storage**, especially when packing multiple documents together. You can convert
|
||||||
your existing JSON-formatted data using the [`spacy convert`](/api/cli#convert)
|
your existing JSON-formatted data using the [`spacy convert`](/api/cli#convert)
|
||||||
|
@ -655,7 +655,7 @@ values. You can then use the auto-generated `config.cfg` for training:
|
||||||
|
|
||||||
The easiest way to get started with an end-to-end training process is to clone a
|
The easiest way to get started with an end-to-end training process is to clone a
|
||||||
[project](/usage/projects) template. Projects let you manage multi-step
|
[project](/usage/projects) template. Projects let you manage multi-step
|
||||||
workflows, from data preprocessing to training and packaging your model.
|
workflows, from data preprocessing to training and packaging your pipeline.
|
||||||
|
|
||||||
</Project>
|
</Project>
|
||||||
|
|
||||||
|
@ -728,7 +728,7 @@ setting up the label scheme.
|
||||||
+ nlp.begin_training(lambda: examples)
|
+ nlp.begin_training(lambda: examples)
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Packaging models {#migrating-training-packaging}
|
#### Packaging trained pipelines {#migrating-training-packaging}
|
||||||
|
|
||||||
The [`spacy package`](/api/cli#package) command now automatically builds the
|
The [`spacy package`](/api/cli#package) command now automatically builds the
|
||||||
installable `.tar.gz` sdist of the Python package, so you don't have to run this
|
installable `.tar.gz` sdist of the Python package, so you don't have to run this
|
||||||
|
@ -736,8 +736,8 @@ step manually anymore. You can disable the behavior by setting the `--no-sdist`
|
||||||
flag.
|
flag.
|
||||||
|
|
||||||
```diff
|
```diff
|
||||||
python -m spacy package ./model ./packages
|
python -m spacy package ./output ./packages
|
||||||
- cd /output/en_model-0.0.0
|
- cd /output/en_pipeline-0.0.0
|
||||||
- python setup.py sdist
|
- python setup.py sdist
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
|
@ -23,10 +23,10 @@ The quickest way to visualize `Doc` is to use
|
||||||
[`displacy.serve`](/api/top-level#displacy.serve). This will spin up a simple
|
[`displacy.serve`](/api/top-level#displacy.serve). This will spin up a simple
|
||||||
web server and let you view the result straight from your browser. displaCy can
|
web server and let you view the result straight from your browser. displaCy can
|
||||||
either take a single `Doc` or a list of `Doc` objects as its first argument.
|
either take a single `Doc` or a list of `Doc` objects as its first argument.
|
||||||
This lets you construct them however you like – using any model or modifications
|
This lets you construct them however you like – using any pipeline or
|
||||||
you like. If you're using [Streamlit](https://streamlit.io), check out the
|
modifications you like. If you're using [Streamlit](https://streamlit.io), check
|
||||||
[`spacy-streamlit`](https://github.com/explosion/spacy-streamlit) package that
|
out the [`spacy-streamlit`](https://github.com/explosion/spacy-streamlit)
|
||||||
helps you integrate spaCy visualizations into your apps!
|
package that helps you integrate spaCy visualizations into your apps!
|
||||||
|
|
||||||
## Visualizing the dependency parse {#dep}
|
## Visualizing the dependency parse {#dep}
|
||||||
|
|
||||||
|
@ -131,8 +131,8 @@ example, you can choose to display `PERSON` entities. Internally, the visualizer
|
||||||
knows nothing about available entity types and will render whichever spans and
|
knows nothing about available entity types and will render whichever spans and
|
||||||
labels it receives. This makes it especially easy to work with custom entity
|
labels it receives. This makes it especially easy to work with custom entity
|
||||||
types. By default, displaCy comes with colors for all entity types used by
|
types. By default, displaCy comes with colors for all entity types used by
|
||||||
[spaCy models](/models). If you're using custom entity types, you can use the
|
[trained spaCy pipelines](/models). If you're using custom entity types, you can
|
||||||
`colors` setting to add your own colors for them.
|
use the `colors` setting to add your own colors for them.
|
||||||
|
|
||||||
> #### Options example
|
> #### Options example
|
||||||
>
|
>
|
||||||
|
@ -176,7 +176,7 @@ visualizations will be included as HTML.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### Jupyter example
|
### Jupyter example
|
||||||
# Don't forget to install a model, e.g.: python -m spacy download en
|
# Don't forget to install a trained pipeline, e.g.: python -m spacy download en
|
||||||
|
|
||||||
# In[1]:
|
# In[1]:
|
||||||
import spacy
|
import spacy
|
||||||
|
|
|
@ -50,11 +50,11 @@
|
||||||
"items": [{ "text": "Overview", "url": "/models" }]
|
"items": [{ "text": "Overview", "url": "/models" }]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"label": "Core Models",
|
"label": "Trained Pipelines",
|
||||||
"items": []
|
"items": []
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"label": "Starter Models",
|
"label": "Starter Packages",
|
||||||
"items": []
|
"items": []
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|
|
@ -23,7 +23,7 @@ export default function Tag({ spaced = false, variant, tooltip, children }) {
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
if (variant === 'model') {
|
if (variant === 'model') {
|
||||||
const tooltipText = `To use this functionality, spaCy needs a model to be installed that supports the following capabilities: ${children}`
|
const tooltipText = `To use this functionality, spaCy needs a trained pipeline that supports the following capabilities: ${children}`
|
||||||
return (
|
return (
|
||||||
<TagTemplate spaced={spaced} tooltip={tooltipText}>
|
<TagTemplate spaced={spaced} tooltip={tooltipText}>
|
||||||
Needs model
|
Needs model
|
||||||
|
|
|
@ -37,14 +37,15 @@ const MODEL_META = {
|
||||||
ents_r: 'Entities (recall)',
|
ents_r: 'Entities (recall)',
|
||||||
cpu: 'words per second on CPU',
|
cpu: 'words per second on CPU',
|
||||||
gpu: 'words per second on GPU',
|
gpu: 'words per second on GPU',
|
||||||
pipeline: 'Processing pipeline components in order',
|
pipeline: 'Active processing pipeline components in order',
|
||||||
|
components: 'All processing pipeline components (including disabled components)',
|
||||||
sources: 'Sources of training data',
|
sources: 'Sources of training data',
|
||||||
vecs:
|
vecs:
|
||||||
'Word vectors included in the model. Models that only support context vectors compute similarity via the tensors shared with the pipeline.',
|
'Word vectors included in the package. Packages that only support context vectors compute similarity via the tensors shared with the pipeline.',
|
||||||
benchmark_parser: 'Syntax accuracy',
|
benchmark_parser: 'Syntax accuracy',
|
||||||
benchmark_ner: 'NER accuracy',
|
benchmark_ner: 'NER accuracy',
|
||||||
benchmark_speed: 'Speed',
|
benchmark_speed: 'Speed',
|
||||||
compat: 'Latest compatible model version for your spaCy installation',
|
compat: 'Latest compatible package version for your spaCy installation',
|
||||||
}
|
}
|
||||||
|
|
||||||
const LABEL_SCHEME_META = {
|
const LABEL_SCHEME_META = {
|
||||||
|
@ -178,6 +179,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
|
||||||
{ label: 'Type', tag: type, content: MODEL_META[type] },
|
{ label: 'Type', tag: type, content: MODEL_META[type] },
|
||||||
{ label: 'Genre', tag: genre, content: MODEL_META[genre] },
|
{ label: 'Genre', tag: genre, content: MODEL_META[genre] },
|
||||||
{ label: 'Size', tag: size, content: meta.sizeFull },
|
{ label: 'Size', tag: size, content: meta.sizeFull },
|
||||||
|
{ label: 'Components', content: components, help: MODEL_META.components },
|
||||||
{ label: 'Pipeline', content: pipeline, help: MODEL_META.pipeline },
|
{ label: 'Pipeline', content: pipeline, help: MODEL_META.pipeline },
|
||||||
{ label: 'Vectors', content: meta.vectors, help: MODEL_META.vecs },
|
{ label: 'Vectors', content: meta.vectors, help: MODEL_META.vecs },
|
||||||
{ label: 'Sources', content: sources, help: MODEL_META.sources },
|
{ label: 'Sources', content: sources, help: MODEL_META.sources },
|
||||||
|
@ -355,7 +357,7 @@ const Models = ({ pageContext, repo, children }) => {
|
||||||
}, [initialized, baseUrl])
|
}, [initialized, baseUrl])
|
||||||
|
|
||||||
const modelTitle = title
|
const modelTitle = title
|
||||||
const modelTeaser = `Available pretrained statistical models for ${title}`
|
const modelTeaser = `Available trained pipelines for ${title}`
|
||||||
|
|
||||||
const starterTitle = `${title} starters`
|
const starterTitle = `${title} starters`
|
||||||
const starterTeaser = `Available transfer learning starter packs for ${title}`
|
const starterTeaser = `Available transfer learning starter packs for ${title}`
|
||||||
|
|
|
@ -43,7 +43,7 @@ const DATA = [
|
||||||
{
|
{
|
||||||
id: 'transformers',
|
id: 'transformers',
|
||||||
title: 'Transformers',
|
title: 'Transformers',
|
||||||
help: 'Use transformers like BERT to train your spaCy models',
|
help: 'Use transformers like BERT to train your spaCy pipelines',
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
id: 'lookups',
|
id: 'lookups',
|
||||||
|
@ -63,7 +63,7 @@ const QuickstartInstall = ({ id, title }) => (
|
||||||
...DATA,
|
...DATA,
|
||||||
{
|
{
|
||||||
id: 'models',
|
id: 'models',
|
||||||
title: 'Models',
|
title: 'Trained Pipelines',
|
||||||
multiple: true,
|
multiple: true,
|
||||||
options: models.map(({ code, name }) => ({ id: code, title: name })),
|
options: models.map(({ code, name }) => ({ id: code, title: name })),
|
||||||
},
|
},
|
||||||
|
|
|
@ -15,13 +15,13 @@ const data = [
|
||||||
{
|
{
|
||||||
id: 'spacy',
|
id: 'spacy',
|
||||||
title: 'Use spacy.load()',
|
title: 'Use spacy.load()',
|
||||||
help: "Use spaCy's built-in loader to load the model by name.",
|
help: "Use spaCy's built-in loader to load the package by name",
|
||||||
checked: true,
|
checked: true,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
id: 'module',
|
id: 'module',
|
||||||
title: 'Import as module',
|
title: 'Import as module',
|
||||||
help: 'Import the model explicitly as a Python module.',
|
help: 'Import the package explicitly as a Python module',
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
|
|
Loading…
Reference in New Issue
Block a user