mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-28 02:04:07 +03:00
Merge branch 'develop' into feature/dependency-matcher-v3
This commit is contained in:
commit
b927893309
|
@ -36,7 +36,7 @@ max_length = 0
|
||||||
limit = 0
|
limit = 0
|
||||||
|
|
||||||
[training.batcher]
|
[training.batcher]
|
||||||
@batchers = "batch_by_words.v1"
|
@batchers = "spacy.batch_by_words.v1"
|
||||||
discard_oversize = false
|
discard_oversize = false
|
||||||
tolerance = 0.2
|
tolerance = 0.2
|
||||||
|
|
||||||
|
|
|
@ -35,7 +35,7 @@ max_length = 0
|
||||||
limit = 0
|
limit = 0
|
||||||
|
|
||||||
[training.batcher]
|
[training.batcher]
|
||||||
@batchers = "batch_by_words.v1"
|
@batchers = "spacy.batch_by_words.v1"
|
||||||
discard_oversize = false
|
discard_oversize = false
|
||||||
tolerance = 0.2
|
tolerance = 0.2
|
||||||
|
|
||||||
|
|
|
@ -24,7 +24,7 @@ redirects = [
|
||||||
{from = "/docs/usage/customizing-tokenizer", to = "/usage/linguistic-features#tokenization", force = true},
|
{from = "/docs/usage/customizing-tokenizer", to = "/usage/linguistic-features#tokenization", force = true},
|
||||||
{from = "/docs/usage/language-processing-pipeline", to = "/usage/processing-pipelines", force = true},
|
{from = "/docs/usage/language-processing-pipeline", to = "/usage/processing-pipelines", force = true},
|
||||||
{from = "/docs/usage/customizing-pipeline", to = "/usage/processing-pipelines", force = true},
|
{from = "/docs/usage/customizing-pipeline", to = "/usage/processing-pipelines", force = true},
|
||||||
{from = "/docs/usage/training-ner", to = "/usage/training#ner", force = true},
|
{from = "/docs/usage/training-ner", to = "/usage/training", force = true},
|
||||||
{from = "/docs/usage/tutorials", to = "/usage/examples", force = true},
|
{from = "/docs/usage/tutorials", to = "/usage/examples", force = true},
|
||||||
{from = "/docs/usage/data-model", to = "/api", force = true},
|
{from = "/docs/usage/data-model", to = "/api", force = true},
|
||||||
{from = "/docs/usage/cli", to = "/api/cli", force = true},
|
{from = "/docs/usage/cli", to = "/api/cli", force = true},
|
||||||
|
|
|
@ -29,9 +29,9 @@ from .project.document import project_document # noqa: F401
|
||||||
|
|
||||||
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
|
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
|
||||||
def link(*args, **kwargs):
|
def link(*args, **kwargs):
|
||||||
"""As of spaCy v3.0, model symlinks are deprecated. You can load models
|
"""As of spaCy v3.0, symlinks like "en" are deprecated. You can load trained
|
||||||
using their full names or from a directory path."""
|
pipeline packages using their full names or from a directory path."""
|
||||||
msg.warn(
|
msg.warn(
|
||||||
"As of spaCy v3.0, model symlinks are deprecated. You can load models "
|
"As of spaCy v3.0, model symlinks are deprecated. You can load trained "
|
||||||
"using their full names or from a directory path."
|
"pipeline packages using their full names or from a directory path."
|
||||||
)
|
)
|
||||||
|
|
|
@ -25,7 +25,7 @@ COMMAND = "python -m spacy"
|
||||||
NAME = "spacy"
|
NAME = "spacy"
|
||||||
HELP = """spaCy Command-line Interface
|
HELP = """spaCy Command-line Interface
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/cli
|
DOCS: https://nightly.spacy.io/api/cli
|
||||||
"""
|
"""
|
||||||
PROJECT_HELP = f"""Command-line interface for spaCy projects and templates.
|
PROJECT_HELP = f"""Command-line interface for spaCy projects and templates.
|
||||||
You'd typically start by cloning a project template to a local directory and
|
You'd typically start by cloning a project template to a local directory and
|
||||||
|
@ -36,7 +36,7 @@ DEBUG_HELP = """Suite of helpful commands for debugging and profiling. Includes
|
||||||
commands to check and validate your config files, training and evaluation data,
|
commands to check and validate your config files, training and evaluation data,
|
||||||
and custom model implementations.
|
and custom model implementations.
|
||||||
"""
|
"""
|
||||||
INIT_HELP = """Commands for initializing configs and models."""
|
INIT_HELP = """Commands for initializing configs and pipeline packages."""
|
||||||
|
|
||||||
# Wrappers for Typer's annotations. Initially created to set defaults and to
|
# Wrappers for Typer's annotations. Initially created to set defaults and to
|
||||||
# keep the names short, but not needed at the moment.
|
# keep the names short, but not needed at the moment.
|
||||||
|
|
|
@ -44,7 +44,7 @@ def convert_cli(
|
||||||
file_type: FileTypes = Opt("spacy", "--file-type", "-t", help="Type of data to produce"),
|
file_type: FileTypes = Opt("spacy", "--file-type", "-t", help="Type of data to produce"),
|
||||||
n_sents: int = Opt(1, "--n-sents", "-n", help="Number of sentences per doc (0 to disable)"),
|
n_sents: int = Opt(1, "--n-sents", "-n", help="Number of sentences per doc (0 to disable)"),
|
||||||
seg_sents: bool = Opt(False, "--seg-sents", "-s", help="Segment sentences (for -c ner)"),
|
seg_sents: bool = Opt(False, "--seg-sents", "-s", help="Segment sentences (for -c ner)"),
|
||||||
model: Optional[str] = Opt(None, "--model", "-b", help="Model for sentence segmentation (for -s)"),
|
model: Optional[str] = Opt(None, "--model", "--base", "-b", help="Trained spaCy pipeline for sentence segmentation to use as base (for --seg-sents)"),
|
||||||
morphology: bool = Opt(False, "--morphology", "-m", help="Enable appending morphology to tags"),
|
morphology: bool = Opt(False, "--morphology", "-m", help="Enable appending morphology to tags"),
|
||||||
merge_subtokens: bool = Opt(False, "--merge-subtokens", "-T", help="Merge CoNLL-U subtokens"),
|
merge_subtokens: bool = Opt(False, "--merge-subtokens", "-T", help="Merge CoNLL-U subtokens"),
|
||||||
converter: str = Opt("auto", "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"),
|
converter: str = Opt("auto", "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"),
|
||||||
|
@ -61,6 +61,8 @@ def convert_cli(
|
||||||
If no output_dir is specified and the output format is JSON, the data
|
If no output_dir is specified and the output format is JSON, the data
|
||||||
is written to stdout, so you can pipe them forward to a JSON file:
|
is written to stdout, so you can pipe them forward to a JSON file:
|
||||||
$ spacy convert some_file.conllu --file-type json > some_file.json
|
$ spacy convert some_file.conllu --file-type json > some_file.json
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/cli#convert
|
||||||
"""
|
"""
|
||||||
if isinstance(file_type, FileTypes):
|
if isinstance(file_type, FileTypes):
|
||||||
# We get an instance of the FileTypes from the CLI so we need its string value
|
# We get an instance of the FileTypes from the CLI so we need its string value
|
||||||
|
@ -261,6 +263,6 @@ def _get_converter(msg, converter, input_path):
|
||||||
msg.warn(
|
msg.warn(
|
||||||
"Can't automatically detect NER format. "
|
"Can't automatically detect NER format. "
|
||||||
"Conversion may not succeed. "
|
"Conversion may not succeed. "
|
||||||
"See https://spacy.io/api/cli#convert"
|
"See https://nightly.spacy.io/api/cli#convert"
|
||||||
)
|
)
|
||||||
return converter
|
return converter
|
||||||
|
|
|
@ -31,6 +31,8 @@ def debug_config_cli(
|
||||||
Similar as with the 'train' command, you can override settings from the config
|
Similar as with the 'train' command, you can override settings from the config
|
||||||
as command line options. For instance, --training.batch_size 128 overrides
|
as command line options. For instance, --training.batch_size 128 overrides
|
||||||
the value of "batch_size" in the block "[training]".
|
the value of "batch_size" in the block "[training]".
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/cli#debug-config
|
||||||
"""
|
"""
|
||||||
overrides = parse_config_overrides(ctx.args)
|
overrides = parse_config_overrides(ctx.args)
|
||||||
import_code(code_path)
|
import_code(code_path)
|
||||||
|
|
|
@ -18,7 +18,7 @@ from .. import util
|
||||||
NEW_LABEL_THRESHOLD = 50
|
NEW_LABEL_THRESHOLD = 50
|
||||||
# Minimum number of expected occurrences of dependency labels
|
# Minimum number of expected occurrences of dependency labels
|
||||||
DEP_LABEL_THRESHOLD = 20
|
DEP_LABEL_THRESHOLD = 20
|
||||||
# Minimum number of expected examples to train a blank model
|
# Minimum number of expected examples to train a new pipeline
|
||||||
BLANK_MODEL_MIN_THRESHOLD = 100
|
BLANK_MODEL_MIN_THRESHOLD = 100
|
||||||
BLANK_MODEL_THRESHOLD = 2000
|
BLANK_MODEL_THRESHOLD = 2000
|
||||||
|
|
||||||
|
@ -47,6 +47,8 @@ def debug_data_cli(
|
||||||
Analyze, debug and validate your training and development data. Outputs
|
Analyze, debug and validate your training and development data. Outputs
|
||||||
useful stats, and can help you find problems like invalid entity annotations,
|
useful stats, and can help you find problems like invalid entity annotations,
|
||||||
cyclic dependencies, low data labels and more.
|
cyclic dependencies, low data labels and more.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/cli#debug-data
|
||||||
"""
|
"""
|
||||||
if ctx.command.name == "debug-data":
|
if ctx.command.name == "debug-data":
|
||||||
msg.warn(
|
msg.warn(
|
||||||
|
@ -148,7 +150,7 @@ def debug_data(
|
||||||
msg.text(f"Language: {config['nlp']['lang']}")
|
msg.text(f"Language: {config['nlp']['lang']}")
|
||||||
msg.text(f"Training pipeline: {', '.join(pipeline)}")
|
msg.text(f"Training pipeline: {', '.join(pipeline)}")
|
||||||
if resume_components:
|
if resume_components:
|
||||||
msg.text(f"Components from other models: {', '.join(resume_components)}")
|
msg.text(f"Components from other pipelines: {', '.join(resume_components)}")
|
||||||
if frozen_components:
|
if frozen_components:
|
||||||
msg.text(f"Frozen components: {', '.join(frozen_components)}")
|
msg.text(f"Frozen components: {', '.join(frozen_components)}")
|
||||||
msg.text(f"{len(train_dataset)} training docs")
|
msg.text(f"{len(train_dataset)} training docs")
|
||||||
|
@ -164,9 +166,7 @@ def debug_data(
|
||||||
# TODO: make this feedback more fine-grained and report on updated
|
# TODO: make this feedback more fine-grained and report on updated
|
||||||
# components vs. blank components
|
# components vs. blank components
|
||||||
if not resume_components and len(train_dataset) < BLANK_MODEL_THRESHOLD:
|
if not resume_components and len(train_dataset) < BLANK_MODEL_THRESHOLD:
|
||||||
text = (
|
text = f"Low number of examples to train a new pipeline ({len(train_dataset)})"
|
||||||
f"Low number of examples to train from a blank model ({len(train_dataset)})"
|
|
||||||
)
|
|
||||||
if len(train_dataset) < BLANK_MODEL_MIN_THRESHOLD:
|
if len(train_dataset) < BLANK_MODEL_MIN_THRESHOLD:
|
||||||
msg.fail(text)
|
msg.fail(text)
|
||||||
else:
|
else:
|
||||||
|
@ -214,7 +214,7 @@ def debug_data(
|
||||||
show=verbose,
|
show=verbose,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
msg.info("No word vectors present in the model")
|
msg.info("No word vectors present in the package")
|
||||||
|
|
||||||
if "ner" in factory_names:
|
if "ner" in factory_names:
|
||||||
# Get all unique NER labels present in the data
|
# Get all unique NER labels present in the data
|
||||||
|
|
|
@ -30,6 +30,8 @@ def debug_model_cli(
|
||||||
"""
|
"""
|
||||||
Analyze a Thinc model implementation. Includes checks for internal structure
|
Analyze a Thinc model implementation. Includes checks for internal structure
|
||||||
and activations during training.
|
and activations during training.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/cli#debug-model
|
||||||
"""
|
"""
|
||||||
if use_gpu >= 0:
|
if use_gpu >= 0:
|
||||||
msg.info("Using GPU")
|
msg.info("Using GPU")
|
||||||
|
|
|
@ -17,16 +17,19 @@ from ..errors import OLD_MODEL_SHORTCUTS
|
||||||
def download_cli(
|
def download_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
ctx: typer.Context,
|
ctx: typer.Context,
|
||||||
model: str = Arg(..., help="Name of model to download"),
|
model: str = Arg(..., help="Name of pipeline package to download"),
|
||||||
direct: bool = Opt(False, "--direct", "-d", "-D", help="Force direct download of name + version"),
|
direct: bool = Opt(False, "--direct", "-d", "-D", help="Force direct download of name + version"),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Download compatible model from default download path using pip. If --direct
|
Download compatible trained pipeline from the default download path using
|
||||||
flag is set, the command expects the full model name with version.
|
pip. If --direct flag is set, the command expects the full package name with
|
||||||
For direct downloads, the compatibility check will be skipped. All
|
version. For direct downloads, the compatibility check will be skipped. All
|
||||||
additional arguments provided to this command will be passed to `pip install`
|
additional arguments provided to this command will be passed to `pip install`
|
||||||
on model installation.
|
on package installation.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/cli#download
|
||||||
|
AVAILABLE PACKAGES: https://spacy.io/models
|
||||||
"""
|
"""
|
||||||
download(model, direct, *ctx.args)
|
download(model, direct, *ctx.args)
|
||||||
|
|
||||||
|
@ -34,11 +37,11 @@ def download_cli(
|
||||||
def download(model: str, direct: bool = False, *pip_args) -> None:
|
def download(model: str, direct: bool = False, *pip_args) -> None:
|
||||||
if not is_package("spacy") and "--no-deps" not in pip_args:
|
if not is_package("spacy") and "--no-deps" not in pip_args:
|
||||||
msg.warn(
|
msg.warn(
|
||||||
"Skipping model package dependencies and setting `--no-deps`. "
|
"Skipping pipeline package dependencies and setting `--no-deps`. "
|
||||||
"You don't seem to have the spaCy package itself installed "
|
"You don't seem to have the spaCy package itself installed "
|
||||||
"(maybe because you've built from source?), so installing the "
|
"(maybe because you've built from source?), so installing the "
|
||||||
"model dependencies would cause spaCy to be downloaded, which "
|
"package dependencies would cause spaCy to be downloaded, which "
|
||||||
"probably isn't what you want. If the model package has other "
|
"probably isn't what you want. If the pipeline package has other "
|
||||||
"dependencies, you'll have to install them manually."
|
"dependencies, you'll have to install them manually."
|
||||||
)
|
)
|
||||||
pip_args = pip_args + ("--no-deps",)
|
pip_args = pip_args + ("--no-deps",)
|
||||||
|
@ -53,7 +56,7 @@ def download(model: str, direct: bool = False, *pip_args) -> None:
|
||||||
if model in OLD_MODEL_SHORTCUTS:
|
if model in OLD_MODEL_SHORTCUTS:
|
||||||
msg.warn(
|
msg.warn(
|
||||||
f"As of spaCy v3.0, shortcuts like '{model}' are deprecated. Please"
|
f"As of spaCy v3.0, shortcuts like '{model}' are deprecated. Please"
|
||||||
f"use the full model name '{OLD_MODEL_SHORTCUTS[model]}' instead."
|
f"use the full pipeline package name '{OLD_MODEL_SHORTCUTS[model]}' instead."
|
||||||
)
|
)
|
||||||
model_name = OLD_MODEL_SHORTCUTS[model]
|
model_name = OLD_MODEL_SHORTCUTS[model]
|
||||||
compatibility = get_compatibility()
|
compatibility = get_compatibility()
|
||||||
|
@ -61,7 +64,7 @@ def download(model: str, direct: bool = False, *pip_args) -> None:
|
||||||
download_model(dl_tpl.format(m=model_name, v=version), pip_args)
|
download_model(dl_tpl.format(m=model_name, v=version), pip_args)
|
||||||
msg.good(
|
msg.good(
|
||||||
"Download and installation successful",
|
"Download and installation successful",
|
||||||
f"You can now load the model via spacy.load('{model_name}')",
|
f"You can now load the package via spacy.load('{model_name}')",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -71,16 +74,16 @@ def get_compatibility() -> dict:
|
||||||
if r.status_code != 200:
|
if r.status_code != 200:
|
||||||
msg.fail(
|
msg.fail(
|
||||||
f"Server error ({r.status_code})",
|
f"Server error ({r.status_code})",
|
||||||
f"Couldn't fetch compatibility table. Please find a model for your spaCy "
|
f"Couldn't fetch compatibility table. Please find a package for your spaCy "
|
||||||
f"installation (v{about.__version__}), and download it manually. "
|
f"installation (v{about.__version__}), and download it manually. "
|
||||||
f"For more details, see the documentation: "
|
f"For more details, see the documentation: "
|
||||||
f"https://spacy.io/usage/models",
|
f"https://nightly.spacy.io/usage/models",
|
||||||
exits=1,
|
exits=1,
|
||||||
)
|
)
|
||||||
comp_table = r.json()
|
comp_table = r.json()
|
||||||
comp = comp_table["spacy"]
|
comp = comp_table["spacy"]
|
||||||
if version not in comp:
|
if version not in comp:
|
||||||
msg.fail(f"No compatible models found for v{version} of spaCy", exits=1)
|
msg.fail(f"No compatible packages found for v{version} of spaCy", exits=1)
|
||||||
return comp[version]
|
return comp[version]
|
||||||
|
|
||||||
|
|
||||||
|
@ -88,7 +91,7 @@ def get_version(model: str, comp: dict) -> str:
|
||||||
model = get_base_version(model)
|
model = get_base_version(model)
|
||||||
if model not in comp:
|
if model not in comp:
|
||||||
msg.fail(
|
msg.fail(
|
||||||
f"No compatible model found for '{model}' (spaCy v{about.__version__})",
|
f"No compatible package found for '{model}' (spaCy v{about.__version__})",
|
||||||
exits=1,
|
exits=1,
|
||||||
)
|
)
|
||||||
return comp[model][0]
|
return comp[model][0]
|
||||||
|
|
|
@ -26,13 +26,16 @@ def evaluate_cli(
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Evaluate a model. Expects a loadable spaCy model and evaluation data in the
|
Evaluate a trained pipeline. Expects a loadable spaCy pipeline and evaluation
|
||||||
binary .spacy format. The --gold-preproc option sets up the evaluation
|
data in the binary .spacy format. The --gold-preproc option sets up the
|
||||||
examples with gold-standard sentences and tokens for the predictions. Gold
|
evaluation examples with gold-standard sentences and tokens for the
|
||||||
preprocessing helps the annotations align to the tokenization, and may
|
predictions. Gold preprocessing helps the annotations align to the
|
||||||
result in sequences of more consistent length. However, it may reduce
|
tokenization, and may result in sequences of more consistent length. However,
|
||||||
runtime accuracy due to train/test skew. To render a sample of dependency
|
it may reduce runtime accuracy due to train/test skew. To render a sample of
|
||||||
parses in a HTML file, set as output directory as the displacy_path argument.
|
dependency parses in a HTML file, set as output directory as the
|
||||||
|
displacy_path argument.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/cli#evaluate
|
||||||
"""
|
"""
|
||||||
evaluate(
|
evaluate(
|
||||||
model,
|
model,
|
||||||
|
|
|
@ -12,15 +12,17 @@ from .. import about
|
||||||
@app.command("info")
|
@app.command("info")
|
||||||
def info_cli(
|
def info_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
model: Optional[str] = Arg(None, help="Optional model name"),
|
model: Optional[str] = Arg(None, help="Optional loadable spaCy pipeline"),
|
||||||
markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues"),
|
markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues"),
|
||||||
silent: bool = Opt(False, "--silent", "-s", "-S", help="Don't print anything (just return)"),
|
silent: bool = Opt(False, "--silent", "-s", "-S", help="Don't print anything (just return)"),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Print info about spaCy installation. If a model is speficied as an argument,
|
Print info about spaCy installation. If a pipeline is speficied as an argument,
|
||||||
print model information. Flag --markdown prints details in Markdown for easy
|
print its meta information. Flag --markdown prints details in Markdown for easy
|
||||||
copy-pasting to GitHub issues.
|
copy-pasting to GitHub issues.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/cli#info
|
||||||
"""
|
"""
|
||||||
info(model, markdown=markdown, silent=silent)
|
info(model, markdown=markdown, silent=silent)
|
||||||
|
|
||||||
|
@ -30,14 +32,16 @@ def info(
|
||||||
) -> Union[str, dict]:
|
) -> Union[str, dict]:
|
||||||
msg = Printer(no_print=silent, pretty=not silent)
|
msg = Printer(no_print=silent, pretty=not silent)
|
||||||
if model:
|
if model:
|
||||||
title = f"Info about model '{model}'"
|
title = f"Info about pipeline '{model}'"
|
||||||
data = info_model(model, silent=silent)
|
data = info_model(model, silent=silent)
|
||||||
else:
|
else:
|
||||||
title = "Info about spaCy"
|
title = "Info about spaCy"
|
||||||
data = info_spacy()
|
data = info_spacy()
|
||||||
raw_data = {k.lower().replace(" ", "_"): v for k, v in data.items()}
|
raw_data = {k.lower().replace(" ", "_"): v for k, v in data.items()}
|
||||||
if "Models" in data and isinstance(data["Models"], dict):
|
if "Pipelines" in data and isinstance(data["Pipelines"], dict):
|
||||||
data["Models"] = ", ".join(f"{n} ({v})" for n, v in data["Models"].items())
|
data["Pipelines"] = ", ".join(
|
||||||
|
f"{n} ({v})" for n, v in data["Pipelines"].items()
|
||||||
|
)
|
||||||
markdown_data = get_markdown(data, title=title)
|
markdown_data = get_markdown(data, title=title)
|
||||||
if markdown:
|
if markdown:
|
||||||
if not silent:
|
if not silent:
|
||||||
|
@ -63,7 +67,7 @@ def info_spacy() -> Dict[str, any]:
|
||||||
"Location": str(Path(__file__).parent.parent),
|
"Location": str(Path(__file__).parent.parent),
|
||||||
"Platform": platform.platform(),
|
"Platform": platform.platform(),
|
||||||
"Python version": platform.python_version(),
|
"Python version": platform.python_version(),
|
||||||
"Models": all_models,
|
"Pipelines": all_models,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -81,7 +85,7 @@ def info_model(model: str, *, silent: bool = True) -> Dict[str, Any]:
|
||||||
model_path = model
|
model_path = model
|
||||||
meta_path = model_path / "meta.json"
|
meta_path = model_path / "meta.json"
|
||||||
if not meta_path.is_file():
|
if not meta_path.is_file():
|
||||||
msg.fail("Can't find model meta.json", meta_path, exits=1)
|
msg.fail("Can't find pipeline meta.json", meta_path, exits=1)
|
||||||
meta = srsly.read_json(meta_path)
|
meta = srsly.read_json(meta_path)
|
||||||
if model_path.resolve() != model_path:
|
if model_path.resolve() != model_path:
|
||||||
meta["source"] = str(model_path.resolve())
|
meta["source"] = str(model_path.resolve())
|
||||||
|
|
|
@ -27,7 +27,7 @@ def init_config_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
output_file: Path = Arg(..., help="File to save config.cfg to or - for stdout (will only output config and no additional logging info)", allow_dash=True),
|
output_file: Path = Arg(..., help="File to save config.cfg to or - for stdout (will only output config and no additional logging info)", allow_dash=True),
|
||||||
lang: Optional[str] = Opt("en", "--lang", "-l", help="Two-letter code of the language to use"),
|
lang: Optional[str] = Opt("en", "--lang", "-l", help="Two-letter code of the language to use"),
|
||||||
pipeline: Optional[str] = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include in the model (without 'tok2vec' or 'transformer')"),
|
pipeline: Optional[str] = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"),
|
||||||
optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
||||||
cpu: bool = Opt(False, "--cpu", "-C", help="Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
cpu: bool = Opt(False, "--cpu", "-C", help="Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
@ -37,6 +37,8 @@ def init_config_cli(
|
||||||
specified via the CLI arguments, this command generates a config with the
|
specified via the CLI arguments, this command generates a config with the
|
||||||
optimal settings for you use case. This includes the choice of architecture,
|
optimal settings for you use case. This includes the choice of architecture,
|
||||||
pretrained weights and related hyperparameters.
|
pretrained weights and related hyperparameters.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/cli#init-config
|
||||||
"""
|
"""
|
||||||
if isinstance(optimize, Optimizations): # instance of enum from the CLI
|
if isinstance(optimize, Optimizations): # instance of enum from the CLI
|
||||||
optimize = optimize.value
|
optimize = optimize.value
|
||||||
|
@ -59,6 +61,8 @@ def init_fill_config_cli(
|
||||||
functions for their default values and update the base config. This command
|
functions for their default values and update the base config. This command
|
||||||
can be used with a config generated via the training quickstart widget:
|
can be used with a config generated via the training quickstart widget:
|
||||||
https://nightly.spacy.io/usage/training#quickstart
|
https://nightly.spacy.io/usage/training#quickstart
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/cli#init-fill-config
|
||||||
"""
|
"""
|
||||||
fill_config(output_file, base_path, pretraining=pretraining, diff=diff)
|
fill_config(output_file, base_path, pretraining=pretraining, diff=diff)
|
||||||
|
|
||||||
|
@ -168,7 +172,7 @@ def save_config(
|
||||||
output_file.parent.mkdir(parents=True)
|
output_file.parent.mkdir(parents=True)
|
||||||
config.to_disk(output_file, interpolate=False)
|
config.to_disk(output_file, interpolate=False)
|
||||||
msg.good("Saved config", output_file)
|
msg.good("Saved config", output_file)
|
||||||
msg.text("You can now add your data and train your model:")
|
msg.text("You can now add your data and train your pipeline:")
|
||||||
variables = ["--paths.train ./train.spacy", "--paths.dev ./dev.spacy"]
|
variables = ["--paths.train ./train.spacy", "--paths.dev ./dev.spacy"]
|
||||||
if not no_print:
|
if not no_print:
|
||||||
print(f"{COMMAND} train {output_file.parts[-1]} {' '.join(variables)}")
|
print(f"{COMMAND} train {output_file.parts[-1]} {' '.join(variables)}")
|
||||||
|
|
|
@ -28,7 +28,7 @@ except ImportError:
|
||||||
DEFAULT_OOV_PROB = -20
|
DEFAULT_OOV_PROB = -20
|
||||||
|
|
||||||
|
|
||||||
@init_cli.command("model")
|
@init_cli.command("vocab")
|
||||||
@app.command(
|
@app.command(
|
||||||
"init-model",
|
"init-model",
|
||||||
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||||
|
@ -37,8 +37,8 @@ DEFAULT_OOV_PROB = -20
|
||||||
def init_model_cli(
|
def init_model_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
ctx: typer.Context, # This is only used to read additional arguments
|
ctx: typer.Context, # This is only used to read additional arguments
|
||||||
lang: str = Arg(..., help="Model language"),
|
lang: str = Arg(..., help="Pipeline language"),
|
||||||
output_dir: Path = Arg(..., help="Model output directory"),
|
output_dir: Path = Arg(..., help="Pipeline output directory"),
|
||||||
freqs_loc: Optional[Path] = Arg(None, help="Location of words frequencies file", exists=True),
|
freqs_loc: Optional[Path] = Arg(None, help="Location of words frequencies file", exists=True),
|
||||||
clusters_loc: Optional[Path] = Opt(None, "--clusters-loc", "-c", help="Optional location of brown clusters data", exists=True),
|
clusters_loc: Optional[Path] = Opt(None, "--clusters-loc", "-c", help="Optional location of brown clusters data", exists=True),
|
||||||
jsonl_loc: Optional[Path] = Opt(None, "--jsonl-loc", "-j", help="Location of JSONL-formatted attributes file", exists=True),
|
jsonl_loc: Optional[Path] = Opt(None, "--jsonl-loc", "-j", help="Location of JSONL-formatted attributes file", exists=True),
|
||||||
|
@ -46,19 +46,22 @@ def init_model_cli(
|
||||||
prune_vectors: int = Opt(-1, "--prune-vectors", "-V", help="Optional number of vectors to prune to"),
|
prune_vectors: int = Opt(-1, "--prune-vectors", "-V", help="Optional number of vectors to prune to"),
|
||||||
truncate_vectors: int = Opt(0, "--truncate-vectors", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
|
truncate_vectors: int = Opt(0, "--truncate-vectors", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
|
||||||
vectors_name: Optional[str] = Opt(None, "--vectors-name", "-vn", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
|
vectors_name: Optional[str] = Opt(None, "--vectors-name", "-vn", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
|
||||||
model_name: Optional[str] = Opt(None, "--model-name", "-mn", help="Optional name for the model meta"),
|
model_name: Optional[str] = Opt(None, "--meta-name", "-mn", help="Optional name of the package for the pipeline meta"),
|
||||||
base_model: Optional[str] = Opt(None, "--base-model", "-b", help="Base model (for languages with custom tokenizers)")
|
base_model: Optional[str] = Opt(None, "--base", "-b", help="Name of or path to base pipeline to start with (mostly relevant for pipelines with custom tokenizers)")
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Create a new model from raw data. If vectors are provided in Word2Vec format,
|
Create a new blank pipeline directory with vocab and vectors from raw data.
|
||||||
they can be either a .txt or zipped as a .zip or .tar.gz.
|
If vectors are provided in Word2Vec format, they can be either a .txt or
|
||||||
|
zipped as a .zip or .tar.gz.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/cli#init-vocab
|
||||||
"""
|
"""
|
||||||
if ctx.command.name == "init-model":
|
if ctx.command.name == "init-model":
|
||||||
msg.warn(
|
msg.warn(
|
||||||
"The init-model command is now available via the 'init model' "
|
"The init-model command is now called 'init vocab'. You can run "
|
||||||
"subcommand (without the hyphen). You can run python -m spacy init "
|
"'python -m spacy init --help' for an overview of the other "
|
||||||
"--help for an overview of the other available initialization commands."
|
"available initialization commands."
|
||||||
)
|
)
|
||||||
init_model(
|
init_model(
|
||||||
lang,
|
lang,
|
||||||
|
@ -115,10 +118,10 @@ def init_model(
|
||||||
msg.fail("Can't find words frequencies file", freqs_loc, exits=1)
|
msg.fail("Can't find words frequencies file", freqs_loc, exits=1)
|
||||||
lex_attrs = read_attrs_from_deprecated(msg, freqs_loc, clusters_loc)
|
lex_attrs = read_attrs_from_deprecated(msg, freqs_loc, clusters_loc)
|
||||||
|
|
||||||
with msg.loading("Creating model..."):
|
with msg.loading("Creating blank pipeline..."):
|
||||||
nlp = create_model(lang, lex_attrs, name=model_name, base_model=base_model)
|
nlp = create_model(lang, lex_attrs, name=model_name, base_model=base_model)
|
||||||
|
|
||||||
msg.good("Successfully created model")
|
msg.good("Successfully created blank pipeline")
|
||||||
if vectors_loc is not None:
|
if vectors_loc is not None:
|
||||||
add_vectors(
|
add_vectors(
|
||||||
msg, nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name
|
msg, nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name
|
||||||
|
@ -242,7 +245,8 @@ def add_vectors(
|
||||||
if vectors_data is not None:
|
if vectors_data is not None:
|
||||||
nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
|
nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
|
||||||
if name is None:
|
if name is None:
|
||||||
nlp.vocab.vectors.name = f"{nlp.meta['lang']}_model.vectors"
|
# TODO: Is this correct? Does this matter?
|
||||||
|
nlp.vocab.vectors.name = f"{nlp.meta['lang']}_{nlp.meta['name']}.vectors"
|
||||||
else:
|
else:
|
||||||
nlp.vocab.vectors.name = name
|
nlp.vocab.vectors.name = name
|
||||||
nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
|
nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
|
||||||
|
|
|
@ -14,23 +14,25 @@ from .. import about
|
||||||
@app.command("package")
|
@app.command("package")
|
||||||
def package_cli(
|
def package_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
input_dir: Path = Arg(..., help="Directory with model data", exists=True, file_okay=False),
|
input_dir: Path = Arg(..., help="Directory with pipeline data", exists=True, file_okay=False),
|
||||||
output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False),
|
output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False),
|
||||||
meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False),
|
meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False),
|
||||||
create_meta: bool = Opt(False, "--create-meta", "-c", "-C", help="Create meta.json, even if one exists"),
|
create_meta: bool = Opt(False, "--create-meta", "-c", "-C", help="Create meta.json, even if one exists"),
|
||||||
version: Optional[str] = Opt(None, "--version", "-v", help="Package version to override meta"),
|
version: Optional[str] = Opt(None, "--version", "-v", help="Package version to override meta"),
|
||||||
no_sdist: bool = Opt(False, "--no-sdist", "-NS", help="Don't build .tar.gz sdist, can be set if you want to run this step manually"),
|
no_sdist: bool = Opt(False, "--no-sdist", "-NS", help="Don't build .tar.gz sdist, can be set if you want to run this step manually"),
|
||||||
force: bool = Opt(False, "--force", "-f", "-F", help="Force overwriting existing model in output directory"),
|
force: bool = Opt(False, "--force", "-f", "-F", help="Force overwriting existing data in output directory"),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Generate an installable Python package for a model. Includes model data,
|
Generate an installable Python package for a pipeline. Includes binary data,
|
||||||
meta and required installation files. A new directory will be created in the
|
meta and required installation files. A new directory will be created in the
|
||||||
specified output directory, and model data will be copied over. If
|
specified output directory, and the data will be copied over. If
|
||||||
--create-meta is set and a meta.json already exists in the output directory,
|
--create-meta is set and a meta.json already exists in the output directory,
|
||||||
the existing values will be used as the defaults in the command-line prompt.
|
the existing values will be used as the defaults in the command-line prompt.
|
||||||
After packaging, "python setup.py sdist" is run in the package directory,
|
After packaging, "python setup.py sdist" is run in the package directory,
|
||||||
which will create a .tar.gz archive that can be installed via "pip install".
|
which will create a .tar.gz archive that can be installed via "pip install".
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/cli#package
|
||||||
"""
|
"""
|
||||||
package(
|
package(
|
||||||
input_dir,
|
input_dir,
|
||||||
|
@ -59,14 +61,14 @@ def package(
|
||||||
output_path = util.ensure_path(output_dir)
|
output_path = util.ensure_path(output_dir)
|
||||||
meta_path = util.ensure_path(meta_path)
|
meta_path = util.ensure_path(meta_path)
|
||||||
if not input_path or not input_path.exists():
|
if not input_path or not input_path.exists():
|
||||||
msg.fail("Can't locate model data", input_path, exits=1)
|
msg.fail("Can't locate pipeline data", input_path, exits=1)
|
||||||
if not output_path or not output_path.exists():
|
if not output_path or not output_path.exists():
|
||||||
msg.fail("Output directory not found", output_path, exits=1)
|
msg.fail("Output directory not found", output_path, exits=1)
|
||||||
if meta_path and not meta_path.exists():
|
if meta_path and not meta_path.exists():
|
||||||
msg.fail("Can't find model meta.json", meta_path, exits=1)
|
msg.fail("Can't find pipeline meta.json", meta_path, exits=1)
|
||||||
meta_path = meta_path or input_dir / "meta.json"
|
meta_path = meta_path or input_dir / "meta.json"
|
||||||
if not meta_path.exists() or not meta_path.is_file():
|
if not meta_path.exists() or not meta_path.is_file():
|
||||||
msg.fail("Can't load model meta.json", meta_path, exits=1)
|
msg.fail("Can't load pipeline meta.json", meta_path, exits=1)
|
||||||
meta = srsly.read_json(meta_path)
|
meta = srsly.read_json(meta_path)
|
||||||
meta = get_meta(input_dir, meta)
|
meta = get_meta(input_dir, meta)
|
||||||
if version is not None:
|
if version is not None:
|
||||||
|
@ -77,7 +79,7 @@ def package(
|
||||||
meta = generate_meta(meta, msg)
|
meta = generate_meta(meta, msg)
|
||||||
errors = validate(ModelMetaSchema, meta)
|
errors = validate(ModelMetaSchema, meta)
|
||||||
if errors:
|
if errors:
|
||||||
msg.fail("Invalid model meta.json")
|
msg.fail("Invalid pipeline meta.json")
|
||||||
print("\n".join(errors))
|
print("\n".join(errors))
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
model_name = meta["lang"] + "_" + meta["name"]
|
model_name = meta["lang"] + "_" + meta["name"]
|
||||||
|
@ -118,7 +120,7 @@ def get_meta(
|
||||||
) -> Dict[str, Any]:
|
) -> Dict[str, Any]:
|
||||||
meta = {
|
meta = {
|
||||||
"lang": "en",
|
"lang": "en",
|
||||||
"name": "model",
|
"name": "pipeline",
|
||||||
"version": "0.0.0",
|
"version": "0.0.0",
|
||||||
"description": "",
|
"description": "",
|
||||||
"author": "",
|
"author": "",
|
||||||
|
@ -143,10 +145,10 @@ def get_meta(
|
||||||
def generate_meta(existing_meta: Dict[str, Any], msg: Printer) -> Dict[str, Any]:
|
def generate_meta(existing_meta: Dict[str, Any], msg: Printer) -> Dict[str, Any]:
|
||||||
meta = existing_meta or {}
|
meta = existing_meta or {}
|
||||||
settings = [
|
settings = [
|
||||||
("lang", "Model language", meta.get("lang", "en")),
|
("lang", "Pipeline language", meta.get("lang", "en")),
|
||||||
("name", "Model name", meta.get("name", "model")),
|
("name", "Pipeline name", meta.get("name", "pipeline")),
|
||||||
("version", "Model version", meta.get("version", "0.0.0")),
|
("version", "Package version", meta.get("version", "0.0.0")),
|
||||||
("description", "Model description", meta.get("description", None)),
|
("description", "Package description", meta.get("description", None)),
|
||||||
("author", "Author", meta.get("author", None)),
|
("author", "Author", meta.get("author", None)),
|
||||||
("email", "Author email", meta.get("email", None)),
|
("email", "Author email", meta.get("email", None)),
|
||||||
("url", "Author website", meta.get("url", None)),
|
("url", "Author website", meta.get("url", None)),
|
||||||
|
@ -154,8 +156,8 @@ def generate_meta(existing_meta: Dict[str, Any], msg: Printer) -> Dict[str, Any]
|
||||||
]
|
]
|
||||||
msg.divider("Generating meta.json")
|
msg.divider("Generating meta.json")
|
||||||
msg.text(
|
msg.text(
|
||||||
"Enter the package settings for your model. The following information "
|
"Enter the package settings for your pipeline. The following information "
|
||||||
"will be read from your model data: pipeline, vectors."
|
"will be read from your pipeline data: pipeline, vectors."
|
||||||
)
|
)
|
||||||
for setting, desc, default in settings:
|
for setting, desc, default in settings:
|
||||||
response = get_raw_input(desc, default)
|
response = get_raw_input(desc, default)
|
||||||
|
|
|
@ -31,7 +31,7 @@ def pretrain_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
ctx: typer.Context, # This is only used to read additional arguments
|
ctx: typer.Context, # This is only used to read additional arguments
|
||||||
texts_loc: Path = Arg(..., help="Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'", exists=True),
|
texts_loc: Path = Arg(..., help="Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'", exists=True),
|
||||||
output_dir: Path = Arg(..., help="Directory to write models to on each epoch"),
|
output_dir: Path = Arg(..., help="Directory to write weights to on each epoch"),
|
||||||
config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False),
|
config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False),
|
||||||
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||||
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
|
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
|
||||||
|
@ -57,6 +57,8 @@ def pretrain_cli(
|
||||||
To load the weights back in during 'spacy train', you need to ensure
|
To load the weights back in during 'spacy train', you need to ensure
|
||||||
all settings are the same between pretraining and training. Ideally,
|
all settings are the same between pretraining and training. Ideally,
|
||||||
this is done by using the same config file for both commands.
|
this is done by using the same config file for both commands.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/cli#pretrain
|
||||||
"""
|
"""
|
||||||
overrides = parse_config_overrides(ctx.args)
|
overrides = parse_config_overrides(ctx.args)
|
||||||
import_code(code_path)
|
import_code(code_path)
|
||||||
|
@ -376,10 +378,9 @@ def verify_cli_args(texts_loc, output_dir, config_path, resume_path, epoch_resum
|
||||||
if output_dir.exists() and [p for p in output_dir.iterdir()]:
|
if output_dir.exists() and [p for p in output_dir.iterdir()]:
|
||||||
if resume_path:
|
if resume_path:
|
||||||
msg.warn(
|
msg.warn(
|
||||||
"Output directory is not empty. ",
|
"Output directory is not empty.",
|
||||||
"If you're resuming a run from a previous model in this directory, "
|
"If you're resuming a run in this directory, the old weights "
|
||||||
"the old models for the consecutive epochs will be overwritten "
|
"for the consecutive epochs will be overwritten with the new ones.",
|
||||||
"with the new ones.",
|
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
msg.warn(
|
msg.warn(
|
||||||
|
|
|
@ -19,7 +19,7 @@ from ..util import load_model
|
||||||
def profile_cli(
|
def profile_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
ctx: typer.Context, # This is only used to read current calling context
|
ctx: typer.Context, # This is only used to read current calling context
|
||||||
model: str = Arg(..., help="Model to load"),
|
model: str = Arg(..., help="Trained pipeline to load"),
|
||||||
inputs: Optional[Path] = Arg(None, help="Location of input file. '-' for stdin.", exists=True, allow_dash=True),
|
inputs: Optional[Path] = Arg(None, help="Location of input file. '-' for stdin.", exists=True, allow_dash=True),
|
||||||
n_texts: int = Opt(10000, "--n-texts", "-n", help="Maximum number of texts to use if available"),
|
n_texts: int = Opt(10000, "--n-texts", "-n", help="Maximum number of texts to use if available"),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
@ -29,6 +29,8 @@ def profile_cli(
|
||||||
Input should be formatted as one JSON object per line with a key "text".
|
Input should be formatted as one JSON object per line with a key "text".
|
||||||
It can either be provided as a JSONL file, or be read from sys.sytdin.
|
It can either be provided as a JSONL file, or be read from sys.sytdin.
|
||||||
If no input file is specified, the IMDB dataset is loaded via Thinc.
|
If no input file is specified, the IMDB dataset is loaded via Thinc.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/cli#debug-profile
|
||||||
"""
|
"""
|
||||||
if ctx.parent.command.name == NAME: # called as top-level command
|
if ctx.parent.command.name == NAME: # called as top-level command
|
||||||
msg.warn(
|
msg.warn(
|
||||||
|
@ -60,9 +62,9 @@ def profile(model: str, inputs: Optional[Path] = None, n_texts: int = 10000) ->
|
||||||
inputs, _ = zip(*imdb_train)
|
inputs, _ = zip(*imdb_train)
|
||||||
msg.info(f"Loaded IMDB dataset and using {n_inputs} examples")
|
msg.info(f"Loaded IMDB dataset and using {n_inputs} examples")
|
||||||
inputs = inputs[:n_inputs]
|
inputs = inputs[:n_inputs]
|
||||||
with msg.loading(f"Loading model '{model}'..."):
|
with msg.loading(f"Loading pipeline '{model}'..."):
|
||||||
nlp = load_model(model)
|
nlp = load_model(model)
|
||||||
msg.good(f"Loaded model '{model}'")
|
msg.good(f"Loaded pipeline '{model}'")
|
||||||
texts = list(itertools.islice(inputs, n_texts))
|
texts = list(itertools.islice(inputs, n_texts))
|
||||||
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
|
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
|
||||||
s = pstats.Stats("Profile.prof")
|
s = pstats.Stats("Profile.prof")
|
||||||
|
|
|
@ -20,6 +20,8 @@ def project_assets_cli(
|
||||||
defined in the "assets" section of the project.yml. If a checksum is
|
defined in the "assets" section of the project.yml. If a checksum is
|
||||||
provided in the project.yml, the file is only downloaded if no local file
|
provided in the project.yml, the file is only downloaded if no local file
|
||||||
with the same checksum exists.
|
with the same checksum exists.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/cli#project-assets
|
||||||
"""
|
"""
|
||||||
project_assets(project_dir)
|
project_assets(project_dir)
|
||||||
|
|
||||||
|
|
|
@ -22,6 +22,8 @@ def project_clone_cli(
|
||||||
only download the files from the given subdirectory. The GitHub repo
|
only download the files from the given subdirectory. The GitHub repo
|
||||||
defaults to the official spaCy template repo, but can be customized
|
defaults to the official spaCy template repo, but can be customized
|
||||||
(including using a private repo).
|
(including using a private repo).
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/cli#project-clone
|
||||||
"""
|
"""
|
||||||
if dest is None:
|
if dest is None:
|
||||||
dest = Path.cwd() / name
|
dest = Path.cwd() / name
|
||||||
|
|
|
@ -43,6 +43,8 @@ def project_document_cli(
|
||||||
hidden markers are added so you can add custom content before or after the
|
hidden markers are added so you can add custom content before or after the
|
||||||
auto-generated section and only the auto-generated docs will be replaced
|
auto-generated section and only the auto-generated docs will be replaced
|
||||||
when you re-run the command.
|
when you re-run the command.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/cli#project-document
|
||||||
"""
|
"""
|
||||||
project_document(project_dir, output_file, no_emoji=no_emoji)
|
project_document(project_dir, output_file, no_emoji=no_emoji)
|
||||||
|
|
||||||
|
|
|
@ -31,7 +31,10 @@ def project_update_dvc_cli(
|
||||||
"""Auto-generate Data Version Control (DVC) config. A DVC
|
"""Auto-generate Data Version Control (DVC) config. A DVC
|
||||||
project can only define one pipeline, so you need to specify one workflow
|
project can only define one pipeline, so you need to specify one workflow
|
||||||
defined in the project.yml. If no workflow is specified, the first defined
|
defined in the project.yml. If no workflow is specified, the first defined
|
||||||
workflow is used. The DVC config will only be updated if the project.yml changed.
|
workflow is used. The DVC config will only be updated if the project.yml
|
||||||
|
changed.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/cli#project-dvc
|
||||||
"""
|
"""
|
||||||
project_update_dvc(project_dir, workflow, verbose=verbose, force=force)
|
project_update_dvc(project_dir, workflow, verbose=verbose, force=force)
|
||||||
|
|
||||||
|
|
|
@ -17,7 +17,9 @@ def project_pull_cli(
|
||||||
"""Retrieve available precomputed outputs from a remote storage.
|
"""Retrieve available precomputed outputs from a remote storage.
|
||||||
You can alias remotes in your project.yml by mapping them to storage paths.
|
You can alias remotes in your project.yml by mapping them to storage paths.
|
||||||
A storage can be anything that the smart-open library can upload to, e.g.
|
A storage can be anything that the smart-open library can upload to, e.g.
|
||||||
gcs, aws, ssh, local directories etc
|
AWS, Google Cloud Storage, SSH, local directories etc.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/cli#project-pull
|
||||||
"""
|
"""
|
||||||
for url, output_path in project_pull(project_dir, remote):
|
for url, output_path in project_pull(project_dir, remote):
|
||||||
if url is not None:
|
if url is not None:
|
||||||
|
@ -38,5 +40,5 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
|
||||||
url = storage.pull(output_path, command_hash=cmd_hash)
|
url = storage.pull(output_path, command_hash=cmd_hash)
|
||||||
yield url, output_path
|
yield url, output_path
|
||||||
|
|
||||||
if cmd.get("outptus") and all(loc.exists() for loc in cmd["outputs"]):
|
if cmd.get("outputs") and all(loc.exists() for loc in cmd["outputs"]):
|
||||||
update_lockfile(project_dir, cmd)
|
update_lockfile(project_dir, cmd)
|
||||||
|
|
|
@ -13,9 +13,12 @@ def project_push_cli(
|
||||||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""Persist outputs to a remote storage. You can alias remotes in your project.yml
|
"""Persist outputs to a remote storage. You can alias remotes in your
|
||||||
by mapping them to storage paths. A storage can be anything that the smart-open
|
project.yml by mapping them to storage paths. A storage can be anything that
|
||||||
library can upload to, e.g. gcs, aws, ssh, local directories etc
|
the smart-open library can upload to, e.g. AWS, Google Cloud Storage, SSH,
|
||||||
|
local directories etc.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/cli#project-push
|
||||||
"""
|
"""
|
||||||
for output_path, url in project_push(project_dir, remote):
|
for output_path, url in project_push(project_dir, remote):
|
||||||
if url is None:
|
if url is None:
|
||||||
|
|
|
@ -24,6 +24,8 @@ def project_run_cli(
|
||||||
name is specified, all commands in the workflow are run, in order. If
|
name is specified, all commands in the workflow are run, in order. If
|
||||||
commands define dependencies and/or outputs, they will only be re-run if
|
commands define dependencies and/or outputs, they will only be re-run if
|
||||||
state has changed.
|
state has changed.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/cli#project-run
|
||||||
"""
|
"""
|
||||||
if show_help or not subcommand:
|
if show_help or not subcommand:
|
||||||
print_run_help(project_dir, subcommand)
|
print_run_help(project_dir, subcommand)
|
||||||
|
|
|
@ -29,7 +29,7 @@ name = "{{ transformer["name"] }}"
|
||||||
tokenizer_config = {"use_fast": true}
|
tokenizer_config = {"use_fast": true}
|
||||||
|
|
||||||
[components.transformer.model.get_spans]
|
[components.transformer.model.get_spans]
|
||||||
@span_getters = "strided_spans.v1"
|
@span_getters = "spacy-transformers.strided_spans.v1"
|
||||||
window = 128
|
window = 128
|
||||||
stride = 96
|
stride = 96
|
||||||
|
|
||||||
|
@ -204,13 +204,13 @@ max_length = 0
|
||||||
|
|
||||||
{% if use_transformer %}
|
{% if use_transformer %}
|
||||||
[training.batcher]
|
[training.batcher]
|
||||||
@batchers = "batch_by_padded.v1"
|
@batchers = "spacy.batch_by_padded.v1"
|
||||||
discard_oversize = true
|
discard_oversize = true
|
||||||
size = 2000
|
size = 2000
|
||||||
buffer = 256
|
buffer = 256
|
||||||
{%- else %}
|
{%- else %}
|
||||||
[training.batcher]
|
[training.batcher]
|
||||||
@batchers = "batch_by_words.v1"
|
@batchers = "spacy.batch_by_words.v1"
|
||||||
discard_oversize = false
|
discard_oversize = false
|
||||||
tolerance = 0.2
|
tolerance = 0.2
|
||||||
|
|
||||||
|
|
|
@ -26,7 +26,7 @@ def train_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
ctx: typer.Context, # This is only used to read additional arguments
|
ctx: typer.Context, # This is only used to read additional arguments
|
||||||
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
||||||
output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store model in"),
|
output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"),
|
||||||
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||||
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||||
|
@ -34,7 +34,7 @@ def train_cli(
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Train or update a spaCy model. Requires data in spaCy's binary format. To
|
Train or update a spaCy pipeline. Requires data in spaCy's binary format. To
|
||||||
convert data from other formats, use the `spacy convert` command. The
|
convert data from other formats, use the `spacy convert` command. The
|
||||||
config file includes all settings and hyperparameters used during traing.
|
config file includes all settings and hyperparameters used during traing.
|
||||||
To override settings in the config, e.g. settings that point to local
|
To override settings in the config, e.g. settings that point to local
|
||||||
|
@ -44,6 +44,8 @@ def train_cli(
|
||||||
lets you pass in a Python file that's imported before training. It can be
|
lets you pass in a Python file that's imported before training. It can be
|
||||||
used to register custom functions and architectures that can then be
|
used to register custom functions and architectures that can then be
|
||||||
referenced in the config.
|
referenced in the config.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/cli#train
|
||||||
"""
|
"""
|
||||||
util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
|
util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
|
||||||
verify_cli_args(config_path, output_path)
|
verify_cli_args(config_path, output_path)
|
||||||
|
@ -113,12 +115,12 @@ def train(
|
||||||
# Load morph rules
|
# Load morph rules
|
||||||
nlp.vocab.morphology.load_morph_exceptions(morph_rules)
|
nlp.vocab.morphology.load_morph_exceptions(morph_rules)
|
||||||
|
|
||||||
# Load a pretrained tok2vec model - cf. CLI command 'pretrain'
|
# Load pretrained tok2vec weights - cf. CLI command 'pretrain'
|
||||||
if weights_data is not None:
|
if weights_data is not None:
|
||||||
tok2vec_path = config["pretraining"].get("tok2vec_model", None)
|
tok2vec_path = config["pretraining"].get("tok2vec_model", None)
|
||||||
if tok2vec_path is None:
|
if tok2vec_path is None:
|
||||||
msg.fail(
|
msg.fail(
|
||||||
f"To use a pretrained tok2vec model, the config needs to specify which "
|
f"To pretrained tok2vec weights, the config needs to specify which "
|
||||||
f"tok2vec layer to load in the setting [pretraining.tok2vec_model].",
|
f"tok2vec layer to load in the setting [pretraining.tok2vec_model].",
|
||||||
exits=1,
|
exits=1,
|
||||||
)
|
)
|
||||||
|
@ -159,6 +161,7 @@ def train(
|
||||||
print_row(info)
|
print_row(info)
|
||||||
if is_best_checkpoint and output_path is not None:
|
if is_best_checkpoint and output_path is not None:
|
||||||
update_meta(T_cfg, nlp, info)
|
update_meta(T_cfg, nlp, info)
|
||||||
|
with nlp.use_params(optimizer.averages):
|
||||||
nlp.to_disk(output_path / "model-best")
|
nlp.to_disk(output_path / "model-best")
|
||||||
progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False)
|
progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False)
|
||||||
progress.set_description(f"Epoch {info['epoch']}")
|
progress.set_description(f"Epoch {info['epoch']}")
|
||||||
|
@ -182,7 +185,7 @@ def train(
|
||||||
nlp.to_disk(final_model_path)
|
nlp.to_disk(final_model_path)
|
||||||
else:
|
else:
|
||||||
nlp.to_disk(final_model_path)
|
nlp.to_disk(final_model_path)
|
||||||
msg.good(f"Saved model to output directory {final_model_path}")
|
msg.good(f"Saved pipeline to output directory {final_model_path}")
|
||||||
|
|
||||||
|
|
||||||
def create_train_batches(iterator, batcher, max_epochs: int):
|
def create_train_batches(iterator, batcher, max_epochs: int):
|
||||||
|
|
|
@ -13,9 +13,11 @@ from ..util import get_package_path, get_model_meta, is_compatible_version
|
||||||
@app.command("validate")
|
@app.command("validate")
|
||||||
def validate_cli():
|
def validate_cli():
|
||||||
"""
|
"""
|
||||||
Validate the currently installed models and spaCy version. Checks if the
|
Validate the currently installed pipeline packages and spaCy version. Checks
|
||||||
installed models are compatible and shows upgrade instructions if available.
|
if the installed packages are compatible and shows upgrade instructions if
|
||||||
Should be run after `pip install -U spacy`.
|
available. Should be run after `pip install -U spacy`.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/cli#validate
|
||||||
"""
|
"""
|
||||||
validate()
|
validate()
|
||||||
|
|
||||||
|
@ -25,13 +27,13 @@ def validate() -> None:
|
||||||
spacy_version = get_base_version(about.__version__)
|
spacy_version = get_base_version(about.__version__)
|
||||||
current_compat = compat.get(spacy_version, {})
|
current_compat = compat.get(spacy_version, {})
|
||||||
if not current_compat:
|
if not current_compat:
|
||||||
msg.warn(f"No compatible models found for v{spacy_version} of spaCy")
|
msg.warn(f"No compatible packages found for v{spacy_version} of spaCy")
|
||||||
incompat_models = {d["name"] for _, d in model_pkgs.items() if not d["compat"]}
|
incompat_models = {d["name"] for _, d in model_pkgs.items() if not d["compat"]}
|
||||||
na_models = [m for m in incompat_models if m not in current_compat]
|
na_models = [m for m in incompat_models if m not in current_compat]
|
||||||
update_models = [m for m in incompat_models if m in current_compat]
|
update_models = [m for m in incompat_models if m in current_compat]
|
||||||
spacy_dir = Path(__file__).parent.parent
|
spacy_dir = Path(__file__).parent.parent
|
||||||
|
|
||||||
msg.divider(f"Installed models (spaCy v{about.__version__})")
|
msg.divider(f"Installed pipeline packages (spaCy v{about.__version__})")
|
||||||
msg.info(f"spaCy installation: {spacy_dir}")
|
msg.info(f"spaCy installation: {spacy_dir}")
|
||||||
|
|
||||||
if model_pkgs:
|
if model_pkgs:
|
||||||
|
@ -47,15 +49,15 @@ def validate() -> None:
|
||||||
rows.append((data["name"], data["spacy"], version, comp))
|
rows.append((data["name"], data["spacy"], version, comp))
|
||||||
msg.table(rows, header=header)
|
msg.table(rows, header=header)
|
||||||
else:
|
else:
|
||||||
msg.text("No models found in your current environment.", exits=0)
|
msg.text("No pipeline packages found in your current environment.", exits=0)
|
||||||
if update_models:
|
if update_models:
|
||||||
msg.divider("Install updates")
|
msg.divider("Install updates")
|
||||||
msg.text("Use the following commands to update the model packages:")
|
msg.text("Use the following commands to update the packages:")
|
||||||
cmd = "python -m spacy download {}"
|
cmd = "python -m spacy download {}"
|
||||||
print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n")
|
print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n")
|
||||||
if na_models:
|
if na_models:
|
||||||
msg.info(
|
msg.info(
|
||||||
f"The following models are custom spaCy models or not "
|
f"The following packages are custom spaCy pipelines or not "
|
||||||
f"available for spaCy v{about.__version__}:",
|
f"available for spaCy v{about.__version__}:",
|
||||||
", ".join(na_models),
|
", ".join(na_models),
|
||||||
)
|
)
|
||||||
|
|
|
@ -69,7 +69,7 @@ max_length = 2000
|
||||||
limit = 0
|
limit = 0
|
||||||
|
|
||||||
[training.batcher]
|
[training.batcher]
|
||||||
@batchers = "batch_by_words.v1"
|
@batchers = "spacy.batch_by_words.v1"
|
||||||
discard_oversize = false
|
discard_oversize = false
|
||||||
tolerance = 0.2
|
tolerance = 0.2
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
"""
|
"""
|
||||||
spaCy's built in visualization suite for dependencies and named entities.
|
spaCy's built in visualization suite for dependencies and named entities.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/top-level#displacy
|
DOCS: https://nightly.spacy.io/api/top-level#displacy
|
||||||
USAGE: https://spacy.io/usage/visualizers
|
USAGE: https://nightly.spacy.io/usage/visualizers
|
||||||
"""
|
"""
|
||||||
from typing import Union, Iterable, Optional, Dict, Any, Callable
|
from typing import Union, Iterable, Optional, Dict, Any, Callable
|
||||||
import warnings
|
import warnings
|
||||||
|
@ -37,8 +37,8 @@ def render(
|
||||||
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
|
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
|
||||||
RETURNS (str): Rendered HTML markup.
|
RETURNS (str): Rendered HTML markup.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/top-level#displacy.render
|
DOCS: https://nightly.spacy.io/api/top-level#displacy.render
|
||||||
USAGE: https://spacy.io/usage/visualizers
|
USAGE: https://nightly.spacy.io/usage/visualizers
|
||||||
"""
|
"""
|
||||||
factories = {
|
factories = {
|
||||||
"dep": (DependencyRenderer, parse_deps),
|
"dep": (DependencyRenderer, parse_deps),
|
||||||
|
@ -88,8 +88,8 @@ def serve(
|
||||||
port (int): Port to serve visualisation.
|
port (int): Port to serve visualisation.
|
||||||
host (str): Host to serve visualisation.
|
host (str): Host to serve visualisation.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/top-level#displacy.serve
|
DOCS: https://nightly.spacy.io/api/top-level#displacy.serve
|
||||||
USAGE: https://spacy.io/usage/visualizers
|
USAGE: https://nightly.spacy.io/usage/visualizers
|
||||||
"""
|
"""
|
||||||
from wsgiref import simple_server
|
from wsgiref import simple_server
|
||||||
|
|
||||||
|
|
|
@ -249,6 +249,12 @@ class EntityRenderer:
|
||||||
colors = dict(DEFAULT_LABEL_COLORS)
|
colors = dict(DEFAULT_LABEL_COLORS)
|
||||||
user_colors = registry.displacy_colors.get_all()
|
user_colors = registry.displacy_colors.get_all()
|
||||||
for user_color in user_colors.values():
|
for user_color in user_colors.values():
|
||||||
|
if callable(user_color):
|
||||||
|
# Since this comes from the function registry, we want to make
|
||||||
|
# sure we support functions that *return* a dict of colors
|
||||||
|
user_color = user_color()
|
||||||
|
if not isinstance(user_color, dict):
|
||||||
|
raise ValueError(Errors.E925.format(obj=type(user_color)))
|
||||||
colors.update(user_color)
|
colors.update(user_color)
|
||||||
colors.update(options.get("colors", {}))
|
colors.update(options.get("colors", {}))
|
||||||
self.default_color = DEFAULT_ENTITY_COLOR
|
self.default_color = DEFAULT_ENTITY_COLOR
|
||||||
|
|
|
@ -22,7 +22,7 @@ class Warnings:
|
||||||
"generate a dependency visualization for it. Make sure the Doc "
|
"generate a dependency visualization for it. Make sure the Doc "
|
||||||
"was processed with a model that supports dependency parsing, and "
|
"was processed with a model that supports dependency parsing, and "
|
||||||
"not just a language class like `English()`. For more info, see "
|
"not just a language class like `English()`. For more info, see "
|
||||||
"the docs:\nhttps://spacy.io/usage/models")
|
"the docs:\nhttps://nightly.spacy.io/usage/models")
|
||||||
W006 = ("No entities to visualize found in Doc object. If this is "
|
W006 = ("No entities to visualize found in Doc object. If this is "
|
||||||
"surprising to you, make sure the Doc was processed using a model "
|
"surprising to you, make sure the Doc was processed using a model "
|
||||||
"that supports named entity recognition, and check the `doc.ents` "
|
"that supports named entity recognition, and check the `doc.ents` "
|
||||||
|
@ -147,7 +147,7 @@ class Errors:
|
||||||
E010 = ("Word vectors set to length 0. This may be because you don't have "
|
E010 = ("Word vectors set to length 0. This may be because you don't have "
|
||||||
"a model installed or loaded, or because your model doesn't "
|
"a model installed or loaded, or because your model doesn't "
|
||||||
"include word vectors. For more info, see the docs:\n"
|
"include word vectors. For more info, see the docs:\n"
|
||||||
"https://spacy.io/usage/models")
|
"https://nightly.spacy.io/usage/models")
|
||||||
E011 = ("Unknown operator: '{op}'. Options: {opts}")
|
E011 = ("Unknown operator: '{op}'. Options: {opts}")
|
||||||
E012 = ("Cannot add pattern for zero tokens to matcher.\nKey: {key}")
|
E012 = ("Cannot add pattern for zero tokens to matcher.\nKey: {key}")
|
||||||
E014 = ("Unknown tag ID: {tag}")
|
E014 = ("Unknown tag ID: {tag}")
|
||||||
|
@ -181,7 +181,7 @@ class Errors:
|
||||||
"list of (unicode, bool) tuples. Got bytes instance: {value}")
|
"list of (unicode, bool) tuples. Got bytes instance: {value}")
|
||||||
E029 = ("noun_chunks requires the dependency parse, which requires a "
|
E029 = ("noun_chunks requires the dependency parse, which requires a "
|
||||||
"statistical model to be installed and loaded. For more info, see "
|
"statistical model to be installed and loaded. For more info, see "
|
||||||
"the documentation:\nhttps://spacy.io/usage/models")
|
"the documentation:\nhttps://nightly.spacy.io/usage/models")
|
||||||
E030 = ("Sentence boundaries unset. You can add the 'sentencizer' "
|
E030 = ("Sentence boundaries unset. You can add the 'sentencizer' "
|
||||||
"component to the pipeline with: "
|
"component to the pipeline with: "
|
||||||
"nlp.add_pipe('sentencizer'). "
|
"nlp.add_pipe('sentencizer'). "
|
||||||
|
@ -294,7 +294,7 @@ class Errors:
|
||||||
E102 = ("Can't merge non-disjoint spans. '{token}' is already part of "
|
E102 = ("Can't merge non-disjoint spans. '{token}' is already part of "
|
||||||
"tokens to merge. If you want to find the longest non-overlapping "
|
"tokens to merge. If you want to find the longest non-overlapping "
|
||||||
"spans, you can use the util.filter_spans helper:\n"
|
"spans, you can use the util.filter_spans helper:\n"
|
||||||
"https://spacy.io/api/top-level#util.filter_spans")
|
"https://nightly.spacy.io/api/top-level#util.filter_spans")
|
||||||
E103 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A "
|
E103 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A "
|
||||||
"token can only be part of one entity, so make sure the entities "
|
"token can only be part of one entity, so make sure the entities "
|
||||||
"you're setting don't overlap.")
|
"you're setting don't overlap.")
|
||||||
|
@ -364,10 +364,10 @@ class Errors:
|
||||||
E137 = ("Expected 'dict' type, but got '{type}' from '{line}'. Make sure "
|
E137 = ("Expected 'dict' type, but got '{type}' from '{line}'. Make sure "
|
||||||
"to provide a valid JSON object as input with either the `text` "
|
"to provide a valid JSON object as input with either the `text` "
|
||||||
"or `tokens` key. For more info, see the docs:\n"
|
"or `tokens` key. For more info, see the docs:\n"
|
||||||
"https://spacy.io/api/cli#pretrain-jsonl")
|
"https://nightly.spacy.io/api/cli#pretrain-jsonl")
|
||||||
E138 = ("Invalid JSONL format for raw text '{text}'. Make sure the input "
|
E138 = ("Invalid JSONL format for raw text '{text}'. Make sure the input "
|
||||||
"includes either the `text` or `tokens` key. For more info, see "
|
"includes either the `text` or `tokens` key. For more info, see "
|
||||||
"the docs:\nhttps://spacy.io/api/cli#pretrain-jsonl")
|
"the docs:\nhttps://nightly.spacy.io/api/cli#pretrain-jsonl")
|
||||||
E139 = ("Knowledge Base for component '{name}' is empty. Use the methods "
|
E139 = ("Knowledge Base for component '{name}' is empty. Use the methods "
|
||||||
"kb.add_entity and kb.add_alias to add entries.")
|
"kb.add_entity and kb.add_alias to add entries.")
|
||||||
E140 = ("The list of entities, prior probabilities and entity vectors "
|
E140 = ("The list of entities, prior probabilities and entity vectors "
|
||||||
|
@ -476,6 +476,8 @@ class Errors:
|
||||||
E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
|
E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
|
||||||
|
|
||||||
# TODO: fix numbering after merging develop into master
|
# TODO: fix numbering after merging develop into master
|
||||||
|
E925 = ("Invalid color values for displaCy visualizer: expected dictionary "
|
||||||
|
"mapping label names to colors but got: {obj}")
|
||||||
E926 = ("It looks like you're trying to modify nlp.{attr} directly. This "
|
E926 = ("It looks like you're trying to modify nlp.{attr} directly. This "
|
||||||
"doesn't work because it's an immutable computed property. If you "
|
"doesn't work because it's an immutable computed property. If you "
|
||||||
"need to modify the pipeline, use the built-in methods like "
|
"need to modify the pipeline, use the built-in methods like "
|
||||||
|
|
|
@ -11,7 +11,7 @@ ItemT = TypeVar("ItemT")
|
||||||
BatcherT = Callable[[Iterable[ItemT]], Iterable[List[ItemT]]]
|
BatcherT = Callable[[Iterable[ItemT]], Iterable[List[ItemT]]]
|
||||||
|
|
||||||
|
|
||||||
@registry.batchers("batch_by_padded.v1")
|
@registry.batchers("spacy.batch_by_padded.v1")
|
||||||
def configure_minibatch_by_padded_size(
|
def configure_minibatch_by_padded_size(
|
||||||
*,
|
*,
|
||||||
size: Sizing,
|
size: Sizing,
|
||||||
|
@ -46,7 +46,7 @@ def configure_minibatch_by_padded_size(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@registry.batchers("batch_by_words.v1")
|
@registry.batchers("spacy.batch_by_words.v1")
|
||||||
def configure_minibatch_by_words(
|
def configure_minibatch_by_words(
|
||||||
*,
|
*,
|
||||||
size: Sizing,
|
size: Sizing,
|
||||||
|
@ -70,7 +70,7 @@ def configure_minibatch_by_words(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@registry.batchers("batch_by_sequence.v1")
|
@registry.batchers("spacy.batch_by_sequence.v1")
|
||||||
def configure_minibatch(
|
def configure_minibatch(
|
||||||
size: Sizing, get_length: Optional[Callable[[ItemT], int]] = None
|
size: Sizing, get_length: Optional[Callable[[ItemT], int]] = None
|
||||||
) -> BatcherT:
|
) -> BatcherT:
|
||||||
|
|
|
@ -106,7 +106,7 @@ def conll_ner2docs(
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"The token-per-line NER file is not formatted correctly. "
|
"The token-per-line NER file is not formatted correctly. "
|
||||||
"Try checking whitespace and delimiters. See "
|
"Try checking whitespace and delimiters. See "
|
||||||
"https://spacy.io/api/cli#convert"
|
"https://nightly.spacy.io/api/cli#convert"
|
||||||
)
|
)
|
||||||
length = len(cols[0])
|
length = len(cols[0])
|
||||||
words.extend(cols[0])
|
words.extend(cols[0])
|
||||||
|
|
|
@ -44,7 +44,7 @@ def read_iob(raw_sents, vocab, n_sents):
|
||||||
sent_tags = ["-"] * len(sent_words)
|
sent_tags = ["-"] * len(sent_words)
|
||||||
else:
|
else:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"The sentence-per-line IOB/IOB2 file is not formatted correctly. Try checking whitespace and delimiters. See https://spacy.io/api/cli#convert"
|
"The sentence-per-line IOB/IOB2 file is not formatted correctly. Try checking whitespace and delimiters. See https://nightly.spacy.io/api/cli#convert"
|
||||||
)
|
)
|
||||||
words.extend(sent_words)
|
words.extend(sent_words)
|
||||||
tags.extend(sent_tags)
|
tags.extend(sent_tags)
|
||||||
|
|
|
@ -38,7 +38,7 @@ class Corpus:
|
||||||
limit (int): Limit corpus to a subset of examples, e.g. for debugging.
|
limit (int): Limit corpus to a subset of examples, e.g. for debugging.
|
||||||
Defaults to 0, which indicates no limit.
|
Defaults to 0, which indicates no limit.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/corpus
|
DOCS: https://nightly.spacy.io/api/corpus
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
|
@ -83,7 +83,7 @@ class Corpus:
|
||||||
nlp (Language): The current nlp object.
|
nlp (Language): The current nlp object.
|
||||||
YIELDS (Example): The examples.
|
YIELDS (Example): The examples.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/corpus#call
|
DOCS: https://nightly.spacy.io/api/corpus#call
|
||||||
"""
|
"""
|
||||||
ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.path))
|
ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.path))
|
||||||
if self.gold_preproc:
|
if self.gold_preproc:
|
||||||
|
|
|
@ -21,7 +21,7 @@ cdef class Candidate:
|
||||||
algorithm which will disambiguate the various candidates to the correct one.
|
algorithm which will disambiguate the various candidates to the correct one.
|
||||||
Each candidate (alias, entity) pair is assigned to a certain prior probability.
|
Each candidate (alias, entity) pair is assigned to a certain prior probability.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/kb/#candidate_init
|
DOCS: https://nightly.spacy.io/api/kb/#candidate_init
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob):
|
def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob):
|
||||||
|
@ -79,7 +79,7 @@ cdef class KnowledgeBase:
|
||||||
"""A `KnowledgeBase` instance stores unique identifiers for entities and their textual aliases,
|
"""A `KnowledgeBase` instance stores unique identifiers for entities and their textual aliases,
|
||||||
to support entity linking of named entities to real-world concepts.
|
to support entity linking of named entities to real-world concepts.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/kb
|
DOCS: https://nightly.spacy.io/api/kb
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, Vocab vocab, entity_vector_length):
|
def __init__(self, Vocab vocab, entity_vector_length):
|
||||||
|
|
|
@ -7,6 +7,7 @@ _concat_icons = CONCAT_ICONS.replace("\u00B0", "")
|
||||||
|
|
||||||
_currency = r"\$¢£€¥฿"
|
_currency = r"\$¢£€¥฿"
|
||||||
_quotes = CONCAT_QUOTES.replace("'", "")
|
_quotes = CONCAT_QUOTES.replace("'", "")
|
||||||
|
_units = UNITS.replace("%", "")
|
||||||
|
|
||||||
_prefixes = (
|
_prefixes = (
|
||||||
LIST_PUNCT
|
LIST_PUNCT
|
||||||
|
@ -26,7 +27,7 @@ _suffixes = (
|
||||||
r"(?<=[0-9])\+",
|
r"(?<=[0-9])\+",
|
||||||
r"(?<=°[FfCcKk])\.",
|
r"(?<=°[FfCcKk])\.",
|
||||||
r"(?<=[0-9])(?:[{c}])".format(c=_currency),
|
r"(?<=[0-9])(?:[{c}])".format(c=_currency),
|
||||||
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
r"(?<=[0-9])(?:{u})".format(u=_units),
|
||||||
r"(?<=[{al}{e}{q}(?:{c})])\.".format(
|
r"(?<=[{al}{e}{q}(?:{c})])\.".format(
|
||||||
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency
|
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency
|
||||||
),
|
),
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
from typing import Optional, Any, Dict, Callable, Iterable, Union, List, Pattern
|
from typing import Optional, Any, Dict, Callable, Iterable, Union, List, Pattern
|
||||||
from typing import Tuple, Iterator
|
from typing import Tuple, Iterator, Optional
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
import random
|
import random
|
||||||
import itertools
|
import itertools
|
||||||
|
@ -95,7 +95,7 @@ class Language:
|
||||||
object and processing pipeline.
|
object and processing pipeline.
|
||||||
lang (str): Two-letter language ID, i.e. ISO code.
|
lang (str): Two-letter language ID, i.e. ISO code.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language
|
DOCS: https://nightly.spacy.io/api/language
|
||||||
"""
|
"""
|
||||||
|
|
||||||
Defaults = BaseDefaults
|
Defaults = BaseDefaults
|
||||||
|
@ -130,7 +130,7 @@ class Language:
|
||||||
create_tokenizer (Callable): Function that takes the nlp object and
|
create_tokenizer (Callable): Function that takes the nlp object and
|
||||||
returns a tokenizer.
|
returns a tokenizer.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#init
|
DOCS: https://nightly.spacy.io/api/language#init
|
||||||
"""
|
"""
|
||||||
# We're only calling this to import all factories provided via entry
|
# We're only calling this to import all factories provided via entry
|
||||||
# points. The factory decorator applied to these functions takes care
|
# points. The factory decorator applied to these functions takes care
|
||||||
|
@ -185,14 +185,14 @@ class Language:
|
||||||
|
|
||||||
RETURNS (Dict[str, Any]): The meta.
|
RETURNS (Dict[str, Any]): The meta.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#meta
|
DOCS: https://nightly.spacy.io/api/language#meta
|
||||||
"""
|
"""
|
||||||
spacy_version = util.get_model_version_range(about.__version__)
|
spacy_version = util.get_model_version_range(about.__version__)
|
||||||
if self.vocab.lang:
|
if self.vocab.lang:
|
||||||
self._meta.setdefault("lang", self.vocab.lang)
|
self._meta.setdefault("lang", self.vocab.lang)
|
||||||
else:
|
else:
|
||||||
self._meta.setdefault("lang", self.lang)
|
self._meta.setdefault("lang", self.lang)
|
||||||
self._meta.setdefault("name", "model")
|
self._meta.setdefault("name", "pipeline")
|
||||||
self._meta.setdefault("version", "0.0.0")
|
self._meta.setdefault("version", "0.0.0")
|
||||||
self._meta.setdefault("spacy_version", spacy_version)
|
self._meta.setdefault("spacy_version", spacy_version)
|
||||||
self._meta.setdefault("description", "")
|
self._meta.setdefault("description", "")
|
||||||
|
@ -225,7 +225,7 @@ class Language:
|
||||||
|
|
||||||
RETURNS (thinc.api.Config): The config.
|
RETURNS (thinc.api.Config): The config.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#config
|
DOCS: https://nightly.spacy.io/api/language#config
|
||||||
"""
|
"""
|
||||||
self._config.setdefault("nlp", {})
|
self._config.setdefault("nlp", {})
|
||||||
self._config.setdefault("training", {})
|
self._config.setdefault("training", {})
|
||||||
|
@ -433,7 +433,7 @@ class Language:
|
||||||
will be combined and normalized for the whole pipeline.
|
will be combined and normalized for the whole pipeline.
|
||||||
func (Optional[Callable]): Factory function if not used as a decorator.
|
func (Optional[Callable]): Factory function if not used as a decorator.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#factory
|
DOCS: https://nightly.spacy.io/api/language#factory
|
||||||
"""
|
"""
|
||||||
if not isinstance(name, str):
|
if not isinstance(name, str):
|
||||||
raise ValueError(Errors.E963.format(decorator="factory"))
|
raise ValueError(Errors.E963.format(decorator="factory"))
|
||||||
|
@ -513,7 +513,7 @@ class Language:
|
||||||
Used for pipeline analysis.
|
Used for pipeline analysis.
|
||||||
func (Optional[Callable]): Factory function if not used as a decorator.
|
func (Optional[Callable]): Factory function if not used as a decorator.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#component
|
DOCS: https://nightly.spacy.io/api/language#component
|
||||||
"""
|
"""
|
||||||
if name is not None and not isinstance(name, str):
|
if name is not None and not isinstance(name, str):
|
||||||
raise ValueError(Errors.E963.format(decorator="component"))
|
raise ValueError(Errors.E963.format(decorator="component"))
|
||||||
|
@ -579,7 +579,7 @@ class Language:
|
||||||
name (str): Name of pipeline component to get.
|
name (str): Name of pipeline component to get.
|
||||||
RETURNS (callable): The pipeline component.
|
RETURNS (callable): The pipeline component.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#get_pipe
|
DOCS: https://nightly.spacy.io/api/language#get_pipe
|
||||||
"""
|
"""
|
||||||
for pipe_name, component in self._components:
|
for pipe_name, component in self._components:
|
||||||
if pipe_name == name:
|
if pipe_name == name:
|
||||||
|
@ -608,7 +608,7 @@ class Language:
|
||||||
arguments and types expected by the factory.
|
arguments and types expected by the factory.
|
||||||
RETURNS (Callable[[Doc], Doc]): The pipeline component.
|
RETURNS (Callable[[Doc], Doc]): The pipeline component.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#create_pipe
|
DOCS: https://nightly.spacy.io/api/language#create_pipe
|
||||||
"""
|
"""
|
||||||
name = name if name is not None else factory_name
|
name = name if name is not None else factory_name
|
||||||
if not isinstance(config, dict):
|
if not isinstance(config, dict):
|
||||||
|
@ -722,7 +722,7 @@ class Language:
|
||||||
arguments and types expected by the factory.
|
arguments and types expected by the factory.
|
||||||
RETURNS (Callable[[Doc], Doc]): The pipeline component.
|
RETURNS (Callable[[Doc], Doc]): The pipeline component.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#add_pipe
|
DOCS: https://nightly.spacy.io/api/language#add_pipe
|
||||||
"""
|
"""
|
||||||
if not isinstance(factory_name, str):
|
if not isinstance(factory_name, str):
|
||||||
bad_val = repr(factory_name)
|
bad_val = repr(factory_name)
|
||||||
|
@ -820,7 +820,7 @@ class Language:
|
||||||
name (str): Name of the component.
|
name (str): Name of the component.
|
||||||
RETURNS (bool): Whether a component of the name exists in the pipeline.
|
RETURNS (bool): Whether a component of the name exists in the pipeline.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#has_pipe
|
DOCS: https://nightly.spacy.io/api/language#has_pipe
|
||||||
"""
|
"""
|
||||||
return name in self.pipe_names
|
return name in self.pipe_names
|
||||||
|
|
||||||
|
@ -841,7 +841,7 @@ class Language:
|
||||||
validate (bool): Whether to validate the component config against the
|
validate (bool): Whether to validate the component config against the
|
||||||
arguments and types expected by the factory.
|
arguments and types expected by the factory.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#replace_pipe
|
DOCS: https://nightly.spacy.io/api/language#replace_pipe
|
||||||
"""
|
"""
|
||||||
if name not in self.pipe_names:
|
if name not in self.pipe_names:
|
||||||
raise ValueError(Errors.E001.format(name=name, opts=self.pipe_names))
|
raise ValueError(Errors.E001.format(name=name, opts=self.pipe_names))
|
||||||
|
@ -870,7 +870,7 @@ class Language:
|
||||||
old_name (str): Name of the component to rename.
|
old_name (str): Name of the component to rename.
|
||||||
new_name (str): New name of the component.
|
new_name (str): New name of the component.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#rename_pipe
|
DOCS: https://nightly.spacy.io/api/language#rename_pipe
|
||||||
"""
|
"""
|
||||||
if old_name not in self.component_names:
|
if old_name not in self.component_names:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
|
@ -891,7 +891,7 @@ class Language:
|
||||||
name (str): Name of the component to remove.
|
name (str): Name of the component to remove.
|
||||||
RETURNS (tuple): A `(name, component)` tuple of the removed component.
|
RETURNS (tuple): A `(name, component)` tuple of the removed component.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#remove_pipe
|
DOCS: https://nightly.spacy.io/api/language#remove_pipe
|
||||||
"""
|
"""
|
||||||
if name not in self.component_names:
|
if name not in self.component_names:
|
||||||
raise ValueError(Errors.E001.format(name=name, opts=self.component_names))
|
raise ValueError(Errors.E001.format(name=name, opts=self.component_names))
|
||||||
|
@ -944,7 +944,7 @@ class Language:
|
||||||
keyword arguments for specific components.
|
keyword arguments for specific components.
|
||||||
RETURNS (Doc): A container for accessing the annotations.
|
RETURNS (Doc): A container for accessing the annotations.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#call
|
DOCS: https://nightly.spacy.io/api/language#call
|
||||||
"""
|
"""
|
||||||
if len(text) > self.max_length:
|
if len(text) > self.max_length:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
|
@ -993,7 +993,7 @@ class Language:
|
||||||
disable (str or iterable): The name(s) of the pipes to disable
|
disable (str or iterable): The name(s) of the pipes to disable
|
||||||
enable (str or iterable): The name(s) of the pipes to enable - all others will be disabled
|
enable (str or iterable): The name(s) of the pipes to enable - all others will be disabled
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#select_pipes
|
DOCS: https://nightly.spacy.io/api/language#select_pipes
|
||||||
"""
|
"""
|
||||||
if enable is None and disable is None:
|
if enable is None and disable is None:
|
||||||
raise ValueError(Errors.E991)
|
raise ValueError(Errors.E991)
|
||||||
|
@ -1044,7 +1044,7 @@ class Language:
|
||||||
exclude (Iterable[str]): Names of components that shouldn't be updated.
|
exclude (Iterable[str]): Names of components that shouldn't be updated.
|
||||||
RETURNS (Dict[str, float]): The updated losses dictionary
|
RETURNS (Dict[str, float]): The updated losses dictionary
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#update
|
DOCS: https://nightly.spacy.io/api/language#update
|
||||||
"""
|
"""
|
||||||
if _ is not None:
|
if _ is not None:
|
||||||
raise ValueError(Errors.E989)
|
raise ValueError(Errors.E989)
|
||||||
|
@ -1106,7 +1106,7 @@ class Language:
|
||||||
>>> raw_batch = [Example.from_dict(nlp.make_doc(text), {}) for text in next(raw_text_batches)]
|
>>> raw_batch = [Example.from_dict(nlp.make_doc(text), {}) for text in next(raw_text_batches)]
|
||||||
>>> nlp.rehearse(raw_batch)
|
>>> nlp.rehearse(raw_batch)
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#rehearse
|
DOCS: https://nightly.spacy.io/api/language#rehearse
|
||||||
"""
|
"""
|
||||||
if len(examples) == 0:
|
if len(examples) == 0:
|
||||||
return
|
return
|
||||||
|
@ -1153,7 +1153,7 @@ class Language:
|
||||||
create_optimizer if it doesn't exist.
|
create_optimizer if it doesn't exist.
|
||||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#begin_training
|
DOCS: https://nightly.spacy.io/api/language#begin_training
|
||||||
"""
|
"""
|
||||||
# TODO: throw warning when get_gold_tuples is provided instead of get_examples
|
# TODO: throw warning when get_gold_tuples is provided instead of get_examples
|
||||||
if get_examples is None:
|
if get_examples is None:
|
||||||
|
@ -1200,7 +1200,7 @@ class Language:
|
||||||
sgd (Optional[Optimizer]): An optimizer.
|
sgd (Optional[Optimizer]): An optimizer.
|
||||||
RETURNS (Optimizer): The optimizer.
|
RETURNS (Optimizer): The optimizer.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#resume_training
|
DOCS: https://nightly.spacy.io/api/language#resume_training
|
||||||
"""
|
"""
|
||||||
if device >= 0: # TODO: do we need this here?
|
if device >= 0: # TODO: do we need this here?
|
||||||
require_gpu(device)
|
require_gpu(device)
|
||||||
|
@ -1236,7 +1236,7 @@ class Language:
|
||||||
for the scorer.
|
for the scorer.
|
||||||
RETURNS (Scorer): The scorer containing the evaluation results.
|
RETURNS (Scorer): The scorer containing the evaluation results.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#evaluate
|
DOCS: https://nightly.spacy.io/api/language#evaluate
|
||||||
"""
|
"""
|
||||||
validate_examples(examples, "Language.evaluate")
|
validate_examples(examples, "Language.evaluate")
|
||||||
if component_cfg is None:
|
if component_cfg is None:
|
||||||
|
@ -1275,7 +1275,7 @@ class Language:
|
||||||
return results
|
return results
|
||||||
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
def use_params(self, params: dict):
|
def use_params(self, params: Optional[dict]):
|
||||||
"""Replace weights of models in the pipeline with those provided in the
|
"""Replace weights of models in the pipeline with those provided in the
|
||||||
params dictionary. Can be used as a contextmanager, in which case,
|
params dictionary. Can be used as a contextmanager, in which case,
|
||||||
models go back to their original weights after the block.
|
models go back to their original weights after the block.
|
||||||
|
@ -1286,8 +1286,11 @@ class Language:
|
||||||
>>> with nlp.use_params(optimizer.averages):
|
>>> with nlp.use_params(optimizer.averages):
|
||||||
>>> nlp.to_disk("/tmp/checkpoint")
|
>>> nlp.to_disk("/tmp/checkpoint")
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#use_params
|
DOCS: https://nightly.spacy.io/api/language#use_params
|
||||||
"""
|
"""
|
||||||
|
if not params:
|
||||||
|
yield
|
||||||
|
else:
|
||||||
contexts = [
|
contexts = [
|
||||||
pipe.use_params(params)
|
pipe.use_params(params)
|
||||||
for name, pipe in self.pipeline
|
for name, pipe in self.pipeline
|
||||||
|
@ -1330,7 +1333,7 @@ class Language:
|
||||||
n_process (int): Number of processors to process texts. If -1, set `multiprocessing.cpu_count()`.
|
n_process (int): Number of processors to process texts. If -1, set `multiprocessing.cpu_count()`.
|
||||||
YIELDS (Doc): Documents in the order of the original text.
|
YIELDS (Doc): Documents in the order of the original text.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#pipe
|
DOCS: https://nightly.spacy.io/api/language#pipe
|
||||||
"""
|
"""
|
||||||
if n_process == -1:
|
if n_process == -1:
|
||||||
n_process = mp.cpu_count()
|
n_process = mp.cpu_count()
|
||||||
|
@ -1466,7 +1469,7 @@ class Language:
|
||||||
the types expected by the factory.
|
the types expected by the factory.
|
||||||
RETURNS (Language): The initialized Language class.
|
RETURNS (Language): The initialized Language class.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#from_config
|
DOCS: https://nightly.spacy.io/api/language#from_config
|
||||||
"""
|
"""
|
||||||
if auto_fill:
|
if auto_fill:
|
||||||
config = Config(
|
config = Config(
|
||||||
|
@ -1579,7 +1582,7 @@ class Language:
|
||||||
it doesn't exist.
|
it doesn't exist.
|
||||||
exclude (list): Names of components or serialization fields to exclude.
|
exclude (list): Names of components or serialization fields to exclude.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#to_disk
|
DOCS: https://nightly.spacy.io/api/language#to_disk
|
||||||
"""
|
"""
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
serializers = {}
|
serializers = {}
|
||||||
|
@ -1608,7 +1611,7 @@ class Language:
|
||||||
exclude (list): Names of components or serialization fields to exclude.
|
exclude (list): Names of components or serialization fields to exclude.
|
||||||
RETURNS (Language): The modified `Language` object.
|
RETURNS (Language): The modified `Language` object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#from_disk
|
DOCS: https://nightly.spacy.io/api/language#from_disk
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def deserialize_meta(path: Path) -> None:
|
def deserialize_meta(path: Path) -> None:
|
||||||
|
@ -1656,7 +1659,7 @@ class Language:
|
||||||
exclude (list): Names of components or serialization fields to exclude.
|
exclude (list): Names of components or serialization fields to exclude.
|
||||||
RETURNS (bytes): The serialized form of the `Language` object.
|
RETURNS (bytes): The serialized form of the `Language` object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#to_bytes
|
DOCS: https://nightly.spacy.io/api/language#to_bytes
|
||||||
"""
|
"""
|
||||||
serializers = {}
|
serializers = {}
|
||||||
serializers["vocab"] = lambda: self.vocab.to_bytes()
|
serializers["vocab"] = lambda: self.vocab.to_bytes()
|
||||||
|
@ -1680,7 +1683,7 @@ class Language:
|
||||||
exclude (list): Names of components or serialization fields to exclude.
|
exclude (list): Names of components or serialization fields to exclude.
|
||||||
RETURNS (Language): The `Language` object.
|
RETURNS (Language): The `Language` object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#from_bytes
|
DOCS: https://nightly.spacy.io/api/language#from_bytes
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def deserialize_meta(b):
|
def deserialize_meta(b):
|
||||||
|
|
|
@ -30,7 +30,7 @@ cdef class Lexeme:
|
||||||
tag, dependency parse, or lemma (lemmatization depends on the
|
tag, dependency parse, or lemma (lemmatization depends on the
|
||||||
part-of-speech tag).
|
part-of-speech tag).
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lexeme
|
DOCS: https://nightly.spacy.io/api/lexeme
|
||||||
"""
|
"""
|
||||||
def __init__(self, Vocab vocab, attr_t orth):
|
def __init__(self, Vocab vocab, attr_t orth):
|
||||||
"""Create a Lexeme object.
|
"""Create a Lexeme object.
|
||||||
|
|
|
@ -57,7 +57,7 @@ class Table(OrderedDict):
|
||||||
data (dict): The dictionary.
|
data (dict): The dictionary.
|
||||||
name (str): Optional table name for reference.
|
name (str): Optional table name for reference.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lookups#table.from_dict
|
DOCS: https://nightly.spacy.io/api/lookups#table.from_dict
|
||||||
"""
|
"""
|
||||||
self = cls(name=name)
|
self = cls(name=name)
|
||||||
self.update(data)
|
self.update(data)
|
||||||
|
@ -69,7 +69,7 @@ class Table(OrderedDict):
|
||||||
name (str): Optional table name for reference.
|
name (str): Optional table name for reference.
|
||||||
data (dict): Initial data, used to hint Bloom Filter.
|
data (dict): Initial data, used to hint Bloom Filter.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lookups#table.init
|
DOCS: https://nightly.spacy.io/api/lookups#table.init
|
||||||
"""
|
"""
|
||||||
OrderedDict.__init__(self)
|
OrderedDict.__init__(self)
|
||||||
self.name = name
|
self.name = name
|
||||||
|
@ -135,7 +135,7 @@ class Table(OrderedDict):
|
||||||
|
|
||||||
RETURNS (bytes): The serialized table.
|
RETURNS (bytes): The serialized table.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lookups#table.to_bytes
|
DOCS: https://nightly.spacy.io/api/lookups#table.to_bytes
|
||||||
"""
|
"""
|
||||||
data = {
|
data = {
|
||||||
"name": self.name,
|
"name": self.name,
|
||||||
|
@ -150,7 +150,7 @@ class Table(OrderedDict):
|
||||||
bytes_data (bytes): The data to load.
|
bytes_data (bytes): The data to load.
|
||||||
RETURNS (Table): The loaded table.
|
RETURNS (Table): The loaded table.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lookups#table.from_bytes
|
DOCS: https://nightly.spacy.io/api/lookups#table.from_bytes
|
||||||
"""
|
"""
|
||||||
loaded = srsly.msgpack_loads(bytes_data)
|
loaded = srsly.msgpack_loads(bytes_data)
|
||||||
data = loaded.get("dict", {})
|
data = loaded.get("dict", {})
|
||||||
|
@ -172,7 +172,7 @@ class Lookups:
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
"""Initialize the Lookups object.
|
"""Initialize the Lookups object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lookups#init
|
DOCS: https://nightly.spacy.io/api/lookups#init
|
||||||
"""
|
"""
|
||||||
self._tables = {}
|
self._tables = {}
|
||||||
|
|
||||||
|
@ -201,7 +201,7 @@ class Lookups:
|
||||||
data (dict): Optional data to add to the table.
|
data (dict): Optional data to add to the table.
|
||||||
RETURNS (Table): The newly added table.
|
RETURNS (Table): The newly added table.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lookups#add_table
|
DOCS: https://nightly.spacy.io/api/lookups#add_table
|
||||||
"""
|
"""
|
||||||
if name in self.tables:
|
if name in self.tables:
|
||||||
raise ValueError(Errors.E158.format(name=name))
|
raise ValueError(Errors.E158.format(name=name))
|
||||||
|
@ -215,7 +215,7 @@ class Lookups:
|
||||||
name (str): Name of the table to set.
|
name (str): Name of the table to set.
|
||||||
table (Table): The Table to set.
|
table (Table): The Table to set.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lookups#set_table
|
DOCS: https://nightly.spacy.io/api/lookups#set_table
|
||||||
"""
|
"""
|
||||||
self._tables[name] = table
|
self._tables[name] = table
|
||||||
|
|
||||||
|
@ -227,7 +227,7 @@ class Lookups:
|
||||||
default (Any): Optional default value to return if table doesn't exist.
|
default (Any): Optional default value to return if table doesn't exist.
|
||||||
RETURNS (Table): The table.
|
RETURNS (Table): The table.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lookups#get_table
|
DOCS: https://nightly.spacy.io/api/lookups#get_table
|
||||||
"""
|
"""
|
||||||
if name not in self._tables:
|
if name not in self._tables:
|
||||||
if default == UNSET:
|
if default == UNSET:
|
||||||
|
@ -241,7 +241,7 @@ class Lookups:
|
||||||
name (str): Name of the table to remove.
|
name (str): Name of the table to remove.
|
||||||
RETURNS (Table): The removed table.
|
RETURNS (Table): The removed table.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lookups#remove_table
|
DOCS: https://nightly.spacy.io/api/lookups#remove_table
|
||||||
"""
|
"""
|
||||||
if name not in self._tables:
|
if name not in self._tables:
|
||||||
raise KeyError(Errors.E159.format(name=name, tables=self.tables))
|
raise KeyError(Errors.E159.format(name=name, tables=self.tables))
|
||||||
|
@ -253,7 +253,7 @@ class Lookups:
|
||||||
name (str): Name of the table.
|
name (str): Name of the table.
|
||||||
RETURNS (bool): Whether a table of that name exists.
|
RETURNS (bool): Whether a table of that name exists.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lookups#has_table
|
DOCS: https://nightly.spacy.io/api/lookups#has_table
|
||||||
"""
|
"""
|
||||||
return name in self._tables
|
return name in self._tables
|
||||||
|
|
||||||
|
@ -262,7 +262,7 @@ class Lookups:
|
||||||
|
|
||||||
RETURNS (bytes): The serialized Lookups.
|
RETURNS (bytes): The serialized Lookups.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lookups#to_bytes
|
DOCS: https://nightly.spacy.io/api/lookups#to_bytes
|
||||||
"""
|
"""
|
||||||
return srsly.msgpack_dumps(self._tables)
|
return srsly.msgpack_dumps(self._tables)
|
||||||
|
|
||||||
|
@ -272,7 +272,7 @@ class Lookups:
|
||||||
bytes_data (bytes): The data to load.
|
bytes_data (bytes): The data to load.
|
||||||
RETURNS (Lookups): The loaded Lookups.
|
RETURNS (Lookups): The loaded Lookups.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lookups#from_bytes
|
DOCS: https://nightly.spacy.io/api/lookups#from_bytes
|
||||||
"""
|
"""
|
||||||
self._tables = {}
|
self._tables = {}
|
||||||
for key, value in srsly.msgpack_loads(bytes_data).items():
|
for key, value in srsly.msgpack_loads(bytes_data).items():
|
||||||
|
@ -287,7 +287,7 @@ class Lookups:
|
||||||
|
|
||||||
path (str / Path): The file path.
|
path (str / Path): The file path.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lookups#to_disk
|
DOCS: https://nightly.spacy.io/api/lookups#to_disk
|
||||||
"""
|
"""
|
||||||
if len(self._tables):
|
if len(self._tables):
|
||||||
path = ensure_path(path)
|
path = ensure_path(path)
|
||||||
|
@ -306,7 +306,7 @@ class Lookups:
|
||||||
path (str / Path): The directory path.
|
path (str / Path): The directory path.
|
||||||
RETURNS (Lookups): The loaded lookups.
|
RETURNS (Lookups): The loaded lookups.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lookups#from_disk
|
DOCS: https://nightly.spacy.io/api/lookups#from_disk
|
||||||
"""
|
"""
|
||||||
path = ensure_path(path)
|
path = ensure_path(path)
|
||||||
filepath = path / filename
|
filepath = path / filename
|
||||||
|
|
|
@ -31,8 +31,8 @@ DEF PADDING = 5
|
||||||
cdef class Matcher:
|
cdef class Matcher:
|
||||||
"""Match sequences of tokens, based on pattern rules.
|
"""Match sequences of tokens, based on pattern rules.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/matcher
|
DOCS: https://nightly.spacy.io/api/matcher
|
||||||
USAGE: https://spacy.io/usage/rule-based-matching
|
USAGE: https://nightly.spacy.io/usage/rule-based-matching
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, vocab, validate=True):
|
def __init__(self, vocab, validate=True):
|
||||||
|
|
|
@ -19,8 +19,8 @@ cdef class PhraseMatcher:
|
||||||
sequences based on lists of token descriptions, the `PhraseMatcher` accepts
|
sequences based on lists of token descriptions, the `PhraseMatcher` accepts
|
||||||
match patterns in the form of `Doc` objects.
|
match patterns in the form of `Doc` objects.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/phrasematcher
|
DOCS: https://nightly.spacy.io/api/phrasematcher
|
||||||
USAGE: https://spacy.io/usage/rule-based-matching#phrasematcher
|
USAGE: https://nightly.spacy.io/usage/rule-based-matching#phrasematcher
|
||||||
|
|
||||||
Adapted from FlashText: https://github.com/vi3k6i5/flashtext
|
Adapted from FlashText: https://github.com/vi3k6i5/flashtext
|
||||||
MIT License (see `LICENSE`)
|
MIT License (see `LICENSE`)
|
||||||
|
@ -34,7 +34,7 @@ cdef class PhraseMatcher:
|
||||||
attr (int / str): Token attribute to match on.
|
attr (int / str): Token attribute to match on.
|
||||||
validate (bool): Perform additional validation when patterns are added.
|
validate (bool): Perform additional validation when patterns are added.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/phrasematcher#init
|
DOCS: https://nightly.spacy.io/api/phrasematcher#init
|
||||||
"""
|
"""
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self._callbacks = {}
|
self._callbacks = {}
|
||||||
|
@ -61,7 +61,7 @@ cdef class PhraseMatcher:
|
||||||
|
|
||||||
RETURNS (int): The number of rules.
|
RETURNS (int): The number of rules.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/phrasematcher#len
|
DOCS: https://nightly.spacy.io/api/phrasematcher#len
|
||||||
"""
|
"""
|
||||||
return len(self._callbacks)
|
return len(self._callbacks)
|
||||||
|
|
||||||
|
@ -71,7 +71,7 @@ cdef class PhraseMatcher:
|
||||||
key (str): The match ID.
|
key (str): The match ID.
|
||||||
RETURNS (bool): Whether the matcher contains rules for this match ID.
|
RETURNS (bool): Whether the matcher contains rules for this match ID.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/phrasematcher#contains
|
DOCS: https://nightly.spacy.io/api/phrasematcher#contains
|
||||||
"""
|
"""
|
||||||
return key in self._callbacks
|
return key in self._callbacks
|
||||||
|
|
||||||
|
@ -85,7 +85,7 @@ cdef class PhraseMatcher:
|
||||||
|
|
||||||
key (str): The match ID.
|
key (str): The match ID.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/phrasematcher#remove
|
DOCS: https://nightly.spacy.io/api/phrasematcher#remove
|
||||||
"""
|
"""
|
||||||
if key not in self._docs:
|
if key not in self._docs:
|
||||||
raise KeyError(key)
|
raise KeyError(key)
|
||||||
|
@ -164,7 +164,7 @@ cdef class PhraseMatcher:
|
||||||
as variable arguments. Will be ignored if a list of patterns is
|
as variable arguments. Will be ignored if a list of patterns is
|
||||||
provided as the second argument.
|
provided as the second argument.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/phrasematcher#add
|
DOCS: https://nightly.spacy.io/api/phrasematcher#add
|
||||||
"""
|
"""
|
||||||
if docs is None or hasattr(docs, "__call__"): # old API
|
if docs is None or hasattr(docs, "__call__"): # old API
|
||||||
on_match = docs
|
on_match = docs
|
||||||
|
@ -228,7 +228,7 @@ cdef class PhraseMatcher:
|
||||||
`doc[start:end]`. The `match_id` is an integer. If as_spans is set
|
`doc[start:end]`. The `match_id` is an integer. If as_spans is set
|
||||||
to True, a list of Span objects is returned.
|
to True, a list of Span objects is returned.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/phrasematcher#call
|
DOCS: https://nightly.spacy.io/api/phrasematcher#call
|
||||||
"""
|
"""
|
||||||
matches = []
|
matches = []
|
||||||
if doc is None or len(doc) == 0:
|
if doc is None or len(doc) == 0:
|
||||||
|
|
|
@ -24,7 +24,7 @@ def build_nel_encoder(tok2vec: Model, nO: Optional[int] = None) -> Model:
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
@registry.assets.register("spacy.KBFromFile.v1")
|
@registry.misc.register("spacy.KBFromFile.v1")
|
||||||
def load_kb(kb_path: str) -> Callable[[Vocab], KnowledgeBase]:
|
def load_kb(kb_path: str) -> Callable[[Vocab], KnowledgeBase]:
|
||||||
def kb_from_file(vocab):
|
def kb_from_file(vocab):
|
||||||
kb = KnowledgeBase(vocab, entity_vector_length=1)
|
kb = KnowledgeBase(vocab, entity_vector_length=1)
|
||||||
|
@ -34,7 +34,7 @@ def load_kb(kb_path: str) -> Callable[[Vocab], KnowledgeBase]:
|
||||||
return kb_from_file
|
return kb_from_file
|
||||||
|
|
||||||
|
|
||||||
@registry.assets.register("spacy.EmptyKB.v1")
|
@registry.misc.register("spacy.EmptyKB.v1")
|
||||||
def empty_kb(entity_vector_length: int) -> Callable[[Vocab], KnowledgeBase]:
|
def empty_kb(entity_vector_length: int) -> Callable[[Vocab], KnowledgeBase]:
|
||||||
def empty_kb_factory(vocab):
|
def empty_kb_factory(vocab):
|
||||||
return KnowledgeBase(vocab=vocab, entity_vector_length=entity_vector_length)
|
return KnowledgeBase(vocab=vocab, entity_vector_length=entity_vector_length)
|
||||||
|
@ -42,6 +42,6 @@ def empty_kb(entity_vector_length: int) -> Callable[[Vocab], KnowledgeBase]:
|
||||||
return empty_kb_factory
|
return empty_kb_factory
|
||||||
|
|
||||||
|
|
||||||
@registry.assets.register("spacy.CandidateGenerator.v1")
|
@registry.misc.register("spacy.CandidateGenerator.v1")
|
||||||
def create_candidates() -> Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]:
|
def create_candidates() -> Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]:
|
||||||
return get_candidates
|
return get_candidates
|
||||||
|
|
|
@ -38,7 +38,7 @@ class AttributeRuler(Pipe):
|
||||||
"""Set token-level attributes for tokens matched by Matcher patterns.
|
"""Set token-level attributes for tokens matched by Matcher patterns.
|
||||||
Additionally supports importing patterns from tag maps and morph rules.
|
Additionally supports importing patterns from tag maps and morph rules.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/attributeruler
|
DOCS: https://nightly.spacy.io/api/attributeruler
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
|
@ -59,7 +59,7 @@ class AttributeRuler(Pipe):
|
||||||
|
|
||||||
RETURNS (AttributeRuler): The AttributeRuler component.
|
RETURNS (AttributeRuler): The AttributeRuler component.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/attributeruler#init
|
DOCS: https://nightly.spacy.io/api/attributeruler#init
|
||||||
"""
|
"""
|
||||||
self.name = name
|
self.name = name
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
|
@ -77,7 +77,7 @@ class AttributeRuler(Pipe):
|
||||||
doc (Doc): The document to process.
|
doc (Doc): The document to process.
|
||||||
RETURNS (Doc): The processed Doc.
|
RETURNS (Doc): The processed Doc.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/attributeruler#call
|
DOCS: https://nightly.spacy.io/api/attributeruler#call
|
||||||
"""
|
"""
|
||||||
matches = sorted(self.matcher(doc))
|
matches = sorted(self.matcher(doc))
|
||||||
|
|
||||||
|
@ -121,7 +121,7 @@ class AttributeRuler(Pipe):
|
||||||
tag_map (dict): The tag map that maps fine-grained tags to
|
tag_map (dict): The tag map that maps fine-grained tags to
|
||||||
coarse-grained tags and morphological features.
|
coarse-grained tags and morphological features.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/attributeruler#load_from_morph_rules
|
DOCS: https://nightly.spacy.io/api/attributeruler#load_from_morph_rules
|
||||||
"""
|
"""
|
||||||
for tag, attrs in tag_map.items():
|
for tag, attrs in tag_map.items():
|
||||||
pattern = [{"TAG": tag}]
|
pattern = [{"TAG": tag}]
|
||||||
|
@ -139,7 +139,7 @@ class AttributeRuler(Pipe):
|
||||||
fine-grained tags to coarse-grained tags, lemmas and morphological
|
fine-grained tags to coarse-grained tags, lemmas and morphological
|
||||||
features.
|
features.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/attributeruler#load_from_morph_rules
|
DOCS: https://nightly.spacy.io/api/attributeruler#load_from_morph_rules
|
||||||
"""
|
"""
|
||||||
for tag in morph_rules:
|
for tag in morph_rules:
|
||||||
for word in morph_rules[tag]:
|
for word in morph_rules[tag]:
|
||||||
|
@ -163,7 +163,7 @@ class AttributeRuler(Pipe):
|
||||||
index (int): The index of the token in the matched span to modify. May
|
index (int): The index of the token in the matched span to modify. May
|
||||||
be negative to index from the end of the span. Defaults to 0.
|
be negative to index from the end of the span. Defaults to 0.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/attributeruler#add
|
DOCS: https://nightly.spacy.io/api/attributeruler#add
|
||||||
"""
|
"""
|
||||||
self.matcher.add(len(self.attrs), patterns)
|
self.matcher.add(len(self.attrs), patterns)
|
||||||
self._attrs_unnormed.append(attrs)
|
self._attrs_unnormed.append(attrs)
|
||||||
|
@ -178,7 +178,7 @@ class AttributeRuler(Pipe):
|
||||||
as the arguments to AttributeRuler.add (patterns/attrs/index) to
|
as the arguments to AttributeRuler.add (patterns/attrs/index) to
|
||||||
add as patterns.
|
add as patterns.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/attributeruler#add_patterns
|
DOCS: https://nightly.spacy.io/api/attributeruler#add_patterns
|
||||||
"""
|
"""
|
||||||
for p in pattern_dicts:
|
for p in pattern_dicts:
|
||||||
self.add(**p)
|
self.add(**p)
|
||||||
|
@ -203,7 +203,7 @@ class AttributeRuler(Pipe):
|
||||||
Scorer.score_token_attr for the attributes "tag", "pos", "morph"
|
Scorer.score_token_attr for the attributes "tag", "pos", "morph"
|
||||||
and "lemma" for the target token attributes.
|
and "lemma" for the target token attributes.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tagger#score
|
DOCS: https://nightly.spacy.io/api/tagger#score
|
||||||
"""
|
"""
|
||||||
validate_examples(examples, "AttributeRuler.score")
|
validate_examples(examples, "AttributeRuler.score")
|
||||||
results = {}
|
results = {}
|
||||||
|
@ -227,7 +227,7 @@ class AttributeRuler(Pipe):
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
RETURNS (bytes): The serialized object.
|
RETURNS (bytes): The serialized object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/attributeruler#to_bytes
|
DOCS: https://nightly.spacy.io/api/attributeruler#to_bytes
|
||||||
"""
|
"""
|
||||||
serialize = {}
|
serialize = {}
|
||||||
serialize["vocab"] = self.vocab.to_bytes
|
serialize["vocab"] = self.vocab.to_bytes
|
||||||
|
@ -243,7 +243,7 @@ class AttributeRuler(Pipe):
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
returns (AttributeRuler): The loaded object.
|
returns (AttributeRuler): The loaded object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/attributeruler#from_bytes
|
DOCS: https://nightly.spacy.io/api/attributeruler#from_bytes
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def load_patterns(b):
|
def load_patterns(b):
|
||||||
|
@ -264,7 +264,7 @@ class AttributeRuler(Pipe):
|
||||||
|
|
||||||
path (Union[Path, str]): A path to a directory.
|
path (Union[Path, str]): A path to a directory.
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
DOCS: https://spacy.io/api/attributeruler#to_disk
|
DOCS: https://nightly.spacy.io/api/attributeruler#to_disk
|
||||||
"""
|
"""
|
||||||
serialize = {
|
serialize = {
|
||||||
"vocab": lambda p: self.vocab.to_disk(p),
|
"vocab": lambda p: self.vocab.to_disk(p),
|
||||||
|
@ -279,7 +279,7 @@ class AttributeRuler(Pipe):
|
||||||
|
|
||||||
path (Union[Path, str]): A path to a directory.
|
path (Union[Path, str]): A path to a directory.
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
DOCS: https://spacy.io/api/attributeruler#from_disk
|
DOCS: https://nightly.spacy.io/api/attributeruler#from_disk
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def load_patterns(p):
|
def load_patterns(p):
|
||||||
|
|
|
@ -105,7 +105,7 @@ def make_parser(
|
||||||
cdef class DependencyParser(Parser):
|
cdef class DependencyParser(Parser):
|
||||||
"""Pipeline component for dependency parsing.
|
"""Pipeline component for dependency parsing.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/dependencyparser
|
DOCS: https://nightly.spacy.io/api/dependencyparser
|
||||||
"""
|
"""
|
||||||
TransitionSystem = ArcEager
|
TransitionSystem = ArcEager
|
||||||
|
|
||||||
|
@ -146,7 +146,7 @@ cdef class DependencyParser(Parser):
|
||||||
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans
|
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans
|
||||||
and Scorer.score_deps.
|
and Scorer.score_deps.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/dependencyparser#score
|
DOCS: https://nightly.spacy.io/api/dependencyparser#score
|
||||||
"""
|
"""
|
||||||
validate_examples(examples, "DependencyParser.score")
|
validate_examples(examples, "DependencyParser.score")
|
||||||
def dep_getter(token, attr):
|
def dep_getter(token, attr):
|
||||||
|
|
|
@ -39,12 +39,12 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
|
||||||
requires=["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"],
|
requires=["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"],
|
||||||
assigns=["token.ent_kb_id"],
|
assigns=["token.ent_kb_id"],
|
||||||
default_config={
|
default_config={
|
||||||
"kb_loader": {"@assets": "spacy.EmptyKB.v1", "entity_vector_length": 64},
|
"kb_loader": {"@misc": "spacy.EmptyKB.v1", "entity_vector_length": 64},
|
||||||
"model": DEFAULT_NEL_MODEL,
|
"model": DEFAULT_NEL_MODEL,
|
||||||
"labels_discard": [],
|
"labels_discard": [],
|
||||||
"incl_prior": True,
|
"incl_prior": True,
|
||||||
"incl_context": True,
|
"incl_context": True,
|
||||||
"get_candidates": {"@assets": "spacy.CandidateGenerator.v1"},
|
"get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
def make_entity_linker(
|
def make_entity_linker(
|
||||||
|
@ -83,7 +83,7 @@ def make_entity_linker(
|
||||||
class EntityLinker(Pipe):
|
class EntityLinker(Pipe):
|
||||||
"""Pipeline component for named entity linking.
|
"""Pipeline component for named entity linking.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entitylinker
|
DOCS: https://nightly.spacy.io/api/entitylinker
|
||||||
"""
|
"""
|
||||||
|
|
||||||
NIL = "NIL" # string used to refer to a non-existing link
|
NIL = "NIL" # string used to refer to a non-existing link
|
||||||
|
@ -111,7 +111,7 @@ class EntityLinker(Pipe):
|
||||||
incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
|
incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
|
||||||
incl_context (bool): Whether or not to include the local context in the model.
|
incl_context (bool): Whether or not to include the local context in the model.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entitylinker#init
|
DOCS: https://nightly.spacy.io/api/entitylinker#init
|
||||||
"""
|
"""
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.model = model
|
self.model = model
|
||||||
|
@ -151,7 +151,7 @@ class EntityLinker(Pipe):
|
||||||
create_optimizer if it doesn't exist.
|
create_optimizer if it doesn't exist.
|
||||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entitylinker#begin_training
|
DOCS: https://nightly.spacy.io/api/entitylinker#begin_training
|
||||||
"""
|
"""
|
||||||
self.require_kb()
|
self.require_kb()
|
||||||
nO = self.kb.entity_vector_length
|
nO = self.kb.entity_vector_length
|
||||||
|
@ -182,7 +182,7 @@ class EntityLinker(Pipe):
|
||||||
Updated using the component name as the key.
|
Updated using the component name as the key.
|
||||||
RETURNS (Dict[str, float]): The updated losses dictionary.
|
RETURNS (Dict[str, float]): The updated losses dictionary.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entitylinker#update
|
DOCS: https://nightly.spacy.io/api/entitylinker#update
|
||||||
"""
|
"""
|
||||||
self.require_kb()
|
self.require_kb()
|
||||||
if losses is None:
|
if losses is None:
|
||||||
|
@ -264,7 +264,7 @@ class EntityLinker(Pipe):
|
||||||
doc (Doc): The document to process.
|
doc (Doc): The document to process.
|
||||||
RETURNS (Doc): The processed Doc.
|
RETURNS (Doc): The processed Doc.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entitylinker#call
|
DOCS: https://nightly.spacy.io/api/entitylinker#call
|
||||||
"""
|
"""
|
||||||
kb_ids = self.predict([doc])
|
kb_ids = self.predict([doc])
|
||||||
self.set_annotations([doc], kb_ids)
|
self.set_annotations([doc], kb_ids)
|
||||||
|
@ -279,7 +279,7 @@ class EntityLinker(Pipe):
|
||||||
batch_size (int): The number of documents to buffer.
|
batch_size (int): The number of documents to buffer.
|
||||||
YIELDS (Doc): Processed documents in order.
|
YIELDS (Doc): Processed documents in order.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entitylinker#pipe
|
DOCS: https://nightly.spacy.io/api/entitylinker#pipe
|
||||||
"""
|
"""
|
||||||
for docs in util.minibatch(stream, size=batch_size):
|
for docs in util.minibatch(stream, size=batch_size):
|
||||||
kb_ids = self.predict(docs)
|
kb_ids = self.predict(docs)
|
||||||
|
@ -294,7 +294,7 @@ class EntityLinker(Pipe):
|
||||||
docs (Iterable[Doc]): The documents to predict.
|
docs (Iterable[Doc]): The documents to predict.
|
||||||
RETURNS (List[int]): The models prediction for each document.
|
RETURNS (List[int]): The models prediction for each document.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entitylinker#predict
|
DOCS: https://nightly.spacy.io/api/entitylinker#predict
|
||||||
"""
|
"""
|
||||||
self.require_kb()
|
self.require_kb()
|
||||||
entity_count = 0
|
entity_count = 0
|
||||||
|
@ -391,7 +391,7 @@ class EntityLinker(Pipe):
|
||||||
docs (Iterable[Doc]): The documents to modify.
|
docs (Iterable[Doc]): The documents to modify.
|
||||||
kb_ids (List[str]): The IDs to set, produced by EntityLinker.predict.
|
kb_ids (List[str]): The IDs to set, produced by EntityLinker.predict.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entitylinker#set_annotations
|
DOCS: https://nightly.spacy.io/api/entitylinker#set_annotations
|
||||||
"""
|
"""
|
||||||
count_ents = len([ent for doc in docs for ent in doc.ents])
|
count_ents = len([ent for doc in docs for ent in doc.ents])
|
||||||
if count_ents != len(kb_ids):
|
if count_ents != len(kb_ids):
|
||||||
|
@ -412,7 +412,7 @@ class EntityLinker(Pipe):
|
||||||
path (str / Path): Path to a directory.
|
path (str / Path): Path to a directory.
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entitylinker#to_disk
|
DOCS: https://nightly.spacy.io/api/entitylinker#to_disk
|
||||||
"""
|
"""
|
||||||
serialize = {}
|
serialize = {}
|
||||||
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
|
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
|
||||||
|
@ -430,7 +430,7 @@ class EntityLinker(Pipe):
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
RETURNS (EntityLinker): The modified EntityLinker object.
|
RETURNS (EntityLinker): The modified EntityLinker object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entitylinker#from_disk
|
DOCS: https://nightly.spacy.io/api/entitylinker#from_disk
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def load_model(p):
|
def load_model(p):
|
||||||
|
|
|
@ -53,8 +53,8 @@ class EntityRuler:
|
||||||
purely rule-based entity recognition system. After initialization, the
|
purely rule-based entity recognition system. After initialization, the
|
||||||
component is typically added to the pipeline using `nlp.add_pipe`.
|
component is typically added to the pipeline using `nlp.add_pipe`.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entityruler
|
DOCS: https://nightly.spacy.io/api/entityruler
|
||||||
USAGE: https://spacy.io/usage/rule-based-matching#entityruler
|
USAGE: https://nightly.spacy.io/usage/rule-based-matching#entityruler
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
|
@ -88,7 +88,7 @@ class EntityRuler:
|
||||||
added by the model, overwrite them by matches if necessary.
|
added by the model, overwrite them by matches if necessary.
|
||||||
ent_id_sep (str): Separator used internally for entity IDs.
|
ent_id_sep (str): Separator used internally for entity IDs.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entityruler#init
|
DOCS: https://nightly.spacy.io/api/entityruler#init
|
||||||
"""
|
"""
|
||||||
self.nlp = nlp
|
self.nlp = nlp
|
||||||
self.name = name
|
self.name = name
|
||||||
|
@ -127,7 +127,7 @@ class EntityRuler:
|
||||||
doc (Doc): The Doc object in the pipeline.
|
doc (Doc): The Doc object in the pipeline.
|
||||||
RETURNS (Doc): The Doc with added entities, if available.
|
RETURNS (Doc): The Doc with added entities, if available.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entityruler#call
|
DOCS: https://nightly.spacy.io/api/entityruler#call
|
||||||
"""
|
"""
|
||||||
matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
|
matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
|
||||||
matches = set(
|
matches = set(
|
||||||
|
@ -165,7 +165,7 @@ class EntityRuler:
|
||||||
|
|
||||||
RETURNS (set): The string labels.
|
RETURNS (set): The string labels.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entityruler#labels
|
DOCS: https://nightly.spacy.io/api/entityruler#labels
|
||||||
"""
|
"""
|
||||||
keys = set(self.token_patterns.keys())
|
keys = set(self.token_patterns.keys())
|
||||||
keys.update(self.phrase_patterns.keys())
|
keys.update(self.phrase_patterns.keys())
|
||||||
|
@ -185,7 +185,7 @@ class EntityRuler:
|
||||||
|
|
||||||
RETURNS (set): The string entity ids.
|
RETURNS (set): The string entity ids.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entityruler#ent_ids
|
DOCS: https://nightly.spacy.io/api/entityruler#ent_ids
|
||||||
"""
|
"""
|
||||||
keys = set(self.token_patterns.keys())
|
keys = set(self.token_patterns.keys())
|
||||||
keys.update(self.phrase_patterns.keys())
|
keys.update(self.phrase_patterns.keys())
|
||||||
|
@ -203,7 +203,7 @@ class EntityRuler:
|
||||||
|
|
||||||
RETURNS (list): The original patterns, one dictionary per pattern.
|
RETURNS (list): The original patterns, one dictionary per pattern.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entityruler#patterns
|
DOCS: https://nightly.spacy.io/api/entityruler#patterns
|
||||||
"""
|
"""
|
||||||
all_patterns = []
|
all_patterns = []
|
||||||
for label, patterns in self.token_patterns.items():
|
for label, patterns in self.token_patterns.items():
|
||||||
|
@ -230,7 +230,7 @@ class EntityRuler:
|
||||||
|
|
||||||
patterns (list): The patterns to add.
|
patterns (list): The patterns to add.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entityruler#add_patterns
|
DOCS: https://nightly.spacy.io/api/entityruler#add_patterns
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# disable the nlp components after this one in case they hadn't been initialized / deserialised yet
|
# disable the nlp components after this one in case they hadn't been initialized / deserialised yet
|
||||||
|
@ -324,7 +324,7 @@ class EntityRuler:
|
||||||
patterns_bytes (bytes): The bytestring to load.
|
patterns_bytes (bytes): The bytestring to load.
|
||||||
RETURNS (EntityRuler): The loaded entity ruler.
|
RETURNS (EntityRuler): The loaded entity ruler.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entityruler#from_bytes
|
DOCS: https://nightly.spacy.io/api/entityruler#from_bytes
|
||||||
"""
|
"""
|
||||||
cfg = srsly.msgpack_loads(patterns_bytes)
|
cfg = srsly.msgpack_loads(patterns_bytes)
|
||||||
self.clear()
|
self.clear()
|
||||||
|
@ -346,7 +346,7 @@ class EntityRuler:
|
||||||
|
|
||||||
RETURNS (bytes): The serialized patterns.
|
RETURNS (bytes): The serialized patterns.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entityruler#to_bytes
|
DOCS: https://nightly.spacy.io/api/entityruler#to_bytes
|
||||||
"""
|
"""
|
||||||
serial = {
|
serial = {
|
||||||
"overwrite": self.overwrite,
|
"overwrite": self.overwrite,
|
||||||
|
@ -365,7 +365,7 @@ class EntityRuler:
|
||||||
path (str / Path): The JSONL file to load.
|
path (str / Path): The JSONL file to load.
|
||||||
RETURNS (EntityRuler): The loaded entity ruler.
|
RETURNS (EntityRuler): The loaded entity ruler.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entityruler#from_disk
|
DOCS: https://nightly.spacy.io/api/entityruler#from_disk
|
||||||
"""
|
"""
|
||||||
path = ensure_path(path)
|
path = ensure_path(path)
|
||||||
self.clear()
|
self.clear()
|
||||||
|
@ -401,7 +401,7 @@ class EntityRuler:
|
||||||
|
|
||||||
path (str / Path): The JSONL file to save.
|
path (str / Path): The JSONL file to save.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entityruler#to_disk
|
DOCS: https://nightly.spacy.io/api/entityruler#to_disk
|
||||||
"""
|
"""
|
||||||
path = ensure_path(path)
|
path = ensure_path(path)
|
||||||
cfg = {
|
cfg = {
|
||||||
|
|
|
@ -15,7 +15,7 @@ def merge_noun_chunks(doc: Doc) -> Doc:
|
||||||
doc (Doc): The Doc object.
|
doc (Doc): The Doc object.
|
||||||
RETURNS (Doc): The Doc object with merged noun chunks.
|
RETURNS (Doc): The Doc object with merged noun chunks.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/pipeline-functions#merge_noun_chunks
|
DOCS: https://nightly.spacy.io/api/pipeline-functions#merge_noun_chunks
|
||||||
"""
|
"""
|
||||||
if not doc.is_parsed:
|
if not doc.is_parsed:
|
||||||
return doc
|
return doc
|
||||||
|
@ -37,7 +37,7 @@ def merge_entities(doc: Doc):
|
||||||
doc (Doc): The Doc object.
|
doc (Doc): The Doc object.
|
||||||
RETURNS (Doc): The Doc object with merged entities.
|
RETURNS (Doc): The Doc object with merged entities.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/pipeline-functions#merge_entities
|
DOCS: https://nightly.spacy.io/api/pipeline-functions#merge_entities
|
||||||
"""
|
"""
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
for ent in doc.ents:
|
for ent in doc.ents:
|
||||||
|
@ -54,7 +54,7 @@ def merge_subtokens(doc: Doc, label: str = "subtok") -> Doc:
|
||||||
label (str): The subtoken dependency label.
|
label (str): The subtoken dependency label.
|
||||||
RETURNS (Doc): The Doc object with merged subtokens.
|
RETURNS (Doc): The Doc object with merged subtokens.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/pipeline-functions#merge_subtokens
|
DOCS: https://nightly.spacy.io/api/pipeline-functions#merge_subtokens
|
||||||
"""
|
"""
|
||||||
# TODO: make stateful component with "label" config
|
# TODO: make stateful component with "label" config
|
||||||
merger = Matcher(doc.vocab)
|
merger = Matcher(doc.vocab)
|
||||||
|
|
|
@ -43,7 +43,7 @@ class Lemmatizer(Pipe):
|
||||||
The Lemmatizer supports simple part-of-speech-sensitive suffix rules and
|
The Lemmatizer supports simple part-of-speech-sensitive suffix rules and
|
||||||
lookup tables.
|
lookup tables.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lemmatizer
|
DOCS: https://nightly.spacy.io/api/lemmatizer
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
@ -54,7 +54,7 @@ class Lemmatizer(Pipe):
|
||||||
mode (str): The lemmatizer mode.
|
mode (str): The lemmatizer mode.
|
||||||
RETURNS (dict): The lookups configuration settings for this mode.
|
RETURNS (dict): The lookups configuration settings for this mode.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lemmatizer#get_lookups_config
|
DOCS: https://nightly.spacy.io/api/lemmatizer#get_lookups_config
|
||||||
"""
|
"""
|
||||||
if mode == "lookup":
|
if mode == "lookup":
|
||||||
return {
|
return {
|
||||||
|
@ -80,7 +80,7 @@ class Lemmatizer(Pipe):
|
||||||
lookups should be loaded.
|
lookups should be loaded.
|
||||||
RETURNS (Lookups): The Lookups object.
|
RETURNS (Lookups): The Lookups object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lemmatizer#get_lookups_config
|
DOCS: https://nightly.spacy.io/api/lemmatizer#get_lookups_config
|
||||||
"""
|
"""
|
||||||
config = cls.get_lookups_config(mode)
|
config = cls.get_lookups_config(mode)
|
||||||
required_tables = config.get("required_tables", [])
|
required_tables = config.get("required_tables", [])
|
||||||
|
@ -123,7 +123,7 @@ class Lemmatizer(Pipe):
|
||||||
overwrite (bool): Whether to overwrite existing lemmas. Defaults to
|
overwrite (bool): Whether to overwrite existing lemmas. Defaults to
|
||||||
`False`.
|
`False`.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lemmatizer#init
|
DOCS: https://nightly.spacy.io/api/lemmatizer#init
|
||||||
"""
|
"""
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.model = model
|
self.model = model
|
||||||
|
@ -152,7 +152,7 @@ class Lemmatizer(Pipe):
|
||||||
doc (Doc): The Doc to process.
|
doc (Doc): The Doc to process.
|
||||||
RETURNS (Doc): The processed Doc.
|
RETURNS (Doc): The processed Doc.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lemmatizer#call
|
DOCS: https://nightly.spacy.io/api/lemmatizer#call
|
||||||
"""
|
"""
|
||||||
for token in doc:
|
for token in doc:
|
||||||
if self.overwrite or token.lemma == 0:
|
if self.overwrite or token.lemma == 0:
|
||||||
|
@ -168,7 +168,7 @@ class Lemmatizer(Pipe):
|
||||||
batch_size (int): The number of documents to buffer.
|
batch_size (int): The number of documents to buffer.
|
||||||
YIELDS (Doc): Processed documents in order.
|
YIELDS (Doc): Processed documents in order.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lemmatizer#pipe
|
DOCS: https://nightly.spacy.io/api/lemmatizer#pipe
|
||||||
"""
|
"""
|
||||||
for doc in stream:
|
for doc in stream:
|
||||||
doc = self(doc)
|
doc = self(doc)
|
||||||
|
@ -180,7 +180,7 @@ class Lemmatizer(Pipe):
|
||||||
token (Token): The token to lemmatize.
|
token (Token): The token to lemmatize.
|
||||||
RETURNS (list): The available lemmas for the string.
|
RETURNS (list): The available lemmas for the string.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lemmatizer#lookup_lemmatize
|
DOCS: https://nightly.spacy.io/api/lemmatizer#lookup_lemmatize
|
||||||
"""
|
"""
|
||||||
lookup_table = self.lookups.get_table("lemma_lookup", {})
|
lookup_table = self.lookups.get_table("lemma_lookup", {})
|
||||||
result = lookup_table.get(token.text, token.text)
|
result = lookup_table.get(token.text, token.text)
|
||||||
|
@ -194,7 +194,7 @@ class Lemmatizer(Pipe):
|
||||||
token (Token): The token to lemmatize.
|
token (Token): The token to lemmatize.
|
||||||
RETURNS (list): The available lemmas for the string.
|
RETURNS (list): The available lemmas for the string.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lemmatizer#rule_lemmatize
|
DOCS: https://nightly.spacy.io/api/lemmatizer#rule_lemmatize
|
||||||
"""
|
"""
|
||||||
cache_key = (token.orth, token.pos, token.morph)
|
cache_key = (token.orth, token.pos, token.morph)
|
||||||
if cache_key in self.cache:
|
if cache_key in self.cache:
|
||||||
|
@ -260,7 +260,7 @@ class Lemmatizer(Pipe):
|
||||||
token (Token): The token.
|
token (Token): The token.
|
||||||
RETURNS (bool): Whether the token is a base form.
|
RETURNS (bool): Whether the token is a base form.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lemmatizer#is_base_form
|
DOCS: https://nightly.spacy.io/api/lemmatizer#is_base_form
|
||||||
"""
|
"""
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
@ -270,7 +270,7 @@ class Lemmatizer(Pipe):
|
||||||
examples (Iterable[Example]): The examples to score.
|
examples (Iterable[Example]): The examples to score.
|
||||||
RETURNS (Dict[str, Any]): The scores.
|
RETURNS (Dict[str, Any]): The scores.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lemmatizer#score
|
DOCS: https://nightly.spacy.io/api/lemmatizer#score
|
||||||
"""
|
"""
|
||||||
validate_examples(examples, "Lemmatizer.score")
|
validate_examples(examples, "Lemmatizer.score")
|
||||||
return Scorer.score_token_attr(examples, "lemma", **kwargs)
|
return Scorer.score_token_attr(examples, "lemma", **kwargs)
|
||||||
|
@ -282,7 +282,7 @@ class Lemmatizer(Pipe):
|
||||||
it doesn't exist.
|
it doesn't exist.
|
||||||
exclude (list): String names of serialization fields to exclude.
|
exclude (list): String names of serialization fields to exclude.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vocab#to_disk
|
DOCS: https://nightly.spacy.io/api/vocab#to_disk
|
||||||
"""
|
"""
|
||||||
serialize = {}
|
serialize = {}
|
||||||
serialize["vocab"] = lambda p: self.vocab.to_disk(p)
|
serialize["vocab"] = lambda p: self.vocab.to_disk(p)
|
||||||
|
@ -297,7 +297,7 @@ class Lemmatizer(Pipe):
|
||||||
exclude (list): String names of serialization fields to exclude.
|
exclude (list): String names of serialization fields to exclude.
|
||||||
RETURNS (Vocab): The modified `Vocab` object.
|
RETURNS (Vocab): The modified `Vocab` object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vocab#to_disk
|
DOCS: https://nightly.spacy.io/api/vocab#to_disk
|
||||||
"""
|
"""
|
||||||
deserialize = {}
|
deserialize = {}
|
||||||
deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
|
deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
|
||||||
|
@ -310,7 +310,7 @@ class Lemmatizer(Pipe):
|
||||||
exclude (list): String names of serialization fields to exclude.
|
exclude (list): String names of serialization fields to exclude.
|
||||||
RETURNS (bytes): The serialized form of the `Vocab` object.
|
RETURNS (bytes): The serialized form of the `Vocab` object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vocab#to_bytes
|
DOCS: https://nightly.spacy.io/api/vocab#to_bytes
|
||||||
"""
|
"""
|
||||||
serialize = {}
|
serialize = {}
|
||||||
serialize["vocab"] = self.vocab.to_bytes
|
serialize["vocab"] = self.vocab.to_bytes
|
||||||
|
@ -324,7 +324,7 @@ class Lemmatizer(Pipe):
|
||||||
exclude (list): String names of serialization fields to exclude.
|
exclude (list): String names of serialization fields to exclude.
|
||||||
RETURNS (Vocab): The `Vocab` object.
|
RETURNS (Vocab): The `Vocab` object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vocab#from_bytes
|
DOCS: https://nightly.spacy.io/api/vocab#from_bytes
|
||||||
"""
|
"""
|
||||||
deserialize = {}
|
deserialize = {}
|
||||||
deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
|
deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
|
||||||
|
|
|
@ -79,7 +79,7 @@ class Morphologizer(Tagger):
|
||||||
labels_morph (dict): Mapping of morph + POS tags to morph labels.
|
labels_morph (dict): Mapping of morph + POS tags to morph labels.
|
||||||
labels_pos (dict): Mapping of morph + POS tags to POS tags.
|
labels_pos (dict): Mapping of morph + POS tags to POS tags.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/morphologizer#init
|
DOCS: https://nightly.spacy.io/api/morphologizer#init
|
||||||
"""
|
"""
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.model = model
|
self.model = model
|
||||||
|
@ -106,7 +106,7 @@ class Morphologizer(Tagger):
|
||||||
label (str): The label to add.
|
label (str): The label to add.
|
||||||
RETURNS (int): 0 if label is already present, otherwise 1.
|
RETURNS (int): 0 if label is already present, otherwise 1.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/morphologizer#add_label
|
DOCS: https://nightly.spacy.io/api/morphologizer#add_label
|
||||||
"""
|
"""
|
||||||
if not isinstance(label, str):
|
if not isinstance(label, str):
|
||||||
raise ValueError(Errors.E187)
|
raise ValueError(Errors.E187)
|
||||||
|
@ -139,7 +139,7 @@ class Morphologizer(Tagger):
|
||||||
create_optimizer if it doesn't exist.
|
create_optimizer if it doesn't exist.
|
||||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/morphologizer#begin_training
|
DOCS: https://nightly.spacy.io/api/morphologizer#begin_training
|
||||||
"""
|
"""
|
||||||
if not hasattr(get_examples, "__call__"):
|
if not hasattr(get_examples, "__call__"):
|
||||||
err = Errors.E930.format(name="Morphologizer", obj=type(get_examples))
|
err = Errors.E930.format(name="Morphologizer", obj=type(get_examples))
|
||||||
|
@ -169,7 +169,7 @@ class Morphologizer(Tagger):
|
||||||
docs (Iterable[Doc]): The documents to modify.
|
docs (Iterable[Doc]): The documents to modify.
|
||||||
batch_tag_ids: The IDs to set, produced by Morphologizer.predict.
|
batch_tag_ids: The IDs to set, produced by Morphologizer.predict.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/morphologizer#set_annotations
|
DOCS: https://nightly.spacy.io/api/morphologizer#set_annotations
|
||||||
"""
|
"""
|
||||||
if isinstance(docs, Doc):
|
if isinstance(docs, Doc):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
|
@ -194,7 +194,7 @@ class Morphologizer(Tagger):
|
||||||
scores: Scores representing the model's predictions.
|
scores: Scores representing the model's predictions.
|
||||||
RETUTNRS (Tuple[float, float]): The loss and the gradient.
|
RETUTNRS (Tuple[float, float]): The loss and the gradient.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/morphologizer#get_loss
|
DOCS: https://nightly.spacy.io/api/morphologizer#get_loss
|
||||||
"""
|
"""
|
||||||
validate_examples(examples, "Morphologizer.get_loss")
|
validate_examples(examples, "Morphologizer.get_loss")
|
||||||
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
|
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
|
||||||
|
@ -231,7 +231,7 @@ class Morphologizer(Tagger):
|
||||||
Scorer.score_token_attr for the attributes "pos" and "morph" and
|
Scorer.score_token_attr for the attributes "pos" and "morph" and
|
||||||
Scorer.score_token_attr_per_feat for the attribute "morph".
|
Scorer.score_token_attr_per_feat for the attribute "morph".
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/morphologizer#score
|
DOCS: https://nightly.spacy.io/api/morphologizer#score
|
||||||
"""
|
"""
|
||||||
validate_examples(examples, "Morphologizer.score")
|
validate_examples(examples, "Morphologizer.score")
|
||||||
results = {}
|
results = {}
|
||||||
|
@ -247,7 +247,7 @@ class Morphologizer(Tagger):
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
RETURNS (bytes): The serialized object.
|
RETURNS (bytes): The serialized object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/morphologizer#to_bytes
|
DOCS: https://nightly.spacy.io/api/morphologizer#to_bytes
|
||||||
"""
|
"""
|
||||||
serialize = {}
|
serialize = {}
|
||||||
serialize["model"] = self.model.to_bytes
|
serialize["model"] = self.model.to_bytes
|
||||||
|
@ -262,7 +262,7 @@ class Morphologizer(Tagger):
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
RETURNS (Morphologizer): The loaded Morphologizer.
|
RETURNS (Morphologizer): The loaded Morphologizer.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/morphologizer#from_bytes
|
DOCS: https://nightly.spacy.io/api/morphologizer#from_bytes
|
||||||
"""
|
"""
|
||||||
def load_model(b):
|
def load_model(b):
|
||||||
try:
|
try:
|
||||||
|
@ -284,7 +284,7 @@ class Morphologizer(Tagger):
|
||||||
path (str / Path): Path to a directory.
|
path (str / Path): Path to a directory.
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/morphologizer#to_disk
|
DOCS: https://nightly.spacy.io/api/morphologizer#to_disk
|
||||||
"""
|
"""
|
||||||
serialize = {
|
serialize = {
|
||||||
"vocab": lambda p: self.vocab.to_disk(p),
|
"vocab": lambda p: self.vocab.to_disk(p),
|
||||||
|
@ -300,7 +300,7 @@ class Morphologizer(Tagger):
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
RETURNS (Morphologizer): The modified Morphologizer object.
|
RETURNS (Morphologizer): The modified Morphologizer object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/morphologizer#from_disk
|
DOCS: https://nightly.spacy.io/api/morphologizer#from_disk
|
||||||
"""
|
"""
|
||||||
def load_model(p):
|
def load_model(p):
|
||||||
with p.open("rb") as file_:
|
with p.open("rb") as file_:
|
||||||
|
|
|
@ -88,7 +88,7 @@ def make_ner(
|
||||||
cdef class EntityRecognizer(Parser):
|
cdef class EntityRecognizer(Parser):
|
||||||
"""Pipeline component for named entity recognition.
|
"""Pipeline component for named entity recognition.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entityrecognizer
|
DOCS: https://nightly.spacy.io/api/entityrecognizer
|
||||||
"""
|
"""
|
||||||
TransitionSystem = BiluoPushDown
|
TransitionSystem = BiluoPushDown
|
||||||
|
|
||||||
|
@ -119,7 +119,7 @@ cdef class EntityRecognizer(Parser):
|
||||||
examples (Iterable[Example]): The examples to score.
|
examples (Iterable[Example]): The examples to score.
|
||||||
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
|
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entityrecognizer#score
|
DOCS: https://nightly.spacy.io/api/entityrecognizer#score
|
||||||
"""
|
"""
|
||||||
validate_examples(examples, "EntityRecognizer.score")
|
validate_examples(examples, "EntityRecognizer.score")
|
||||||
return Scorer.score_spans(examples, "ents", **kwargs)
|
return Scorer.score_spans(examples, "ents", **kwargs)
|
||||||
|
|
|
@ -15,7 +15,7 @@ cdef class Pipe:
|
||||||
from it and it defines the interface that components should follow to
|
from it and it defines the interface that components should follow to
|
||||||
function as trainable components in a spaCy pipeline.
|
function as trainable components in a spaCy pipeline.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/pipe
|
DOCS: https://nightly.spacy.io/api/pipe
|
||||||
"""
|
"""
|
||||||
def __init__(self, vocab, model, name, **cfg):
|
def __init__(self, vocab, model, name, **cfg):
|
||||||
"""Initialize a pipeline component.
|
"""Initialize a pipeline component.
|
||||||
|
@ -25,7 +25,7 @@ cdef class Pipe:
|
||||||
name (str): The component instance name.
|
name (str): The component instance name.
|
||||||
**cfg: Additonal settings and config parameters.
|
**cfg: Additonal settings and config parameters.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/pipe#init
|
DOCS: https://nightly.spacy.io/api/pipe#init
|
||||||
"""
|
"""
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.model = model
|
self.model = model
|
||||||
|
@ -40,7 +40,7 @@ cdef class Pipe:
|
||||||
docs (Doc): The Doc to process.
|
docs (Doc): The Doc to process.
|
||||||
RETURNS (Doc): The processed Doc.
|
RETURNS (Doc): The processed Doc.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/pipe#call
|
DOCS: https://nightly.spacy.io/api/pipe#call
|
||||||
"""
|
"""
|
||||||
scores = self.predict([doc])
|
scores = self.predict([doc])
|
||||||
self.set_annotations([doc], scores)
|
self.set_annotations([doc], scores)
|
||||||
|
@ -55,7 +55,7 @@ cdef class Pipe:
|
||||||
batch_size (int): The number of documents to buffer.
|
batch_size (int): The number of documents to buffer.
|
||||||
YIELDS (Doc): Processed documents in order.
|
YIELDS (Doc): Processed documents in order.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/pipe#pipe
|
DOCS: https://nightly.spacy.io/api/pipe#pipe
|
||||||
"""
|
"""
|
||||||
for docs in util.minibatch(stream, size=batch_size):
|
for docs in util.minibatch(stream, size=batch_size):
|
||||||
scores = self.predict(docs)
|
scores = self.predict(docs)
|
||||||
|
@ -69,7 +69,7 @@ cdef class Pipe:
|
||||||
docs (Iterable[Doc]): The documents to predict.
|
docs (Iterable[Doc]): The documents to predict.
|
||||||
RETURNS: Vector representations for each token in the documents.
|
RETURNS: Vector representations for each token in the documents.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/pipe#predict
|
DOCS: https://nightly.spacy.io/api/pipe#predict
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError(Errors.E931.format(method="predict", name=self.name))
|
raise NotImplementedError(Errors.E931.format(method="predict", name=self.name))
|
||||||
|
|
||||||
|
@ -79,7 +79,7 @@ cdef class Pipe:
|
||||||
docs (Iterable[Doc]): The documents to modify.
|
docs (Iterable[Doc]): The documents to modify.
|
||||||
scores: The scores to assign.
|
scores: The scores to assign.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/pipe#set_annotations
|
DOCS: https://nightly.spacy.io/api/pipe#set_annotations
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError(Errors.E931.format(method="set_annotations", name=self.name))
|
raise NotImplementedError(Errors.E931.format(method="set_annotations", name=self.name))
|
||||||
|
|
||||||
|
@ -96,7 +96,7 @@ cdef class Pipe:
|
||||||
Updated using the component name as the key.
|
Updated using the component name as the key.
|
||||||
RETURNS (Dict[str, float]): The updated losses dictionary.
|
RETURNS (Dict[str, float]): The updated losses dictionary.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/pipe#update
|
DOCS: https://nightly.spacy.io/api/pipe#update
|
||||||
"""
|
"""
|
||||||
if losses is None:
|
if losses is None:
|
||||||
losses = {}
|
losses = {}
|
||||||
|
@ -132,7 +132,7 @@ cdef class Pipe:
|
||||||
Updated using the component name as the key.
|
Updated using the component name as the key.
|
||||||
RETURNS (Dict[str, float]): The updated losses dictionary.
|
RETURNS (Dict[str, float]): The updated losses dictionary.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/pipe#rehearse
|
DOCS: https://nightly.spacy.io/api/pipe#rehearse
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@ -144,7 +144,7 @@ cdef class Pipe:
|
||||||
scores: Scores representing the model's predictions.
|
scores: Scores representing the model's predictions.
|
||||||
RETUTNRS (Tuple[float, float]): The loss and the gradient.
|
RETUTNRS (Tuple[float, float]): The loss and the gradient.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/pipe#get_loss
|
DOCS: https://nightly.spacy.io/api/pipe#get_loss
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError(Errors.E931.format(method="get_loss", name=self.name))
|
raise NotImplementedError(Errors.E931.format(method="get_loss", name=self.name))
|
||||||
|
|
||||||
|
@ -156,7 +156,7 @@ cdef class Pipe:
|
||||||
label (str): The label to add.
|
label (str): The label to add.
|
||||||
RETURNS (int): 0 if label is already present, otherwise 1.
|
RETURNS (int): 0 if label is already present, otherwise 1.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/pipe#add_label
|
DOCS: https://nightly.spacy.io/api/pipe#add_label
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError(Errors.E931.format(method="add_label", name=self.name))
|
raise NotImplementedError(Errors.E931.format(method="add_label", name=self.name))
|
||||||
|
|
||||||
|
@ -165,7 +165,7 @@ cdef class Pipe:
|
||||||
|
|
||||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/pipe#create_optimizer
|
DOCS: https://nightly.spacy.io/api/pipe#create_optimizer
|
||||||
"""
|
"""
|
||||||
return util.create_default_optimizer()
|
return util.create_default_optimizer()
|
||||||
|
|
||||||
|
@ -181,7 +181,7 @@ cdef class Pipe:
|
||||||
create_optimizer if it doesn't exist.
|
create_optimizer if it doesn't exist.
|
||||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/pipe#begin_training
|
DOCS: https://nightly.spacy.io/api/pipe#begin_training
|
||||||
"""
|
"""
|
||||||
self.model.initialize()
|
self.model.initialize()
|
||||||
if sgd is None:
|
if sgd is None:
|
||||||
|
@ -200,7 +200,7 @@ cdef class Pipe:
|
||||||
|
|
||||||
params (dict): The parameter values to use in the model.
|
params (dict): The parameter values to use in the model.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/pipe#use_params
|
DOCS: https://nightly.spacy.io/api/pipe#use_params
|
||||||
"""
|
"""
|
||||||
with self.model.use_params(params):
|
with self.model.use_params(params):
|
||||||
yield
|
yield
|
||||||
|
@ -211,7 +211,7 @@ cdef class Pipe:
|
||||||
examples (Iterable[Example]): The examples to score.
|
examples (Iterable[Example]): The examples to score.
|
||||||
RETURNS (Dict[str, Any]): The scores.
|
RETURNS (Dict[str, Any]): The scores.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/pipe#score
|
DOCS: https://nightly.spacy.io/api/pipe#score
|
||||||
"""
|
"""
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
@ -221,7 +221,7 @@ cdef class Pipe:
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
RETURNS (bytes): The serialized object.
|
RETURNS (bytes): The serialized object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/pipe#to_bytes
|
DOCS: https://nightly.spacy.io/api/pipe#to_bytes
|
||||||
"""
|
"""
|
||||||
serialize = {}
|
serialize = {}
|
||||||
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
|
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
|
||||||
|
@ -236,7 +236,7 @@ cdef class Pipe:
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
RETURNS (Pipe): The loaded object.
|
RETURNS (Pipe): The loaded object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/pipe#from_bytes
|
DOCS: https://nightly.spacy.io/api/pipe#from_bytes
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def load_model(b):
|
def load_model(b):
|
||||||
|
@ -259,7 +259,7 @@ cdef class Pipe:
|
||||||
path (str / Path): Path to a directory.
|
path (str / Path): Path to a directory.
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/pipe#to_disk
|
DOCS: https://nightly.spacy.io/api/pipe#to_disk
|
||||||
"""
|
"""
|
||||||
serialize = {}
|
serialize = {}
|
||||||
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
|
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
|
||||||
|
@ -274,7 +274,7 @@ cdef class Pipe:
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
RETURNS (Pipe): The loaded object.
|
RETURNS (Pipe): The loaded object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/pipe#from_disk
|
DOCS: https://nightly.spacy.io/api/pipe#from_disk
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def load_model(p):
|
def load_model(p):
|
||||||
|
|
|
@ -29,7 +29,7 @@ def make_sentencizer(
|
||||||
class Sentencizer(Pipe):
|
class Sentencizer(Pipe):
|
||||||
"""Segment the Doc into sentences using a rule-based strategy.
|
"""Segment the Doc into sentences using a rule-based strategy.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/sentencizer
|
DOCS: https://nightly.spacy.io/api/sentencizer
|
||||||
"""
|
"""
|
||||||
|
|
||||||
default_punct_chars = ['!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹',
|
default_punct_chars = ['!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹',
|
||||||
|
@ -51,7 +51,7 @@ class Sentencizer(Pipe):
|
||||||
serialized with the nlp object.
|
serialized with the nlp object.
|
||||||
RETURNS (Sentencizer): The sentencizer component.
|
RETURNS (Sentencizer): The sentencizer component.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/sentencizer#init
|
DOCS: https://nightly.spacy.io/api/sentencizer#init
|
||||||
"""
|
"""
|
||||||
self.name = name
|
self.name = name
|
||||||
if punct_chars:
|
if punct_chars:
|
||||||
|
@ -68,7 +68,7 @@ class Sentencizer(Pipe):
|
||||||
doc (Doc): The document to process.
|
doc (Doc): The document to process.
|
||||||
RETURNS (Doc): The processed Doc.
|
RETURNS (Doc): The processed Doc.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/sentencizer#call
|
DOCS: https://nightly.spacy.io/api/sentencizer#call
|
||||||
"""
|
"""
|
||||||
start = 0
|
start = 0
|
||||||
seen_period = False
|
seen_period = False
|
||||||
|
@ -94,7 +94,7 @@ class Sentencizer(Pipe):
|
||||||
batch_size (int): The number of documents to buffer.
|
batch_size (int): The number of documents to buffer.
|
||||||
YIELDS (Doc): Processed documents in order.
|
YIELDS (Doc): Processed documents in order.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/sentencizer#pipe
|
DOCS: https://nightly.spacy.io/api/sentencizer#pipe
|
||||||
"""
|
"""
|
||||||
for docs in util.minibatch(stream, size=batch_size):
|
for docs in util.minibatch(stream, size=batch_size):
|
||||||
predictions = self.predict(docs)
|
predictions = self.predict(docs)
|
||||||
|
@ -157,7 +157,7 @@ class Sentencizer(Pipe):
|
||||||
examples (Iterable[Example]): The examples to score.
|
examples (Iterable[Example]): The examples to score.
|
||||||
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
|
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/sentencizer#score
|
DOCS: https://nightly.spacy.io/api/sentencizer#score
|
||||||
"""
|
"""
|
||||||
validate_examples(examples, "Sentencizer.score")
|
validate_examples(examples, "Sentencizer.score")
|
||||||
results = Scorer.score_spans(examples, "sents", **kwargs)
|
results = Scorer.score_spans(examples, "sents", **kwargs)
|
||||||
|
@ -169,7 +169,7 @@ class Sentencizer(Pipe):
|
||||||
|
|
||||||
RETURNS (bytes): The serialized object.
|
RETURNS (bytes): The serialized object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/sentencizer#to_bytes
|
DOCS: https://nightly.spacy.io/api/sentencizer#to_bytes
|
||||||
"""
|
"""
|
||||||
return srsly.msgpack_dumps({"punct_chars": list(self.punct_chars)})
|
return srsly.msgpack_dumps({"punct_chars": list(self.punct_chars)})
|
||||||
|
|
||||||
|
@ -179,7 +179,7 @@ class Sentencizer(Pipe):
|
||||||
bytes_data (bytes): The data to load.
|
bytes_data (bytes): The data to load.
|
||||||
returns (Sentencizer): The loaded object.
|
returns (Sentencizer): The loaded object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/sentencizer#from_bytes
|
DOCS: https://nightly.spacy.io/api/sentencizer#from_bytes
|
||||||
"""
|
"""
|
||||||
cfg = srsly.msgpack_loads(bytes_data)
|
cfg = srsly.msgpack_loads(bytes_data)
|
||||||
self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars))
|
self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars))
|
||||||
|
@ -188,7 +188,7 @@ class Sentencizer(Pipe):
|
||||||
def to_disk(self, path, *, exclude=tuple()):
|
def to_disk(self, path, *, exclude=tuple()):
|
||||||
"""Serialize the sentencizer to disk.
|
"""Serialize the sentencizer to disk.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/sentencizer#to_disk
|
DOCS: https://nightly.spacy.io/api/sentencizer#to_disk
|
||||||
"""
|
"""
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
path = path.with_suffix(".json")
|
path = path.with_suffix(".json")
|
||||||
|
@ -198,7 +198,7 @@ class Sentencizer(Pipe):
|
||||||
def from_disk(self, path, *, exclude=tuple()):
|
def from_disk(self, path, *, exclude=tuple()):
|
||||||
"""Load the sentencizer from disk.
|
"""Load the sentencizer from disk.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/sentencizer#from_disk
|
DOCS: https://nightly.spacy.io/api/sentencizer#from_disk
|
||||||
"""
|
"""
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
path = path.with_suffix(".json")
|
path = path.with_suffix(".json")
|
||||||
|
|
|
@ -44,7 +44,7 @@ def make_senter(nlp: Language, name: str, model: Model):
|
||||||
class SentenceRecognizer(Tagger):
|
class SentenceRecognizer(Tagger):
|
||||||
"""Pipeline component for sentence segmentation.
|
"""Pipeline component for sentence segmentation.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/sentencerecognizer
|
DOCS: https://nightly.spacy.io/api/sentencerecognizer
|
||||||
"""
|
"""
|
||||||
def __init__(self, vocab, model, name="senter"):
|
def __init__(self, vocab, model, name="senter"):
|
||||||
"""Initialize a sentence recognizer.
|
"""Initialize a sentence recognizer.
|
||||||
|
@ -54,7 +54,7 @@ class SentenceRecognizer(Tagger):
|
||||||
name (str): The component instance name, used to add entries to the
|
name (str): The component instance name, used to add entries to the
|
||||||
losses during training.
|
losses during training.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/sentencerecognizer#init
|
DOCS: https://nightly.spacy.io/api/sentencerecognizer#init
|
||||||
"""
|
"""
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.model = model
|
self.model = model
|
||||||
|
@ -76,7 +76,7 @@ class SentenceRecognizer(Tagger):
|
||||||
docs (Iterable[Doc]): The documents to modify.
|
docs (Iterable[Doc]): The documents to modify.
|
||||||
batch_tag_ids: The IDs to set, produced by SentenceRecognizer.predict.
|
batch_tag_ids: The IDs to set, produced by SentenceRecognizer.predict.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/sentencerecognizer#set_annotations
|
DOCS: https://nightly.spacy.io/api/sentencerecognizer#set_annotations
|
||||||
"""
|
"""
|
||||||
if isinstance(docs, Doc):
|
if isinstance(docs, Doc):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
|
@ -101,7 +101,7 @@ class SentenceRecognizer(Tagger):
|
||||||
scores: Scores representing the model's predictions.
|
scores: Scores representing the model's predictions.
|
||||||
RETUTNRS (Tuple[float, float]): The loss and the gradient.
|
RETUTNRS (Tuple[float, float]): The loss and the gradient.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/sentencerecognizer#get_loss
|
DOCS: https://nightly.spacy.io/api/sentencerecognizer#get_loss
|
||||||
"""
|
"""
|
||||||
validate_examples(examples, "SentenceRecognizer.get_loss")
|
validate_examples(examples, "SentenceRecognizer.get_loss")
|
||||||
labels = self.labels
|
labels = self.labels
|
||||||
|
@ -135,7 +135,7 @@ class SentenceRecognizer(Tagger):
|
||||||
create_optimizer if it doesn't exist.
|
create_optimizer if it doesn't exist.
|
||||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/sentencerecognizer#begin_training
|
DOCS: https://nightly.spacy.io/api/sentencerecognizer#begin_training
|
||||||
"""
|
"""
|
||||||
self.set_output(len(self.labels))
|
self.set_output(len(self.labels))
|
||||||
self.model.initialize()
|
self.model.initialize()
|
||||||
|
@ -151,7 +151,7 @@ class SentenceRecognizer(Tagger):
|
||||||
|
|
||||||
examples (Iterable[Example]): The examples to score.
|
examples (Iterable[Example]): The examples to score.
|
||||||
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
|
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
|
||||||
DOCS: https://spacy.io/api/sentencerecognizer#score
|
DOCS: https://nightly.spacy.io/api/sentencerecognizer#score
|
||||||
"""
|
"""
|
||||||
validate_examples(examples, "SentenceRecognizer.score")
|
validate_examples(examples, "SentenceRecognizer.score")
|
||||||
results = Scorer.score_spans(examples, "sents", **kwargs)
|
results = Scorer.score_spans(examples, "sents", **kwargs)
|
||||||
|
@ -164,7 +164,7 @@ class SentenceRecognizer(Tagger):
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
RETURNS (bytes): The serialized object.
|
RETURNS (bytes): The serialized object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/sentencerecognizer#to_bytes
|
DOCS: https://nightly.spacy.io/api/sentencerecognizer#to_bytes
|
||||||
"""
|
"""
|
||||||
serialize = {}
|
serialize = {}
|
||||||
serialize["model"] = self.model.to_bytes
|
serialize["model"] = self.model.to_bytes
|
||||||
|
@ -179,7 +179,7 @@ class SentenceRecognizer(Tagger):
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
RETURNS (Tagger): The loaded SentenceRecognizer.
|
RETURNS (Tagger): The loaded SentenceRecognizer.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/sentencerecognizer#from_bytes
|
DOCS: https://nightly.spacy.io/api/sentencerecognizer#from_bytes
|
||||||
"""
|
"""
|
||||||
def load_model(b):
|
def load_model(b):
|
||||||
try:
|
try:
|
||||||
|
@ -201,7 +201,7 @@ class SentenceRecognizer(Tagger):
|
||||||
path (str / Path): Path to a directory.
|
path (str / Path): Path to a directory.
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/sentencerecognizer#to_disk
|
DOCS: https://nightly.spacy.io/api/sentencerecognizer#to_disk
|
||||||
"""
|
"""
|
||||||
serialize = {
|
serialize = {
|
||||||
"vocab": lambda p: self.vocab.to_disk(p),
|
"vocab": lambda p: self.vocab.to_disk(p),
|
||||||
|
@ -217,7 +217,7 @@ class SentenceRecognizer(Tagger):
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
RETURNS (Tagger): The modified SentenceRecognizer object.
|
RETURNS (Tagger): The modified SentenceRecognizer object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/sentencerecognizer#from_disk
|
DOCS: https://nightly.spacy.io/api/sentencerecognizer#from_disk
|
||||||
"""
|
"""
|
||||||
def load_model(p):
|
def load_model(p):
|
||||||
with p.open("rb") as file_:
|
with p.open("rb") as file_:
|
||||||
|
|
|
@ -78,7 +78,7 @@ class SimpleNER(Pipe):
|
||||||
def add_label(self, label: str) -> None:
|
def add_label(self, label: str) -> None:
|
||||||
"""Add a new label to the pipe.
|
"""Add a new label to the pipe.
|
||||||
label (str): The label to add.
|
label (str): The label to add.
|
||||||
DOCS: https://spacy.io/api/simplener#add_label
|
DOCS: https://nightly.spacy.io/api/simplener#add_label
|
||||||
"""
|
"""
|
||||||
if not isinstance(label, str):
|
if not isinstance(label, str):
|
||||||
raise ValueError(Errors.E187)
|
raise ValueError(Errors.E187)
|
||||||
|
|
|
@ -58,7 +58,7 @@ def make_tagger(nlp: Language, name: str, model: Model):
|
||||||
class Tagger(Pipe):
|
class Tagger(Pipe):
|
||||||
"""Pipeline component for part-of-speech tagging.
|
"""Pipeline component for part-of-speech tagging.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tagger
|
DOCS: https://nightly.spacy.io/api/tagger
|
||||||
"""
|
"""
|
||||||
def __init__(self, vocab, model, name="tagger", *, labels=None):
|
def __init__(self, vocab, model, name="tagger", *, labels=None):
|
||||||
"""Initialize a part-of-speech tagger.
|
"""Initialize a part-of-speech tagger.
|
||||||
|
@ -69,7 +69,7 @@ class Tagger(Pipe):
|
||||||
losses during training.
|
losses during training.
|
||||||
labels (List): The set of labels. Defaults to None.
|
labels (List): The set of labels. Defaults to None.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tagger#init
|
DOCS: https://nightly.spacy.io/api/tagger#init
|
||||||
"""
|
"""
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.model = model
|
self.model = model
|
||||||
|
@ -86,7 +86,7 @@ class Tagger(Pipe):
|
||||||
|
|
||||||
RETURNS (Tuple[str]): The labels.
|
RETURNS (Tuple[str]): The labels.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tagger#labels
|
DOCS: https://nightly.spacy.io/api/tagger#labels
|
||||||
"""
|
"""
|
||||||
return tuple(self.cfg["labels"])
|
return tuple(self.cfg["labels"])
|
||||||
|
|
||||||
|
@ -96,7 +96,7 @@ class Tagger(Pipe):
|
||||||
doc (Doc): The document to process.
|
doc (Doc): The document to process.
|
||||||
RETURNS (Doc): The processed Doc.
|
RETURNS (Doc): The processed Doc.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tagger#call
|
DOCS: https://nightly.spacy.io/api/tagger#call
|
||||||
"""
|
"""
|
||||||
tags = self.predict([doc])
|
tags = self.predict([doc])
|
||||||
self.set_annotations([doc], tags)
|
self.set_annotations([doc], tags)
|
||||||
|
@ -111,7 +111,7 @@ class Tagger(Pipe):
|
||||||
batch_size (int): The number of documents to buffer.
|
batch_size (int): The number of documents to buffer.
|
||||||
YIELDS (Doc): Processed documents in order.
|
YIELDS (Doc): Processed documents in order.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tagger#pipe
|
DOCS: https://nightly.spacy.io/api/tagger#pipe
|
||||||
"""
|
"""
|
||||||
for docs in util.minibatch(stream, size=batch_size):
|
for docs in util.minibatch(stream, size=batch_size):
|
||||||
tag_ids = self.predict(docs)
|
tag_ids = self.predict(docs)
|
||||||
|
@ -124,7 +124,7 @@ class Tagger(Pipe):
|
||||||
docs (Iterable[Doc]): The documents to predict.
|
docs (Iterable[Doc]): The documents to predict.
|
||||||
RETURNS: The models prediction for each document.
|
RETURNS: The models prediction for each document.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tagger#predict
|
DOCS: https://nightly.spacy.io/api/tagger#predict
|
||||||
"""
|
"""
|
||||||
if not any(len(doc) for doc in docs):
|
if not any(len(doc) for doc in docs):
|
||||||
# Handle cases where there are no tokens in any docs.
|
# Handle cases where there are no tokens in any docs.
|
||||||
|
@ -153,7 +153,7 @@ class Tagger(Pipe):
|
||||||
docs (Iterable[Doc]): The documents to modify.
|
docs (Iterable[Doc]): The documents to modify.
|
||||||
batch_tag_ids: The IDs to set, produced by Tagger.predict.
|
batch_tag_ids: The IDs to set, produced by Tagger.predict.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tagger#set_annotations
|
DOCS: https://nightly.spacy.io/api/tagger#set_annotations
|
||||||
"""
|
"""
|
||||||
if isinstance(docs, Doc):
|
if isinstance(docs, Doc):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
|
@ -182,7 +182,7 @@ class Tagger(Pipe):
|
||||||
Updated using the component name as the key.
|
Updated using the component name as the key.
|
||||||
RETURNS (Dict[str, float]): The updated losses dictionary.
|
RETURNS (Dict[str, float]): The updated losses dictionary.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tagger#update
|
DOCS: https://nightly.spacy.io/api/tagger#update
|
||||||
"""
|
"""
|
||||||
if losses is None:
|
if losses is None:
|
||||||
losses = {}
|
losses = {}
|
||||||
|
@ -220,7 +220,7 @@ class Tagger(Pipe):
|
||||||
Updated using the component name as the key.
|
Updated using the component name as the key.
|
||||||
RETURNS (Dict[str, float]): The updated losses dictionary.
|
RETURNS (Dict[str, float]): The updated losses dictionary.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tagger#rehearse
|
DOCS: https://nightly.spacy.io/api/tagger#rehearse
|
||||||
"""
|
"""
|
||||||
validate_examples(examples, "Tagger.rehearse")
|
validate_examples(examples, "Tagger.rehearse")
|
||||||
docs = [eg.predicted for eg in examples]
|
docs = [eg.predicted for eg in examples]
|
||||||
|
@ -247,7 +247,7 @@ class Tagger(Pipe):
|
||||||
scores: Scores representing the model's predictions.
|
scores: Scores representing the model's predictions.
|
||||||
RETUTNRS (Tuple[float, float]): The loss and the gradient.
|
RETUTNRS (Tuple[float, float]): The loss and the gradient.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tagger#get_loss
|
DOCS: https://nightly.spacy.io/api/tagger#get_loss
|
||||||
"""
|
"""
|
||||||
validate_examples(examples, "Tagger.get_loss")
|
validate_examples(examples, "Tagger.get_loss")
|
||||||
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
|
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
|
||||||
|
@ -269,7 +269,7 @@ class Tagger(Pipe):
|
||||||
create_optimizer if it doesn't exist.
|
create_optimizer if it doesn't exist.
|
||||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tagger#begin_training
|
DOCS: https://nightly.spacy.io/api/tagger#begin_training
|
||||||
"""
|
"""
|
||||||
if not hasattr(get_examples, "__call__"):
|
if not hasattr(get_examples, "__call__"):
|
||||||
err = Errors.E930.format(name="Tagger", obj=type(get_examples))
|
err = Errors.E930.format(name="Tagger", obj=type(get_examples))
|
||||||
|
@ -307,7 +307,7 @@ class Tagger(Pipe):
|
||||||
label (str): The label to add.
|
label (str): The label to add.
|
||||||
RETURNS (int): 0 if label is already present, otherwise 1.
|
RETURNS (int): 0 if label is already present, otherwise 1.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tagger#add_label
|
DOCS: https://nightly.spacy.io/api/tagger#add_label
|
||||||
"""
|
"""
|
||||||
if not isinstance(label, str):
|
if not isinstance(label, str):
|
||||||
raise ValueError(Errors.E187)
|
raise ValueError(Errors.E187)
|
||||||
|
@ -324,7 +324,7 @@ class Tagger(Pipe):
|
||||||
RETURNS (Dict[str, Any]): The scores, produced by
|
RETURNS (Dict[str, Any]): The scores, produced by
|
||||||
Scorer.score_token_attr for the attributes "tag".
|
Scorer.score_token_attr for the attributes "tag".
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tagger#score
|
DOCS: https://nightly.spacy.io/api/tagger#score
|
||||||
"""
|
"""
|
||||||
validate_examples(examples, "Tagger.score")
|
validate_examples(examples, "Tagger.score")
|
||||||
return Scorer.score_token_attr(examples, "tag", **kwargs)
|
return Scorer.score_token_attr(examples, "tag", **kwargs)
|
||||||
|
@ -335,7 +335,7 @@ class Tagger(Pipe):
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
RETURNS (bytes): The serialized object.
|
RETURNS (bytes): The serialized object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tagger#to_bytes
|
DOCS: https://nightly.spacy.io/api/tagger#to_bytes
|
||||||
"""
|
"""
|
||||||
serialize = {}
|
serialize = {}
|
||||||
serialize["model"] = self.model.to_bytes
|
serialize["model"] = self.model.to_bytes
|
||||||
|
@ -350,7 +350,7 @@ class Tagger(Pipe):
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
RETURNS (Tagger): The loaded Tagger.
|
RETURNS (Tagger): The loaded Tagger.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tagger#from_bytes
|
DOCS: https://nightly.spacy.io/api/tagger#from_bytes
|
||||||
"""
|
"""
|
||||||
def load_model(b):
|
def load_model(b):
|
||||||
try:
|
try:
|
||||||
|
@ -372,7 +372,7 @@ class Tagger(Pipe):
|
||||||
path (str / Path): Path to a directory.
|
path (str / Path): Path to a directory.
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tagger#to_disk
|
DOCS: https://nightly.spacy.io/api/tagger#to_disk
|
||||||
"""
|
"""
|
||||||
serialize = {
|
serialize = {
|
||||||
"vocab": lambda p: self.vocab.to_disk(p),
|
"vocab": lambda p: self.vocab.to_disk(p),
|
||||||
|
@ -388,7 +388,7 @@ class Tagger(Pipe):
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
RETURNS (Tagger): The modified Tagger object.
|
RETURNS (Tagger): The modified Tagger object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tagger#from_disk
|
DOCS: https://nightly.spacy.io/api/tagger#from_disk
|
||||||
"""
|
"""
|
||||||
def load_model(p):
|
def load_model(p):
|
||||||
with p.open("rb") as file_:
|
with p.open("rb") as file_:
|
||||||
|
|
|
@ -92,7 +92,7 @@ def make_textcat(
|
||||||
class TextCategorizer(Pipe):
|
class TextCategorizer(Pipe):
|
||||||
"""Pipeline component for text classification.
|
"""Pipeline component for text classification.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/textcategorizer
|
DOCS: https://nightly.spacy.io/api/textcategorizer
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
|
@ -111,7 +111,7 @@ class TextCategorizer(Pipe):
|
||||||
losses during training.
|
losses during training.
|
||||||
labels (Iterable[str]): The labels to use.
|
labels (Iterable[str]): The labels to use.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/textcategorizer#init
|
DOCS: https://nightly.spacy.io/api/textcategorizer#init
|
||||||
"""
|
"""
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.model = model
|
self.model = model
|
||||||
|
@ -124,7 +124,7 @@ class TextCategorizer(Pipe):
|
||||||
def labels(self) -> Tuple[str]:
|
def labels(self) -> Tuple[str]:
|
||||||
"""RETURNS (Tuple[str]): The labels currently added to the component.
|
"""RETURNS (Tuple[str]): The labels currently added to the component.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/textcategorizer#labels
|
DOCS: https://nightly.spacy.io/api/textcategorizer#labels
|
||||||
"""
|
"""
|
||||||
return tuple(self.cfg.setdefault("labels", []))
|
return tuple(self.cfg.setdefault("labels", []))
|
||||||
|
|
||||||
|
@ -146,7 +146,7 @@ class TextCategorizer(Pipe):
|
||||||
batch_size (int): The number of documents to buffer.
|
batch_size (int): The number of documents to buffer.
|
||||||
YIELDS (Doc): Processed documents in order.
|
YIELDS (Doc): Processed documents in order.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/textcategorizer#pipe
|
DOCS: https://nightly.spacy.io/api/textcategorizer#pipe
|
||||||
"""
|
"""
|
||||||
for docs in util.minibatch(stream, size=batch_size):
|
for docs in util.minibatch(stream, size=batch_size):
|
||||||
scores = self.predict(docs)
|
scores = self.predict(docs)
|
||||||
|
@ -159,7 +159,7 @@ class TextCategorizer(Pipe):
|
||||||
docs (Iterable[Doc]): The documents to predict.
|
docs (Iterable[Doc]): The documents to predict.
|
||||||
RETURNS: The models prediction for each document.
|
RETURNS: The models prediction for each document.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/textcategorizer#predict
|
DOCS: https://nightly.spacy.io/api/textcategorizer#predict
|
||||||
"""
|
"""
|
||||||
tensors = [doc.tensor for doc in docs]
|
tensors = [doc.tensor for doc in docs]
|
||||||
if not any(len(doc) for doc in docs):
|
if not any(len(doc) for doc in docs):
|
||||||
|
@ -177,7 +177,7 @@ class TextCategorizer(Pipe):
|
||||||
docs (Iterable[Doc]): The documents to modify.
|
docs (Iterable[Doc]): The documents to modify.
|
||||||
scores: The scores to set, produced by TextCategorizer.predict.
|
scores: The scores to set, produced by TextCategorizer.predict.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/textcategorizer#set_annotations
|
DOCS: https://nightly.spacy.io/api/textcategorizer#set_annotations
|
||||||
"""
|
"""
|
||||||
for i, doc in enumerate(docs):
|
for i, doc in enumerate(docs):
|
||||||
for j, label in enumerate(self.labels):
|
for j, label in enumerate(self.labels):
|
||||||
|
@ -204,7 +204,7 @@ class TextCategorizer(Pipe):
|
||||||
Updated using the component name as the key.
|
Updated using the component name as the key.
|
||||||
RETURNS (Dict[str, float]): The updated losses dictionary.
|
RETURNS (Dict[str, float]): The updated losses dictionary.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/textcategorizer#update
|
DOCS: https://nightly.spacy.io/api/textcategorizer#update
|
||||||
"""
|
"""
|
||||||
if losses is None:
|
if losses is None:
|
||||||
losses = {}
|
losses = {}
|
||||||
|
@ -245,7 +245,7 @@ class TextCategorizer(Pipe):
|
||||||
Updated using the component name as the key.
|
Updated using the component name as the key.
|
||||||
RETURNS (Dict[str, float]): The updated losses dictionary.
|
RETURNS (Dict[str, float]): The updated losses dictionary.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/textcategorizer#rehearse
|
DOCS: https://nightly.spacy.io/api/textcategorizer#rehearse
|
||||||
"""
|
"""
|
||||||
if losses is not None:
|
if losses is not None:
|
||||||
losses.setdefault(self.name, 0.0)
|
losses.setdefault(self.name, 0.0)
|
||||||
|
@ -289,7 +289,7 @@ class TextCategorizer(Pipe):
|
||||||
scores: Scores representing the model's predictions.
|
scores: Scores representing the model's predictions.
|
||||||
RETUTNRS (Tuple[float, float]): The loss and the gradient.
|
RETUTNRS (Tuple[float, float]): The loss and the gradient.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/textcategorizer#get_loss
|
DOCS: https://nightly.spacy.io/api/textcategorizer#get_loss
|
||||||
"""
|
"""
|
||||||
validate_examples(examples, "TextCategorizer.get_loss")
|
validate_examples(examples, "TextCategorizer.get_loss")
|
||||||
truths, not_missing = self._examples_to_truth(examples)
|
truths, not_missing = self._examples_to_truth(examples)
|
||||||
|
@ -305,7 +305,7 @@ class TextCategorizer(Pipe):
|
||||||
label (str): The label to add.
|
label (str): The label to add.
|
||||||
RETURNS (int): 0 if label is already present, otherwise 1.
|
RETURNS (int): 0 if label is already present, otherwise 1.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/textcategorizer#add_label
|
DOCS: https://nightly.spacy.io/api/textcategorizer#add_label
|
||||||
"""
|
"""
|
||||||
if not isinstance(label, str):
|
if not isinstance(label, str):
|
||||||
raise ValueError(Errors.E187)
|
raise ValueError(Errors.E187)
|
||||||
|
@ -343,7 +343,7 @@ class TextCategorizer(Pipe):
|
||||||
create_optimizer if it doesn't exist.
|
create_optimizer if it doesn't exist.
|
||||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/textcategorizer#begin_training
|
DOCS: https://nightly.spacy.io/api/textcategorizer#begin_training
|
||||||
"""
|
"""
|
||||||
if not hasattr(get_examples, "__call__"):
|
if not hasattr(get_examples, "__call__"):
|
||||||
err = Errors.E930.format(name="TextCategorizer", obj=type(get_examples))
|
err = Errors.E930.format(name="TextCategorizer", obj=type(get_examples))
|
||||||
|
@ -378,7 +378,7 @@ class TextCategorizer(Pipe):
|
||||||
positive_label (str): Optional positive label.
|
positive_label (str): Optional positive label.
|
||||||
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats.
|
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/textcategorizer#score
|
DOCS: https://nightly.spacy.io/api/textcategorizer#score
|
||||||
"""
|
"""
|
||||||
validate_examples(examples, "TextCategorizer.score")
|
validate_examples(examples, "TextCategorizer.score")
|
||||||
return Scorer.score_cats(
|
return Scorer.score_cats(
|
||||||
|
|
|
@ -56,7 +56,7 @@ class Tok2Vec(Pipe):
|
||||||
a list of Doc objects as input, and output a list of 2d float arrays.
|
a list of Doc objects as input, and output a list of 2d float arrays.
|
||||||
name (str): The component instance name.
|
name (str): The component instance name.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tok2vec#init
|
DOCS: https://nightly.spacy.io/api/tok2vec#init
|
||||||
"""
|
"""
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.model = model
|
self.model = model
|
||||||
|
@ -91,7 +91,7 @@ class Tok2Vec(Pipe):
|
||||||
docs (Doc): The Doc to process.
|
docs (Doc): The Doc to process.
|
||||||
RETURNS (Doc): The processed Doc.
|
RETURNS (Doc): The processed Doc.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tok2vec#call
|
DOCS: https://nightly.spacy.io/api/tok2vec#call
|
||||||
"""
|
"""
|
||||||
tokvecses = self.predict([doc])
|
tokvecses = self.predict([doc])
|
||||||
self.set_annotations([doc], tokvecses)
|
self.set_annotations([doc], tokvecses)
|
||||||
|
@ -106,7 +106,7 @@ class Tok2Vec(Pipe):
|
||||||
batch_size (int): The number of documents to buffer.
|
batch_size (int): The number of documents to buffer.
|
||||||
YIELDS (Doc): Processed documents in order.
|
YIELDS (Doc): Processed documents in order.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tok2vec#pipe
|
DOCS: https://nightly.spacy.io/api/tok2vec#pipe
|
||||||
"""
|
"""
|
||||||
for docs in minibatch(stream, batch_size):
|
for docs in minibatch(stream, batch_size):
|
||||||
docs = list(docs)
|
docs = list(docs)
|
||||||
|
@ -121,7 +121,7 @@ class Tok2Vec(Pipe):
|
||||||
docs (Iterable[Doc]): The documents to predict.
|
docs (Iterable[Doc]): The documents to predict.
|
||||||
RETURNS: Vector representations for each token in the documents.
|
RETURNS: Vector representations for each token in the documents.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tok2vec#predict
|
DOCS: https://nightly.spacy.io/api/tok2vec#predict
|
||||||
"""
|
"""
|
||||||
tokvecs = self.model.predict(docs)
|
tokvecs = self.model.predict(docs)
|
||||||
batch_id = Tok2VecListener.get_batch_id(docs)
|
batch_id = Tok2VecListener.get_batch_id(docs)
|
||||||
|
@ -135,7 +135,7 @@ class Tok2Vec(Pipe):
|
||||||
docs (Iterable[Doc]): The documents to modify.
|
docs (Iterable[Doc]): The documents to modify.
|
||||||
tokvecses: The tensors to set, produced by Tok2Vec.predict.
|
tokvecses: The tensors to set, produced by Tok2Vec.predict.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tok2vec#set_annotations
|
DOCS: https://nightly.spacy.io/api/tok2vec#set_annotations
|
||||||
"""
|
"""
|
||||||
for doc, tokvecs in zip(docs, tokvecses):
|
for doc, tokvecs in zip(docs, tokvecses):
|
||||||
assert tokvecs.shape[0] == len(doc)
|
assert tokvecs.shape[0] == len(doc)
|
||||||
|
@ -162,7 +162,7 @@ class Tok2Vec(Pipe):
|
||||||
Updated using the component name as the key.
|
Updated using the component name as the key.
|
||||||
RETURNS (Dict[str, float]): The updated losses dictionary.
|
RETURNS (Dict[str, float]): The updated losses dictionary.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tok2vec#update
|
DOCS: https://nightly.spacy.io/api/tok2vec#update
|
||||||
"""
|
"""
|
||||||
if losses is None:
|
if losses is None:
|
||||||
losses = {}
|
losses = {}
|
||||||
|
@ -220,7 +220,7 @@ class Tok2Vec(Pipe):
|
||||||
create_optimizer if it doesn't exist.
|
create_optimizer if it doesn't exist.
|
||||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tok2vec#begin_training
|
DOCS: https://nightly.spacy.io/api/tok2vec#begin_training
|
||||||
"""
|
"""
|
||||||
docs = [Doc(self.vocab, words=["hello"])]
|
docs = [Doc(self.vocab, words=["hello"])]
|
||||||
self.model.initialize(X=docs)
|
self.model.initialize(X=docs)
|
||||||
|
|
|
@ -6,6 +6,7 @@ from itertools import islice
|
||||||
from libcpp.vector cimport vector
|
from libcpp.vector cimport vector
|
||||||
from libc.string cimport memset
|
from libc.string cimport memset
|
||||||
from libc.stdlib cimport calloc, free
|
from libc.stdlib cimport calloc, free
|
||||||
|
import random
|
||||||
|
|
||||||
import srsly
|
import srsly
|
||||||
from thinc.api import set_dropout_rate
|
from thinc.api import set_dropout_rate
|
||||||
|
@ -275,22 +276,22 @@ cdef class Parser(Pipe):
|
||||||
# Prepare the stepwise model, and get the callback for finishing the batch
|
# Prepare the stepwise model, and get the callback for finishing the batch
|
||||||
model, backprop_tok2vec = self.model.begin_update(
|
model, backprop_tok2vec = self.model.begin_update(
|
||||||
[eg.predicted for eg in examples])
|
[eg.predicted for eg in examples])
|
||||||
if self.cfg["update_with_oracle_cut_size"] >= 1:
|
max_moves = self.cfg["update_with_oracle_cut_size"]
|
||||||
# Chop sequences into lengths of this many transitions, to make the
|
if max_moves >= 1:
|
||||||
|
# Chop sequences into lengths of this many words, to make the
|
||||||
# batch uniform length.
|
# batch uniform length.
|
||||||
# We used to randomize this, but it's not clear that actually helps?
|
max_moves = int(random.uniform(max_moves // 2, max_moves * 2))
|
||||||
cut_size = self.cfg["update_with_oracle_cut_size"]
|
states, golds, _ = self._init_gold_batch(
|
||||||
states, golds, max_steps = self._init_gold_batch(
|
|
||||||
examples,
|
examples,
|
||||||
max_length=cut_size
|
max_length=max_moves
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
states, golds, _ = self.moves.init_gold_batch(examples)
|
states, golds, _ = self.moves.init_gold_batch(examples)
|
||||||
max_steps = max([len(eg.x) for eg in examples])
|
|
||||||
if not states:
|
if not states:
|
||||||
return losses
|
return losses
|
||||||
all_states = list(states)
|
all_states = list(states)
|
||||||
states_golds = list(zip(states, golds))
|
states_golds = list(zip(states, golds))
|
||||||
|
n_moves = 0
|
||||||
while states_golds:
|
while states_golds:
|
||||||
states, golds = zip(*states_golds)
|
states, golds = zip(*states_golds)
|
||||||
scores, backprop = model.begin_update(states)
|
scores, backprop = model.begin_update(states)
|
||||||
|
@ -303,6 +304,9 @@ cdef class Parser(Pipe):
|
||||||
# Follow the predicted action
|
# Follow the predicted action
|
||||||
self.transition_states(states, scores)
|
self.transition_states(states, scores)
|
||||||
states_golds = [(s, g) for (s, g) in zip(states, golds) if not s.is_final()]
|
states_golds = [(s, g) for (s, g) in zip(states, golds) if not s.is_final()]
|
||||||
|
if max_moves >= 1 and n_moves >= max_moves:
|
||||||
|
break
|
||||||
|
n_moves += 1
|
||||||
|
|
||||||
backprop_tok2vec(golds)
|
backprop_tok2vec(golds)
|
||||||
if sgd not in (None, False):
|
if sgd not in (None, False):
|
||||||
|
@ -498,7 +502,7 @@ cdef class Parser(Pipe):
|
||||||
raise ValueError(Errors.E149) from None
|
raise ValueError(Errors.E149) from None
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def _init_gold_batch(self, examples, min_length=5, max_length=500):
|
def _init_gold_batch(self, examples, max_length):
|
||||||
"""Make a square batch, of length equal to the shortest transition
|
"""Make a square batch, of length equal to the shortest transition
|
||||||
sequence or a cap. A long
|
sequence or a cap. A long
|
||||||
doc will get multiple states. Let's say we have a doc of length 2*N,
|
doc will get multiple states. Let's say we have a doc of length 2*N,
|
||||||
|
@ -511,8 +515,7 @@ cdef class Parser(Pipe):
|
||||||
all_states = self.moves.init_batch([eg.predicted for eg in examples])
|
all_states = self.moves.init_batch([eg.predicted for eg in examples])
|
||||||
states = []
|
states = []
|
||||||
golds = []
|
golds = []
|
||||||
kept = []
|
to_cut = []
|
||||||
max_length_seen = 0
|
|
||||||
for state, eg in zip(all_states, examples):
|
for state, eg in zip(all_states, examples):
|
||||||
if self.moves.has_gold(eg) and not state.is_final():
|
if self.moves.has_gold(eg) and not state.is_final():
|
||||||
gold = self.moves.init_gold(state, eg)
|
gold = self.moves.init_gold(state, eg)
|
||||||
|
@ -522,30 +525,22 @@ cdef class Parser(Pipe):
|
||||||
else:
|
else:
|
||||||
oracle_actions = self.moves.get_oracle_sequence_from_state(
|
oracle_actions = self.moves.get_oracle_sequence_from_state(
|
||||||
state.copy(), gold)
|
state.copy(), gold)
|
||||||
kept.append((eg, state, gold, oracle_actions))
|
to_cut.append((eg, state, gold, oracle_actions))
|
||||||
min_length = min(min_length, len(oracle_actions))
|
if not to_cut:
|
||||||
max_length_seen = max(max_length, len(oracle_actions))
|
|
||||||
if not kept:
|
|
||||||
return states, golds, 0
|
return states, golds, 0
|
||||||
max_length = max(min_length, min(max_length, max_length_seen))
|
|
||||||
cdef int clas
|
cdef int clas
|
||||||
max_moves = 0
|
for eg, state, gold, oracle_actions in to_cut:
|
||||||
for eg, state, gold, oracle_actions in kept:
|
|
||||||
for i in range(0, len(oracle_actions), max_length):
|
for i in range(0, len(oracle_actions), max_length):
|
||||||
start_state = state.copy()
|
start_state = state.copy()
|
||||||
n_moves = 0
|
|
||||||
for clas in oracle_actions[i:i+max_length]:
|
for clas in oracle_actions[i:i+max_length]:
|
||||||
action = self.moves.c[clas]
|
action = self.moves.c[clas]
|
||||||
action.do(state.c, action.label)
|
action.do(state.c, action.label)
|
||||||
state.c.push_hist(action.clas)
|
state.c.push_hist(action.clas)
|
||||||
n_moves += 1
|
|
||||||
if state.is_final():
|
if state.is_final():
|
||||||
break
|
break
|
||||||
max_moves = max(max_moves, n_moves)
|
|
||||||
if self.moves.has_gold(eg, start_state.B(0), state.B(0)):
|
if self.moves.has_gold(eg, start_state.B(0), state.B(0)):
|
||||||
states.append(start_state)
|
states.append(start_state)
|
||||||
golds.append(gold)
|
golds.append(gold)
|
||||||
max_moves = max(max_moves, n_moves)
|
|
||||||
if state.is_final():
|
if state.is_final():
|
||||||
break
|
break
|
||||||
return states, golds, max_moves
|
return states, golds, max_length
|
||||||
|
|
|
@ -85,7 +85,7 @@ class Scorer:
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize the Scorer.
|
"""Initialize the Scorer.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/scorer#init
|
DOCS: https://nightly.spacy.io/api/scorer#init
|
||||||
"""
|
"""
|
||||||
self.nlp = nlp
|
self.nlp = nlp
|
||||||
self.cfg = cfg
|
self.cfg = cfg
|
||||||
|
@ -101,7 +101,7 @@ class Scorer:
|
||||||
examples (Iterable[Example]): The predicted annotations + correct annotations.
|
examples (Iterable[Example]): The predicted annotations + correct annotations.
|
||||||
RETURNS (Dict): A dictionary of scores.
|
RETURNS (Dict): A dictionary of scores.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/scorer#score
|
DOCS: https://nightly.spacy.io/api/scorer#score
|
||||||
"""
|
"""
|
||||||
scores = {}
|
scores = {}
|
||||||
if hasattr(self.nlp.tokenizer, "score"):
|
if hasattr(self.nlp.tokenizer, "score"):
|
||||||
|
@ -121,7 +121,7 @@ class Scorer:
|
||||||
RETURNS (Dict[str, float]): A dictionary containing the scores
|
RETURNS (Dict[str, float]): A dictionary containing the scores
|
||||||
token_acc/p/r/f.
|
token_acc/p/r/f.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/scorer#score_tokenization
|
DOCS: https://nightly.spacy.io/api/scorer#score_tokenization
|
||||||
"""
|
"""
|
||||||
acc_score = PRFScore()
|
acc_score = PRFScore()
|
||||||
prf_score = PRFScore()
|
prf_score = PRFScore()
|
||||||
|
@ -169,7 +169,7 @@ class Scorer:
|
||||||
RETURNS (Dict[str, float]): A dictionary containing the accuracy score
|
RETURNS (Dict[str, float]): A dictionary containing the accuracy score
|
||||||
under the key attr_acc.
|
under the key attr_acc.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/scorer#score_token_attr
|
DOCS: https://nightly.spacy.io/api/scorer#score_token_attr
|
||||||
"""
|
"""
|
||||||
tag_score = PRFScore()
|
tag_score = PRFScore()
|
||||||
for example in examples:
|
for example in examples:
|
||||||
|
@ -263,7 +263,7 @@ class Scorer:
|
||||||
RETURNS (Dict[str, Any]): A dictionary containing the PRF scores under
|
RETURNS (Dict[str, Any]): A dictionary containing the PRF scores under
|
||||||
the keys attr_p/r/f and the per-type PRF scores under attr_per_type.
|
the keys attr_p/r/f and the per-type PRF scores under attr_per_type.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/scorer#score_spans
|
DOCS: https://nightly.spacy.io/api/scorer#score_spans
|
||||||
"""
|
"""
|
||||||
score = PRFScore()
|
score = PRFScore()
|
||||||
score_per_type = dict()
|
score_per_type = dict()
|
||||||
|
@ -350,7 +350,7 @@ class Scorer:
|
||||||
attr_f_per_type,
|
attr_f_per_type,
|
||||||
attr_auc_per_type
|
attr_auc_per_type
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/scorer#score_cats
|
DOCS: https://nightly.spacy.io/api/scorer#score_cats
|
||||||
"""
|
"""
|
||||||
if threshold is None:
|
if threshold is None:
|
||||||
threshold = 0.5 if multi_label else 0.0
|
threshold = 0.5 if multi_label else 0.0
|
||||||
|
@ -467,7 +467,7 @@ class Scorer:
|
||||||
RETURNS (Dict[str, Any]): A dictionary containing the scores:
|
RETURNS (Dict[str, Any]): A dictionary containing the scores:
|
||||||
attr_uas, attr_las, and attr_las_per_type.
|
attr_uas, attr_las, and attr_las_per_type.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/scorer#score_deps
|
DOCS: https://nightly.spacy.io/api/scorer#score_deps
|
||||||
"""
|
"""
|
||||||
unlabelled = PRFScore()
|
unlabelled = PRFScore()
|
||||||
labelled = PRFScore()
|
labelled = PRFScore()
|
||||||
|
|
|
@ -91,7 +91,7 @@ cdef Utf8Str* _allocate(Pool mem, const unsigned char* chars, uint32_t length) e
|
||||||
cdef class StringStore:
|
cdef class StringStore:
|
||||||
"""Look up strings by 64-bit hashes.
|
"""Look up strings by 64-bit hashes.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/stringstore
|
DOCS: https://nightly.spacy.io/api/stringstore
|
||||||
"""
|
"""
|
||||||
def __init__(self, strings=None, freeze=False):
|
def __init__(self, strings=None, freeze=False):
|
||||||
"""Create the StringStore.
|
"""Create the StringStore.
|
||||||
|
|
|
@ -317,7 +317,8 @@ def test_doc_from_array_morph(en_vocab):
|
||||||
|
|
||||||
|
|
||||||
def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
||||||
en_texts = ["Merging the docs is fun.", "They don't think alike."]
|
en_texts = ["Merging the docs is fun.", "", "They don't think alike."]
|
||||||
|
en_texts_without_empty = [t for t in en_texts if len(t)]
|
||||||
de_text = "Wie war die Frage?"
|
de_text = "Wie war die Frage?"
|
||||||
en_docs = [en_tokenizer(text) for text in en_texts]
|
en_docs = [en_tokenizer(text) for text in en_texts]
|
||||||
docs_idx = en_texts[0].index("docs")
|
docs_idx = en_texts[0].index("docs")
|
||||||
|
@ -338,14 +339,14 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
||||||
Doc.from_docs(en_docs + [de_doc])
|
Doc.from_docs(en_docs + [de_doc])
|
||||||
|
|
||||||
m_doc = Doc.from_docs(en_docs)
|
m_doc = Doc.from_docs(en_docs)
|
||||||
assert len(en_docs) == len(list(m_doc.sents))
|
assert len(en_texts_without_empty) == len(list(m_doc.sents))
|
||||||
assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
|
assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
|
||||||
assert str(m_doc) == " ".join(en_texts)
|
assert str(m_doc) == " ".join(en_texts_without_empty)
|
||||||
p_token = m_doc[len(en_docs[0]) - 1]
|
p_token = m_doc[len(en_docs[0]) - 1]
|
||||||
assert p_token.text == "." and bool(p_token.whitespace_)
|
assert p_token.text == "." and bool(p_token.whitespace_)
|
||||||
en_docs_tokens = [t for doc in en_docs for t in doc]
|
en_docs_tokens = [t for doc in en_docs for t in doc]
|
||||||
assert len(m_doc) == len(en_docs_tokens)
|
assert len(m_doc) == len(en_docs_tokens)
|
||||||
think_idx = len(en_texts[0]) + 1 + en_texts[1].index("think")
|
think_idx = len(en_texts[0]) + 1 + en_texts[2].index("think")
|
||||||
assert m_doc[9].idx == think_idx
|
assert m_doc[9].idx == think_idx
|
||||||
with pytest.raises(AttributeError):
|
with pytest.raises(AttributeError):
|
||||||
# not callable, because it was not set via set_extension
|
# not callable, because it was not set via set_extension
|
||||||
|
@ -353,14 +354,14 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
||||||
assert len(m_doc.user_data) == len(en_docs[0].user_data) # but it's there
|
assert len(m_doc.user_data) == len(en_docs[0].user_data) # but it's there
|
||||||
|
|
||||||
m_doc = Doc.from_docs(en_docs, ensure_whitespace=False)
|
m_doc = Doc.from_docs(en_docs, ensure_whitespace=False)
|
||||||
assert len(en_docs) == len(list(m_doc.sents))
|
assert len(en_texts_without_empty) == len(list(m_doc.sents))
|
||||||
assert len(str(m_doc)) == len(en_texts[0]) + len(en_texts[1])
|
assert len(str(m_doc)) == sum(len(t) for t in en_texts)
|
||||||
assert str(m_doc) == "".join(en_texts)
|
assert str(m_doc) == "".join(en_texts)
|
||||||
p_token = m_doc[len(en_docs[0]) - 1]
|
p_token = m_doc[len(en_docs[0]) - 1]
|
||||||
assert p_token.text == "." and not bool(p_token.whitespace_)
|
assert p_token.text == "." and not bool(p_token.whitespace_)
|
||||||
en_docs_tokens = [t for doc in en_docs for t in doc]
|
en_docs_tokens = [t for doc in en_docs for t in doc]
|
||||||
assert len(m_doc) == len(en_docs_tokens)
|
assert len(m_doc) == len(en_docs_tokens)
|
||||||
think_idx = len(en_texts[0]) + 0 + en_texts[1].index("think")
|
think_idx = len(en_texts[0]) + 0 + en_texts[2].index("think")
|
||||||
assert m_doc[9].idx == think_idx
|
assert m_doc[9].idx == think_idx
|
||||||
|
|
||||||
m_doc = Doc.from_docs(en_docs, attrs=["lemma", "length", "pos"])
|
m_doc = Doc.from_docs(en_docs, attrs=["lemma", "length", "pos"])
|
||||||
|
@ -369,12 +370,12 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
||||||
assert list(m_doc.sents)
|
assert list(m_doc.sents)
|
||||||
assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
|
assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
|
||||||
# space delimiter considered, although spacy attribute was missing
|
# space delimiter considered, although spacy attribute was missing
|
||||||
assert str(m_doc) == " ".join(en_texts)
|
assert str(m_doc) == " ".join(en_texts_without_empty)
|
||||||
p_token = m_doc[len(en_docs[0]) - 1]
|
p_token = m_doc[len(en_docs[0]) - 1]
|
||||||
assert p_token.text == "." and bool(p_token.whitespace_)
|
assert p_token.text == "." and bool(p_token.whitespace_)
|
||||||
en_docs_tokens = [t for doc in en_docs for t in doc]
|
en_docs_tokens = [t for doc in en_docs for t in doc]
|
||||||
assert len(m_doc) == len(en_docs_tokens)
|
assert len(m_doc) == len(en_docs_tokens)
|
||||||
think_idx = len(en_texts[0]) + 1 + en_texts[1].index("think")
|
think_idx = len(en_texts[0]) + 1 + en_texts[2].index("think")
|
||||||
assert m_doc[9].idx == think_idx
|
assert m_doc[9].idx == think_idx
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -14,7 +14,7 @@ LANGUAGES = ["el", "en", "fr", "nl"]
|
||||||
|
|
||||||
@pytest.mark.parametrize("lang", LANGUAGES)
|
@pytest.mark.parametrize("lang", LANGUAGES)
|
||||||
def test_lemmatizer_initialize(lang, capfd):
|
def test_lemmatizer_initialize(lang, capfd):
|
||||||
@registry.assets("lemmatizer_init_lookups")
|
@registry.misc("lemmatizer_init_lookups")
|
||||||
def lemmatizer_init_lookups():
|
def lemmatizer_init_lookups():
|
||||||
lookups = Lookups()
|
lookups = Lookups()
|
||||||
lookups.add_table("lemma_lookup", {"cope": "cope"})
|
lookups.add_table("lemma_lookup", {"cope": "cope"})
|
||||||
|
@ -25,9 +25,7 @@ def test_lemmatizer_initialize(lang, capfd):
|
||||||
|
|
||||||
"""Test that languages can be initialized."""
|
"""Test that languages can be initialized."""
|
||||||
nlp = get_lang_class(lang)()
|
nlp = get_lang_class(lang)()
|
||||||
nlp.add_pipe(
|
nlp.add_pipe("lemmatizer", config={"lookups": {"@misc": "lemmatizer_init_lookups"}})
|
||||||
"lemmatizer", config={"lookups": {"@assets": "lemmatizer_init_lookups"}}
|
|
||||||
)
|
|
||||||
# Check for stray print statements (see #3342)
|
# Check for stray print statements (see #3342)
|
||||||
doc = nlp("test") # noqa: F841
|
doc = nlp("test") # noqa: F841
|
||||||
captured = capfd.readouterr()
|
captured = capfd.readouterr()
|
||||||
|
|
|
@ -31,7 +31,7 @@ def pattern_dicts():
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@registry.assets("attribute_ruler_patterns")
|
@registry.misc("attribute_ruler_patterns")
|
||||||
def attribute_ruler_patterns():
|
def attribute_ruler_patterns():
|
||||||
return [
|
return [
|
||||||
{
|
{
|
||||||
|
@ -86,7 +86,7 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
|
||||||
# initialize with patterns from asset
|
# initialize with patterns from asset
|
||||||
nlp.add_pipe(
|
nlp.add_pipe(
|
||||||
"attribute_ruler",
|
"attribute_ruler",
|
||||||
config={"pattern_dicts": {"@assets": "attribute_ruler_patterns"}},
|
config={"pattern_dicts": {"@misc": "attribute_ruler_patterns"}},
|
||||||
)
|
)
|
||||||
doc = nlp("This is a test.")
|
doc = nlp("This is a test.")
|
||||||
assert doc[2].lemma_ == "the"
|
assert doc[2].lemma_ == "the"
|
||||||
|
|
|
@ -137,7 +137,7 @@ def test_kb_undefined(nlp):
|
||||||
|
|
||||||
def test_kb_empty(nlp):
|
def test_kb_empty(nlp):
|
||||||
"""Test that the EL can't train with an empty KB"""
|
"""Test that the EL can't train with an empty KB"""
|
||||||
config = {"kb_loader": {"@assets": "spacy.EmptyKB.v1", "entity_vector_length": 342}}
|
config = {"kb_loader": {"@misc": "spacy.EmptyKB.v1", "entity_vector_length": 342}}
|
||||||
entity_linker = nlp.add_pipe("entity_linker", config=config)
|
entity_linker = nlp.add_pipe("entity_linker", config=config)
|
||||||
assert len(entity_linker.kb) == 0
|
assert len(entity_linker.kb) == 0
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
|
@ -183,7 +183,7 @@ def test_el_pipe_configuration(nlp):
|
||||||
ruler = nlp.add_pipe("entity_ruler")
|
ruler = nlp.add_pipe("entity_ruler")
|
||||||
ruler.add_patterns([pattern])
|
ruler.add_patterns([pattern])
|
||||||
|
|
||||||
@registry.assets.register("myAdamKB.v1")
|
@registry.misc.register("myAdamKB.v1")
|
||||||
def mykb() -> Callable[["Vocab"], KnowledgeBase]:
|
def mykb() -> Callable[["Vocab"], KnowledgeBase]:
|
||||||
def create_kb(vocab):
|
def create_kb(vocab):
|
||||||
kb = KnowledgeBase(vocab, entity_vector_length=1)
|
kb = KnowledgeBase(vocab, entity_vector_length=1)
|
||||||
|
@ -199,7 +199,7 @@ def test_el_pipe_configuration(nlp):
|
||||||
# run an EL pipe without a trained context encoder, to check the candidate generation step only
|
# run an EL pipe without a trained context encoder, to check the candidate generation step only
|
||||||
nlp.add_pipe(
|
nlp.add_pipe(
|
||||||
"entity_linker",
|
"entity_linker",
|
||||||
config={"kb_loader": {"@assets": "myAdamKB.v1"}, "incl_context": False},
|
config={"kb_loader": {"@misc": "myAdamKB.v1"}, "incl_context": False},
|
||||||
)
|
)
|
||||||
# With the default get_candidates function, matching is case-sensitive
|
# With the default get_candidates function, matching is case-sensitive
|
||||||
text = "Douglas and douglas are not the same."
|
text = "Douglas and douglas are not the same."
|
||||||
|
@ -211,7 +211,7 @@ def test_el_pipe_configuration(nlp):
|
||||||
def get_lowercased_candidates(kb, span):
|
def get_lowercased_candidates(kb, span):
|
||||||
return kb.get_alias_candidates(span.text.lower())
|
return kb.get_alias_candidates(span.text.lower())
|
||||||
|
|
||||||
@registry.assets.register("spacy.LowercaseCandidateGenerator.v1")
|
@registry.misc.register("spacy.LowercaseCandidateGenerator.v1")
|
||||||
def create_candidates() -> Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]:
|
def create_candidates() -> Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]:
|
||||||
return get_lowercased_candidates
|
return get_lowercased_candidates
|
||||||
|
|
||||||
|
@ -220,9 +220,9 @@ def test_el_pipe_configuration(nlp):
|
||||||
"entity_linker",
|
"entity_linker",
|
||||||
"entity_linker",
|
"entity_linker",
|
||||||
config={
|
config={
|
||||||
"kb_loader": {"@assets": "myAdamKB.v1"},
|
"kb_loader": {"@misc": "myAdamKB.v1"},
|
||||||
"incl_context": False,
|
"incl_context": False,
|
||||||
"get_candidates": {"@assets": "spacy.LowercaseCandidateGenerator.v1"},
|
"get_candidates": {"@misc": "spacy.LowercaseCandidateGenerator.v1"},
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
doc = nlp(text)
|
doc = nlp(text)
|
||||||
|
@ -282,7 +282,7 @@ def test_append_invalid_alias(nlp):
|
||||||
def test_preserving_links_asdoc(nlp):
|
def test_preserving_links_asdoc(nlp):
|
||||||
"""Test that Span.as_doc preserves the existing entity links"""
|
"""Test that Span.as_doc preserves the existing entity links"""
|
||||||
|
|
||||||
@registry.assets.register("myLocationsKB.v1")
|
@registry.misc.register("myLocationsKB.v1")
|
||||||
def dummy_kb() -> Callable[["Vocab"], KnowledgeBase]:
|
def dummy_kb() -> Callable[["Vocab"], KnowledgeBase]:
|
||||||
def create_kb(vocab):
|
def create_kb(vocab):
|
||||||
mykb = KnowledgeBase(vocab, entity_vector_length=1)
|
mykb = KnowledgeBase(vocab, entity_vector_length=1)
|
||||||
|
@ -304,7 +304,7 @@ def test_preserving_links_asdoc(nlp):
|
||||||
]
|
]
|
||||||
ruler = nlp.add_pipe("entity_ruler")
|
ruler = nlp.add_pipe("entity_ruler")
|
||||||
ruler.add_patterns(patterns)
|
ruler.add_patterns(patterns)
|
||||||
el_config = {"kb_loader": {"@assets": "myLocationsKB.v1"}, "incl_prior": False}
|
el_config = {"kb_loader": {"@misc": "myLocationsKB.v1"}, "incl_prior": False}
|
||||||
el_pipe = nlp.add_pipe("entity_linker", config=el_config, last=True)
|
el_pipe = nlp.add_pipe("entity_linker", config=el_config, last=True)
|
||||||
el_pipe.begin_training(lambda: [])
|
el_pipe.begin_training(lambda: [])
|
||||||
el_pipe.incl_context = False
|
el_pipe.incl_context = False
|
||||||
|
@ -387,7 +387,7 @@ def test_overfitting_IO():
|
||||||
doc = nlp(text)
|
doc = nlp(text)
|
||||||
train_examples.append(Example.from_dict(doc, annotation))
|
train_examples.append(Example.from_dict(doc, annotation))
|
||||||
|
|
||||||
@registry.assets.register("myOverfittingKB.v1")
|
@registry.misc.register("myOverfittingKB.v1")
|
||||||
def dummy_kb() -> Callable[["Vocab"], KnowledgeBase]:
|
def dummy_kb() -> Callable[["Vocab"], KnowledgeBase]:
|
||||||
def create_kb(vocab):
|
def create_kb(vocab):
|
||||||
# create artificial KB - assign same prior weight to the two russ cochran's
|
# create artificial KB - assign same prior weight to the two russ cochran's
|
||||||
|
@ -408,7 +408,7 @@ def test_overfitting_IO():
|
||||||
# Create the Entity Linker component and add it to the pipeline
|
# Create the Entity Linker component and add it to the pipeline
|
||||||
nlp.add_pipe(
|
nlp.add_pipe(
|
||||||
"entity_linker",
|
"entity_linker",
|
||||||
config={"kb_loader": {"@assets": "myOverfittingKB.v1"}},
|
config={"kb_loader": {"@misc": "myOverfittingKB.v1"}},
|
||||||
last=True,
|
last=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -13,7 +13,7 @@ def nlp():
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def lemmatizer(nlp):
|
def lemmatizer(nlp):
|
||||||
@registry.assets("cope_lookups")
|
@registry.misc("cope_lookups")
|
||||||
def cope_lookups():
|
def cope_lookups():
|
||||||
lookups = Lookups()
|
lookups = Lookups()
|
||||||
lookups.add_table("lemma_lookup", {"cope": "cope"})
|
lookups.add_table("lemma_lookup", {"cope": "cope"})
|
||||||
|
@ -23,13 +23,13 @@ def lemmatizer(nlp):
|
||||||
return lookups
|
return lookups
|
||||||
|
|
||||||
lemmatizer = nlp.add_pipe(
|
lemmatizer = nlp.add_pipe(
|
||||||
"lemmatizer", config={"mode": "rule", "lookups": {"@assets": "cope_lookups"}}
|
"lemmatizer", config={"mode": "rule", "lookups": {"@misc": "cope_lookups"}}
|
||||||
)
|
)
|
||||||
return lemmatizer
|
return lemmatizer
|
||||||
|
|
||||||
|
|
||||||
def test_lemmatizer_init(nlp):
|
def test_lemmatizer_init(nlp):
|
||||||
@registry.assets("cope_lookups")
|
@registry.misc("cope_lookups")
|
||||||
def cope_lookups():
|
def cope_lookups():
|
||||||
lookups = Lookups()
|
lookups = Lookups()
|
||||||
lookups.add_table("lemma_lookup", {"cope": "cope"})
|
lookups.add_table("lemma_lookup", {"cope": "cope"})
|
||||||
|
@ -39,7 +39,7 @@ def test_lemmatizer_init(nlp):
|
||||||
return lookups
|
return lookups
|
||||||
|
|
||||||
lemmatizer = nlp.add_pipe(
|
lemmatizer = nlp.add_pipe(
|
||||||
"lemmatizer", config={"mode": "lookup", "lookups": {"@assets": "cope_lookups"}}
|
"lemmatizer", config={"mode": "lookup", "lookups": {"@misc": "cope_lookups"}}
|
||||||
)
|
)
|
||||||
assert isinstance(lemmatizer.lookups, Lookups)
|
assert isinstance(lemmatizer.lookups, Lookups)
|
||||||
assert lemmatizer.mode == "lookup"
|
assert lemmatizer.mode == "lookup"
|
||||||
|
@ -51,14 +51,14 @@ def test_lemmatizer_init(nlp):
|
||||||
|
|
||||||
nlp.remove_pipe("lemmatizer")
|
nlp.remove_pipe("lemmatizer")
|
||||||
|
|
||||||
@registry.assets("empty_lookups")
|
@registry.misc("empty_lookups")
|
||||||
def empty_lookups():
|
def empty_lookups():
|
||||||
return Lookups()
|
return Lookups()
|
||||||
|
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
nlp.add_pipe(
|
nlp.add_pipe(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
config={"mode": "lookup", "lookups": {"@assets": "empty_lookups"}},
|
config={"mode": "lookup", "lookups": {"@misc": "empty_lookups"}},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -79,7 +79,7 @@ def test_lemmatizer_config(nlp, lemmatizer):
|
||||||
|
|
||||||
|
|
||||||
def test_lemmatizer_serialize(nlp, lemmatizer):
|
def test_lemmatizer_serialize(nlp, lemmatizer):
|
||||||
@registry.assets("cope_lookups")
|
@registry.misc("cope_lookups")
|
||||||
def cope_lookups():
|
def cope_lookups():
|
||||||
lookups = Lookups()
|
lookups = Lookups()
|
||||||
lookups.add_table("lemma_lookup", {"cope": "cope"})
|
lookups.add_table("lemma_lookup", {"cope": "cope"})
|
||||||
|
@ -90,7 +90,7 @@ def test_lemmatizer_serialize(nlp, lemmatizer):
|
||||||
|
|
||||||
nlp2 = English()
|
nlp2 = English()
|
||||||
lemmatizer2 = nlp2.add_pipe(
|
lemmatizer2 = nlp2.add_pipe(
|
||||||
"lemmatizer", config={"mode": "rule", "lookups": {"@assets": "cope_lookups"}}
|
"lemmatizer", config={"mode": "rule", "lookups": {"@misc": "cope_lookups"}}
|
||||||
)
|
)
|
||||||
lemmatizer2.from_bytes(lemmatizer.to_bytes())
|
lemmatizer2.from_bytes(lemmatizer.to_bytes())
|
||||||
assert lemmatizer.to_bytes() == lemmatizer2.to_bytes()
|
assert lemmatizer.to_bytes() == lemmatizer2.to_bytes()
|
||||||
|
|
|
@ -28,8 +28,6 @@ def test_tagger_begin_training_tag_map():
|
||||||
|
|
||||||
TAGS = ("N", "V", "J")
|
TAGS = ("N", "V", "J")
|
||||||
|
|
||||||
MORPH_RULES = {"V": {"like": {"lemma": "luck"}}}
|
|
||||||
|
|
||||||
TRAIN_DATA = [
|
TRAIN_DATA = [
|
||||||
("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
|
("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
|
||||||
("Eat blue ham", {"tags": ["V", "J", "N"]}),
|
("Eat blue ham", {"tags": ["V", "J", "N"]}),
|
||||||
|
|
|
@ -84,9 +84,8 @@ def test_overfitting_IO():
|
||||||
# Simple test to try and quickly overfit the textcat component - ensuring the ML models work correctly
|
# Simple test to try and quickly overfit the textcat component - ensuring the ML models work correctly
|
||||||
fix_random_seed(0)
|
fix_random_seed(0)
|
||||||
nlp = English()
|
nlp = English()
|
||||||
textcat = nlp.add_pipe("textcat")
|
|
||||||
# Set exclusive labels
|
# Set exclusive labels
|
||||||
textcat.model.attrs["multi_label"] = False
|
textcat = nlp.add_pipe("textcat", config={"model": {"exclusive_classes": True}})
|
||||||
train_examples = []
|
train_examples = []
|
||||||
for text, annotations in TRAIN_DATA:
|
for text, annotations in TRAIN_DATA:
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||||
|
@ -103,9 +102,8 @@ def test_overfitting_IO():
|
||||||
test_text = "I am happy."
|
test_text = "I am happy."
|
||||||
doc = nlp(test_text)
|
doc = nlp(test_text)
|
||||||
cats = doc.cats
|
cats = doc.cats
|
||||||
# note that by default, exclusive_classes = false so we need a bigger error margin
|
assert cats["POSITIVE"] > 0.9
|
||||||
assert cats["POSITIVE"] > 0.8
|
assert cats["POSITIVE"] + cats["NEGATIVE"] == pytest.approx(1.0, 0.001)
|
||||||
assert cats["POSITIVE"] + cats["NEGATIVE"] == pytest.approx(1.0, 0.1)
|
|
||||||
|
|
||||||
# Also test the results are still the same after IO
|
# Also test the results are still the same after IO
|
||||||
with make_tempdir() as tmp_dir:
|
with make_tempdir() as tmp_dir:
|
||||||
|
@ -113,8 +111,8 @@ def test_overfitting_IO():
|
||||||
nlp2 = util.load_model_from_path(tmp_dir)
|
nlp2 = util.load_model_from_path(tmp_dir)
|
||||||
doc2 = nlp2(test_text)
|
doc2 = nlp2(test_text)
|
||||||
cats2 = doc2.cats
|
cats2 = doc2.cats
|
||||||
assert cats2["POSITIVE"] > 0.8
|
assert cats2["POSITIVE"] > 0.9
|
||||||
assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.1)
|
assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.001)
|
||||||
|
|
||||||
# Test scoring
|
# Test scoring
|
||||||
scores = nlp.evaluate(train_examples, scorer_cfg={"positive_label": "POSITIVE"})
|
scores = nlp.evaluate(train_examples, scorer_cfg={"positive_label": "POSITIVE"})
|
||||||
|
|
|
@ -71,7 +71,7 @@ def tagger():
|
||||||
def entity_linker():
|
def entity_linker():
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
|
|
||||||
@registry.assets.register("TestIssue5230KB.v1")
|
@registry.misc.register("TestIssue5230KB.v1")
|
||||||
def dummy_kb() -> Callable[["Vocab"], KnowledgeBase]:
|
def dummy_kb() -> Callable[["Vocab"], KnowledgeBase]:
|
||||||
def create_kb(vocab):
|
def create_kb(vocab):
|
||||||
kb = KnowledgeBase(vocab, entity_vector_length=1)
|
kb = KnowledgeBase(vocab, entity_vector_length=1)
|
||||||
|
@ -80,7 +80,7 @@ def entity_linker():
|
||||||
|
|
||||||
return create_kb
|
return create_kb
|
||||||
|
|
||||||
config = {"kb_loader": {"@assets": "TestIssue5230KB.v1"}}
|
config = {"kb_loader": {"@misc": "TestIssue5230KB.v1"}}
|
||||||
entity_linker = nlp.add_pipe("entity_linker", config=config)
|
entity_linker = nlp.add_pipe("entity_linker", config=config)
|
||||||
# need to add model for two reasons:
|
# need to add model for two reasons:
|
||||||
# 1. no model leads to error in serialization,
|
# 1. no model leads to error in serialization,
|
||||||
|
|
|
@ -28,7 +28,7 @@ path = ${paths.train}
|
||||||
path = ${paths.dev}
|
path = ${paths.dev}
|
||||||
|
|
||||||
[training.batcher]
|
[training.batcher]
|
||||||
@batchers = "batch_by_words.v1"
|
@batchers = "spacy.batch_by_words.v1"
|
||||||
size = 666
|
size = 666
|
||||||
|
|
||||||
[nlp]
|
[nlp]
|
||||||
|
|
|
@ -85,7 +85,7 @@ def test_serialize_subclassed_kb():
|
||||||
super().__init__(vocab, entity_vector_length)
|
super().__init__(vocab, entity_vector_length)
|
||||||
self.custom_field = custom_field
|
self.custom_field = custom_field
|
||||||
|
|
||||||
@registry.assets.register("spacy.CustomKB.v1")
|
@registry.misc.register("spacy.CustomKB.v1")
|
||||||
def custom_kb(
|
def custom_kb(
|
||||||
entity_vector_length: int, custom_field: int
|
entity_vector_length: int, custom_field: int
|
||||||
) -> Callable[["Vocab"], KnowledgeBase]:
|
) -> Callable[["Vocab"], KnowledgeBase]:
|
||||||
|
@ -101,7 +101,7 @@ def test_serialize_subclassed_kb():
|
||||||
nlp = English()
|
nlp = English()
|
||||||
config = {
|
config = {
|
||||||
"kb_loader": {
|
"kb_loader": {
|
||||||
"@assets": "spacy.CustomKB.v1",
|
"@misc": "spacy.CustomKB.v1",
|
||||||
"entity_vector_length": 342,
|
"entity_vector_length": 342,
|
||||||
"custom_field": 666,
|
"custom_field": 666,
|
||||||
}
|
}
|
||||||
|
|
|
@ -34,9 +34,9 @@ cdef class Tokenizer:
|
||||||
vector[SpanC] &filtered)
|
vector[SpanC] &filtered)
|
||||||
cdef int _retokenize_special_spans(self, Doc doc, TokenC* tokens,
|
cdef int _retokenize_special_spans(self, Doc doc, TokenC* tokens,
|
||||||
object span_data)
|
object span_data)
|
||||||
cdef int _try_cache(self, hash_t key, Doc tokens) except -1
|
cdef int _try_specials_and_cache(self, hash_t key, Doc tokens,
|
||||||
cdef int _try_specials(self, hash_t key, Doc tokens,
|
int* has_special,
|
||||||
int* has_special) except -1
|
bint with_special_cases) except -1
|
||||||
cdef int _tokenize(self, Doc tokens, unicode span, hash_t key,
|
cdef int _tokenize(self, Doc tokens, unicode span, hash_t key,
|
||||||
int* has_special, bint with_special_cases) except -1
|
int* has_special, bint with_special_cases) except -1
|
||||||
cdef unicode _split_affixes(self, Pool mem, unicode string,
|
cdef unicode _split_affixes(self, Pool mem, unicode string,
|
||||||
|
|
|
@ -31,7 +31,7 @@ cdef class Tokenizer:
|
||||||
"""Segment text, and create Doc objects with the discovered segment
|
"""Segment text, and create Doc objects with the discovered segment
|
||||||
boundaries.
|
boundaries.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tokenizer
|
DOCS: https://nightly.spacy.io/api/tokenizer
|
||||||
"""
|
"""
|
||||||
def __init__(self, Vocab vocab, rules=None, prefix_search=None,
|
def __init__(self, Vocab vocab, rules=None, prefix_search=None,
|
||||||
suffix_search=None, infix_finditer=None, token_match=None,
|
suffix_search=None, infix_finditer=None, token_match=None,
|
||||||
|
@ -54,7 +54,7 @@ cdef class Tokenizer:
|
||||||
EXAMPLE:
|
EXAMPLE:
|
||||||
>>> tokenizer = Tokenizer(nlp.vocab)
|
>>> tokenizer = Tokenizer(nlp.vocab)
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tokenizer#init
|
DOCS: https://nightly.spacy.io/api/tokenizer#init
|
||||||
"""
|
"""
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self._cache = PreshMap()
|
self._cache = PreshMap()
|
||||||
|
@ -147,7 +147,7 @@ cdef class Tokenizer:
|
||||||
string (str): The string to tokenize.
|
string (str): The string to tokenize.
|
||||||
RETURNS (Doc): A container for linguistic annotations.
|
RETURNS (Doc): A container for linguistic annotations.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tokenizer#call
|
DOCS: https://nightly.spacy.io/api/tokenizer#call
|
||||||
"""
|
"""
|
||||||
doc = self._tokenize_affixes(string, True)
|
doc = self._tokenize_affixes(string, True)
|
||||||
self._apply_special_cases(doc)
|
self._apply_special_cases(doc)
|
||||||
|
@ -169,8 +169,6 @@ cdef class Tokenizer:
|
||||||
cdef int i = 0
|
cdef int i = 0
|
||||||
cdef int start = 0
|
cdef int start = 0
|
||||||
cdef int has_special = 0
|
cdef int has_special = 0
|
||||||
cdef bint specials_hit = 0
|
|
||||||
cdef bint cache_hit = 0
|
|
||||||
cdef bint in_ws = string[0].isspace()
|
cdef bint in_ws = string[0].isspace()
|
||||||
cdef unicode span
|
cdef unicode span
|
||||||
# The task here is much like string.split, but not quite
|
# The task here is much like string.split, but not quite
|
||||||
|
@ -186,13 +184,7 @@ cdef class Tokenizer:
|
||||||
# we don't have to create the slice when we hit the cache.
|
# we don't have to create the slice when we hit the cache.
|
||||||
span = string[start:i]
|
span = string[start:i]
|
||||||
key = hash_string(span)
|
key = hash_string(span)
|
||||||
specials_hit = 0
|
if not self._try_specials_and_cache(key, doc, &has_special, with_special_cases):
|
||||||
cache_hit = 0
|
|
||||||
if with_special_cases:
|
|
||||||
specials_hit = self._try_specials(key, doc, &has_special)
|
|
||||||
if not specials_hit:
|
|
||||||
cache_hit = self._try_cache(key, doc)
|
|
||||||
if not specials_hit and not cache_hit:
|
|
||||||
self._tokenize(doc, span, key, &has_special, with_special_cases)
|
self._tokenize(doc, span, key, &has_special, with_special_cases)
|
||||||
if uc == ' ':
|
if uc == ' ':
|
||||||
doc.c[doc.length - 1].spacy = True
|
doc.c[doc.length - 1].spacy = True
|
||||||
|
@ -204,13 +196,7 @@ cdef class Tokenizer:
|
||||||
if start < i:
|
if start < i:
|
||||||
span = string[start:]
|
span = string[start:]
|
||||||
key = hash_string(span)
|
key = hash_string(span)
|
||||||
specials_hit = 0
|
if not self._try_specials_and_cache(key, doc, &has_special, with_special_cases):
|
||||||
cache_hit = 0
|
|
||||||
if with_special_cases:
|
|
||||||
specials_hit = self._try_specials(key, doc, &has_special)
|
|
||||||
if not specials_hit:
|
|
||||||
cache_hit = self._try_cache(key, doc)
|
|
||||||
if not specials_hit and not cache_hit:
|
|
||||||
self._tokenize(doc, span, key, &has_special, with_special_cases)
|
self._tokenize(doc, span, key, &has_special, with_special_cases)
|
||||||
doc.c[doc.length - 1].spacy = string[-1] == " " and not in_ws
|
doc.c[doc.length - 1].spacy = string[-1] == " " and not in_ws
|
||||||
return doc
|
return doc
|
||||||
|
@ -223,7 +209,7 @@ cdef class Tokenizer:
|
||||||
Defaults to 1000.
|
Defaults to 1000.
|
||||||
YIELDS (Doc): A sequence of Doc objects, in order.
|
YIELDS (Doc): A sequence of Doc objects, in order.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tokenizer#pipe
|
DOCS: https://nightly.spacy.io/api/tokenizer#pipe
|
||||||
"""
|
"""
|
||||||
for text in texts:
|
for text in texts:
|
||||||
yield self(text)
|
yield self(text)
|
||||||
|
@ -364,27 +350,33 @@ cdef class Tokenizer:
|
||||||
offset += span[3]
|
offset += span[3]
|
||||||
return offset
|
return offset
|
||||||
|
|
||||||
cdef int _try_cache(self, hash_t key, Doc tokens) except -1:
|
cdef int _try_specials_and_cache(self, hash_t key, Doc tokens, int* has_special, bint with_special_cases) except -1:
|
||||||
|
cdef bint specials_hit = 0
|
||||||
|
cdef bint cache_hit = 0
|
||||||
|
cdef int i
|
||||||
|
if with_special_cases:
|
||||||
|
cached = <_Cached*>self._specials.get(key)
|
||||||
|
if cached == NULL:
|
||||||
|
specials_hit = False
|
||||||
|
else:
|
||||||
|
for i in range(cached.length):
|
||||||
|
tokens.push_back(&cached.data.tokens[i], False)
|
||||||
|
has_special[0] = 1
|
||||||
|
specials_hit = True
|
||||||
|
if not specials_hit:
|
||||||
cached = <_Cached*>self._cache.get(key)
|
cached = <_Cached*>self._cache.get(key)
|
||||||
if cached == NULL:
|
if cached == NULL:
|
||||||
return False
|
cache_hit = False
|
||||||
cdef int i
|
else:
|
||||||
if cached.is_lex:
|
if cached.is_lex:
|
||||||
for i in range(cached.length):
|
for i in range(cached.length):
|
||||||
tokens.push_back(cached.data.lexemes[i], False)
|
tokens.push_back(cached.data.lexemes[i], False)
|
||||||
else:
|
else:
|
||||||
for i in range(cached.length):
|
for i in range(cached.length):
|
||||||
tokens.push_back(&cached.data.tokens[i], False)
|
tokens.push_back(&cached.data.tokens[i], False)
|
||||||
return True
|
cache_hit = True
|
||||||
|
if not specials_hit and not cache_hit:
|
||||||
cdef int _try_specials(self, hash_t key, Doc tokens, int* has_special) except -1:
|
|
||||||
cached = <_Cached*>self._specials.get(key)
|
|
||||||
if cached == NULL:
|
|
||||||
return False
|
return False
|
||||||
cdef int i
|
|
||||||
for i in range(cached.length):
|
|
||||||
tokens.push_back(&cached.data.tokens[i], False)
|
|
||||||
has_special[0] = 1
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
cdef int _tokenize(self, Doc tokens, unicode span, hash_t orig_key, int* has_special, bint with_special_cases) except -1:
|
cdef int _tokenize(self, Doc tokens, unicode span, hash_t orig_key, int* has_special, bint with_special_cases) except -1:
|
||||||
|
@ -462,12 +454,7 @@ cdef class Tokenizer:
|
||||||
for i in range(prefixes.size()):
|
for i in range(prefixes.size()):
|
||||||
tokens.push_back(prefixes[0][i], False)
|
tokens.push_back(prefixes[0][i], False)
|
||||||
if string:
|
if string:
|
||||||
if with_special_cases:
|
if self._try_specials_and_cache(hash_string(string), tokens, has_special, with_special_cases):
|
||||||
specials_hit = self._try_specials(hash_string(string), tokens,
|
|
||||||
has_special)
|
|
||||||
if not specials_hit:
|
|
||||||
cache_hit = self._try_cache(hash_string(string), tokens)
|
|
||||||
if specials_hit or cache_hit:
|
|
||||||
pass
|
pass
|
||||||
elif (self.token_match and self.token_match(string)) or \
|
elif (self.token_match and self.token_match(string)) or \
|
||||||
(self.url_match and \
|
(self.url_match and \
|
||||||
|
@ -542,7 +529,7 @@ cdef class Tokenizer:
|
||||||
and `.end()` methods, denoting the placement of internal segment
|
and `.end()` methods, denoting the placement of internal segment
|
||||||
separators, e.g. hyphens.
|
separators, e.g. hyphens.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tokenizer#find_infix
|
DOCS: https://nightly.spacy.io/api/tokenizer#find_infix
|
||||||
"""
|
"""
|
||||||
if self.infix_finditer is None:
|
if self.infix_finditer is None:
|
||||||
return 0
|
return 0
|
||||||
|
@ -555,7 +542,7 @@ cdef class Tokenizer:
|
||||||
string (str): The string to segment.
|
string (str): The string to segment.
|
||||||
RETURNS (int): The length of the prefix if present, otherwise `None`.
|
RETURNS (int): The length of the prefix if present, otherwise `None`.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tokenizer#find_prefix
|
DOCS: https://nightly.spacy.io/api/tokenizer#find_prefix
|
||||||
"""
|
"""
|
||||||
if self.prefix_search is None:
|
if self.prefix_search is None:
|
||||||
return 0
|
return 0
|
||||||
|
@ -569,7 +556,7 @@ cdef class Tokenizer:
|
||||||
string (str): The string to segment.
|
string (str): The string to segment.
|
||||||
Returns (int): The length of the suffix if present, otherwise `None`.
|
Returns (int): The length of the suffix if present, otherwise `None`.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tokenizer#find_suffix
|
DOCS: https://nightly.spacy.io/api/tokenizer#find_suffix
|
||||||
"""
|
"""
|
||||||
if self.suffix_search is None:
|
if self.suffix_search is None:
|
||||||
return 0
|
return 0
|
||||||
|
@ -609,7 +596,7 @@ cdef class Tokenizer:
|
||||||
a token and its attributes. The `ORTH` fields of the attributes
|
a token and its attributes. The `ORTH` fields of the attributes
|
||||||
must exactly match the string when they are concatenated.
|
must exactly match the string when they are concatenated.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tokenizer#add_special_case
|
DOCS: https://nightly.spacy.io/api/tokenizer#add_special_case
|
||||||
"""
|
"""
|
||||||
self._validate_special_case(string, substrings)
|
self._validate_special_case(string, substrings)
|
||||||
substrings = list(substrings)
|
substrings = list(substrings)
|
||||||
|
@ -648,7 +635,7 @@ cdef class Tokenizer:
|
||||||
string (str): The string to tokenize.
|
string (str): The string to tokenize.
|
||||||
RETURNS (list): A list of (pattern_string, token_string) tuples
|
RETURNS (list): A list of (pattern_string, token_string) tuples
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tokenizer#explain
|
DOCS: https://nightly.spacy.io/api/tokenizer#explain
|
||||||
"""
|
"""
|
||||||
prefix_search = self.prefix_search
|
prefix_search = self.prefix_search
|
||||||
suffix_search = self.suffix_search
|
suffix_search = self.suffix_search
|
||||||
|
@ -729,7 +716,7 @@ cdef class Tokenizer:
|
||||||
it doesn't exist.
|
it doesn't exist.
|
||||||
exclude (list): String names of serialization fields to exclude.
|
exclude (list): String names of serialization fields to exclude.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tokenizer#to_disk
|
DOCS: https://nightly.spacy.io/api/tokenizer#to_disk
|
||||||
"""
|
"""
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
with path.open("wb") as file_:
|
with path.open("wb") as file_:
|
||||||
|
@ -743,7 +730,7 @@ cdef class Tokenizer:
|
||||||
exclude (list): String names of serialization fields to exclude.
|
exclude (list): String names of serialization fields to exclude.
|
||||||
RETURNS (Tokenizer): The modified `Tokenizer` object.
|
RETURNS (Tokenizer): The modified `Tokenizer` object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tokenizer#from_disk
|
DOCS: https://nightly.spacy.io/api/tokenizer#from_disk
|
||||||
"""
|
"""
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
with path.open("rb") as file_:
|
with path.open("rb") as file_:
|
||||||
|
@ -757,7 +744,7 @@ cdef class Tokenizer:
|
||||||
exclude (list): String names of serialization fields to exclude.
|
exclude (list): String names of serialization fields to exclude.
|
||||||
RETURNS (bytes): The serialized form of the `Tokenizer` object.
|
RETURNS (bytes): The serialized form of the `Tokenizer` object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tokenizer#to_bytes
|
DOCS: https://nightly.spacy.io/api/tokenizer#to_bytes
|
||||||
"""
|
"""
|
||||||
serializers = {
|
serializers = {
|
||||||
"vocab": lambda: self.vocab.to_bytes(),
|
"vocab": lambda: self.vocab.to_bytes(),
|
||||||
|
@ -777,7 +764,7 @@ cdef class Tokenizer:
|
||||||
exclude (list): String names of serialization fields to exclude.
|
exclude (list): String names of serialization fields to exclude.
|
||||||
RETURNS (Tokenizer): The `Tokenizer` object.
|
RETURNS (Tokenizer): The `Tokenizer` object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tokenizer#from_bytes
|
DOCS: https://nightly.spacy.io/api/tokenizer#from_bytes
|
||||||
"""
|
"""
|
||||||
data = {}
|
data = {}
|
||||||
deserializers = {
|
deserializers = {
|
||||||
|
|
|
@ -24,8 +24,8 @@ from ..strings import get_string_id
|
||||||
cdef class Retokenizer:
|
cdef class Retokenizer:
|
||||||
"""Helper class for doc.retokenize() context manager.
|
"""Helper class for doc.retokenize() context manager.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/doc#retokenize
|
DOCS: https://nightly.spacy.io/api/doc#retokenize
|
||||||
USAGE: https://spacy.io/usage/linguistic-features#retokenization
|
USAGE: https://nightly.spacy.io/usage/linguistic-features#retokenization
|
||||||
"""
|
"""
|
||||||
cdef Doc doc
|
cdef Doc doc
|
||||||
cdef list merges
|
cdef list merges
|
||||||
|
@ -47,7 +47,7 @@ cdef class Retokenizer:
|
||||||
span (Span): The span to merge.
|
span (Span): The span to merge.
|
||||||
attrs (dict): Attributes to set on the merged token.
|
attrs (dict): Attributes to set on the merged token.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/doc#retokenizer.merge
|
DOCS: https://nightly.spacy.io/api/doc#retokenizer.merge
|
||||||
"""
|
"""
|
||||||
if (span.start, span.end) in self._spans_to_merge:
|
if (span.start, span.end) in self._spans_to_merge:
|
||||||
return
|
return
|
||||||
|
@ -73,7 +73,7 @@ cdef class Retokenizer:
|
||||||
attrs (dict): Attributes to set on all split tokens. Attribute names
|
attrs (dict): Attributes to set on all split tokens. Attribute names
|
||||||
mapped to list of per-token attribute values.
|
mapped to list of per-token attribute values.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/doc#retokenizer.split
|
DOCS: https://nightly.spacy.io/api/doc#retokenizer.split
|
||||||
"""
|
"""
|
||||||
if ''.join(orths) != token.text:
|
if ''.join(orths) != token.text:
|
||||||
raise ValueError(Errors.E117.format(new=''.join(orths), old=token.text))
|
raise ValueError(Errors.E117.format(new=''.join(orths), old=token.text))
|
||||||
|
|
|
@ -61,7 +61,7 @@ class DocBin:
|
||||||
store_user_data (bool): Whether to include the `Doc.user_data`.
|
store_user_data (bool): Whether to include the `Doc.user_data`.
|
||||||
docs (Iterable[Doc]): Docs to add.
|
docs (Iterable[Doc]): Docs to add.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/docbin#init
|
DOCS: https://nightly.spacy.io/api/docbin#init
|
||||||
"""
|
"""
|
||||||
attrs = sorted([intify_attr(attr) for attr in attrs])
|
attrs = sorted([intify_attr(attr) for attr in attrs])
|
||||||
self.version = "0.1"
|
self.version = "0.1"
|
||||||
|
@ -86,7 +86,7 @@ class DocBin:
|
||||||
|
|
||||||
doc (Doc): The Doc object to add.
|
doc (Doc): The Doc object to add.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/docbin#add
|
DOCS: https://nightly.spacy.io/api/docbin#add
|
||||||
"""
|
"""
|
||||||
array = doc.to_array(self.attrs)
|
array = doc.to_array(self.attrs)
|
||||||
if len(array.shape) == 1:
|
if len(array.shape) == 1:
|
||||||
|
@ -115,7 +115,7 @@ class DocBin:
|
||||||
vocab (Vocab): The shared vocab.
|
vocab (Vocab): The shared vocab.
|
||||||
YIELDS (Doc): The Doc objects.
|
YIELDS (Doc): The Doc objects.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/docbin#get_docs
|
DOCS: https://nightly.spacy.io/api/docbin#get_docs
|
||||||
"""
|
"""
|
||||||
for string in self.strings:
|
for string in self.strings:
|
||||||
vocab[string]
|
vocab[string]
|
||||||
|
@ -141,7 +141,7 @@ class DocBin:
|
||||||
|
|
||||||
other (DocBin): The DocBin to merge into the current bin.
|
other (DocBin): The DocBin to merge into the current bin.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/docbin#merge
|
DOCS: https://nightly.spacy.io/api/docbin#merge
|
||||||
"""
|
"""
|
||||||
if self.attrs != other.attrs:
|
if self.attrs != other.attrs:
|
||||||
raise ValueError(Errors.E166.format(current=self.attrs, other=other.attrs))
|
raise ValueError(Errors.E166.format(current=self.attrs, other=other.attrs))
|
||||||
|
@ -158,7 +158,7 @@ class DocBin:
|
||||||
|
|
||||||
RETURNS (bytes): The serialized DocBin.
|
RETURNS (bytes): The serialized DocBin.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/docbin#to_bytes
|
DOCS: https://nightly.spacy.io/api/docbin#to_bytes
|
||||||
"""
|
"""
|
||||||
for tokens in self.tokens:
|
for tokens in self.tokens:
|
||||||
assert len(tokens.shape) == 2, tokens.shape # this should never happen
|
assert len(tokens.shape) == 2, tokens.shape # this should never happen
|
||||||
|
@ -185,7 +185,7 @@ class DocBin:
|
||||||
bytes_data (bytes): The data to load from.
|
bytes_data (bytes): The data to load from.
|
||||||
RETURNS (DocBin): The loaded DocBin.
|
RETURNS (DocBin): The loaded DocBin.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/docbin#from_bytes
|
DOCS: https://nightly.spacy.io/api/docbin#from_bytes
|
||||||
"""
|
"""
|
||||||
msg = srsly.msgpack_loads(zlib.decompress(bytes_data))
|
msg = srsly.msgpack_loads(zlib.decompress(bytes_data))
|
||||||
self.attrs = msg["attrs"]
|
self.attrs = msg["attrs"]
|
||||||
|
@ -211,7 +211,7 @@ class DocBin:
|
||||||
|
|
||||||
path (str / Path): The file path.
|
path (str / Path): The file path.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/docbin#to_disk
|
DOCS: https://nightly.spacy.io/api/docbin#to_disk
|
||||||
"""
|
"""
|
||||||
path = ensure_path(path)
|
path = ensure_path(path)
|
||||||
with path.open("wb") as file_:
|
with path.open("wb") as file_:
|
||||||
|
@ -223,7 +223,7 @@ class DocBin:
|
||||||
path (str / Path): The file path.
|
path (str / Path): The file path.
|
||||||
RETURNS (DocBin): The loaded DocBin.
|
RETURNS (DocBin): The loaded DocBin.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/docbin#to_disk
|
DOCS: https://nightly.spacy.io/api/docbin#to_disk
|
||||||
"""
|
"""
|
||||||
path = ensure_path(path)
|
path = ensure_path(path)
|
||||||
with path.open("rb") as file_:
|
with path.open("rb") as file_:
|
||||||
|
|
|
@ -104,7 +104,7 @@ cdef class Doc:
|
||||||
>>> from spacy.tokens import Doc
|
>>> from spacy.tokens import Doc
|
||||||
>>> doc = Doc(nlp.vocab, words=["hello", "world", "!"], spaces=[True, False, False])
|
>>> doc = Doc(nlp.vocab, words=["hello", "world", "!"], spaces=[True, False, False])
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/doc
|
DOCS: https://nightly.spacy.io/api/doc
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
@ -118,8 +118,8 @@ cdef class Doc:
|
||||||
method (callable): Optional method for method extension.
|
method (callable): Optional method for method extension.
|
||||||
force (bool): Force overwriting existing attribute.
|
force (bool): Force overwriting existing attribute.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/doc#set_extension
|
DOCS: https://nightly.spacy.io/api/doc#set_extension
|
||||||
USAGE: https://spacy.io/usage/processing-pipelines#custom-components-attributes
|
USAGE: https://nightly.spacy.io/usage/processing-pipelines#custom-components-attributes
|
||||||
"""
|
"""
|
||||||
if cls.has_extension(name) and not kwargs.get("force", False):
|
if cls.has_extension(name) and not kwargs.get("force", False):
|
||||||
raise ValueError(Errors.E090.format(name=name, obj="Doc"))
|
raise ValueError(Errors.E090.format(name=name, obj="Doc"))
|
||||||
|
@ -132,7 +132,7 @@ cdef class Doc:
|
||||||
name (str): Name of the extension.
|
name (str): Name of the extension.
|
||||||
RETURNS (tuple): A `(default, method, getter, setter)` tuple.
|
RETURNS (tuple): A `(default, method, getter, setter)` tuple.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/doc#get_extension
|
DOCS: https://nightly.spacy.io/api/doc#get_extension
|
||||||
"""
|
"""
|
||||||
return Underscore.doc_extensions.get(name)
|
return Underscore.doc_extensions.get(name)
|
||||||
|
|
||||||
|
@ -143,7 +143,7 @@ cdef class Doc:
|
||||||
name (str): Name of the extension.
|
name (str): Name of the extension.
|
||||||
RETURNS (bool): Whether the extension has been registered.
|
RETURNS (bool): Whether the extension has been registered.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/doc#has_extension
|
DOCS: https://nightly.spacy.io/api/doc#has_extension
|
||||||
"""
|
"""
|
||||||
return name in Underscore.doc_extensions
|
return name in Underscore.doc_extensions
|
||||||
|
|
||||||
|
@ -155,7 +155,7 @@ cdef class Doc:
|
||||||
RETURNS (tuple): A `(default, method, getter, setter)` tuple of the
|
RETURNS (tuple): A `(default, method, getter, setter)` tuple of the
|
||||||
removed extension.
|
removed extension.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/doc#remove_extension
|
DOCS: https://nightly.spacy.io/api/doc#remove_extension
|
||||||
"""
|
"""
|
||||||
if not cls.has_extension(name):
|
if not cls.has_extension(name):
|
||||||
raise ValueError(Errors.E046.format(name=name))
|
raise ValueError(Errors.E046.format(name=name))
|
||||||
|
@ -173,7 +173,7 @@ cdef class Doc:
|
||||||
it is not. If `None`, defaults to `[True]*len(words)`
|
it is not. If `None`, defaults to `[True]*len(words)`
|
||||||
user_data (dict or None): Optional extra data to attach to the Doc.
|
user_data (dict or None): Optional extra data to attach to the Doc.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/doc#init
|
DOCS: https://nightly.spacy.io/api/doc#init
|
||||||
"""
|
"""
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
size = max(20, (len(words) if words is not None else 0))
|
size = max(20, (len(words) if words is not None else 0))
|
||||||
|
@ -288,7 +288,7 @@ cdef class Doc:
|
||||||
You can use negative indices and open-ended ranges, which have
|
You can use negative indices and open-ended ranges, which have
|
||||||
their normal Python semantics.
|
their normal Python semantics.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/doc#getitem
|
DOCS: https://nightly.spacy.io/api/doc#getitem
|
||||||
"""
|
"""
|
||||||
if isinstance(i, slice):
|
if isinstance(i, slice):
|
||||||
start, stop = normalize_slice(len(self), i.start, i.stop, i.step)
|
start, stop = normalize_slice(len(self), i.start, i.stop, i.step)
|
||||||
|
@ -305,7 +305,7 @@ cdef class Doc:
|
||||||
than-Python speeds are required, you can instead access the annotations
|
than-Python speeds are required, you can instead access the annotations
|
||||||
as a numpy array, or access the underlying C data directly from Cython.
|
as a numpy array, or access the underlying C data directly from Cython.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/doc#iter
|
DOCS: https://nightly.spacy.io/api/doc#iter
|
||||||
"""
|
"""
|
||||||
cdef int i
|
cdef int i
|
||||||
for i in range(self.length):
|
for i in range(self.length):
|
||||||
|
@ -316,7 +316,7 @@ cdef class Doc:
|
||||||
|
|
||||||
RETURNS (int): The number of tokens in the document.
|
RETURNS (int): The number of tokens in the document.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/doc#len
|
DOCS: https://nightly.spacy.io/api/doc#len
|
||||||
"""
|
"""
|
||||||
return self.length
|
return self.length
|
||||||
|
|
||||||
|
@ -349,7 +349,7 @@ cdef class Doc:
|
||||||
the span.
|
the span.
|
||||||
RETURNS (Span): The newly constructed object.
|
RETURNS (Span): The newly constructed object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/doc#char_span
|
DOCS: https://nightly.spacy.io/api/doc#char_span
|
||||||
"""
|
"""
|
||||||
if not isinstance(label, int):
|
if not isinstance(label, int):
|
||||||
label = self.vocab.strings.add(label)
|
label = self.vocab.strings.add(label)
|
||||||
|
@ -374,7 +374,7 @@ cdef class Doc:
|
||||||
`Span`, `Token` and `Lexeme` objects.
|
`Span`, `Token` and `Lexeme` objects.
|
||||||
RETURNS (float): A scalar similarity score. Higher is more similar.
|
RETURNS (float): A scalar similarity score. Higher is more similar.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/doc#similarity
|
DOCS: https://nightly.spacy.io/api/doc#similarity
|
||||||
"""
|
"""
|
||||||
if "similarity" in self.user_hooks:
|
if "similarity" in self.user_hooks:
|
||||||
return self.user_hooks["similarity"](self, other)
|
return self.user_hooks["similarity"](self, other)
|
||||||
|
@ -407,7 +407,7 @@ cdef class Doc:
|
||||||
|
|
||||||
RETURNS (bool): Whether a word vector is associated with the object.
|
RETURNS (bool): Whether a word vector is associated with the object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/doc#has_vector
|
DOCS: https://nightly.spacy.io/api/doc#has_vector
|
||||||
"""
|
"""
|
||||||
if "has_vector" in self.user_hooks:
|
if "has_vector" in self.user_hooks:
|
||||||
return self.user_hooks["has_vector"](self)
|
return self.user_hooks["has_vector"](self)
|
||||||
|
@ -425,7 +425,7 @@ cdef class Doc:
|
||||||
RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
|
RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
|
||||||
representing the document's semantics.
|
representing the document's semantics.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/doc#vector
|
DOCS: https://nightly.spacy.io/api/doc#vector
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if "vector" in self.user_hooks:
|
if "vector" in self.user_hooks:
|
||||||
|
@ -453,7 +453,7 @@ cdef class Doc:
|
||||||
|
|
||||||
RETURNS (float): The L2 norm of the vector representation.
|
RETURNS (float): The L2 norm of the vector representation.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/doc#vector_norm
|
DOCS: https://nightly.spacy.io/api/doc#vector_norm
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if "vector_norm" in self.user_hooks:
|
if "vector_norm" in self.user_hooks:
|
||||||
|
@ -493,7 +493,7 @@ cdef class Doc:
|
||||||
|
|
||||||
RETURNS (tuple): Entities in the document, one `Span` per entity.
|
RETURNS (tuple): Entities in the document, one `Span` per entity.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/doc#ents
|
DOCS: https://nightly.spacy.io/api/doc#ents
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
cdef int i
|
cdef int i
|
||||||
|
@ -584,7 +584,7 @@ cdef class Doc:
|
||||||
|
|
||||||
YIELDS (Span): Noun chunks in the document.
|
YIELDS (Span): Noun chunks in the document.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/doc#noun_chunks
|
DOCS: https://nightly.spacy.io/api/doc#noun_chunks
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Accumulate the result before beginning to iterate over it. This
|
# Accumulate the result before beginning to iterate over it. This
|
||||||
|
@ -609,7 +609,7 @@ cdef class Doc:
|
||||||
|
|
||||||
YIELDS (Span): Sentences in the document.
|
YIELDS (Span): Sentences in the document.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/doc#sents
|
DOCS: https://nightly.spacy.io/api/doc#sents
|
||||||
"""
|
"""
|
||||||
if not self.is_sentenced:
|
if not self.is_sentenced:
|
||||||
raise ValueError(Errors.E030)
|
raise ValueError(Errors.E030)
|
||||||
|
@ -722,7 +722,7 @@ cdef class Doc:
|
||||||
attr_id (int): The attribute ID to key the counts.
|
attr_id (int): The attribute ID to key the counts.
|
||||||
RETURNS (dict): A dictionary mapping attributes to integer counts.
|
RETURNS (dict): A dictionary mapping attributes to integer counts.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/doc#count_by
|
DOCS: https://nightly.spacy.io/api/doc#count_by
|
||||||
"""
|
"""
|
||||||
cdef int i
|
cdef int i
|
||||||
cdef attr_t attr
|
cdef attr_t attr
|
||||||
|
@ -777,7 +777,7 @@ cdef class Doc:
|
||||||
array (numpy.ndarray[ndim=2, dtype='int32']): The attribute values.
|
array (numpy.ndarray[ndim=2, dtype='int32']): The attribute values.
|
||||||
RETURNS (Doc): Itself.
|
RETURNS (Doc): Itself.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/doc#from_array
|
DOCS: https://nightly.spacy.io/api/doc#from_array
|
||||||
"""
|
"""
|
||||||
# Handle scalar/list inputs of strings/ints for py_attr_ids
|
# Handle scalar/list inputs of strings/ints for py_attr_ids
|
||||||
# See also #3064
|
# See also #3064
|
||||||
|
@ -872,7 +872,7 @@ cdef class Doc:
|
||||||
attrs (list): Optional list of attribute ID ints or attribute name strings.
|
attrs (list): Optional list of attribute ID ints or attribute name strings.
|
||||||
RETURNS (Doc): A doc that contains the concatenated docs, or None if no docs were given.
|
RETURNS (Doc): A doc that contains the concatenated docs, or None if no docs were given.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/doc#from_docs
|
DOCS: https://nightly.spacy.io/api/doc#from_docs
|
||||||
"""
|
"""
|
||||||
if not docs:
|
if not docs:
|
||||||
return None
|
return None
|
||||||
|
@ -920,7 +920,9 @@ cdef class Doc:
|
||||||
warnings.warn(Warnings.W101.format(name=name))
|
warnings.warn(Warnings.W101.format(name=name))
|
||||||
else:
|
else:
|
||||||
warnings.warn(Warnings.W102.format(key=key, value=value))
|
warnings.warn(Warnings.W102.format(key=key, value=value))
|
||||||
char_offset += len(doc.text) if not ensure_whitespace or doc[-1].is_space else len(doc.text) + 1
|
char_offset += len(doc.text)
|
||||||
|
if ensure_whitespace and not (len(doc) > 0 and doc[-1].is_space):
|
||||||
|
char_offset += 1
|
||||||
|
|
||||||
arrays = [doc.to_array(attrs) for doc in docs]
|
arrays = [doc.to_array(attrs) for doc in docs]
|
||||||
|
|
||||||
|
@ -932,7 +934,7 @@ cdef class Doc:
|
||||||
token_offset = -1
|
token_offset = -1
|
||||||
for doc in docs[:-1]:
|
for doc in docs[:-1]:
|
||||||
token_offset += len(doc)
|
token_offset += len(doc)
|
||||||
if not doc[-1].is_space:
|
if not (len(doc) > 0 and doc[-1].is_space):
|
||||||
concat_spaces[token_offset] = True
|
concat_spaces[token_offset] = True
|
||||||
|
|
||||||
concat_array = numpy.concatenate(arrays)
|
concat_array = numpy.concatenate(arrays)
|
||||||
|
@ -951,7 +953,7 @@ cdef class Doc:
|
||||||
RETURNS (np.array[ndim=2, dtype=numpy.int32]): LCA matrix with shape
|
RETURNS (np.array[ndim=2, dtype=numpy.int32]): LCA matrix with shape
|
||||||
(n, n), where n = len(self).
|
(n, n), where n = len(self).
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/doc#get_lca_matrix
|
DOCS: https://nightly.spacy.io/api/doc#get_lca_matrix
|
||||||
"""
|
"""
|
||||||
return numpy.asarray(_get_lca_matrix(self, 0, len(self)))
|
return numpy.asarray(_get_lca_matrix(self, 0, len(self)))
|
||||||
|
|
||||||
|
@ -985,7 +987,7 @@ cdef class Doc:
|
||||||
it doesn't exist. Paths may be either strings or Path-like objects.
|
it doesn't exist. Paths may be either strings or Path-like objects.
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/doc#to_disk
|
DOCS: https://nightly.spacy.io/api/doc#to_disk
|
||||||
"""
|
"""
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
with path.open("wb") as file_:
|
with path.open("wb") as file_:
|
||||||
|
@ -1000,7 +1002,7 @@ cdef class Doc:
|
||||||
exclude (list): String names of serialization fields to exclude.
|
exclude (list): String names of serialization fields to exclude.
|
||||||
RETURNS (Doc): The modified `Doc` object.
|
RETURNS (Doc): The modified `Doc` object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/doc#from_disk
|
DOCS: https://nightly.spacy.io/api/doc#from_disk
|
||||||
"""
|
"""
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
with path.open("rb") as file_:
|
with path.open("rb") as file_:
|
||||||
|
@ -1014,7 +1016,7 @@ cdef class Doc:
|
||||||
RETURNS (bytes): A losslessly serialized copy of the `Doc`, including
|
RETURNS (bytes): A losslessly serialized copy of the `Doc`, including
|
||||||
all annotations.
|
all annotations.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/doc#to_bytes
|
DOCS: https://nightly.spacy.io/api/doc#to_bytes
|
||||||
"""
|
"""
|
||||||
return srsly.msgpack_dumps(self.to_dict(exclude=exclude))
|
return srsly.msgpack_dumps(self.to_dict(exclude=exclude))
|
||||||
|
|
||||||
|
@ -1025,7 +1027,7 @@ cdef class Doc:
|
||||||
exclude (list): String names of serialization fields to exclude.
|
exclude (list): String names of serialization fields to exclude.
|
||||||
RETURNS (Doc): Itself.
|
RETURNS (Doc): Itself.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/doc#from_bytes
|
DOCS: https://nightly.spacy.io/api/doc#from_bytes
|
||||||
"""
|
"""
|
||||||
return self.from_dict(srsly.msgpack_loads(bytes_data), exclude=exclude)
|
return self.from_dict(srsly.msgpack_loads(bytes_data), exclude=exclude)
|
||||||
|
|
||||||
|
@ -1036,7 +1038,7 @@ cdef class Doc:
|
||||||
RETURNS (bytes): A losslessly serialized copy of the `Doc`, including
|
RETURNS (bytes): A losslessly serialized copy of the `Doc`, including
|
||||||
all annotations.
|
all annotations.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/doc#to_bytes
|
DOCS: https://nightly.spacy.io/api/doc#to_bytes
|
||||||
"""
|
"""
|
||||||
array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID, NORM, ENT_KB_ID]
|
array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID, NORM, ENT_KB_ID]
|
||||||
if self.is_tagged:
|
if self.is_tagged:
|
||||||
|
@ -1084,7 +1086,7 @@ cdef class Doc:
|
||||||
exclude (list): String names of serialization fields to exclude.
|
exclude (list): String names of serialization fields to exclude.
|
||||||
RETURNS (Doc): Itself.
|
RETURNS (Doc): Itself.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/doc#from_dict
|
DOCS: https://nightly.spacy.io/api/doc#from_dict
|
||||||
"""
|
"""
|
||||||
if self.length != 0:
|
if self.length != 0:
|
||||||
raise ValueError(Errors.E033.format(length=self.length))
|
raise ValueError(Errors.E033.format(length=self.length))
|
||||||
|
@ -1164,8 +1166,8 @@ cdef class Doc:
|
||||||
retokenization are invalidated, although they may accidentally
|
retokenization are invalidated, although they may accidentally
|
||||||
continue to work.
|
continue to work.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/doc#retokenize
|
DOCS: https://nightly.spacy.io/api/doc#retokenize
|
||||||
USAGE: https://spacy.io/usage/linguistic-features#retokenization
|
USAGE: https://nightly.spacy.io/usage/linguistic-features#retokenization
|
||||||
"""
|
"""
|
||||||
return Retokenizer(self)
|
return Retokenizer(self)
|
||||||
|
|
||||||
|
@ -1200,7 +1202,7 @@ cdef class Doc:
|
||||||
be added to an "_" key in the data, e.g. "_": {"foo": "bar"}.
|
be added to an "_" key in the data, e.g. "_": {"foo": "bar"}.
|
||||||
RETURNS (dict): The data in spaCy's JSON format.
|
RETURNS (dict): The data in spaCy's JSON format.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/doc#to_json
|
DOCS: https://nightly.spacy.io/api/doc#to_json
|
||||||
"""
|
"""
|
||||||
data = {"text": self.text}
|
data = {"text": self.text}
|
||||||
if self.is_nered:
|
if self.is_nered:
|
||||||
|
|
|
@ -27,7 +27,7 @@ from .underscore import Underscore, get_ext_args
|
||||||
cdef class Span:
|
cdef class Span:
|
||||||
"""A slice from a Doc object.
|
"""A slice from a Doc object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/span
|
DOCS: https://nightly.spacy.io/api/span
|
||||||
"""
|
"""
|
||||||
@classmethod
|
@classmethod
|
||||||
def set_extension(cls, name, **kwargs):
|
def set_extension(cls, name, **kwargs):
|
||||||
|
@ -40,8 +40,8 @@ cdef class Span:
|
||||||
method (callable): Optional method for method extension.
|
method (callable): Optional method for method extension.
|
||||||
force (bool): Force overwriting existing attribute.
|
force (bool): Force overwriting existing attribute.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/span#set_extension
|
DOCS: https://nightly.spacy.io/api/span#set_extension
|
||||||
USAGE: https://spacy.io/usage/processing-pipelines#custom-components-attributes
|
USAGE: https://nightly.spacy.io/usage/processing-pipelines#custom-components-attributes
|
||||||
"""
|
"""
|
||||||
if cls.has_extension(name) and not kwargs.get("force", False):
|
if cls.has_extension(name) and not kwargs.get("force", False):
|
||||||
raise ValueError(Errors.E090.format(name=name, obj="Span"))
|
raise ValueError(Errors.E090.format(name=name, obj="Span"))
|
||||||
|
@ -54,7 +54,7 @@ cdef class Span:
|
||||||
name (str): Name of the extension.
|
name (str): Name of the extension.
|
||||||
RETURNS (tuple): A `(default, method, getter, setter)` tuple.
|
RETURNS (tuple): A `(default, method, getter, setter)` tuple.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/span#get_extension
|
DOCS: https://nightly.spacy.io/api/span#get_extension
|
||||||
"""
|
"""
|
||||||
return Underscore.span_extensions.get(name)
|
return Underscore.span_extensions.get(name)
|
||||||
|
|
||||||
|
@ -65,7 +65,7 @@ cdef class Span:
|
||||||
name (str): Name of the extension.
|
name (str): Name of the extension.
|
||||||
RETURNS (bool): Whether the extension has been registered.
|
RETURNS (bool): Whether the extension has been registered.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/span#has_extension
|
DOCS: https://nightly.spacy.io/api/span#has_extension
|
||||||
"""
|
"""
|
||||||
return name in Underscore.span_extensions
|
return name in Underscore.span_extensions
|
||||||
|
|
||||||
|
@ -77,7 +77,7 @@ cdef class Span:
|
||||||
RETURNS (tuple): A `(default, method, getter, setter)` tuple of the
|
RETURNS (tuple): A `(default, method, getter, setter)` tuple of the
|
||||||
removed extension.
|
removed extension.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/span#remove_extension
|
DOCS: https://nightly.spacy.io/api/span#remove_extension
|
||||||
"""
|
"""
|
||||||
if not cls.has_extension(name):
|
if not cls.has_extension(name):
|
||||||
raise ValueError(Errors.E046.format(name=name))
|
raise ValueError(Errors.E046.format(name=name))
|
||||||
|
@ -95,7 +95,7 @@ cdef class Span:
|
||||||
vector (ndarray[ndim=1, dtype='float32']): A meaning representation
|
vector (ndarray[ndim=1, dtype='float32']): A meaning representation
|
||||||
of the span.
|
of the span.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/span#init
|
DOCS: https://nightly.spacy.io/api/span#init
|
||||||
"""
|
"""
|
||||||
if not (0 <= start <= end <= len(doc)):
|
if not (0 <= start <= end <= len(doc)):
|
||||||
raise IndexError(Errors.E035.format(start=start, end=end, length=len(doc)))
|
raise IndexError(Errors.E035.format(start=start, end=end, length=len(doc)))
|
||||||
|
@ -151,7 +151,7 @@ cdef class Span:
|
||||||
|
|
||||||
RETURNS (int): The number of tokens in the span.
|
RETURNS (int): The number of tokens in the span.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/span#len
|
DOCS: https://nightly.spacy.io/api/span#len
|
||||||
"""
|
"""
|
||||||
self._recalculate_indices()
|
self._recalculate_indices()
|
||||||
if self.end < self.start:
|
if self.end < self.start:
|
||||||
|
@ -168,7 +168,7 @@ cdef class Span:
|
||||||
the span to get.
|
the span to get.
|
||||||
RETURNS (Token or Span): The token at `span[i]`.
|
RETURNS (Token or Span): The token at `span[i]`.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/span#getitem
|
DOCS: https://nightly.spacy.io/api/span#getitem
|
||||||
"""
|
"""
|
||||||
self._recalculate_indices()
|
self._recalculate_indices()
|
||||||
if isinstance(i, slice):
|
if isinstance(i, slice):
|
||||||
|
@ -189,7 +189,7 @@ cdef class Span:
|
||||||
|
|
||||||
YIELDS (Token): A `Token` object.
|
YIELDS (Token): A `Token` object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/span#iter
|
DOCS: https://nightly.spacy.io/api/span#iter
|
||||||
"""
|
"""
|
||||||
self._recalculate_indices()
|
self._recalculate_indices()
|
||||||
for i in range(self.start, self.end):
|
for i in range(self.start, self.end):
|
||||||
|
@ -210,7 +210,7 @@ cdef class Span:
|
||||||
copy_user_data (bool): Whether or not to copy the original doc's user data.
|
copy_user_data (bool): Whether or not to copy the original doc's user data.
|
||||||
RETURNS (Doc): The `Doc` copy of the span.
|
RETURNS (Doc): The `Doc` copy of the span.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/span#as_doc
|
DOCS: https://nightly.spacy.io/api/span#as_doc
|
||||||
"""
|
"""
|
||||||
# TODO: make copy_user_data a keyword-only argument (Python 3 only)
|
# TODO: make copy_user_data a keyword-only argument (Python 3 only)
|
||||||
words = [t.text for t in self]
|
words = [t.text for t in self]
|
||||||
|
@ -292,7 +292,7 @@ cdef class Span:
|
||||||
RETURNS (np.array[ndim=2, dtype=numpy.int32]): LCA matrix with shape
|
RETURNS (np.array[ndim=2, dtype=numpy.int32]): LCA matrix with shape
|
||||||
(n, n), where n = len(self).
|
(n, n), where n = len(self).
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/span#get_lca_matrix
|
DOCS: https://nightly.spacy.io/api/span#get_lca_matrix
|
||||||
"""
|
"""
|
||||||
return numpy.asarray(_get_lca_matrix(self.doc, self.start, self.end))
|
return numpy.asarray(_get_lca_matrix(self.doc, self.start, self.end))
|
||||||
|
|
||||||
|
@ -304,7 +304,7 @@ cdef class Span:
|
||||||
`Span`, `Token` and `Lexeme` objects.
|
`Span`, `Token` and `Lexeme` objects.
|
||||||
RETURNS (float): A scalar similarity score. Higher is more similar.
|
RETURNS (float): A scalar similarity score. Higher is more similar.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/span#similarity
|
DOCS: https://nightly.spacy.io/api/span#similarity
|
||||||
"""
|
"""
|
||||||
if "similarity" in self.doc.user_span_hooks:
|
if "similarity" in self.doc.user_span_hooks:
|
||||||
return self.doc.user_span_hooks["similarity"](self, other)
|
return self.doc.user_span_hooks["similarity"](self, other)
|
||||||
|
@ -400,7 +400,7 @@ cdef class Span:
|
||||||
|
|
||||||
RETURNS (tuple): Entities in the span, one `Span` per entity.
|
RETURNS (tuple): Entities in the span, one `Span` per entity.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/span#ents
|
DOCS: https://nightly.spacy.io/api/span#ents
|
||||||
"""
|
"""
|
||||||
ents = []
|
ents = []
|
||||||
for ent in self.doc.ents:
|
for ent in self.doc.ents:
|
||||||
|
@ -415,7 +415,7 @@ cdef class Span:
|
||||||
|
|
||||||
RETURNS (bool): Whether a word vector is associated with the object.
|
RETURNS (bool): Whether a word vector is associated with the object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/span#has_vector
|
DOCS: https://nightly.spacy.io/api/span#has_vector
|
||||||
"""
|
"""
|
||||||
if "has_vector" in self.doc.user_span_hooks:
|
if "has_vector" in self.doc.user_span_hooks:
|
||||||
return self.doc.user_span_hooks["has_vector"](self)
|
return self.doc.user_span_hooks["has_vector"](self)
|
||||||
|
@ -434,7 +434,7 @@ cdef class Span:
|
||||||
RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
|
RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
|
||||||
representing the span's semantics.
|
representing the span's semantics.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/span#vector
|
DOCS: https://nightly.spacy.io/api/span#vector
|
||||||
"""
|
"""
|
||||||
if "vector" in self.doc.user_span_hooks:
|
if "vector" in self.doc.user_span_hooks:
|
||||||
return self.doc.user_span_hooks["vector"](self)
|
return self.doc.user_span_hooks["vector"](self)
|
||||||
|
@ -448,7 +448,7 @@ cdef class Span:
|
||||||
|
|
||||||
RETURNS (float): The L2 norm of the vector representation.
|
RETURNS (float): The L2 norm of the vector representation.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/span#vector_norm
|
DOCS: https://nightly.spacy.io/api/span#vector_norm
|
||||||
"""
|
"""
|
||||||
if "vector_norm" in self.doc.user_span_hooks:
|
if "vector_norm" in self.doc.user_span_hooks:
|
||||||
return self.doc.user_span_hooks["vector"](self)
|
return self.doc.user_span_hooks["vector"](self)
|
||||||
|
@ -508,7 +508,7 @@ cdef class Span:
|
||||||
|
|
||||||
YIELDS (Span): Base noun-phrase `Span` objects.
|
YIELDS (Span): Base noun-phrase `Span` objects.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/span#noun_chunks
|
DOCS: https://nightly.spacy.io/api/span#noun_chunks
|
||||||
"""
|
"""
|
||||||
if not self.doc.is_parsed:
|
if not self.doc.is_parsed:
|
||||||
raise ValueError(Errors.E029)
|
raise ValueError(Errors.E029)
|
||||||
|
@ -533,7 +533,7 @@ cdef class Span:
|
||||||
|
|
||||||
RETURNS (Token): The root token.
|
RETURNS (Token): The root token.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/span#root
|
DOCS: https://nightly.spacy.io/api/span#root
|
||||||
"""
|
"""
|
||||||
self._recalculate_indices()
|
self._recalculate_indices()
|
||||||
if "root" in self.doc.user_span_hooks:
|
if "root" in self.doc.user_span_hooks:
|
||||||
|
@ -590,7 +590,7 @@ cdef class Span:
|
||||||
|
|
||||||
RETURNS (tuple): A tuple of Token objects.
|
RETURNS (tuple): A tuple of Token objects.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/span#lefts
|
DOCS: https://nightly.spacy.io/api/span#lefts
|
||||||
"""
|
"""
|
||||||
return self.root.conjuncts
|
return self.root.conjuncts
|
||||||
|
|
||||||
|
@ -601,7 +601,7 @@ cdef class Span:
|
||||||
|
|
||||||
YIELDS (Token):A left-child of a token of the span.
|
YIELDS (Token):A left-child of a token of the span.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/span#lefts
|
DOCS: https://nightly.spacy.io/api/span#lefts
|
||||||
"""
|
"""
|
||||||
for token in reversed(self): # Reverse, so we get tokens in order
|
for token in reversed(self): # Reverse, so we get tokens in order
|
||||||
for left in token.lefts:
|
for left in token.lefts:
|
||||||
|
@ -615,7 +615,7 @@ cdef class Span:
|
||||||
|
|
||||||
YIELDS (Token): A right-child of a token of the span.
|
YIELDS (Token): A right-child of a token of the span.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/span#rights
|
DOCS: https://nightly.spacy.io/api/span#rights
|
||||||
"""
|
"""
|
||||||
for token in self:
|
for token in self:
|
||||||
for right in token.rights:
|
for right in token.rights:
|
||||||
|
@ -630,7 +630,7 @@ cdef class Span:
|
||||||
RETURNS (int): The number of leftward immediate children of the
|
RETURNS (int): The number of leftward immediate children of the
|
||||||
span, in the syntactic dependency parse.
|
span, in the syntactic dependency parse.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/span#n_lefts
|
DOCS: https://nightly.spacy.io/api/span#n_lefts
|
||||||
"""
|
"""
|
||||||
return len(list(self.lefts))
|
return len(list(self.lefts))
|
||||||
|
|
||||||
|
@ -642,7 +642,7 @@ cdef class Span:
|
||||||
RETURNS (int): The number of rightward immediate children of the
|
RETURNS (int): The number of rightward immediate children of the
|
||||||
span, in the syntactic dependency parse.
|
span, in the syntactic dependency parse.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/span#n_rights
|
DOCS: https://nightly.spacy.io/api/span#n_rights
|
||||||
"""
|
"""
|
||||||
return len(list(self.rights))
|
return len(list(self.rights))
|
||||||
|
|
||||||
|
@ -652,7 +652,7 @@ cdef class Span:
|
||||||
|
|
||||||
YIELDS (Token): A token within the span, or a descendant from it.
|
YIELDS (Token): A token within the span, or a descendant from it.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/span#subtree
|
DOCS: https://nightly.spacy.io/api/span#subtree
|
||||||
"""
|
"""
|
||||||
for word in self.lefts:
|
for word in self.lefts:
|
||||||
yield from word.subtree
|
yield from word.subtree
|
||||||
|
|
|
@ -30,7 +30,7 @@ cdef class Token:
|
||||||
"""An individual token – i.e. a word, punctuation symbol, whitespace,
|
"""An individual token – i.e. a word, punctuation symbol, whitespace,
|
||||||
etc.
|
etc.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/token
|
DOCS: https://nightly.spacy.io/api/token
|
||||||
"""
|
"""
|
||||||
@classmethod
|
@classmethod
|
||||||
def set_extension(cls, name, **kwargs):
|
def set_extension(cls, name, **kwargs):
|
||||||
|
@ -43,8 +43,8 @@ cdef class Token:
|
||||||
method (callable): Optional method for method extension.
|
method (callable): Optional method for method extension.
|
||||||
force (bool): Force overwriting existing attribute.
|
force (bool): Force overwriting existing attribute.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/token#set_extension
|
DOCS: https://nightly.spacy.io/api/token#set_extension
|
||||||
USAGE: https://spacy.io/usage/processing-pipelines#custom-components-attributes
|
USAGE: https://nightly.spacy.io/usage/processing-pipelines#custom-components-attributes
|
||||||
"""
|
"""
|
||||||
if cls.has_extension(name) and not kwargs.get("force", False):
|
if cls.has_extension(name) and not kwargs.get("force", False):
|
||||||
raise ValueError(Errors.E090.format(name=name, obj="Token"))
|
raise ValueError(Errors.E090.format(name=name, obj="Token"))
|
||||||
|
@ -57,7 +57,7 @@ cdef class Token:
|
||||||
name (str): Name of the extension.
|
name (str): Name of the extension.
|
||||||
RETURNS (tuple): A `(default, method, getter, setter)` tuple.
|
RETURNS (tuple): A `(default, method, getter, setter)` tuple.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/token#get_extension
|
DOCS: https://nightly.spacy.io/api/token#get_extension
|
||||||
"""
|
"""
|
||||||
return Underscore.token_extensions.get(name)
|
return Underscore.token_extensions.get(name)
|
||||||
|
|
||||||
|
@ -68,7 +68,7 @@ cdef class Token:
|
||||||
name (str): Name of the extension.
|
name (str): Name of the extension.
|
||||||
RETURNS (bool): Whether the extension has been registered.
|
RETURNS (bool): Whether the extension has been registered.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/token#has_extension
|
DOCS: https://nightly.spacy.io/api/token#has_extension
|
||||||
"""
|
"""
|
||||||
return name in Underscore.token_extensions
|
return name in Underscore.token_extensions
|
||||||
|
|
||||||
|
@ -80,7 +80,7 @@ cdef class Token:
|
||||||
RETURNS (tuple): A `(default, method, getter, setter)` tuple of the
|
RETURNS (tuple): A `(default, method, getter, setter)` tuple of the
|
||||||
removed extension.
|
removed extension.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/token#remove_extension
|
DOCS: https://nightly.spacy.io/api/token#remove_extension
|
||||||
"""
|
"""
|
||||||
if not cls.has_extension(name):
|
if not cls.has_extension(name):
|
||||||
raise ValueError(Errors.E046.format(name=name))
|
raise ValueError(Errors.E046.format(name=name))
|
||||||
|
@ -93,7 +93,7 @@ cdef class Token:
|
||||||
doc (Doc): The parent document.
|
doc (Doc): The parent document.
|
||||||
offset (int): The index of the token within the document.
|
offset (int): The index of the token within the document.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/token#init
|
DOCS: https://nightly.spacy.io/api/token#init
|
||||||
"""
|
"""
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.doc = doc
|
self.doc = doc
|
||||||
|
@ -108,7 +108,7 @@ cdef class Token:
|
||||||
|
|
||||||
RETURNS (int): The number of unicode characters in the token.
|
RETURNS (int): The number of unicode characters in the token.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/token#len
|
DOCS: https://nightly.spacy.io/api/token#len
|
||||||
"""
|
"""
|
||||||
return self.c.lex.length
|
return self.c.lex.length
|
||||||
|
|
||||||
|
@ -171,7 +171,7 @@ cdef class Token:
|
||||||
flag_id (int): The ID of the flag attribute.
|
flag_id (int): The ID of the flag attribute.
|
||||||
RETURNS (bool): Whether the flag is set.
|
RETURNS (bool): Whether the flag is set.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/token#check_flag
|
DOCS: https://nightly.spacy.io/api/token#check_flag
|
||||||
"""
|
"""
|
||||||
return Lexeme.c_check_flag(self.c.lex, flag_id)
|
return Lexeme.c_check_flag(self.c.lex, flag_id)
|
||||||
|
|
||||||
|
@ -181,7 +181,7 @@ cdef class Token:
|
||||||
i (int): The relative position of the token to get. Defaults to 1.
|
i (int): The relative position of the token to get. Defaults to 1.
|
||||||
RETURNS (Token): The token at position `self.doc[self.i+i]`.
|
RETURNS (Token): The token at position `self.doc[self.i+i]`.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/token#nbor
|
DOCS: https://nightly.spacy.io/api/token#nbor
|
||||||
"""
|
"""
|
||||||
if self.i+i < 0 or (self.i+i >= len(self.doc)):
|
if self.i+i < 0 or (self.i+i >= len(self.doc)):
|
||||||
raise IndexError(Errors.E042.format(i=self.i, j=i, length=len(self.doc)))
|
raise IndexError(Errors.E042.format(i=self.i, j=i, length=len(self.doc)))
|
||||||
|
@ -195,7 +195,7 @@ cdef class Token:
|
||||||
`Span`, `Token` and `Lexeme` objects.
|
`Span`, `Token` and `Lexeme` objects.
|
||||||
RETURNS (float): A scalar similarity score. Higher is more similar.
|
RETURNS (float): A scalar similarity score. Higher is more similar.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/token#similarity
|
DOCS: https://nightly.spacy.io/api/token#similarity
|
||||||
"""
|
"""
|
||||||
if "similarity" in self.doc.user_token_hooks:
|
if "similarity" in self.doc.user_token_hooks:
|
||||||
return self.doc.user_token_hooks["similarity"](self, other)
|
return self.doc.user_token_hooks["similarity"](self, other)
|
||||||
|
@ -373,7 +373,7 @@ cdef class Token:
|
||||||
|
|
||||||
RETURNS (bool): Whether a word vector is associated with the object.
|
RETURNS (bool): Whether a word vector is associated with the object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/token#has_vector
|
DOCS: https://nightly.spacy.io/api/token#has_vector
|
||||||
"""
|
"""
|
||||||
if "has_vector" in self.doc.user_token_hooks:
|
if "has_vector" in self.doc.user_token_hooks:
|
||||||
return self.doc.user_token_hooks["has_vector"](self)
|
return self.doc.user_token_hooks["has_vector"](self)
|
||||||
|
@ -388,7 +388,7 @@ cdef class Token:
|
||||||
RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
|
RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
|
||||||
representing the token's semantics.
|
representing the token's semantics.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/token#vector
|
DOCS: https://nightly.spacy.io/api/token#vector
|
||||||
"""
|
"""
|
||||||
if "vector" in self.doc.user_token_hooks:
|
if "vector" in self.doc.user_token_hooks:
|
||||||
return self.doc.user_token_hooks["vector"](self)
|
return self.doc.user_token_hooks["vector"](self)
|
||||||
|
@ -403,7 +403,7 @@ cdef class Token:
|
||||||
|
|
||||||
RETURNS (float): The L2 norm of the vector representation.
|
RETURNS (float): The L2 norm of the vector representation.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/token#vector_norm
|
DOCS: https://nightly.spacy.io/api/token#vector_norm
|
||||||
"""
|
"""
|
||||||
if "vector_norm" in self.doc.user_token_hooks:
|
if "vector_norm" in self.doc.user_token_hooks:
|
||||||
return self.doc.user_token_hooks["vector_norm"](self)
|
return self.doc.user_token_hooks["vector_norm"](self)
|
||||||
|
@ -426,7 +426,7 @@ cdef class Token:
|
||||||
RETURNS (int): The number of leftward immediate children of the
|
RETURNS (int): The number of leftward immediate children of the
|
||||||
word, in the syntactic dependency parse.
|
word, in the syntactic dependency parse.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/token#n_lefts
|
DOCS: https://nightly.spacy.io/api/token#n_lefts
|
||||||
"""
|
"""
|
||||||
return self.c.l_kids
|
return self.c.l_kids
|
||||||
|
|
||||||
|
@ -438,7 +438,7 @@ cdef class Token:
|
||||||
RETURNS (int): The number of rightward immediate children of the
|
RETURNS (int): The number of rightward immediate children of the
|
||||||
word, in the syntactic dependency parse.
|
word, in the syntactic dependency parse.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/token#n_rights
|
DOCS: https://nightly.spacy.io/api/token#n_rights
|
||||||
"""
|
"""
|
||||||
return self.c.r_kids
|
return self.c.r_kids
|
||||||
|
|
||||||
|
@ -470,7 +470,7 @@ cdef class Token:
|
||||||
RETURNS (bool / None): Whether the token starts a sentence.
|
RETURNS (bool / None): Whether the token starts a sentence.
|
||||||
None if unknown.
|
None if unknown.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/token#is_sent_start
|
DOCS: https://nightly.spacy.io/api/token#is_sent_start
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if self.c.sent_start == 0:
|
if self.c.sent_start == 0:
|
||||||
|
@ -499,7 +499,7 @@ cdef class Token:
|
||||||
RETURNS (bool / None): Whether the token ends a sentence.
|
RETURNS (bool / None): Whether the token ends a sentence.
|
||||||
None if unknown.
|
None if unknown.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/token#is_sent_end
|
DOCS: https://nightly.spacy.io/api/token#is_sent_end
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if self.i + 1 == len(self.doc):
|
if self.i + 1 == len(self.doc):
|
||||||
|
@ -521,7 +521,7 @@ cdef class Token:
|
||||||
|
|
||||||
YIELDS (Token): A left-child of the token.
|
YIELDS (Token): A left-child of the token.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/token#lefts
|
DOCS: https://nightly.spacy.io/api/token#lefts
|
||||||
"""
|
"""
|
||||||
cdef int nr_iter = 0
|
cdef int nr_iter = 0
|
||||||
cdef const TokenC* ptr = self.c - (self.i - self.c.l_edge)
|
cdef const TokenC* ptr = self.c - (self.i - self.c.l_edge)
|
||||||
|
@ -541,7 +541,7 @@ cdef class Token:
|
||||||
|
|
||||||
YIELDS (Token): A right-child of the token.
|
YIELDS (Token): A right-child of the token.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/token#rights
|
DOCS: https://nightly.spacy.io/api/token#rights
|
||||||
"""
|
"""
|
||||||
cdef const TokenC* ptr = self.c + (self.c.r_edge - self.i)
|
cdef const TokenC* ptr = self.c + (self.c.r_edge - self.i)
|
||||||
tokens = []
|
tokens = []
|
||||||
|
@ -563,7 +563,7 @@ cdef class Token:
|
||||||
|
|
||||||
YIELDS (Token): A child token such that `child.head==self`.
|
YIELDS (Token): A child token such that `child.head==self`.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/token#children
|
DOCS: https://nightly.spacy.io/api/token#children
|
||||||
"""
|
"""
|
||||||
yield from self.lefts
|
yield from self.lefts
|
||||||
yield from self.rights
|
yield from self.rights
|
||||||
|
@ -576,7 +576,7 @@ cdef class Token:
|
||||||
YIELDS (Token): A descendent token such that
|
YIELDS (Token): A descendent token such that
|
||||||
`self.is_ancestor(descendent) or token == self`.
|
`self.is_ancestor(descendent) or token == self`.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/token#subtree
|
DOCS: https://nightly.spacy.io/api/token#subtree
|
||||||
"""
|
"""
|
||||||
for word in self.lefts:
|
for word in self.lefts:
|
||||||
yield from word.subtree
|
yield from word.subtree
|
||||||
|
@ -607,7 +607,7 @@ cdef class Token:
|
||||||
YIELDS (Token): A sequence of ancestor tokens such that
|
YIELDS (Token): A sequence of ancestor tokens such that
|
||||||
`ancestor.is_ancestor(self)`.
|
`ancestor.is_ancestor(self)`.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/token#ancestors
|
DOCS: https://nightly.spacy.io/api/token#ancestors
|
||||||
"""
|
"""
|
||||||
cdef const TokenC* head_ptr = self.c
|
cdef const TokenC* head_ptr = self.c
|
||||||
# Guard against infinite loop, no token can have
|
# Guard against infinite loop, no token can have
|
||||||
|
@ -625,7 +625,7 @@ cdef class Token:
|
||||||
descendant (Token): Another token.
|
descendant (Token): Another token.
|
||||||
RETURNS (bool): Whether this token is the ancestor of the descendant.
|
RETURNS (bool): Whether this token is the ancestor of the descendant.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/token#is_ancestor
|
DOCS: https://nightly.spacy.io/api/token#is_ancestor
|
||||||
"""
|
"""
|
||||||
if self.doc is not descendant.doc:
|
if self.doc is not descendant.doc:
|
||||||
return False
|
return False
|
||||||
|
@ -729,7 +729,7 @@ cdef class Token:
|
||||||
|
|
||||||
RETURNS (tuple): The coordinated tokens.
|
RETURNS (tuple): The coordinated tokens.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/token#conjuncts
|
DOCS: https://nightly.spacy.io/api/token#conjuncts
|
||||||
"""
|
"""
|
||||||
cdef Token word, child
|
cdef Token word, child
|
||||||
if "conjuncts" in self.doc.user_token_hooks:
|
if "conjuncts" in self.doc.user_token_hooks:
|
||||||
|
|
|
@ -76,7 +76,7 @@ class registry(thinc.registry):
|
||||||
lemmatizers = catalogue.create("spacy", "lemmatizers", entry_points=True)
|
lemmatizers = catalogue.create("spacy", "lemmatizers", entry_points=True)
|
||||||
lookups = catalogue.create("spacy", "lookups", entry_points=True)
|
lookups = catalogue.create("spacy", "lookups", entry_points=True)
|
||||||
displacy_colors = catalogue.create("spacy", "displacy_colors", entry_points=True)
|
displacy_colors = catalogue.create("spacy", "displacy_colors", entry_points=True)
|
||||||
assets = catalogue.create("spacy", "assets", entry_points=True)
|
misc = catalogue.create("spacy", "misc", entry_points=True)
|
||||||
# Callback functions used to manipulate nlp object etc.
|
# Callback functions used to manipulate nlp object etc.
|
||||||
callbacks = catalogue.create("spacy", "callbacks")
|
callbacks = catalogue.create("spacy", "callbacks")
|
||||||
batchers = catalogue.create("spacy", "batchers", entry_points=True)
|
batchers = catalogue.create("spacy", "batchers", entry_points=True)
|
||||||
|
|
|
@ -44,7 +44,7 @@ cdef class Vectors:
|
||||||
the table need to be assigned - so len(list(vectors.keys())) may be
|
the table need to be assigned - so len(list(vectors.keys())) may be
|
||||||
greater or smaller than vectors.shape[0].
|
greater or smaller than vectors.shape[0].
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vectors
|
DOCS: https://nightly.spacy.io/api/vectors
|
||||||
"""
|
"""
|
||||||
cdef public object name
|
cdef public object name
|
||||||
cdef public object data
|
cdef public object data
|
||||||
|
@ -59,7 +59,7 @@ cdef class Vectors:
|
||||||
keys (iterable): A sequence of keys, aligned with the data.
|
keys (iterable): A sequence of keys, aligned with the data.
|
||||||
name (str): A name to identify the vectors table.
|
name (str): A name to identify the vectors table.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vectors#init
|
DOCS: https://nightly.spacy.io/api/vectors#init
|
||||||
"""
|
"""
|
||||||
self.name = name
|
self.name = name
|
||||||
if data is None:
|
if data is None:
|
||||||
|
@ -83,7 +83,7 @@ cdef class Vectors:
|
||||||
|
|
||||||
RETURNS (tuple): A `(rows, dims)` pair.
|
RETURNS (tuple): A `(rows, dims)` pair.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vectors#shape
|
DOCS: https://nightly.spacy.io/api/vectors#shape
|
||||||
"""
|
"""
|
||||||
return self.data.shape
|
return self.data.shape
|
||||||
|
|
||||||
|
@ -93,7 +93,7 @@ cdef class Vectors:
|
||||||
|
|
||||||
RETURNS (int): The vector size.
|
RETURNS (int): The vector size.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vectors#size
|
DOCS: https://nightly.spacy.io/api/vectors#size
|
||||||
"""
|
"""
|
||||||
return self.data.shape[0] * self.data.shape[1]
|
return self.data.shape[0] * self.data.shape[1]
|
||||||
|
|
||||||
|
@ -103,7 +103,7 @@ cdef class Vectors:
|
||||||
|
|
||||||
RETURNS (bool): `True` if no slots are available for new keys.
|
RETURNS (bool): `True` if no slots are available for new keys.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vectors#is_full
|
DOCS: https://nightly.spacy.io/api/vectors#is_full
|
||||||
"""
|
"""
|
||||||
return self._unset.size() == 0
|
return self._unset.size() == 0
|
||||||
|
|
||||||
|
@ -114,7 +114,7 @@ cdef class Vectors:
|
||||||
|
|
||||||
RETURNS (int): The number of keys in the table.
|
RETURNS (int): The number of keys in the table.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vectors#n_keys
|
DOCS: https://nightly.spacy.io/api/vectors#n_keys
|
||||||
"""
|
"""
|
||||||
return len(self.key2row)
|
return len(self.key2row)
|
||||||
|
|
||||||
|
@ -127,7 +127,7 @@ cdef class Vectors:
|
||||||
key (int): The key to get the vector for.
|
key (int): The key to get the vector for.
|
||||||
RETURNS (ndarray): The vector for the key.
|
RETURNS (ndarray): The vector for the key.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vectors#getitem
|
DOCS: https://nightly.spacy.io/api/vectors#getitem
|
||||||
"""
|
"""
|
||||||
i = self.key2row[key]
|
i = self.key2row[key]
|
||||||
if i is None:
|
if i is None:
|
||||||
|
@ -141,7 +141,7 @@ cdef class Vectors:
|
||||||
key (int): The key to set the vector for.
|
key (int): The key to set the vector for.
|
||||||
vector (ndarray): The vector to set.
|
vector (ndarray): The vector to set.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vectors#setitem
|
DOCS: https://nightly.spacy.io/api/vectors#setitem
|
||||||
"""
|
"""
|
||||||
i = self.key2row[key]
|
i = self.key2row[key]
|
||||||
self.data[i] = vector
|
self.data[i] = vector
|
||||||
|
@ -153,7 +153,7 @@ cdef class Vectors:
|
||||||
|
|
||||||
YIELDS (int): A key in the table.
|
YIELDS (int): A key in the table.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vectors#iter
|
DOCS: https://nightly.spacy.io/api/vectors#iter
|
||||||
"""
|
"""
|
||||||
yield from self.key2row
|
yield from self.key2row
|
||||||
|
|
||||||
|
@ -162,7 +162,7 @@ cdef class Vectors:
|
||||||
|
|
||||||
RETURNS (int): The number of vectors in the data.
|
RETURNS (int): The number of vectors in the data.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vectors#len
|
DOCS: https://nightly.spacy.io/api/vectors#len
|
||||||
"""
|
"""
|
||||||
return self.data.shape[0]
|
return self.data.shape[0]
|
||||||
|
|
||||||
|
@ -172,7 +172,7 @@ cdef class Vectors:
|
||||||
key (int): The key to check.
|
key (int): The key to check.
|
||||||
RETURNS (bool): Whether the key has a vector entry.
|
RETURNS (bool): Whether the key has a vector entry.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vectors#contains
|
DOCS: https://nightly.spacy.io/api/vectors#contains
|
||||||
"""
|
"""
|
||||||
return key in self.key2row
|
return key in self.key2row
|
||||||
|
|
||||||
|
@ -189,7 +189,7 @@ cdef class Vectors:
|
||||||
inplace (bool): Reallocate the memory.
|
inplace (bool): Reallocate the memory.
|
||||||
RETURNS (list): The removed items as a list of `(key, row)` tuples.
|
RETURNS (list): The removed items as a list of `(key, row)` tuples.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vectors#resize
|
DOCS: https://nightly.spacy.io/api/vectors#resize
|
||||||
"""
|
"""
|
||||||
xp = get_array_module(self.data)
|
xp = get_array_module(self.data)
|
||||||
if inplace:
|
if inplace:
|
||||||
|
@ -224,7 +224,7 @@ cdef class Vectors:
|
||||||
|
|
||||||
YIELDS (ndarray): A vector in the table.
|
YIELDS (ndarray): A vector in the table.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vectors#values
|
DOCS: https://nightly.spacy.io/api/vectors#values
|
||||||
"""
|
"""
|
||||||
for row, vector in enumerate(range(self.data.shape[0])):
|
for row, vector in enumerate(range(self.data.shape[0])):
|
||||||
if not self._unset.count(row):
|
if not self._unset.count(row):
|
||||||
|
@ -235,7 +235,7 @@ cdef class Vectors:
|
||||||
|
|
||||||
YIELDS (tuple): A key/vector pair.
|
YIELDS (tuple): A key/vector pair.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vectors#items
|
DOCS: https://nightly.spacy.io/api/vectors#items
|
||||||
"""
|
"""
|
||||||
for key, row in self.key2row.items():
|
for key, row in self.key2row.items():
|
||||||
yield key, self.data[row]
|
yield key, self.data[row]
|
||||||
|
@ -281,7 +281,7 @@ cdef class Vectors:
|
||||||
row (int / None): The row number of a vector to map the key to.
|
row (int / None): The row number of a vector to map the key to.
|
||||||
RETURNS (int): The row the vector was added to.
|
RETURNS (int): The row the vector was added to.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vectors#add
|
DOCS: https://nightly.spacy.io/api/vectors#add
|
||||||
"""
|
"""
|
||||||
# use int for all keys and rows in key2row for more efficient access
|
# use int for all keys and rows in key2row for more efficient access
|
||||||
# and serialization
|
# and serialization
|
||||||
|
@ -368,7 +368,7 @@ cdef class Vectors:
|
||||||
path (str / Path): A path to a directory, which will be created if
|
path (str / Path): A path to a directory, which will be created if
|
||||||
it doesn't exists.
|
it doesn't exists.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vectors#to_disk
|
DOCS: https://nightly.spacy.io/api/vectors#to_disk
|
||||||
"""
|
"""
|
||||||
xp = get_array_module(self.data)
|
xp = get_array_module(self.data)
|
||||||
if xp is numpy:
|
if xp is numpy:
|
||||||
|
@ -396,7 +396,7 @@ cdef class Vectors:
|
||||||
path (str / Path): Directory path, string or Path-like object.
|
path (str / Path): Directory path, string or Path-like object.
|
||||||
RETURNS (Vectors): The modified object.
|
RETURNS (Vectors): The modified object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vectors#from_disk
|
DOCS: https://nightly.spacy.io/api/vectors#from_disk
|
||||||
"""
|
"""
|
||||||
def load_key2row(path):
|
def load_key2row(path):
|
||||||
if path.exists():
|
if path.exists():
|
||||||
|
@ -432,7 +432,7 @@ cdef class Vectors:
|
||||||
exclude (list): String names of serialization fields to exclude.
|
exclude (list): String names of serialization fields to exclude.
|
||||||
RETURNS (bytes): The serialized form of the `Vectors` object.
|
RETURNS (bytes): The serialized form of the `Vectors` object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vectors#to_bytes
|
DOCS: https://nightly.spacy.io/api/vectors#to_bytes
|
||||||
"""
|
"""
|
||||||
def serialize_weights():
|
def serialize_weights():
|
||||||
if hasattr(self.data, "to_bytes"):
|
if hasattr(self.data, "to_bytes"):
|
||||||
|
@ -453,7 +453,7 @@ cdef class Vectors:
|
||||||
exclude (list): String names of serialization fields to exclude.
|
exclude (list): String names of serialization fields to exclude.
|
||||||
RETURNS (Vectors): The `Vectors` object.
|
RETURNS (Vectors): The `Vectors` object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vectors#from_bytes
|
DOCS: https://nightly.spacy.io/api/vectors#from_bytes
|
||||||
"""
|
"""
|
||||||
def deserialize_weights(b):
|
def deserialize_weights(b):
|
||||||
if hasattr(self.data, "from_bytes"):
|
if hasattr(self.data, "from_bytes"):
|
||||||
|
|
|
@ -54,7 +54,7 @@ cdef class Vocab:
|
||||||
instance also provides access to the `StringStore`, and owns underlying
|
instance also provides access to the `StringStore`, and owns underlying
|
||||||
C-data that is shared between `Doc` objects.
|
C-data that is shared between `Doc` objects.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vocab
|
DOCS: https://nightly.spacy.io/api/vocab
|
||||||
"""
|
"""
|
||||||
def __init__(self, lex_attr_getters=None, strings=tuple(), lookups=None,
|
def __init__(self, lex_attr_getters=None, strings=tuple(), lookups=None,
|
||||||
oov_prob=-20., vectors_name=None, writing_system={},
|
oov_prob=-20., vectors_name=None, writing_system={},
|
||||||
|
@ -117,7 +117,7 @@ cdef class Vocab:
|
||||||
available bit will be chosen.
|
available bit will be chosen.
|
||||||
RETURNS (int): The integer ID by which the flag value can be checked.
|
RETURNS (int): The integer ID by which the flag value can be checked.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vocab#add_flag
|
DOCS: https://nightly.spacy.io/api/vocab#add_flag
|
||||||
"""
|
"""
|
||||||
if flag_id == -1:
|
if flag_id == -1:
|
||||||
for bit in range(1, 64):
|
for bit in range(1, 64):
|
||||||
|
@ -201,7 +201,7 @@ cdef class Vocab:
|
||||||
string (unicode): The ID string.
|
string (unicode): The ID string.
|
||||||
RETURNS (bool) Whether the string has an entry in the vocabulary.
|
RETURNS (bool) Whether the string has an entry in the vocabulary.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vocab#contains
|
DOCS: https://nightly.spacy.io/api/vocab#contains
|
||||||
"""
|
"""
|
||||||
cdef hash_t int_key
|
cdef hash_t int_key
|
||||||
if isinstance(key, bytes):
|
if isinstance(key, bytes):
|
||||||
|
@ -218,7 +218,7 @@ cdef class Vocab:
|
||||||
|
|
||||||
YIELDS (Lexeme): An entry in the vocabulary.
|
YIELDS (Lexeme): An entry in the vocabulary.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vocab#iter
|
DOCS: https://nightly.spacy.io/api/vocab#iter
|
||||||
"""
|
"""
|
||||||
cdef attr_t key
|
cdef attr_t key
|
||||||
cdef size_t addr
|
cdef size_t addr
|
||||||
|
@ -241,7 +241,7 @@ cdef class Vocab:
|
||||||
>>> apple = nlp.vocab.strings["apple"]
|
>>> apple = nlp.vocab.strings["apple"]
|
||||||
>>> assert nlp.vocab[apple] == nlp.vocab[u"apple"]
|
>>> assert nlp.vocab[apple] == nlp.vocab[u"apple"]
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vocab#getitem
|
DOCS: https://nightly.spacy.io/api/vocab#getitem
|
||||||
"""
|
"""
|
||||||
cdef attr_t orth
|
cdef attr_t orth
|
||||||
if isinstance(id_or_string, unicode):
|
if isinstance(id_or_string, unicode):
|
||||||
|
@ -309,7 +309,7 @@ cdef class Vocab:
|
||||||
word was mapped to, and `score` the similarity score between the
|
word was mapped to, and `score` the similarity score between the
|
||||||
two words.
|
two words.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vocab#prune_vectors
|
DOCS: https://nightly.spacy.io/api/vocab#prune_vectors
|
||||||
"""
|
"""
|
||||||
xp = get_array_module(self.vectors.data)
|
xp = get_array_module(self.vectors.data)
|
||||||
# Make prob negative so it sorts by rank ascending
|
# Make prob negative so it sorts by rank ascending
|
||||||
|
@ -349,7 +349,7 @@ cdef class Vocab:
|
||||||
and shape determined by the `vocab.vectors` instance. Usually, a
|
and shape determined by the `vocab.vectors` instance. Usually, a
|
||||||
numpy ndarray of shape (300,) and dtype float32.
|
numpy ndarray of shape (300,) and dtype float32.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vocab#get_vector
|
DOCS: https://nightly.spacy.io/api/vocab#get_vector
|
||||||
"""
|
"""
|
||||||
if isinstance(orth, str):
|
if isinstance(orth, str):
|
||||||
orth = self.strings.add(orth)
|
orth = self.strings.add(orth)
|
||||||
|
@ -396,7 +396,7 @@ cdef class Vocab:
|
||||||
orth (int / unicode): The word.
|
orth (int / unicode): The word.
|
||||||
vector (numpy.ndarray[ndim=1, dtype='float32']): The vector to set.
|
vector (numpy.ndarray[ndim=1, dtype='float32']): The vector to set.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vocab#set_vector
|
DOCS: https://nightly.spacy.io/api/vocab#set_vector
|
||||||
"""
|
"""
|
||||||
if isinstance(orth, str):
|
if isinstance(orth, str):
|
||||||
orth = self.strings.add(orth)
|
orth = self.strings.add(orth)
|
||||||
|
@ -418,7 +418,7 @@ cdef class Vocab:
|
||||||
orth (int / unicode): The word.
|
orth (int / unicode): The word.
|
||||||
RETURNS (bool): Whether the word has a vector.
|
RETURNS (bool): Whether the word has a vector.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vocab#has_vector
|
DOCS: https://nightly.spacy.io/api/vocab#has_vector
|
||||||
"""
|
"""
|
||||||
if isinstance(orth, str):
|
if isinstance(orth, str):
|
||||||
orth = self.strings.add(orth)
|
orth = self.strings.add(orth)
|
||||||
|
@ -431,7 +431,7 @@ cdef class Vocab:
|
||||||
it doesn't exist.
|
it doesn't exist.
|
||||||
exclude (list): String names of serialization fields to exclude.
|
exclude (list): String names of serialization fields to exclude.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vocab#to_disk
|
DOCS: https://nightly.spacy.io/api/vocab#to_disk
|
||||||
"""
|
"""
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
if not path.exists():
|
if not path.exists():
|
||||||
|
@ -452,7 +452,7 @@ cdef class Vocab:
|
||||||
exclude (list): String names of serialization fields to exclude.
|
exclude (list): String names of serialization fields to exclude.
|
||||||
RETURNS (Vocab): The modified `Vocab` object.
|
RETURNS (Vocab): The modified `Vocab` object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vocab#to_disk
|
DOCS: https://nightly.spacy.io/api/vocab#to_disk
|
||||||
"""
|
"""
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
getters = ["strings", "vectors"]
|
getters = ["strings", "vectors"]
|
||||||
|
@ -477,7 +477,7 @@ cdef class Vocab:
|
||||||
exclude (list): String names of serialization fields to exclude.
|
exclude (list): String names of serialization fields to exclude.
|
||||||
RETURNS (bytes): The serialized form of the `Vocab` object.
|
RETURNS (bytes): The serialized form of the `Vocab` object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vocab#to_bytes
|
DOCS: https://nightly.spacy.io/api/vocab#to_bytes
|
||||||
"""
|
"""
|
||||||
def deserialize_vectors():
|
def deserialize_vectors():
|
||||||
if self.vectors is None:
|
if self.vectors is None:
|
||||||
|
@ -499,7 +499,7 @@ cdef class Vocab:
|
||||||
exclude (list): String names of serialization fields to exclude.
|
exclude (list): String names of serialization fields to exclude.
|
||||||
RETURNS (Vocab): The `Vocab` object.
|
RETURNS (Vocab): The `Vocab` object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vocab#from_bytes
|
DOCS: https://nightly.spacy.io/api/vocab#from_bytes
|
||||||
"""
|
"""
|
||||||
def serialize_vectors(b):
|
def serialize_vectors(b):
|
||||||
if self.vectors is None:
|
if self.vectors is None:
|
||||||
|
|
|
@ -25,36 +25,6 @@ usage documentation on
|
||||||
|
|
||||||
## Tok2Vec architectures {#tok2vec-arch source="spacy/ml/models/tok2vec.py"}
|
## Tok2Vec architectures {#tok2vec-arch source="spacy/ml/models/tok2vec.py"}
|
||||||
|
|
||||||
### spacy.HashEmbedCNN.v1 {#HashEmbedCNN}
|
|
||||||
|
|
||||||
> #### Example Config
|
|
||||||
>
|
|
||||||
> ```ini
|
|
||||||
> [model]
|
|
||||||
> @architectures = "spacy.HashEmbedCNN.v1"
|
|
||||||
> pretrained_vectors = null
|
|
||||||
> width = 96
|
|
||||||
> depth = 4
|
|
||||||
> embed_size = 2000
|
|
||||||
> window_size = 1
|
|
||||||
> maxout_pieces = 3
|
|
||||||
> subword_features = true
|
|
||||||
> ```
|
|
||||||
|
|
||||||
Build spaCy's "standard" embedding layer, which uses hash embedding with subword
|
|
||||||
features and a CNN with layer-normalized maxout.
|
|
||||||
|
|
||||||
| Name | Description |
|
|
||||||
| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
|
||||||
| `width` | The width of the input and output. These are required to be the same, so that residual connections can be used. Recommended values are `96`, `128` or `300`. ~~int~~ |
|
|
||||||
| `depth` | The number of convolutional layers to use. Recommended values are between `2` and `8`. ~~int~~ |
|
|
||||||
| `embed_size` | The number of rows in the hash embedding tables. This can be surprisingly small, due to the use of the hash embeddings. Recommended values are between `2000` and `10000`. ~~int~~ |
|
|
||||||
| `window_size` | The number of tokens on either side to concatenate during the convolutions. The receptive field of the CNN will be `depth * (window_size * 2 + 1)`, so a 4-layer network with a window size of `2` will be sensitive to 17 words at a time. Recommended value is `1`. ~~int~~ |
|
|
||||||
| `maxout_pieces` | The number of pieces to use in the maxout non-linearity. If `1`, the [`Mish`](https://thinc.ai/docs/api-layers#mish) non-linearity is used instead. Recommended values are `1`-`3`. ~~int~~ |
|
|
||||||
| `subword_features` | Whether to also embed subword features, specifically the prefix, suffix and word shape. This is recommended for alphabetic languages like English, but not if single-character tokens are used for a language such as Chinese. ~~bool~~ |
|
|
||||||
| `pretrained_vectors` | Whether to also use static vectors. ~~bool~~ |
|
|
||||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
|
|
||||||
|
|
||||||
### spacy.Tok2Vec.v1 {#Tok2Vec}
|
### spacy.Tok2Vec.v1 {#Tok2Vec}
|
||||||
|
|
||||||
> #### Example config
|
> #### Example config
|
||||||
|
@ -72,7 +42,8 @@ features and a CNN with layer-normalized maxout.
|
||||||
> # ...
|
> # ...
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
Construct a tok2vec model out of embedding and encoding subnetworks. See the
|
Construct a tok2vec model out of two subnetworks: one for embedding and one for
|
||||||
|
encoding. See the
|
||||||
["Embed, Encode, Attend, Predict"](https://explosion.ai/blog/deep-learning-formula-nlp)
|
["Embed, Encode, Attend, Predict"](https://explosion.ai/blog/deep-learning-formula-nlp)
|
||||||
blog post for background.
|
blog post for background.
|
||||||
|
|
||||||
|
@ -82,6 +53,39 @@ blog post for background.
|
||||||
| `encode` | Encode context into the embeddings, using an architecture such as a CNN, BiLSTM or transformer. For example, [MaxoutWindowEncoder](/api/architectures#MaxoutWindowEncoder). ~~Model[List[Floats2d], List[Floats2d]]~~ |
|
| `encode` | Encode context into the embeddings, using an architecture such as a CNN, BiLSTM or transformer. For example, [MaxoutWindowEncoder](/api/architectures#MaxoutWindowEncoder). ~~Model[List[Floats2d], List[Floats2d]]~~ |
|
||||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
|
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||||
|
|
||||||
|
### spacy.HashEmbedCNN.v1 {#HashEmbedCNN}
|
||||||
|
|
||||||
|
> #### Example Config
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> [model]
|
||||||
|
> @architectures = "spacy.HashEmbedCNN.v1"
|
||||||
|
> pretrained_vectors = null
|
||||||
|
> width = 96
|
||||||
|
> depth = 4
|
||||||
|
> embed_size = 2000
|
||||||
|
> window_size = 1
|
||||||
|
> maxout_pieces = 3
|
||||||
|
> subword_features = true
|
||||||
|
> ```
|
||||||
|
|
||||||
|
Build spaCy's "standard" tok2vec layer. This layer is defined by a
|
||||||
|
[MultiHashEmbed](/api/architectures#MultiHashEmbed) embedding layer that uses
|
||||||
|
subword features, and a
|
||||||
|
[MaxoutWindowEncoder](/api/architectures#MaxoutWindowEncoder) encoding layer
|
||||||
|
consisting of a CNN and a layer-normalized maxout activation function.
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `width` | The width of the input and output. These are required to be the same, so that residual connections can be used. Recommended values are `96`, `128` or `300`. ~~int~~ |
|
||||||
|
| `depth` | The number of convolutional layers to use. Recommended values are between `2` and `8`. ~~int~~ |
|
||||||
|
| `embed_size` | The number of rows in the hash embedding tables. This can be surprisingly small, due to the use of the hash embeddings. Recommended values are between `2000` and `10000`. ~~int~~ |
|
||||||
|
| `window_size` | The number of tokens on either side to concatenate during the convolutions. The receptive field of the CNN will be `depth * (window_size * 2 + 1)`, so a 4-layer network with a window size of `2` will be sensitive to 17 words at a time. Recommended value is `1`. ~~int~~ |
|
||||||
|
| `maxout_pieces` | The number of pieces to use in the maxout non-linearity. If `1`, the [`Mish`](https://thinc.ai/docs/api-layers#mish) non-linearity is used instead. Recommended values are `1`-`3`. ~~int~~ |
|
||||||
|
| `subword_features` | Whether to also embed subword features, specifically the prefix, suffix and word shape. This is recommended for alphabetic languages like English, but not if single-character tokens are used for a language such as Chinese. ~~bool~~ |
|
||||||
|
| `pretrained_vectors` | Whether to also use static vectors. ~~bool~~ |
|
||||||
|
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||||
|
|
||||||
### spacy.Tok2VecListener.v1 {#Tok2VecListener}
|
### spacy.Tok2VecListener.v1 {#Tok2VecListener}
|
||||||
|
|
||||||
> #### Example config
|
> #### Example config
|
||||||
|
@ -316,7 +320,7 @@ for details and system requirements.
|
||||||
> tokenizer_config = {"use_fast": true}
|
> tokenizer_config = {"use_fast": true}
|
||||||
>
|
>
|
||||||
> [model.get_spans]
|
> [model.get_spans]
|
||||||
> @span_getters = "strided_spans.v1"
|
> @span_getters = "spacy-transformers.strided_spans.v1"
|
||||||
> window = 128
|
> window = 128
|
||||||
> stride = 96
|
> stride = 96
|
||||||
> ```
|
> ```
|
||||||
|
@ -669,11 +673,11 @@ into the "real world". This requires 3 main components:
|
||||||
> subword_features = true
|
> subword_features = true
|
||||||
>
|
>
|
||||||
> [kb_loader]
|
> [kb_loader]
|
||||||
> @assets = "spacy.EmptyKB.v1"
|
> @misc = "spacy.EmptyKB.v1"
|
||||||
> entity_vector_length = 64
|
> entity_vector_length = 64
|
||||||
>
|
>
|
||||||
> [get_candidates]
|
> [get_candidates]
|
||||||
> @assets = "spacy.CandidateGenerator.v1"
|
> @misc = "spacy.CandidateGenerator.v1"
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
The `EntityLinker` model architecture is a Thinc `Model` with a
|
The `EntityLinker` model architecture is a Thinc `Model` with a
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
---
|
---
|
||||||
title: Command Line Interface
|
title: Command Line Interface
|
||||||
teaser: Download, train and package models, and debug spaCy
|
teaser: Download, train and package pipelines, and debug spaCy
|
||||||
source: spacy/cli
|
source: spacy/cli
|
||||||
menu:
|
menu:
|
||||||
- ['download', 'download']
|
- ['download', 'download']
|
||||||
|
@ -17,45 +17,47 @@ menu:
|
||||||
---
|
---
|
||||||
|
|
||||||
spaCy's CLI provides a range of helpful commands for downloading and training
|
spaCy's CLI provides a range of helpful commands for downloading and training
|
||||||
models, converting data and debugging your config, data and installation. For a
|
pipelines, converting data and debugging your config, data and installation. For
|
||||||
list of available commands, you can type `python -m spacy --help`. You can also
|
a list of available commands, you can type `python -m spacy --help`. You can
|
||||||
add the `--help` flag to any command or subcommand to see the description,
|
also add the `--help` flag to any command or subcommand to see the description,
|
||||||
available arguments and usage.
|
available arguments and usage.
|
||||||
|
|
||||||
## download {#download tag="command"}
|
## download {#download tag="command"}
|
||||||
|
|
||||||
Download [models](/usage/models) for spaCy. The downloader finds the
|
Download [trained pipelines](/usage/models) for spaCy. The downloader finds the
|
||||||
best-matching compatible version and uses `pip install` to download the model as
|
best-matching compatible version and uses `pip install` to download the Python
|
||||||
a package. Direct downloads don't perform any compatibility checks and require
|
package. Direct downloads don't perform any compatibility checks and require the
|
||||||
the model name to be specified with its version (e.g. `en_core_web_sm-2.2.0`).
|
pipeline name to be specified with its version (e.g. `en_core_web_sm-2.2.0`).
|
||||||
|
|
||||||
> #### Downloading best practices
|
> #### Downloading best practices
|
||||||
>
|
>
|
||||||
> The `download` command is mostly intended as a convenient, interactive wrapper
|
> The `download` command is mostly intended as a convenient, interactive wrapper
|
||||||
> – it performs compatibility checks and prints detailed messages in case things
|
> – it performs compatibility checks and prints detailed messages in case things
|
||||||
> go wrong. It's **not recommended** to use this command as part of an automated
|
> go wrong. It's **not recommended** to use this command as part of an automated
|
||||||
> process. If you know which model your project needs, you should consider a
|
> process. If you know which package your project needs, you should consider a
|
||||||
> [direct download via pip](/usage/models#download-pip), or uploading the model
|
> [direct download via pip](/usage/models#download-pip), or uploading the
|
||||||
> to a local PyPi installation and fetching it straight from there. This will
|
> package to a local PyPi installation and fetching it straight from there. This
|
||||||
> also allow you to add it as a versioned package dependency to your project.
|
> will also allow you to add it as a versioned package dependency to your
|
||||||
|
> project.
|
||||||
|
|
||||||
```cli
|
```cli
|
||||||
$ python -m spacy download [model] [--direct] [pip_args]
|
$ python -m spacy download [model] [--direct] [pip_args]
|
||||||
```
|
```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| ------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `model` | Model name, e.g. [`en_core_web_sm`](/models/en#en_core_web_sm). ~~str (positional)~~ |
|
| `model` | Pipeline package name, e.g. [`en_core_web_sm`](/models/en#en_core_web_sm). ~~str (positional)~~ |
|
||||||
| `--direct`, `-d` | Force direct download of exact model version. ~~bool (flag)~~ |
|
| `--direct`, `-d` | Force direct download of exact package version. ~~bool (flag)~~ |
|
||||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||||
| pip args <Tag variant="new">2.1</Tag> | Additional installation options to be passed to `pip install` when installing the model package. For example, `--user` to install to the user home directory or `--no-deps` to not install model dependencies. ~~Any (option/flag)~~ |
|
| pip args <Tag variant="new">2.1</Tag> | Additional installation options to be passed to `pip install` when installing the pipeline package. For example, `--user` to install to the user home directory or `--no-deps` to not install package dependencies. ~~Any (option/flag)~~ |
|
||||||
| **CREATES** | The installed model package in your `site-packages` directory. |
|
| **CREATES** | The installed pipeline package in your `site-packages` directory. |
|
||||||
|
|
||||||
## info {#info tag="command"}
|
## info {#info tag="command"}
|
||||||
|
|
||||||
Print information about your spaCy installation, models and local setup, and
|
Print information about your spaCy installation, trained pipelines and local
|
||||||
generate [Markdown](https://en.wikipedia.org/wiki/Markdown)-formatted markup to
|
setup, and generate [Markdown](https://en.wikipedia.org/wiki/Markdown)-formatted
|
||||||
copy-paste into [GitHub issues](https://github.com/explosion/spaCy/issues).
|
markup to copy-paste into
|
||||||
|
[GitHub issues](https://github.com/explosion/spaCy/issues).
|
||||||
|
|
||||||
```cli
|
```cli
|
||||||
$ python -m spacy info [--markdown] [--silent]
|
$ python -m spacy info [--markdown] [--silent]
|
||||||
|
@ -66,8 +68,8 @@ $ python -m spacy info [model] [--markdown] [--silent]
|
||||||
```
|
```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------------------------------------ | ------------------------------------------------------------------------------ |
|
| ------------------------------------------------ | ----------------------------------------------------------------------------------------- |
|
||||||
| `model` | A model, i.e. package name or path (optional). ~~Optional[str] \(positional)~~ |
|
| `model` | A trained pipeline, i.e. package name or path (optional). ~~Optional[str] \(positional)~~ |
|
||||||
| `--markdown`, `-md` | Print information as Markdown. ~~bool (flag)~~ |
|
| `--markdown`, `-md` | Print information as Markdown. ~~bool (flag)~~ |
|
||||||
| `--silent`, `-s` <Tag variant="new">2.0.12</Tag> | Don't print anything, just return the values. ~~bool (flag)~~ |
|
| `--silent`, `-s` <Tag variant="new">2.0.12</Tag> | Don't print anything, just return the values. ~~bool (flag)~~ |
|
||||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||||
|
@ -75,31 +77,31 @@ $ python -m spacy info [model] [--markdown] [--silent]
|
||||||
|
|
||||||
## validate {#validate new="2" tag="command"}
|
## validate {#validate new="2" tag="command"}
|
||||||
|
|
||||||
Find all models installed in the current environment and check whether they are
|
Find all trained pipeline packages installed in the current environment and
|
||||||
compatible with the currently installed version of spaCy. Should be run after
|
check whether they are compatible with the currently installed version of spaCy.
|
||||||
upgrading spaCy via `pip install -U spacy` to ensure that all installed models
|
Should be run after upgrading spaCy via `pip install -U spacy` to ensure that
|
||||||
are can be used with the new version. It will show a list of models and their
|
all installed packages are can be used with the new version. It will show a list
|
||||||
installed versions. If any model is out of date, the latest compatible versions
|
of packages and their installed versions. If any package is out of date, the
|
||||||
and command for updating are shown.
|
latest compatible versions and command for updating are shown.
|
||||||
|
|
||||||
> #### Automated validation
|
> #### Automated validation
|
||||||
>
|
>
|
||||||
> You can also use the `validate` command as part of your build process or test
|
> You can also use the `validate` command as part of your build process or test
|
||||||
> suite, to ensure all models are up to date before proceeding. If incompatible
|
> suite, to ensure all packages are up to date before proceeding. If
|
||||||
> models are found, it will return `1`.
|
> incompatible packages are found, it will return `1`.
|
||||||
|
|
||||||
```cli
|
```cli
|
||||||
$ python -m spacy validate
|
$ python -m spacy validate
|
||||||
```
|
```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ---------- | --------------------------------------------------------- |
|
| ---------- | -------------------------------------------------------------------- |
|
||||||
| **PRINTS** | Details about the compatibility of your installed models. |
|
| **PRINTS** | Details about the compatibility of your installed pipeline packages. |
|
||||||
|
|
||||||
## init {#init new="3"}
|
## init {#init new="3"}
|
||||||
|
|
||||||
The `spacy init` CLI includes helpful commands for initializing training config
|
The `spacy init` CLI includes helpful commands for initializing training config
|
||||||
files and model directories.
|
files and pipeline directories.
|
||||||
|
|
||||||
### init config {#init-config new="3" tag="command"}
|
### init config {#init-config new="3" tag="command"}
|
||||||
|
|
||||||
|
@ -125,7 +127,7 @@ $ python -m spacy init config [output_file] [--lang] [--pipeline] [--optimize] [
|
||||||
| ------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `output_file` | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~ |
|
| `output_file` | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~ |
|
||||||
| `--lang`, `-l` | Optional code of the [language](/usage/models#languages) to use. Defaults to `"en"`. ~~str (option)~~ |
|
| `--lang`, `-l` | Optional code of the [language](/usage/models#languages) to use. Defaults to `"en"`. ~~str (option)~~ |
|
||||||
| `--pipeline`, `-p` | Comma-separated list of trainable [pipeline components](/usage/processing-pipelines#built-in) to include in the model. Defaults to `"tagger,parser,ner"`. ~~str (option)~~ |
|
| `--pipeline`, `-p` | Comma-separated list of trainable [pipeline components](/usage/processing-pipelines#built-in) to include. Defaults to `"tagger,parser,ner"`. ~~str (option)~~ |
|
||||||
| `--optimize`, `-o` | `"efficiency"` or `"accuracy"`. Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters. Defaults to `"efficiency"`. ~~str (option)~~ |
|
| `--optimize`, `-o` | `"efficiency"` or `"accuracy"`. Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters. Defaults to `"efficiency"`. ~~str (option)~~ |
|
||||||
| `--cpu`, `-C` | Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters. ~~bool (flag)~~ |
|
| `--cpu`, `-C` | Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters. ~~bool (flag)~~ |
|
||||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||||
|
@ -165,36 +167,38 @@ $ python -m spacy init fill-config [base_path] [output_file] [--diff]
|
||||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||||
| **CREATES** | Complete and auto-filled config file for training. |
|
| **CREATES** | Complete and auto-filled config file for training. |
|
||||||
|
|
||||||
### init model {#init-model new="2" tag="command"}
|
### init vocab {#init-vocab new="3" tag="command"}
|
||||||
|
|
||||||
Create a new model directory from raw data, like word frequencies, Brown
|
Create a blank pipeline directory from raw data, like word frequencies, Brown
|
||||||
clusters and word vectors. Note that in order to populate the model's vocab, you
|
clusters and word vectors. Note that in order to populate the vocabulary, you
|
||||||
need to pass in a JSONL-formatted
|
need to pass in a JSONL-formatted
|
||||||
[vocabulary file](/api/data-formats#vocab-jsonl) as `--jsonl-loc` with optional
|
[vocabulary file](/api/data-formats#vocab-jsonl) as `--jsonl-loc` with optional
|
||||||
`id` values that correspond to the vectors table. Just loading in vectors will
|
`id` values that correspond to the vectors table. Just loading in vectors will
|
||||||
not automatically populate the vocab.
|
not automatically populate the vocab.
|
||||||
|
|
||||||
<Infobox title="New in v3.0" variant="warning">
|
<Infobox title="New in v3.0" variant="warning" id="init-model">
|
||||||
|
|
||||||
The `init-model` command is now available as a subcommand of `spacy init`.
|
This command was previously called `init-model`.
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
```cli
|
```cli
|
||||||
$ python -m spacy init model [lang] [output_dir] [--jsonl-loc] [--vectors-loc] [--prune-vectors]
|
$ python -m spacy init vocab [lang] [output_dir] [--jsonl-loc] [--vectors-loc] [--prune-vectors] [--vectors-name] [--meta-name] [--base]
|
||||||
```
|
```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `lang` | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. ~~str (positional)~~ |
|
| `lang` | Pipeline language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. ~~str (positional)~~ |
|
||||||
| `output_dir` | Model output directory. Will be created if it doesn't exist. ~~Path (positional)~~ |
|
| `output_dir` | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~ |
|
||||||
| `--jsonl-loc`, `-j` | Optional location of JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) with lexical attributes. ~~Optional[Path] \(option)~~ |
|
| `--jsonl-loc`, `-j` | Optional location of JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) with lexical attributes. ~~Optional[Path] \(option)~~ |
|
||||||
| `--vectors-loc`, `-v` | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Optional[Path] \(option)~~ |
|
| `--vectors-loc`, `-v` | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Optional[Path] \(option)~~ |
|
||||||
| `--truncate-vectors`, `-t` <Tag variant="new">2.3</Tag> | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ |
|
| `--truncate-vectors`, `-t` <Tag variant="new">2.3</Tag> | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ |
|
||||||
| `--prune-vectors`, `-V` | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~ |
|
| `--prune-vectors`, `-V` | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~ |
|
||||||
| `--vectors-name`, `-vn` | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. ~~str (option)~~ |
|
| `--vectors-name`, `-vn` | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. ~~Optional[str] \(option)~~ |
|
||||||
|
| `--meta-name`, `-mn` | Optional name of the package for the pipeline meta. ~~Optional[str] \(option)~~ |
|
||||||
|
| `--base`, `-b` | Optional name of or path to base pipeline to start with (mostly relevant for pipelines with custom tokenizers). ~~Optional[str] \(option)~~ |
|
||||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||||
| **CREATES** | A spaCy model containing the vocab and vectors. |
|
| **CREATES** | A spaCy pipeline directory containing the vocab and vectors. |
|
||||||
|
|
||||||
## convert {#convert tag="command"}
|
## convert {#convert tag="command"}
|
||||||
|
|
||||||
|
@ -205,7 +209,7 @@ management functions. The converter can be specified on the command line, or
|
||||||
chosen based on the file extension of the input file.
|
chosen based on the file extension of the input file.
|
||||||
|
|
||||||
```cli
|
```cli
|
||||||
$ python -m spacy convert [input_file] [output_dir] [--converter] [--file-type] [--n-sents] [--seg-sents] [--model] [--morphology] [--merge-subtokens] [--ner-map] [--lang]
|
$ python -m spacy convert [input_file] [output_dir] [--converter] [--file-type] [--n-sents] [--seg-sents] [--base] [--morphology] [--merge-subtokens] [--ner-map] [--lang]
|
||||||
```
|
```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
|
@ -216,7 +220,7 @@ $ python -m spacy convert [input_file] [output_dir] [--converter] [--file-type]
|
||||||
| `--file-type`, `-t` <Tag variant="new">2.1</Tag> | Type of file to create. Either `spacy` (default) for binary [`DocBin`](/api/docbin) data or `json` for v2.x JSON format. ~~str (option)~~ |
|
| `--file-type`, `-t` <Tag variant="new">2.1</Tag> | Type of file to create. Either `spacy` (default) for binary [`DocBin`](/api/docbin) data or `json` for v2.x JSON format. ~~str (option)~~ |
|
||||||
| `--n-sents`, `-n` | Number of sentences per document. ~~int (option)~~ |
|
| `--n-sents`, `-n` | Number of sentences per document. ~~int (option)~~ |
|
||||||
| `--seg-sents`, `-s` <Tag variant="new">2.2</Tag> | Segment sentences (for `--converter ner`). ~~bool (flag)~~ |
|
| `--seg-sents`, `-s` <Tag variant="new">2.2</Tag> | Segment sentences (for `--converter ner`). ~~bool (flag)~~ |
|
||||||
| `--model`, `-b` <Tag variant="new">2.2</Tag> | Model for parser-based sentence segmentation (for `--seg-sents`). ~~Optional[str](option)~~ |
|
| `--base`, `-b` | Trained spaCy pipeline for sentence segmentation to use as base (for `--seg-sents`). ~~Optional[str](option)~~ |
|
||||||
| `--morphology`, `-m` | Enable appending morphology to tags. ~~bool (flag)~~ |
|
| `--morphology`, `-m` | Enable appending morphology to tags. ~~bool (flag)~~ |
|
||||||
| `--ner-map`, `-nm` | NER tag mapping (as JSON-encoded dict of entity types). ~~Optional[Path](option)~~ |
|
| `--ner-map`, `-nm` | NER tag mapping (as JSON-encoded dict of entity types). ~~Optional[Path](option)~~ |
|
||||||
| `--lang`, `-l` <Tag variant="new">2.1</Tag> | Language code (if tokenizer required). ~~Optional[str] \(option)~~ |
|
| `--lang`, `-l` <Tag variant="new">2.1</Tag> | Language code (if tokenizer required). ~~Optional[str] \(option)~~ |
|
||||||
|
@ -267,7 +271,7 @@ training -> dropout field required
|
||||||
training -> optimizer field required
|
training -> optimizer field required
|
||||||
training -> optimize extra fields not permitted
|
training -> optimize extra fields not permitted
|
||||||
|
|
||||||
{'vectors': 'en_vectors_web_lg', 'seed': 0, 'accumulate_gradient': 1, 'init_tok2vec': None, 'raw_text': None, 'patience': 1600, 'max_epochs': 0, 'max_steps': 20000, 'eval_frequency': 200, 'frozen_components': [], 'optimize': None, 'batcher': {'@batchers': 'batch_by_words.v1', 'discard_oversize': False, 'tolerance': 0.2, 'get_length': None, 'size': {'@schedules': 'compounding.v1', 'start': 100, 'stop': 1000, 'compound': 1.001, 't': 0.0}}, 'dev_corpus': {'@readers': 'spacy.Corpus.v1', 'path': '', 'max_length': 0, 'gold_preproc': False, 'limit': 0}, 'score_weights': {'tag_acc': 0.5, 'dep_uas': 0.25, 'dep_las': 0.25, 'sents_f': 0.0}, 'train_corpus': {'@readers': 'spacy.Corpus.v1', 'path': '', 'max_length': 0, 'gold_preproc': False, 'limit': 0}}
|
{'vectors': 'en_vectors_web_lg', 'seed': 0, 'accumulate_gradient': 1, 'init_tok2vec': None, 'raw_text': None, 'patience': 1600, 'max_epochs': 0, 'max_steps': 20000, 'eval_frequency': 200, 'frozen_components': [], 'optimize': None, 'batcher': {'@batchers': 'spacy.batch_by_words.v1', 'discard_oversize': False, 'tolerance': 0.2, 'get_length': None, 'size': {'@schedules': 'compounding.v1', 'start': 100, 'stop': 1000, 'compound': 1.001, 't': 0.0}}, 'dev_corpus': {'@readers': 'spacy.Corpus.v1', 'path': '', 'max_length': 0, 'gold_preproc': False, 'limit': 0}, 'score_weights': {'tag_acc': 0.5, 'dep_uas': 0.25, 'dep_las': 0.25, 'sents_f': 0.0}, 'train_corpus': {'@readers': 'spacy.Corpus.v1', 'path': '', 'max_length': 0, 'gold_preproc': False, 'limit': 0}}
|
||||||
|
|
||||||
If your config contains missing values, you can run the 'init fill-config'
|
If your config contains missing values, you can run the 'init fill-config'
|
||||||
command to fill in all the defaults, if possible:
|
command to fill in all the defaults, if possible:
|
||||||
|
@ -357,7 +361,7 @@ Module spacy.gold.loggers
|
||||||
File /path/to/spacy/gold/loggers.py (line 8)
|
File /path/to/spacy/gold/loggers.py (line 8)
|
||||||
ℹ [training.batcher]
|
ℹ [training.batcher]
|
||||||
Registry @batchers
|
Registry @batchers
|
||||||
Name batch_by_words.v1
|
Name spacy.batch_by_words.v1
|
||||||
Module spacy.gold.batchers
|
Module spacy.gold.batchers
|
||||||
File /path/to/spacy/gold/batchers.py (line 49)
|
File /path/to/spacy/gold/batchers.py (line 49)
|
||||||
ℹ [training.batcher.size]
|
ℹ [training.batcher.size]
|
||||||
|
@ -594,11 +598,11 @@ $ python -m spacy debug profile [model] [inputs] [--n-texts]
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------------- | ---------------------------------------------------------------------------------- |
|
| ----------------- | ---------------------------------------------------------------------------------- |
|
||||||
| `model` | A loadable spaCy model. ~~str (positional)~~ |
|
| `model` | A loadable spaCy pipeline (package name or path). ~~str (positional)~~ |
|
||||||
| `inputs` | Optional path to input file, or `-` for standard input. ~~Path (positional)~~ |
|
| `inputs` | Optional path to input file, or `-` for standard input. ~~Path (positional)~~ |
|
||||||
| `--n-texts`, `-n` | Maximum number of texts to use if available. Defaults to `10000`. ~~int (option)~~ |
|
| `--n-texts`, `-n` | Maximum number of texts to use if available. Defaults to `10000`. ~~int (option)~~ |
|
||||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||||
| **PRINTS** | Profiling information for the model. |
|
| **PRINTS** | Profiling information for the pipeline. |
|
||||||
|
|
||||||
### debug model {#debug-model new="3" tag="command"}
|
### debug model {#debug-model new="3" tag="command"}
|
||||||
|
|
||||||
|
@ -724,10 +728,10 @@ $ python -m spacy debug model ./config.cfg tagger -l "5,15" -DIM -PAR -P0 -P1 -P
|
||||||
|
|
||||||
## train {#train tag="command"}
|
## train {#train tag="command"}
|
||||||
|
|
||||||
Train a model. Expects data in spaCy's
|
Train a pipeline. Expects data in spaCy's
|
||||||
[binary format](/api/data-formats#training) and a
|
[binary format](/api/data-formats#training) and a
|
||||||
[config file](/api/data-formats#config) with all settings and hyperparameters.
|
[config file](/api/data-formats#config) with all settings and hyperparameters.
|
||||||
Will save out the best model from all epochs, as well as the final model. The
|
Will save out the best model from all epochs, as well as the final pipeline. The
|
||||||
`--code` argument can be used to provide a Python file that's imported before
|
`--code` argument can be used to provide a Python file that's imported before
|
||||||
the training process starts. This lets you register
|
the training process starts. This lets you register
|
||||||
[custom functions](/usage/training#custom-functions) and architectures and refer
|
[custom functions](/usage/training#custom-functions) and architectures and refer
|
||||||
|
@ -753,12 +757,12 @@ $ python -m spacy train [config_path] [--output] [--code] [--verbose] [overrides
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
|
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
|
||||||
| `--output`, `-o` | Directory to store model in. Will be created if it doesn't exist. ~~Optional[Path] \(positional)~~ |
|
| `--output`, `-o` | Directory to store trained pipeline in. Will be created if it doesn't exist. ~~Optional[Path] \(positional)~~ |
|
||||||
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||||
| `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~ |
|
| `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~ |
|
||||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||||
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
|
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
|
||||||
| **CREATES** | The final model and the best model. |
|
| **CREATES** | The final trained pipeline and the best trained pipeline. |
|
||||||
|
|
||||||
## pretrain {#pretrain new="2.1" tag="command,experimental"}
|
## pretrain {#pretrain new="2.1" tag="command,experimental"}
|
||||||
|
|
||||||
|
@ -769,7 +773,7 @@ a component like a CNN, BiLSTM, etc to predict vectors which match the
|
||||||
pretrained ones. The weights are saved to a directory after each epoch. You can
|
pretrained ones. The weights are saved to a directory after each epoch. You can
|
||||||
then include a **path to one of these pretrained weights files** in your
|
then include a **path to one of these pretrained weights files** in your
|
||||||
[training config](/usage/training#config) as the `init_tok2vec` setting when you
|
[training config](/usage/training#config) as the `init_tok2vec` setting when you
|
||||||
train your model. This technique may be especially helpful if you have little
|
train your pipeline. This technique may be especially helpful if you have little
|
||||||
labelled data. See the usage docs on [pretraining](/usage/training#pretraining)
|
labelled data. See the usage docs on [pretraining](/usage/training#pretraining)
|
||||||
for more info.
|
for more info.
|
||||||
|
|
||||||
|
@ -792,7 +796,7 @@ $ python -m spacy pretrain [texts_loc] [output_dir] [config_path] [--code] [--re
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `texts_loc` | Path to JSONL file with raw texts to learn from, with text provided as the key `"text"` or tokens as the key `"tokens"`. [See here](/api/data-formats#pretrain) for details. ~~Path (positional)~~ |
|
| `texts_loc` | Path to JSONL file with raw texts to learn from, with text provided as the key `"text"` or tokens as the key `"tokens"`. [See here](/api/data-formats#pretrain) for details. ~~Path (positional)~~ |
|
||||||
| `output_dir` | Directory to write models to on each epoch. ~~Path (positional)~~ |
|
| `output_dir` | Directory to save binary weights to on each epoch. ~~Path (positional)~~ |
|
||||||
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
|
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
|
||||||
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||||
| `--resume-path`, `-r` | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~ |
|
| `--resume-path`, `-r` | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~ |
|
||||||
|
@ -803,7 +807,8 @@ $ python -m spacy pretrain [texts_loc] [output_dir] [config_path] [--code] [--re
|
||||||
|
|
||||||
## evaluate {#evaluate new="2" tag="command"}
|
## evaluate {#evaluate new="2" tag="command"}
|
||||||
|
|
||||||
Evaluate a model. Expects a loadable spaCy model and evaluation data in the
|
Evaluate a trained pipeline. Expects a loadable spaCy pipeline (package name or
|
||||||
|
path) and evaluation data in the
|
||||||
[binary `.spacy` format](/api/data-formats#binary-training). The
|
[binary `.spacy` format](/api/data-formats#binary-training). The
|
||||||
`--gold-preproc` option sets up the evaluation examples with gold-standard
|
`--gold-preproc` option sets up the evaluation examples with gold-standard
|
||||||
sentences and tokens for the predictions. Gold preprocessing helps the
|
sentences and tokens for the predictions. Gold preprocessing helps the
|
||||||
|
@ -819,7 +824,7 @@ $ python -m spacy evaluate [model] [data_path] [--output] [--gold-preproc] [--gp
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `model` | Model to evaluate. Can be a package or a path to a model data directory. ~~str (positional)~~ |
|
| `model` | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~ |
|
||||||
| `data_path` | Location of evaluation data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~ |
|
| `data_path` | Location of evaluation data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~ |
|
||||||
| `--output`, `-o` | Output JSON file for metrics. If not set, no metrics will be exported. ~~Optional[Path] \(option)~~ |
|
| `--output`, `-o` | Output JSON file for metrics. If not set, no metrics will be exported. ~~Optional[Path] \(option)~~ |
|
||||||
| `--gold-preproc`, `-G` | Use gold preprocessing. ~~bool (flag)~~ |
|
| `--gold-preproc`, `-G` | Use gold preprocessing. ~~bool (flag)~~ |
|
||||||
|
@ -831,13 +836,12 @@ $ python -m spacy evaluate [model] [data_path] [--output] [--gold-preproc] [--gp
|
||||||
|
|
||||||
## package {#package tag="command"}
|
## package {#package tag="command"}
|
||||||
|
|
||||||
Generate an installable
|
Generate an installable [Python package](/usage/training#models-generating) from
|
||||||
[model Python package](/usage/training#models-generating) from an existing model
|
an existing pipeline data directory. All data files are copied over. If the path
|
||||||
data directory. All data files are copied over. If the path to a
|
to a [`meta.json`](/api/data-formats#meta) is supplied, or a `meta.json` is
|
||||||
[`meta.json`](/api/data-formats#meta) is supplied, or a `meta.json` is found in
|
found in the input directory, this file is used. Otherwise, the data can be
|
||||||
the input directory, this file is used. Otherwise, the data can be entered
|
entered directly from the command line. spaCy will then create a `.tar.gz`
|
||||||
directly from the command line. spaCy will then create a `.tar.gz` archive file
|
archive file that you can distribute and install with `pip install`.
|
||||||
that you can distribute and install with `pip install`.
|
|
||||||
|
|
||||||
<Infobox title="New in v3.0" variant="warning">
|
<Infobox title="New in v3.0" variant="warning">
|
||||||
|
|
||||||
|
@ -855,13 +859,13 @@ $ python -m spacy package [input_dir] [output_dir] [--meta-path] [--create-meta]
|
||||||
>
|
>
|
||||||
> ```cli
|
> ```cli
|
||||||
> $ python -m spacy package /input /output
|
> $ python -m spacy package /input /output
|
||||||
> $ cd /output/en_model-0.0.0
|
> $ cd /output/en_pipeline-0.0.0
|
||||||
> $ pip install dist/en_model-0.0.0.tar.gz
|
> $ pip install dist/en_pipeline-0.0.0.tar.gz
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `input_dir` | Path to directory containing model data. ~~Path (positional)~~ |
|
| `input_dir` | Path to directory containing pipeline data. ~~Path (positional)~~ |
|
||||||
| `output_dir` | Directory to create package folder in. ~~Path (positional)~~ |
|
| `output_dir` | Directory to create package folder in. ~~Path (positional)~~ |
|
||||||
| `--meta-path`, `-m` <Tag variant="new">2</Tag> | Path to [`meta.json`](/api/data-formats#meta) file (optional). ~~Optional[Path] \(option)~~ |
|
| `--meta-path`, `-m` <Tag variant="new">2</Tag> | Path to [`meta.json`](/api/data-formats#meta) file (optional). ~~Optional[Path] \(option)~~ |
|
||||||
| `--create-meta`, `-C` <Tag variant="new">2</Tag> | Create a `meta.json` file on the command line, even if one already exists in the directory. If an existing file is found, its entries will be shown as the defaults in the command line prompt. ~~bool (flag)~~ |
|
| `--create-meta`, `-C` <Tag variant="new">2</Tag> | Create a `meta.json` file on the command line, even if one already exists in the directory. If an existing file is found, its entries will be shown as the defaults in the command line prompt. ~~bool (flag)~~ |
|
||||||
|
@ -869,13 +873,13 @@ $ python -m spacy package [input_dir] [output_dir] [--meta-path] [--create-meta]
|
||||||
| `--version`, `-v` <Tag variant="new">3</Tag> | Package version to override in meta. Useful when training new versions, as it doesn't require editing the meta template. ~~Optional[str] \(option)~~ |
|
| `--version`, `-v` <Tag variant="new">3</Tag> | Package version to override in meta. Useful when training new versions, as it doesn't require editing the meta template. ~~Optional[str] \(option)~~ |
|
||||||
| `--force`, `-f` | Force overwriting of existing folder in output directory. ~~bool (flag)~~ |
|
| `--force`, `-f` | Force overwriting of existing folder in output directory. ~~bool (flag)~~ |
|
||||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||||
| **CREATES** | A Python package containing the spaCy model. |
|
| **CREATES** | A Python package containing the spaCy pipeline. |
|
||||||
|
|
||||||
## project {#project new="3"}
|
## project {#project new="3"}
|
||||||
|
|
||||||
The `spacy project` CLI includes subcommands for working with
|
The `spacy project` CLI includes subcommands for working with
|
||||||
[spaCy projects](/usage/projects), end-to-end workflows for building and
|
[spaCy projects](/usage/projects), end-to-end workflows for building and
|
||||||
deploying custom spaCy models.
|
deploying custom spaCy pipelines.
|
||||||
|
|
||||||
### project clone {#project-clone tag="command"}
|
### project clone {#project-clone tag="command"}
|
||||||
|
|
||||||
|
@ -1015,9 +1019,9 @@ Download all files or directories listed as `outputs` for commands, unless they
|
||||||
are not already present locally. When searching for files in the remote, `pull`
|
are not already present locally. When searching for files in the remote, `pull`
|
||||||
won't just look at the output path, but will also consider the **command
|
won't just look at the output path, but will also consider the **command
|
||||||
string** and the **hashes of the dependencies**. For instance, let's say you've
|
string** and the **hashes of the dependencies**. For instance, let's say you've
|
||||||
previously pushed a model checkpoint to the remote, but now you've changed some
|
previously pushed a checkpoint to the remote, but now you've changed some
|
||||||
hyper-parameters. Because you've changed the inputs to the command, if you run
|
hyper-parameters. Because you've changed the inputs to the command, if you run
|
||||||
`pull`, you won't retrieve the stale result. If you train your model and push
|
`pull`, you won't retrieve the stale result. If you train your pipeline and push
|
||||||
the outputs to the remote, the outputs will be saved alongside the prior
|
the outputs to the remote, the outputs will be saved alongside the prior
|
||||||
outputs, so if you change the config back, you'll be able to fetch back the
|
outputs, so if you change the config back, you'll be able to fetch back the
|
||||||
result.
|
result.
|
||||||
|
|
|
@ -6,18 +6,18 @@ menu:
|
||||||
- ['Training Data', 'training']
|
- ['Training Data', 'training']
|
||||||
- ['Pretraining Data', 'pretraining']
|
- ['Pretraining Data', 'pretraining']
|
||||||
- ['Vocabulary', 'vocab-jsonl']
|
- ['Vocabulary', 'vocab-jsonl']
|
||||||
- ['Model Meta', 'meta']
|
- ['Pipeline Meta', 'meta']
|
||||||
---
|
---
|
||||||
|
|
||||||
This section documents input and output formats of data used by spaCy, including
|
This section documents input and output formats of data used by spaCy, including
|
||||||
the [training config](/usage/training#config), training data and lexical
|
the [training config](/usage/training#config), training data and lexical
|
||||||
vocabulary data. For an overview of label schemes used by the models, see the
|
vocabulary data. For an overview of label schemes used by the models, see the
|
||||||
[models directory](/models). Each model documents the label schemes used in its
|
[models directory](/models). Each trained pipeline documents the label schemes
|
||||||
components, depending on the data it was trained on.
|
used in its components, depending on the data it was trained on.
|
||||||
|
|
||||||
## Training config {#config new="3"}
|
## Training config {#config new="3"}
|
||||||
|
|
||||||
Config files define the training process and model pipeline and can be passed to
|
Config files define the training process and pipeline and can be passed to
|
||||||
[`spacy train`](/api/cli#train). They use
|
[`spacy train`](/api/cli#train). They use
|
||||||
[Thinc's configuration system](https://thinc.ai/docs/usage-config) under the
|
[Thinc's configuration system](https://thinc.ai/docs/usage-config) under the
|
||||||
hood. For details on how to use training configs, see the
|
hood. For details on how to use training configs, see the
|
||||||
|
@ -75,10 +75,10 @@ Defines the `nlp` object, its tokenizer and
|
||||||
[processing pipeline](/usage/processing-pipelines) component names.
|
[processing pipeline](/usage/processing-pipelines) component names.
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `lang` | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). Defaults to `null`. ~~str~~ |
|
| `lang` | Pipeline language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). Defaults to `null`. ~~str~~ |
|
||||||
| `pipeline` | Names of pipeline components in order. Should correspond to sections in the `[components]` block, e.g. `[components.ner]`. See docs on [defining components](/usage/training#config-components). Defaults to `[]`. ~~List[str]~~ |
|
| `pipeline` | Names of pipeline components in order. Should correspond to sections in the `[components]` block, e.g. `[components.ner]`. See docs on [defining components](/usage/training#config-components). Defaults to `[]`. ~~List[str]~~ |
|
||||||
| `disabled` | Names of pipeline components that are loaded but disabled by default and not run as part of the pipeline. Should correspond to components listed in `pipeline`. After a model is loaded, disabled components can be enabled using [`Language.enable_pipe`](/api/language#enable_pipe). ~~List[str]~~ |
|
| `disabled` | Names of pipeline components that are loaded but disabled by default and not run as part of the pipeline. Should correspond to components listed in `pipeline`. After a pipeline is loaded, disabled components can be enabled using [`Language.enable_pipe`](/api/language#enable_pipe). ~~List[str]~~ |
|
||||||
| `load_vocab_data` | Whether to load additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) if available. Defaults to `true`. ~~bool~~ |
|
| `load_vocab_data` | Whether to load additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) if available. Defaults to `true`. ~~bool~~ |
|
||||||
| `before_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `Language` subclass before it's initialized. Defaults to `null`. ~~Optional[Callable[[Type[Language]], Type[Language]]]~~ |
|
| `before_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `Language` subclass before it's initialized. Defaults to `null`. ~~Optional[Callable[[Type[Language]], Type[Language]]]~~ |
|
||||||
| `after_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object right after it's initialized. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ |
|
| `after_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object right after it's initialized. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ |
|
||||||
|
@ -105,8 +105,8 @@ This section includes definitions of the
|
||||||
[pipeline components](/usage/processing-pipelines) and their models, if
|
[pipeline components](/usage/processing-pipelines) and their models, if
|
||||||
available. Components in this section can be referenced in the `pipeline` of the
|
available. Components in this section can be referenced in the `pipeline` of the
|
||||||
`[nlp]` block. Component blocks need to specify either a `factory` (named
|
`[nlp]` block. Component blocks need to specify either a `factory` (named
|
||||||
function to use to create component) or a `source` (name of path of pretrained
|
function to use to create component) or a `source` (name of path of trained
|
||||||
model to copy components from). See the docs on
|
pipeline to copy components from). See the docs on
|
||||||
[defining pipeline components](/usage/training#config-components) for details.
|
[defining pipeline components](/usage/training#config-components) for details.
|
||||||
|
|
||||||
### paths, system {#config-variables tag="variables"}
|
### paths, system {#config-variables tag="variables"}
|
||||||
|
@ -145,7 +145,7 @@ process that are used when you run [`spacy train`](/api/cli#train).
|
||||||
| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ |
|
| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ |
|
||||||
| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ |
|
| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ |
|
||||||
| `train_corpus` | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/corpus). ~~Callable[[Language], Iterator[Example]]~~ |
|
| `train_corpus` | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/corpus). ~~Callable[[Language], Iterator[Example]]~~ |
|
||||||
| `vectors` | Model name or path to model containing pretrained word vectors to use, e.g. created with [`init model`](/api/cli#init-model). Defaults to `null`. ~~Optional[str]~~ |
|
| `vectors` | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~ |
|
||||||
|
|
||||||
### pretraining {#config-pretraining tag="section,optional"}
|
### pretraining {#config-pretraining tag="section,optional"}
|
||||||
|
|
||||||
|
@ -184,7 +184,7 @@ run [`spacy pretrain`](/api/cli#pretrain).
|
||||||
|
|
||||||
The main data format used in spaCy v3.0 is a **binary format** created by
|
The main data format used in spaCy v3.0 is a **binary format** created by
|
||||||
serializing a [`DocBin`](/api/docbin), which represents a collection of `Doc`
|
serializing a [`DocBin`](/api/docbin), which represents a collection of `Doc`
|
||||||
objects. This means that you can train spaCy models using the same format it
|
objects. This means that you can train spaCy pipelines using the same format it
|
||||||
outputs: annotated `Doc` objects. The binary format is extremely **efficient in
|
outputs: annotated `Doc` objects. The binary format is extremely **efficient in
|
||||||
storage**, especially when packing multiple documents together.
|
storage**, especially when packing multiple documents together.
|
||||||
|
|
||||||
|
@ -286,8 +286,8 @@ a dictionary of gold-standard annotations.
|
||||||
[internal training API](/usage/training#api) and they're expected when you call
|
[internal training API](/usage/training#api) and they're expected when you call
|
||||||
[`nlp.update`](/api/language#update). However, for most use cases, you
|
[`nlp.update`](/api/language#update). However, for most use cases, you
|
||||||
**shouldn't** have to write your own training scripts. It's recommended to train
|
**shouldn't** have to write your own training scripts. It's recommended to train
|
||||||
your models via the [`spacy train`](/api/cli#train) command with a config file
|
your pipelines via the [`spacy train`](/api/cli#train) command with a config
|
||||||
to keep track of your settings and hyperparameters and your own
|
file to keep track of your settings and hyperparameters and your own
|
||||||
[registered functions](/usage/training/#custom-code) to customize the setup.
|
[registered functions](/usage/training/#custom-code) to customize the setup.
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
@ -406,15 +406,15 @@ in line-by-line, while still making it easy to represent newlines in the data.
|
||||||
|
|
||||||
## Lexical data for vocabulary {#vocab-jsonl new="2"}
|
## Lexical data for vocabulary {#vocab-jsonl new="2"}
|
||||||
|
|
||||||
To populate a model's vocabulary, you can use the
|
To populate a pipeline's vocabulary, you can use the
|
||||||
[`spacy init model`](/api/cli#init-model) command and load in a
|
[`spacy init vocab`](/api/cli#init-vocab) command and load in a
|
||||||
[newline-delimited JSON](http://jsonlines.org/) (JSONL) file containing one
|
[newline-delimited JSON](http://jsonlines.org/) (JSONL) file containing one
|
||||||
lexical entry per line via the `--jsonl-loc` option. The first line defines the
|
lexical entry per line via the `--jsonl-loc` option. The first line defines the
|
||||||
language and vocabulary settings. All other lines are expected to be JSON
|
language and vocabulary settings. All other lines are expected to be JSON
|
||||||
objects describing an individual lexeme. The lexical attributes will be then set
|
objects describing an individual lexeme. The lexical attributes will be then set
|
||||||
as attributes on spaCy's [`Lexeme`](/api/lexeme#attributes) object. The `vocab`
|
as attributes on spaCy's [`Lexeme`](/api/lexeme#attributes) object. The `vocab`
|
||||||
command outputs a ready-to-use spaCy model with a `Vocab` containing the lexical
|
command outputs a ready-to-use spaCy pipeline with a `Vocab` containing the
|
||||||
data.
|
lexical data.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### First line
|
### First line
|
||||||
|
@ -459,11 +459,11 @@ Here's an example of the 20 most frequent lexemes in the English training data:
|
||||||
https://github.com/explosion/spaCy/tree/master/examples/training/vocab-data.jsonl
|
https://github.com/explosion/spaCy/tree/master/examples/training/vocab-data.jsonl
|
||||||
```
|
```
|
||||||
|
|
||||||
## Model meta {#meta}
|
## Pipeline meta {#meta}
|
||||||
|
|
||||||
The model meta is available as the file `meta.json` and exported automatically
|
The pipeline meta is available as the file `meta.json` and exported
|
||||||
when you save an `nlp` object to disk. Its contents are available as
|
automatically when you save an `nlp` object to disk. Its contents are available
|
||||||
[`nlp.meta`](/api/language#meta).
|
as [`nlp.meta`](/api/language#meta).
|
||||||
|
|
||||||
<Infobox variant="warning" title="Changed in v3.0">
|
<Infobox variant="warning" title="Changed in v3.0">
|
||||||
|
|
||||||
|
@ -473,8 +473,8 @@ creating a Python package with [`spacy package`](/api/cli#package). How to set
|
||||||
up the `nlp` object is now defined in the
|
up the `nlp` object is now defined in the
|
||||||
[`config.cfg`](/api/data-formats#config), which includes detailed information
|
[`config.cfg`](/api/data-formats#config), which includes detailed information
|
||||||
about the pipeline components and their model architectures, and all other
|
about the pipeline components and their model architectures, and all other
|
||||||
settings and hyperparameters used to train the model. It's the **single source
|
settings and hyperparameters used to train the pipeline. It's the **single
|
||||||
of truth** used for loading a model.
|
source of truth** used for loading a pipeline.
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
|
@ -482,12 +482,12 @@ of truth** used for loading a model.
|
||||||
>
|
>
|
||||||
> ```json
|
> ```json
|
||||||
> {
|
> {
|
||||||
> "name": "example_model",
|
> "name": "example_pipeline",
|
||||||
> "lang": "en",
|
> "lang": "en",
|
||||||
> "version": "1.0.0",
|
> "version": "1.0.0",
|
||||||
> "spacy_version": ">=3.0.0,<3.1.0",
|
> "spacy_version": ">=3.0.0,<3.1.0",
|
||||||
> "parent_package": "spacy",
|
> "parent_package": "spacy",
|
||||||
> "description": "Example model for spaCy",
|
> "description": "Example pipeline for spaCy",
|
||||||
> "author": "You",
|
> "author": "You",
|
||||||
> "email": "you@example.com",
|
> "email": "you@example.com",
|
||||||
> "url": "https://example.com",
|
> "url": "https://example.com",
|
||||||
|
@ -511,22 +511,22 @@ of truth** used for loading a model.
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ---------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| ---------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `lang` | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). Defaults to `"en"`. ~~str~~ |
|
| `lang` | Pipeline language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). Defaults to `"en"`. ~~str~~ |
|
||||||
| `name` | Model name, e.g. `"core_web_sm"`. The final model package name will be `{lang}_{name}`. Defaults to `"model"`. ~~str~~ |
|
| `name` | Pipeline name, e.g. `"core_web_sm"`. The final package name will be `{lang}_{name}`. Defaults to `"pipeline"`. ~~str~~ |
|
||||||
| `version` | Model version. Will be used to version a Python package created with [`spacy package`](/api/cli#package). Defaults to `"0.0.0"`. ~~str~~ |
|
| `version` | Pipeline version. Will be used to version a Python package created with [`spacy package`](/api/cli#package). Defaults to `"0.0.0"`. ~~str~~ |
|
||||||
| `spacy_version` | spaCy version range the model is compatible with. Defaults to the spaCy version used to create the model, up to next minor version, which is the default compatibility for the available [pretrained models](/models). For instance, a model trained with v3.0.0 will have the version range `">=3.0.0,<3.1.0"`. ~~str~~ |
|
| `spacy_version` | spaCy version range the package is compatible with. Defaults to the spaCy version used to create the pipeline, up to next minor version, which is the default compatibility for the available [trained pipelines](/models). For instance, a pipeline trained with v3.0.0 will have the version range `">=3.0.0,<3.1.0"`. ~~str~~ |
|
||||||
| `parent_package` | Name of the spaCy package. Typically `"spacy"` or `"spacy_nightly"`. Defaults to `"spacy"`. ~~str~~ |
|
| `parent_package` | Name of the spaCy package. Typically `"spacy"` or `"spacy_nightly"`. Defaults to `"spacy"`. ~~str~~ |
|
||||||
| `description` | Model description. Also used for Python package. Defaults to `""`. ~~str~~ |
|
| `description` | Pipeline description. Also used for Python package. Defaults to `""`. ~~str~~ |
|
||||||
| `author` | Model author name. Also used for Python package. Defaults to `""`. ~~str~~ |
|
| `author` | Pipeline author name. Also used for Python package. Defaults to `""`. ~~str~~ |
|
||||||
| `email` | Model author email. Also used for Python package. Defaults to `""`. ~~str~~ |
|
| `email` | Pipeline author email. Also used for Python package. Defaults to `""`. ~~str~~ |
|
||||||
| `url` | Model author URL. Also used for Python package. Defaults to `""`. ~~str~~ |
|
| `url` | Pipeline author URL. Also used for Python package. Defaults to `""`. ~~str~~ |
|
||||||
| `license` | Model license. Also used for Python package. Defaults to `""`. ~~str~~ |
|
| `license` | Pipeline license. Also used for Python package. Defaults to `""`. ~~str~~ |
|
||||||
| `sources` | Data sources used to train the model. Typically a list of dicts with the keys `"name"`, `"url"`, `"author"` and `"license"`. [See here](https://github.com/explosion/spacy-models/tree/master/meta) for examples. Defaults to `None`. ~~Optional[List[Dict[str, str]]]~~ |
|
| `sources` | Data sources used to train the pipeline. Typically a list of dicts with the keys `"name"`, `"url"`, `"author"` and `"license"`. [See here](https://github.com/explosion/spacy-models/tree/master/meta) for examples. Defaults to `None`. ~~Optional[List[Dict[str, str]]]~~ |
|
||||||
| `vectors` | Information about the word vectors included with the model. Typically a dict with the keys `"width"`, `"vectors"` (number of vectors), `"keys"` and `"name"`. ~~Dict[str, Any]~~ |
|
| `vectors` | Information about the word vectors included with the pipeline. Typically a dict with the keys `"width"`, `"vectors"` (number of vectors), `"keys"` and `"name"`. ~~Dict[str, Any]~~ |
|
||||||
| `pipeline` | Names of pipeline component names in the model, in order. Corresponds to [`nlp.pipe_names`](/api/language#pipe_names). Only exists for reference and is not used to create the components. This information is defined in the [`config.cfg`](/api/data-formats#config). Defaults to `[]`. ~~List[str]~~ |
|
| `pipeline` | Names of pipeline component names, in order. Corresponds to [`nlp.pipe_names`](/api/language#pipe_names). Only exists for reference and is not used to create the components. This information is defined in the [`config.cfg`](/api/data-formats#config). Defaults to `[]`. ~~List[str]~~ |
|
||||||
| `labels` | Label schemes of the trained pipeline components, keyed by component name. Corresponds to [`nlp.pipe_labels`](/api/language#pipe_labels). [See here](https://github.com/explosion/spacy-models/tree/master/meta) for examples. Defaults to `{}`. ~~Dict[str, Dict[str, List[str]]]~~ |
|
| `labels` | Label schemes of the trained pipeline components, keyed by component name. Corresponds to [`nlp.pipe_labels`](/api/language#pipe_labels). [See here](https://github.com/explosion/spacy-models/tree/master/meta) for examples. Defaults to `{}`. ~~Dict[str, Dict[str, List[str]]]~~ |
|
||||||
| `accuracy` | Training accuracy, added automatically by [`spacy train`](/api/cli#train). Dictionary of [score names](/usage/training#metrics) mapped to scores. Defaults to `{}`. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
|
| `accuracy` | Training accuracy, added automatically by [`spacy train`](/api/cli#train). Dictionary of [score names](/usage/training#metrics) mapped to scores. Defaults to `{}`. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
|
||||||
| `speed` | Model speed, added automatically by [`spacy train`](/api/cli#train). Typically a dictionary with the keys `"cpu"`, `"gpu"` and `"nwords"` (words per second). Defaults to `{}`. ~~Dict[str, Optional[Union[float, str]]]~~ |
|
| `speed` | Inference speed, added automatically by [`spacy train`](/api/cli#train). Typically a dictionary with the keys `"cpu"`, `"gpu"` and `"nwords"` (words per second). Defaults to `{}`. ~~Dict[str, Optional[Union[float, str]]]~~ |
|
||||||
| `spacy_git_version` <Tag variant="new">3</Tag> | Git commit of [`spacy`](https://github.com/explosion/spaCy) used to create model. ~~str~~ |
|
| `spacy_git_version` <Tag variant="new">3</Tag> | Git commit of [`spacy`](https://github.com/explosion/spaCy) used to create pipeline. ~~str~~ |
|
||||||
| other | Any other custom meta information you want to add. The data is preserved in [`nlp.meta`](/api/language#meta). ~~Any~~ |
|
| other | Any other custom meta information you want to add. The data is preserved in [`nlp.meta`](/api/language#meta). ~~Any~~ |
|
||||||
|
|
|
@ -13,8 +13,8 @@ An `EntityLinker` component disambiguates textual mentions (tagged as named
|
||||||
entities) to unique identifiers, grounding the named entities into the "real
|
entities) to unique identifiers, grounding the named entities into the "real
|
||||||
world". It requires a `KnowledgeBase`, as well as a function to generate
|
world". It requires a `KnowledgeBase`, as well as a function to generate
|
||||||
plausible candidates from that `KnowledgeBase` given a certain textual mention,
|
plausible candidates from that `KnowledgeBase` given a certain textual mention,
|
||||||
and a ML model to pick the right candidate, given the local context of the
|
and a machine learning model to pick the right candidate, given the local
|
||||||
mention.
|
context of the mention.
|
||||||
|
|
||||||
## Config and implementation {#config}
|
## Config and implementation {#config}
|
||||||
|
|
||||||
|
@ -34,8 +34,8 @@ architectures and their arguments and hyperparameters.
|
||||||
> "incl_prior": True,
|
> "incl_prior": True,
|
||||||
> "incl_context": True,
|
> "incl_context": True,
|
||||||
> "model": DEFAULT_NEL_MODEL,
|
> "model": DEFAULT_NEL_MODEL,
|
||||||
> "kb_loader": {'@assets': 'spacy.EmptyKB.v1', 'entity_vector_length': 64},
|
> "kb_loader": {'@misc': 'spacy.EmptyKB.v1', 'entity_vector_length': 64},
|
||||||
> "get_candidates": {'@assets': 'spacy.CandidateGenerator.v1'},
|
> "get_candidates": {'@misc': 'spacy.CandidateGenerator.v1'},
|
||||||
> }
|
> }
|
||||||
> nlp.add_pipe("entity_linker", config=config)
|
> nlp.add_pipe("entity_linker", config=config)
|
||||||
> ```
|
> ```
|
||||||
|
@ -66,7 +66,7 @@ https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/entity_linker.py
|
||||||
> entity_linker = nlp.add_pipe("entity_linker", config=config)
|
> entity_linker = nlp.add_pipe("entity_linker", config=config)
|
||||||
>
|
>
|
||||||
> # Construction via add_pipe with custom KB and candidate generation
|
> # Construction via add_pipe with custom KB and candidate generation
|
||||||
> config = {"kb": {"@assets": "my_kb.v1"}}
|
> config = {"kb": {"@misc": "my_kb.v1"}}
|
||||||
> entity_linker = nlp.add_pipe("entity_linker", config=config)
|
> entity_linker = nlp.add_pipe("entity_linker", config=config)
|
||||||
>
|
>
|
||||||
> # Construction from class
|
> # Construction from class
|
||||||
|
|
|
@ -7,9 +7,9 @@ source: spacy/language.py
|
||||||
|
|
||||||
Usually you'll load this once per process as `nlp` and pass the instance around
|
Usually you'll load this once per process as `nlp` and pass the instance around
|
||||||
your application. The `Language` class is created when you call
|
your application. The `Language` class is created when you call
|
||||||
[`spacy.load()`](/api/top-level#spacy.load) and contains the shared vocabulary
|
[`spacy.load`](/api/top-level#spacy.load) and contains the shared vocabulary and
|
||||||
and [language data](/usage/adding-languages), optional model data loaded from a
|
[language data](/usage/adding-languages), optional binary weights, e.g. provided
|
||||||
[model package](/models) or a path, and a
|
by a [trained pipeline](/models), and the
|
||||||
[processing pipeline](/usage/processing-pipelines) containing components like
|
[processing pipeline](/usage/processing-pipelines) containing components like
|
||||||
the tagger or parser that are called on a document in order. You can also add
|
the tagger or parser that are called on a document in order. You can also add
|
||||||
your own processing pipeline components that take a `Doc` object, modify it and
|
your own processing pipeline components that take a `Doc` object, modify it and
|
||||||
|
@ -37,7 +37,7 @@ Initialize a `Language` object.
|
||||||
| `vocab` | A `Vocab` object. If `True`, a vocab is created using the default language data settings. ~~Vocab~~ |
|
| `vocab` | A `Vocab` object. If `True`, a vocab is created using the default language data settings. ~~Vocab~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `max_length` | Maximum number of characters allowed in a single text. Defaults to `10 ** 6`. ~~int~~ |
|
| `max_length` | Maximum number of characters allowed in a single text. Defaults to `10 ** 6`. ~~int~~ |
|
||||||
| `meta` | Custom meta data for the `Language` class. Is written to by models to add model meta data. ~~dict~~ |
|
| `meta` | Custom meta data for the `Language` class. Is written to by pipelines to add meta data. ~~dict~~ |
|
||||||
| `create_tokenizer` | Optional function that receives the `nlp` object and returns a tokenizer. ~~Callable[[Language], Callable[[str], Doc]]~~ |
|
| `create_tokenizer` | Optional function that receives the `nlp` object and returns a tokenizer. ~~Callable[[Language], Callable[[str], Doc]]~~ |
|
||||||
|
|
||||||
## Language.from_config {#from_config tag="classmethod" new="3"}
|
## Language.from_config {#from_config tag="classmethod" new="3"}
|
||||||
|
@ -232,7 +232,7 @@ tuples of `Doc` and `GoldParse` objects.
|
||||||
|
|
||||||
## Language.resume_training {#resume_training tag="method,experimental" new="3"}
|
## Language.resume_training {#resume_training tag="method,experimental" new="3"}
|
||||||
|
|
||||||
Continue training a pretrained model. Create and return an optimizer, and
|
Continue training a trained pipeline. Create and return an optimizer, and
|
||||||
initialize "rehearsal" for any pipeline component that has a `rehearse` method.
|
initialize "rehearsal" for any pipeline component that has a `rehearse` method.
|
||||||
Rehearsal is used to prevent models from "forgetting" their initialized
|
Rehearsal is used to prevent models from "forgetting" their initialized
|
||||||
"knowledge". To perform rehearsal, collect samples of text you want the models
|
"knowledge". To perform rehearsal, collect samples of text you want the models
|
||||||
|
@ -314,7 +314,7 @@ the "catastrophic forgetting" problem. This feature is experimental.
|
||||||
|
|
||||||
## Language.evaluate {#evaluate tag="method"}
|
## Language.evaluate {#evaluate tag="method"}
|
||||||
|
|
||||||
Evaluate a model's pipeline components.
|
Evaluate a pipeline's components.
|
||||||
|
|
||||||
<Infobox variant="warning" title="Changed in v3.0">
|
<Infobox variant="warning" title="Changed in v3.0">
|
||||||
|
|
||||||
|
@ -386,13 +386,13 @@ component, adds it to the pipeline and returns it.
|
||||||
> nlp.add_pipe("component", before="ner")
|
> nlp.add_pipe("component", before="ner")
|
||||||
> component = nlp.add_pipe("component", name="custom_name", last=True)
|
> component = nlp.add_pipe("component", name="custom_name", last=True)
|
||||||
>
|
>
|
||||||
> # Add component from source model
|
> # Add component from source pipeline
|
||||||
> source_nlp = spacy.load("en_core_web_sm")
|
> source_nlp = spacy.load("en_core_web_sm")
|
||||||
> nlp.add_pipe("ner", source=source_nlp)
|
> nlp.add_pipe("ner", source=source_nlp)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `factory_name` | Name of the registered component factory. ~~str~~ |
|
| `factory_name` | Name of the registered component factory. ~~str~~ |
|
||||||
| `name` | Optional unique name of pipeline component instance. If not set, the factory name is used. An error is raised if the name already exists in the pipeline. ~~Optional[str]~~ |
|
| `name` | Optional unique name of pipeline component instance. If not set, the factory name is used. An error is raised if the name already exists in the pipeline. ~~Optional[str]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
|
@ -401,7 +401,7 @@ component, adds it to the pipeline and returns it.
|
||||||
| `first` | Insert component first / not first in the pipeline. ~~Optional[bool]~~ |
|
| `first` | Insert component first / not first in the pipeline. ~~Optional[bool]~~ |
|
||||||
| `last` | Insert component last / not last in the pipeline. ~~Optional[bool]~~ |
|
| `last` | Insert component last / not last in the pipeline. ~~Optional[bool]~~ |
|
||||||
| `config` <Tag variant="new">3</Tag> | Optional config parameters to use for this component. Will be merged with the `default_config` specified by the component factory. ~~Optional[Dict[str, Any]]~~ |
|
| `config` <Tag variant="new">3</Tag> | Optional config parameters to use for this component. Will be merged with the `default_config` specified by the component factory. ~~Optional[Dict[str, Any]]~~ |
|
||||||
| `source` <Tag variant="new">3</Tag> | Optional source model to copy component from. If a source is provided, the `factory_name` is interpreted as the name of the component in the source pipeline. Make sure that the vocab, vectors and settings of the source model match the target model. ~~Optional[Language]~~ |
|
| `source` <Tag variant="new">3</Tag> | Optional source pipeline to copy component from. If a source is provided, the `factory_name` is interpreted as the name of the component in the source pipeline. Make sure that the vocab, vectors and settings of the source pipeline match the target pipeline. ~~Optional[Language]~~ |
|
||||||
| `validate` <Tag variant="new">3</Tag> | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ |
|
| `validate` <Tag variant="new">3</Tag> | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ |
|
||||||
| **RETURNS** | The pipeline component. ~~Callable[[Doc], Doc]~~ |
|
| **RETURNS** | The pipeline component. ~~Callable[[Doc], Doc]~~ |
|
||||||
|
|
||||||
|
@ -790,9 +790,10 @@ token.ent_iob, token.ent_type
|
||||||
|
|
||||||
## Language.meta {#meta tag="property"}
|
## Language.meta {#meta tag="property"}
|
||||||
|
|
||||||
Custom meta data for the Language class. If a model is loaded, contains meta
|
Custom meta data for the Language class. If a trained pipeline is loaded, this
|
||||||
data of the model. The `Language.meta` is also what's serialized as the
|
contains meta data of the pipeline. The `Language.meta` is also what's
|
||||||
[`meta.json`](/api/data-formats#meta) when you save an `nlp` object to disk.
|
serialized as the [`meta.json`](/api/data-formats#meta) when you save an `nlp`
|
||||||
|
object to disk.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
@ -827,13 +828,15 @@ subclass of the built-in `dict`. It supports the additional methods `to_disk`
|
||||||
|
|
||||||
## Language.to_disk {#to_disk tag="method" new="2"}
|
## Language.to_disk {#to_disk tag="method" new="2"}
|
||||||
|
|
||||||
Save the current state to a directory. If a model is loaded, this will **include
|
Save the current state to a directory. Under the hood, this method delegates to
|
||||||
the model**.
|
the `to_disk` methods of the individual pipeline components, if available. This
|
||||||
|
means that if a trained pipeline is loaded, all components and their weights
|
||||||
|
will be saved to disk.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> nlp.to_disk("/path/to/models")
|
> nlp.to_disk("/path/to/pipeline")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
|
@ -844,22 +847,28 @@ the model**.
|
||||||
|
|
||||||
## Language.from_disk {#from_disk tag="method" new="2"}
|
## Language.from_disk {#from_disk tag="method" new="2"}
|
||||||
|
|
||||||
Loads state from a directory. Modifies the object in place and returns it. If
|
Loads state from a directory, including all data that was saved with the
|
||||||
the saved `Language` object contains a model, the model will be loaded. Note
|
`Language` object. Modifies the object in place and returns it.
|
||||||
that this method is commonly used via the subclasses like `English` or `German`
|
|
||||||
to make language-specific functionality like the
|
<Infobox variant="warning" title="Important note">
|
||||||
[lexical attribute getters](/usage/adding-languages#lex-attrs) available to the
|
|
||||||
loaded object.
|
Keep in mind that this method **only loads serialized state** and doesn't set up
|
||||||
|
the `nlp` object. This means that it requires the correct language class to be
|
||||||
|
initialized and all pipeline components to be added to the pipeline. If you want
|
||||||
|
to load a serialized pipeline from a directory, you should use
|
||||||
|
[`spacy.load`](/api/top-level#spacy.load), which will set everything up for you.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> from spacy.language import Language
|
> from spacy.language import Language
|
||||||
> nlp = Language().from_disk("/path/to/model")
|
> nlp = Language().from_disk("/path/to/pipeline")
|
||||||
>
|
>
|
||||||
> # using language-specific subclass
|
> # Using language-specific subclass
|
||||||
> from spacy.lang.en import English
|
> from spacy.lang.en import English
|
||||||
> nlp = English().from_disk("/path/to/en_model")
|
> nlp = English().from_disk("/path/to/pipeline")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
|
@ -924,7 +933,7 @@ available to the loaded object.
|
||||||
| `components` <Tag variant="new">3</Tag> | List of all available `(name, component)` tuples, including components that are currently disabled. ~~List[Tuple[str, Callable[[Doc], Doc]]]~~ |
|
| `components` <Tag variant="new">3</Tag> | List of all available `(name, component)` tuples, including components that are currently disabled. ~~List[Tuple[str, Callable[[Doc], Doc]]]~~ |
|
||||||
| `component_names` <Tag variant="new">3</Tag> | List of all available component names, including components that are currently disabled. ~~List[str]~~ |
|
| `component_names` <Tag variant="new">3</Tag> | List of all available component names, including components that are currently disabled. ~~List[str]~~ |
|
||||||
| `disabled` <Tag variant="new">3</Tag> | Names of components that are currently disabled and don't run as part of the pipeline. ~~List[str]~~ |
|
| `disabled` <Tag variant="new">3</Tag> | Names of components that are currently disabled and don't run as part of the pipeline. ~~List[str]~~ |
|
||||||
| `path` <Tag variant="new">2</Tag> | Path to the model data directory, if a model is loaded. Otherwise `None`. ~~Optional[Path]~~ |
|
| `path` <Tag variant="new">2</Tag> | Path to the pipeline data directory, if a pipeline is loaded from a path or package. Otherwise `None`. ~~Optional[Path]~~ |
|
||||||
|
|
||||||
## Class attributes {#class-attributes}
|
## Class attributes {#class-attributes}
|
||||||
|
|
||||||
|
@ -1004,7 +1013,7 @@ serialization by passing in the string names via the `exclude` argument.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> data = nlp.to_bytes(exclude=["tokenizer", "vocab"])
|
> data = nlp.to_bytes(exclude=["tokenizer", "vocab"])
|
||||||
> nlp.from_disk("./model-data", exclude=["ner"])
|
> nlp.from_disk("/pipeline", exclude=["ner"])
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
|
|
|
@ -286,7 +286,7 @@ context, the original parameters are restored.
|
||||||
|
|
||||||
## Pipe.add_label {#add_label tag="method"}
|
## Pipe.add_label {#add_label tag="method"}
|
||||||
|
|
||||||
Add a new label to the pipe. It's possible to extend pretrained models with new
|
Add a new label to the pipe. It's possible to extend trained models with new
|
||||||
labels, but care should be taken to avoid the "catastrophic forgetting" problem.
|
labels, but care should be taken to avoid the "catastrophic forgetting" problem.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
|
|
|
@ -12,14 +12,14 @@ menu:
|
||||||
|
|
||||||
## spaCy {#spacy hidden="true"}
|
## spaCy {#spacy hidden="true"}
|
||||||
|
|
||||||
### spacy.load {#spacy.load tag="function" model="any"}
|
### spacy.load {#spacy.load tag="function"}
|
||||||
|
|
||||||
Load a model using the name of an installed
|
Load a pipeline using the name of an installed
|
||||||
[model package](/usage/training#models-generating), a string path or a
|
[package](/usage/saving-loading#models), a string path or a `Path`-like object.
|
||||||
`Path`-like object. spaCy will try resolving the load argument in this order. If
|
spaCy will try resolving the load argument in this order. If a pipeline is
|
||||||
a model is loaded from a model name, spaCy will assume it's a Python package and
|
loaded from a string name, spaCy will assume it's a Python package and import it
|
||||||
import it and call the model's own `load()` method. If a model is loaded from a
|
and call the package's own `load()` method. If a pipeline is loaded from a path,
|
||||||
path, spaCy will assume it's a data directory, load its
|
spaCy will assume it's a data directory, load its
|
||||||
[`config.cfg`](/api/data-formats#config) and use the language and pipeline
|
[`config.cfg`](/api/data-formats#config) and use the language and pipeline
|
||||||
information to construct the `Language` class. The data will be loaded in via
|
information to construct the `Language` class. The data will be loaded in via
|
||||||
[`Language.from_disk`](/api/language#from_disk).
|
[`Language.from_disk`](/api/language#from_disk).
|
||||||
|
@ -36,38 +36,38 @@ specified separately using the new `exclude` keyword argument.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> nlp = spacy.load("en_core_web_sm") # package
|
> nlp = spacy.load("en_core_web_sm") # package
|
||||||
> nlp = spacy.load("/path/to/en") # string path
|
> nlp = spacy.load("/path/to/pipeline") # string path
|
||||||
> nlp = spacy.load(Path("/path/to/en")) # pathlib Path
|
> nlp = spacy.load(Path("/path/to/pipeline")) # pathlib Path
|
||||||
>
|
>
|
||||||
> nlp = spacy.load("en_core_web_sm", exclude=["parser", "tagger"])
|
> nlp = spacy.load("en_core_web_sm", exclude=["parser", "tagger"])
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `name` | Model to load, i.e. package name or path. ~~Union[str, Path]~~ |
|
| `name` | Pipeline to load, i.e. package name or path. ~~Union[str, Path]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~List[str]~~ |
|
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~List[str]~~ |
|
||||||
| `exclude` <Tag variant="new">3</Tag> | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ |
|
| `exclude` <Tag variant="new">3</Tag> | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ |
|
||||||
| `config` <Tag variant="new">3</Tag> | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ |
|
| `config` <Tag variant="new">3</Tag> | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ |
|
||||||
| **RETURNS** | A `Language` object with the loaded model. ~~Language~~ |
|
| **RETURNS** | A `Language` object with the loaded pipeline. ~~Language~~ |
|
||||||
|
|
||||||
Essentially, `spacy.load()` is a convenience wrapper that reads the model's
|
Essentially, `spacy.load()` is a convenience wrapper that reads the pipeline's
|
||||||
[`config.cfg`](/api/data-formats#config), uses the language and pipeline
|
[`config.cfg`](/api/data-formats#config), uses the language and pipeline
|
||||||
information to construct a `Language` object, loads in the model data and
|
information to construct a `Language` object, loads in the model data and
|
||||||
returns it.
|
weights, and returns it.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### Abstract example
|
### Abstract example
|
||||||
cls = util.get_lang_class(lang) # get language for ID, e.g. "en"
|
cls = spacy.util.get_lang_class(lang) # 1. Get Language class, e.g. English
|
||||||
nlp = cls() # initialize the language
|
nlp = cls() # 2. Initialize it
|
||||||
for name in pipeline:
|
for name in pipeline:
|
||||||
nlp.add_pipe(name) # add component to pipeline
|
nlp.add_pipe(name) # 3. Add the component to the pipeline
|
||||||
nlp.from_disk(model_data_path) # load in model data
|
nlp.from_disk(data_path) # 4. Load in the binary data
|
||||||
```
|
```
|
||||||
|
|
||||||
### spacy.blank {#spacy.blank tag="function" new="2"}
|
### spacy.blank {#spacy.blank tag="function" new="2"}
|
||||||
|
|
||||||
Create a blank model of a given language class. This function is the twin of
|
Create a blank pipeline of a given language class. This function is the twin of
|
||||||
`spacy.load()`.
|
`spacy.load()`.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
|
@ -85,9 +85,7 @@ Create a blank model of a given language class. This function is the twin of
|
||||||
### spacy.info {#spacy.info tag="function"}
|
### spacy.info {#spacy.info tag="function"}
|
||||||
|
|
||||||
The same as the [`info` command](/api/cli#info). Pretty-print information about
|
The same as the [`info` command](/api/cli#info). Pretty-print information about
|
||||||
your installation, models and local setup from within spaCy. To get the model
|
your installation, installed pipelines and local setup from within spaCy.
|
||||||
meta data as a dictionary instead, you can use the `meta` attribute on your
|
|
||||||
`nlp` object with a loaded model, e.g. `nlp.meta`.
|
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
@ -98,8 +96,8 @@ meta data as a dictionary instead, you can use the `meta` attribute on your
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------- | ------------------------------------------------------------------ |
|
| -------------- | ---------------------------------------------------------------------------- |
|
||||||
| `model` | A model, i.e. a package name or path (optional). ~~Optional[str]~~ |
|
| `model` | Optional pipeline, i.e. a package name or path (optional). ~~Optional[str]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `markdown` | Print information as Markdown. ~~bool~~ |
|
| `markdown` | Print information as Markdown. ~~bool~~ |
|
||||||
| `silent` | Don't print anything, just return. ~~bool~~ |
|
| `silent` | Don't print anything, just return. ~~bool~~ |
|
||||||
|
@ -133,7 +131,7 @@ list of available terms, see
|
||||||
Allocate data and perform operations on [GPU](/usage/#gpu), if available. If
|
Allocate data and perform operations on [GPU](/usage/#gpu), if available. If
|
||||||
data has already been allocated on CPU, it will not be moved. Ideally, this
|
data has already been allocated on CPU, it will not be moved. Ideally, this
|
||||||
function should be called right after importing spaCy and _before_ loading any
|
function should be called right after importing spaCy and _before_ loading any
|
||||||
models.
|
pipelines.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
@ -152,7 +150,7 @@ models.
|
||||||
Allocate data and perform operations on [GPU](/usage/#gpu). Will raise an error
|
Allocate data and perform operations on [GPU](/usage/#gpu). Will raise an error
|
||||||
if no GPU is available. If data has already been allocated on CPU, it will not
|
if no GPU is available. If data has already been allocated on CPU, it will not
|
||||||
be moved. Ideally, this function should be called right after importing spaCy
|
be moved. Ideally, this function should be called right after importing spaCy
|
||||||
and _before_ loading any models.
|
and _before_ loading any pipelines.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
@ -271,9 +269,9 @@ If a setting is not present in the options, the default value will be used.
|
||||||
| `template` <Tag variant="new">2.2</Tag> | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. See [`templates.py`](https://github.com/explosion/spaCy/blob/master/spacy/displacy/templates.py) for examples. ~~Optional[str]~~ |
|
| `template` <Tag variant="new">2.2</Tag> | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. See [`templates.py`](https://github.com/explosion/spaCy/blob/master/spacy/displacy/templates.py) for examples. ~~Optional[str]~~ |
|
||||||
|
|
||||||
By default, displaCy comes with colors for all entity types used by
|
By default, displaCy comes with colors for all entity types used by
|
||||||
[spaCy models](/models). If you're using custom entity types, you can use the
|
[spaCy's trained pipelines](/models). If you're using custom entity types, you
|
||||||
`colors` setting to add your own colors for them. Your application or model
|
can use the `colors` setting to add your own colors for them. Your application
|
||||||
package can also expose a
|
or pipeline package can also expose a
|
||||||
[`spacy_displacy_colors` entry point](/usage/saving-loading#entry-points-displacy)
|
[`spacy_displacy_colors` entry point](/usage/saving-loading#entry-points-displacy)
|
||||||
to add custom labels and their colors automatically.
|
to add custom labels and their colors automatically.
|
||||||
|
|
||||||
|
@ -309,7 +307,6 @@ factories.
|
||||||
| Registry name | Description |
|
| Registry name | Description |
|
||||||
| ----------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `architectures` | Registry for functions that create [model architectures](/api/architectures). Can be used to register custom model architectures and reference them in the `config.cfg`. |
|
| `architectures` | Registry for functions that create [model architectures](/api/architectures). Can be used to register custom model architectures and reference them in the `config.cfg`. |
|
||||||
| `assets` | Registry for data assets, knowledge bases etc. |
|
|
||||||
| `batchers` | Registry for training and evaluation [data batchers](#batchers). |
|
| `batchers` | Registry for training and evaluation [data batchers](#batchers). |
|
||||||
| `callbacks` | Registry for custom callbacks to [modify the `nlp` object](/usage/training#custom-code-nlp-callbacks) before training. |
|
| `callbacks` | Registry for custom callbacks to [modify the `nlp` object](/usage/training#custom-code-nlp-callbacks) before training. |
|
||||||
| `displacy_colors` | Registry for custom color scheme for the [`displacy` NER visualizer](/usage/visualizers). Automatically reads from [entry points](/usage/saving-loading#entry-points). |
|
| `displacy_colors` | Registry for custom color scheme for the [`displacy` NER visualizer](/usage/visualizers). Automatically reads from [entry points](/usage/saving-loading#entry-points). |
|
||||||
|
@ -320,6 +317,7 @@ factories.
|
||||||
| `loggers` | Registry for functions that log [training results](/usage/training). |
|
| `loggers` | Registry for functions that log [training results](/usage/training). |
|
||||||
| `lookups` | Registry for large lookup tables available via `vocab.lookups`. |
|
| `lookups` | Registry for large lookup tables available via `vocab.lookups`. |
|
||||||
| `losses` | Registry for functions that create [losses](https://thinc.ai/docs/api-loss). |
|
| `losses` | Registry for functions that create [losses](https://thinc.ai/docs/api-loss). |
|
||||||
|
| `misc` | Registry for miscellaneous functions that return data assets, knowledge bases or anything else you may need. |
|
||||||
| `optimizers` | Registry for functions that create [optimizers](https://thinc.ai/docs/api-optimizers). |
|
| `optimizers` | Registry for functions that create [optimizers](https://thinc.ai/docs/api-optimizers). |
|
||||||
| `readers` | Registry for training and evaluation data readers like [`Corpus`](/api/corpus). |
|
| `readers` | Registry for training and evaluation data readers like [`Corpus`](/api/corpus). |
|
||||||
| `schedules` | Registry for functions that create [schedules](https://thinc.ai/docs/api-schedules). |
|
| `schedules` | Registry for functions that create [schedules](https://thinc.ai/docs/api-schedules). |
|
||||||
|
@ -366,7 +364,7 @@ results to a [Weights & Biases](https://www.wandb.com/) dashboard. Instead of
|
||||||
using one of the built-in loggers listed here, you can also
|
using one of the built-in loggers listed here, you can also
|
||||||
[implement your own](/usage/training#custom-logging).
|
[implement your own](/usage/training#custom-logging).
|
||||||
|
|
||||||
#### spacy.ConsoleLogger.v1 {#ConsoleLogger tag="registered function"}
|
#### spacy.ConsoleLogger {#ConsoleLogger tag="registered function"}
|
||||||
|
|
||||||
> #### Example config
|
> #### Example config
|
||||||
>
|
>
|
||||||
|
@ -412,7 +410,7 @@ start decreasing across epochs.
|
||||||
|
|
||||||
</Accordion>
|
</Accordion>
|
||||||
|
|
||||||
#### spacy.WandbLogger.v1 {#WandbLogger tag="registered function"}
|
#### spacy.WandbLogger {#WandbLogger tag="registered function"}
|
||||||
|
|
||||||
> #### Installation
|
> #### Installation
|
||||||
>
|
>
|
||||||
|
@ -468,7 +466,7 @@ Instead of using one of the built-in batchers listed here, you can also
|
||||||
[implement your own](/usage/training#custom-code-readers-batchers), which may or
|
[implement your own](/usage/training#custom-code-readers-batchers), which may or
|
||||||
may not use a custom schedule.
|
may not use a custom schedule.
|
||||||
|
|
||||||
#### batch_by_words.v1 {#batch_by_words tag="registered function"}
|
#### batch_by_words {#batch_by_words tag="registered function"}
|
||||||
|
|
||||||
Create minibatches of roughly a given number of words. If any examples are
|
Create minibatches of roughly a given number of words. If any examples are
|
||||||
longer than the specified batch length, they will appear in a batch by
|
longer than the specified batch length, they will appear in a batch by
|
||||||
|
@ -480,7 +478,7 @@ themselves, or be discarded if `discard_oversize` is set to `True`. The argument
|
||||||
>
|
>
|
||||||
> ```ini
|
> ```ini
|
||||||
> [training.batcher]
|
> [training.batcher]
|
||||||
> @batchers = "batch_by_words.v1"
|
> @batchers = "spacy.batch_by_words.v1"
|
||||||
> size = 100
|
> size = 100
|
||||||
> tolerance = 0.2
|
> tolerance = 0.2
|
||||||
> discard_oversize = false
|
> discard_oversize = false
|
||||||
|
@ -495,13 +493,13 @@ themselves, or be discarded if `discard_oversize` is set to `True`. The argument
|
||||||
| `discard_oversize` | Whether to discard sequences that by themselves exceed the tolerated size. ~~bool~~ |
|
| `discard_oversize` | Whether to discard sequences that by themselves exceed the tolerated size. ~~bool~~ |
|
||||||
| `get_length` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~ |
|
| `get_length` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~ |
|
||||||
|
|
||||||
#### batch_by_sequence.v1 {#batch_by_sequence tag="registered function"}
|
#### batch_by_sequence {#batch_by_sequence tag="registered function"}
|
||||||
|
|
||||||
> #### Example config
|
> #### Example config
|
||||||
>
|
>
|
||||||
> ```ini
|
> ```ini
|
||||||
> [training.batcher]
|
> [training.batcher]
|
||||||
> @batchers = "batch_by_sequence.v1"
|
> @batchers = "spacy.batch_by_sequence.v1"
|
||||||
> size = 32
|
> size = 32
|
||||||
> get_length = null
|
> get_length = null
|
||||||
> ```
|
> ```
|
||||||
|
@ -513,13 +511,13 @@ Create a batcher that creates batches of the specified size.
|
||||||
| `size` | The target number of items per batch. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). ~~Union[int, Sequence[int]]~~ |
|
| `size` | The target number of items per batch. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). ~~Union[int, Sequence[int]]~~ |
|
||||||
| `get_length` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~ |
|
| `get_length` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~ |
|
||||||
|
|
||||||
#### batch_by_padded.v1 {#batch_by_padded tag="registered function"}
|
#### batch_by_padded {#batch_by_padded tag="registered function"}
|
||||||
|
|
||||||
> #### Example config
|
> #### Example config
|
||||||
>
|
>
|
||||||
> ```ini
|
> ```ini
|
||||||
> [training.batcher]
|
> [training.batcher]
|
||||||
> @batchers = "batch_by_padded.v1"
|
> @batchers = "spacy.batch_by_padded.v1"
|
||||||
> size = 100
|
> size = 100
|
||||||
> buffer = 256
|
> buffer = 256
|
||||||
> discard_oversize = false
|
> discard_oversize = false
|
||||||
|
@ -666,8 +664,8 @@ loaded lazily, to avoid expensive setup code associated with the language data.
|
||||||
|
|
||||||
### util.load_model {#util.load_model tag="function" new="2"}
|
### util.load_model {#util.load_model tag="function" new="2"}
|
||||||
|
|
||||||
Load a model from a package or data path. If called with a package name, spaCy
|
Load a pipeline from a package or data path. If called with a string name, spaCy
|
||||||
will assume the model is a Python package and import and call its `load()`
|
will assume the pipeline is a Python package and import and call its `load()`
|
||||||
method. If called with a path, spaCy will assume it's a data directory, read the
|
method. If called with a path, spaCy will assume it's a data directory, read the
|
||||||
language and pipeline settings from the [`config.cfg`](/api/data-formats#config)
|
language and pipeline settings from the [`config.cfg`](/api/data-formats#config)
|
||||||
and create a `Language` object. The model data will then be loaded in via
|
and create a `Language` object. The model data will then be loaded in via
|
||||||
|
@ -683,16 +681,16 @@ and create a `Language` object. The model data will then be loaded in via
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `name` | Package name or model path. ~~str~~ |
|
| `name` | Package name or path. ~~str~~ |
|
||||||
| `vocab` <Tag variant="new">3</Tag> | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~. |
|
| `vocab` <Tag variant="new">3</Tag> | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~. |
|
||||||
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~List[str]~~ |
|
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~List[str]~~ |
|
||||||
| `exclude` <Tag variant="new">3</Tag> | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ |
|
| `exclude` <Tag variant="new">3</Tag> | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ |
|
||||||
| `config` <Tag variant="new">3</Tag> | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ |
|
| `config` <Tag variant="new">3</Tag> | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ |
|
||||||
| **RETURNS** | `Language` class with the loaded model. ~~Language~~ |
|
| **RETURNS** | `Language` class with the loaded pipeline. ~~Language~~ |
|
||||||
|
|
||||||
### util.load_model_from_init_py {#util.load_model_from_init_py tag="function" new="2"}
|
### util.load_model_from_init_py {#util.load_model_from_init_py tag="function" new="2"}
|
||||||
|
|
||||||
A helper function to use in the `load()` method of a model package's
|
A helper function to use in the `load()` method of a pipeline package's
|
||||||
[`__init__.py`](https://github.com/explosion/spacy-models/tree/master/template/model/xx_model_name/__init__.py).
|
[`__init__.py`](https://github.com/explosion/spacy-models/tree/master/template/model/xx_model_name/__init__.py).
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
|
@ -706,70 +704,72 @@ A helper function to use in the `load()` method of a model package's
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `init_file` | Path to model's `__init__.py`, i.e. `__file__`. ~~Union[str, Path]~~ |
|
| `init_file` | Path to package's `__init__.py`, i.e. `__file__`. ~~Union[str, Path]~~ |
|
||||||
| `vocab` <Tag variant="new">3</Tag> | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~. |
|
| `vocab` <Tag variant="new">3</Tag> | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~. |
|
||||||
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~List[str]~~ |
|
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~List[str]~~ |
|
||||||
| `exclude` <Tag variant="new">3</Tag> | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ |
|
| `exclude` <Tag variant="new">3</Tag> | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ |
|
||||||
| `config` <Tag variant="new">3</Tag> | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ |
|
| `config` <Tag variant="new">3</Tag> | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ |
|
||||||
| **RETURNS** | `Language` class with the loaded model. ~~Language~~ |
|
| **RETURNS** | `Language` class with the loaded pipeline. ~~Language~~ |
|
||||||
|
|
||||||
### util.load_config {#util.load_config tag="function" new="3"}
|
### util.load_config {#util.load_config tag="function" new="3"}
|
||||||
|
|
||||||
Load a model's [`config.cfg`](/api/data-formats#config) from a file path. The
|
Load a pipeline's [`config.cfg`](/api/data-formats#config) from a file path. The
|
||||||
config typically includes details about the model pipeline and how its
|
config typically includes details about the components and how they're created,
|
||||||
components are created, as well as all training settings and hyperparameters.
|
as well as all training settings and hyperparameters.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> config = util.load_config("/path/to/model/config.cfg")
|
> config = util.load_config("/path/to/config.cfg")
|
||||||
> print(config.to_str())
|
> print(config.to_str())
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `path` | Path to the model's `config.cfg`. ~~Union[str, Path]~~ |
|
| `path` | Path to the pipeline's `config.cfg`. ~~Union[str, Path]~~ |
|
||||||
| `overrides` | Optional config overrides to replace in loaded config. Can be provided as nested dict, or as flat dict with keys in dot notation, e.g. `"nlp.pipeline"`. ~~Dict[str, Any]~~ |
|
| `overrides` | Optional config overrides to replace in loaded config. Can be provided as nested dict, or as flat dict with keys in dot notation, e.g. `"nlp.pipeline"`. ~~Dict[str, Any]~~ |
|
||||||
| `interpolate` | Whether to interpolate the config and replace variables like `${paths.train}` with their values. Defaults to `False`. ~~bool~~ |
|
| `interpolate` | Whether to interpolate the config and replace variables like `${paths.train}` with their values. Defaults to `False`. ~~bool~~ |
|
||||||
| **RETURNS** | The model's config. ~~Config~~ |
|
| **RETURNS** | The pipeline's config. ~~Config~~ |
|
||||||
|
|
||||||
### util.load_meta {#util.load_meta tag="function" new="3"}
|
### util.load_meta {#util.load_meta tag="function" new="3"}
|
||||||
|
|
||||||
Get a model's [`meta.json`](/api/data-formats#meta) from a file path and
|
Get a pipeline's [`meta.json`](/api/data-formats#meta) from a file path and
|
||||||
validate its contents.
|
validate its contents. The meta typically includes details about author,
|
||||||
|
licensing, data sources and version.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> meta = util.load_meta("/path/to/model/meta.json")
|
> meta = util.load_meta("/path/to/meta.json")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------- | ----------------------------------------------------- |
|
| ----------- | -------------------------------------------------------- |
|
||||||
| `path` | Path to the model's `meta.json`. ~~Union[str, Path]~~ |
|
| `path` | Path to the pipeline's `meta.json`. ~~Union[str, Path]~~ |
|
||||||
| **RETURNS** | The model's meta data. ~~Dict[str, Any]~~ |
|
| **RETURNS** | The pipeline's meta data. ~~Dict[str, Any]~~ |
|
||||||
|
|
||||||
### util.get_installed_models {#util.get_installed_models tag="function" new="3"}
|
### util.get_installed_models {#util.get_installed_models tag="function" new="3"}
|
||||||
|
|
||||||
List all model packages installed in the current environment. This will include
|
List all pipeline packages installed in the current environment. This will
|
||||||
any spaCy model that was packaged with [`spacy package`](/api/cli#package).
|
include any spaCy pipeline that was packaged with
|
||||||
Under the hood, model packages expose a Python entry point that spaCy can check,
|
[`spacy package`](/api/cli#package). Under the hood, pipeline packages expose a
|
||||||
without having to load the model.
|
Python entry point that spaCy can check, without having to load the `nlp`
|
||||||
|
object.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> model_names = util.get_installed_models()
|
> names = util.get_installed_models()
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------- | ---------------------------------------------------------------------------------- |
|
| ----------- | ------------------------------------------------------------------------------------- |
|
||||||
| **RETURNS** | The string names of the models installed in the current environment. ~~List[str]~~ |
|
| **RETURNS** | The string names of the pipelines installed in the current environment. ~~List[str]~~ |
|
||||||
|
|
||||||
### util.is_package {#util.is_package tag="function"}
|
### util.is_package {#util.is_package tag="function"}
|
||||||
|
|
||||||
Check if string maps to a package installed via pip. Mainly used to validate
|
Check if string maps to a package installed via pip. Mainly used to validate
|
||||||
[model packages](/usage/models).
|
[pipeline packages](/usage/models).
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
@ -786,7 +786,8 @@ Check if string maps to a package installed via pip. Mainly used to validate
|
||||||
### util.get_package_path {#util.get_package_path tag="function" new="2"}
|
### util.get_package_path {#util.get_package_path tag="function" new="2"}
|
||||||
|
|
||||||
Get path to an installed package. Mainly used to resolve the location of
|
Get path to an installed package. Mainly used to resolve the location of
|
||||||
[model packages](/usage/models). Currently imports the package to find its path.
|
[pipeline packages](/usage/models). Currently imports the package to find its
|
||||||
|
path.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
@ -796,9 +797,9 @@ Get path to an installed package. Mainly used to resolve the location of
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------- | ----------------------------------------- |
|
| -------------- | -------------------------------------------- |
|
||||||
| `package_name` | Name of installed package. ~~str~~ |
|
| `package_name` | Name of installed package. ~~str~~ |
|
||||||
| **RETURNS** | Path to model package directory. ~~Path~~ |
|
| **RETURNS** | Path to pipeline package directory. ~~Path~~ |
|
||||||
|
|
||||||
### util.is_in_jupyter {#util.is_in_jupyter tag="function" new="2"}
|
### util.is_in_jupyter {#util.is_in_jupyter tag="function" new="2"}
|
||||||
|
|
||||||
|
|
|
@ -453,7 +453,7 @@ using the `@spacy.registry.span_getters` decorator.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> @spacy.registry.span_getters("sent_spans.v1")
|
> @spacy.registry.span_getters("custom_sent_spans")
|
||||||
> def configure_get_sent_spans() -> Callable:
|
> def configure_get_sent_spans() -> Callable:
|
||||||
> def get_sent_spans(docs: Iterable[Doc]) -> List[List[Span]]:
|
> def get_sent_spans(docs: Iterable[Doc]) -> List[List[Span]]:
|
||||||
> return [list(doc.sents) for doc in docs]
|
> return [list(doc.sents) for doc in docs]
|
||||||
|
@ -472,7 +472,7 @@ using the `@spacy.registry.span_getters` decorator.
|
||||||
>
|
>
|
||||||
> ```ini
|
> ```ini
|
||||||
> [transformer.model.get_spans]
|
> [transformer.model.get_spans]
|
||||||
> @span_getters = "doc_spans.v1"
|
> @span_getters = "spacy-transformers.doc_spans.v1"
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
Create a span getter that uses the whole document as its spans. This is the best
|
Create a span getter that uses the whole document as its spans. This is the best
|
||||||
|
@ -485,7 +485,7 @@ texts.
|
||||||
>
|
>
|
||||||
> ```ini
|
> ```ini
|
||||||
> [transformer.model.get_spans]
|
> [transformer.model.get_spans]
|
||||||
> @span_getters = "sent_spans.v1"
|
> @span_getters = "spacy-transformers.sent_spans.v1"
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
Create a span getter that uses sentence boundary markers to extract the spans.
|
Create a span getter that uses sentence boundary markers to extract the spans.
|
||||||
|
@ -500,7 +500,7 @@ more meaningful windows to attend over.
|
||||||
>
|
>
|
||||||
> ```ini
|
> ```ini
|
||||||
> [transformer.model.get_spans]
|
> [transformer.model.get_spans]
|
||||||
> @span_getters = "strided_spans.v1"
|
> @span_getters = "spacy-transformers.strided_spans.v1"
|
||||||
> window = 128
|
> window = 128
|
||||||
> stride = 96
|
> stride = 96
|
||||||
> ```
|
> ```
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
---
|
---
|
||||||
title: Models
|
title: Trained Models & Pipelines
|
||||||
teaser: Downloadable pretrained models for spaCy
|
teaser: Downloadable trained pipelines and weights for spaCy
|
||||||
menu:
|
menu:
|
||||||
- ['Quickstart', 'quickstart']
|
- ['Quickstart', 'quickstart']
|
||||||
- ['Conventions', 'conventions']
|
- ['Conventions', 'conventions']
|
||||||
|
@ -8,15 +8,15 @@ menu:
|
||||||
|
|
||||||
<!-- Update page, refer to new /api/architectures and training docs -->
|
<!-- Update page, refer to new /api/architectures and training docs -->
|
||||||
|
|
||||||
The models directory includes two types of pretrained models:
|
This directory includes two types of packages:
|
||||||
|
|
||||||
1. **Core models:** General-purpose pretrained models to predict named entities,
|
1. **Trained pipelines:** General-purpose spaCy pipelines to predict named
|
||||||
part-of-speech tags and syntactic dependencies. Can be used out-of-the-box
|
entities, part-of-speech tags and syntactic dependencies. Can be used
|
||||||
and fine-tuned on more specific data.
|
out-of-the-box and fine-tuned on more specific data.
|
||||||
2. **Starter models:** Transfer learning starter packs with pretrained weights
|
2. **Starters:** Transfer learning starter packs with pretrained weights you can
|
||||||
you can initialize your models with to achieve better accuracy. They can
|
initialize your pipeline models with to achieve better accuracy. They can
|
||||||
include word vectors (which will be used as features during training) or
|
include word vectors (which will be used as features during training) or
|
||||||
other pretrained representations like BERT. These models don't include
|
other pretrained representations like BERT. These packages don't include
|
||||||
components for specific tasks like NER or text classification and are
|
components for specific tasks like NER or text classification and are
|
||||||
intended to be used as base models when training your own models.
|
intended to be used as base models when training your own models.
|
||||||
|
|
||||||
|
@ -28,43 +28,42 @@ import QuickstartModels from 'widgets/quickstart-models.js'
|
||||||
|
|
||||||
<Infobox title="Installation and usage" emoji="📖">
|
<Infobox title="Installation and usage" emoji="📖">
|
||||||
|
|
||||||
For more details on how to use models with spaCy, see the
|
For more details on how to use trained pipelines with spaCy, see the
|
||||||
[usage guide on models](/usage/models).
|
[usage guide](/usage/models).
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
## Model naming conventions {#conventions}
|
## Package naming conventions {#conventions}
|
||||||
|
|
||||||
In general, spaCy expects all model packages to follow the naming convention of
|
In general, spaCy expects all pipeline packages to follow the naming convention
|
||||||
`[lang`\_[name]]. For spaCy's models, we also chose to divide the name into
|
of `[lang`\_[name]]. For spaCy's pipelines, we also chose to divide the name
|
||||||
three components:
|
into three components:
|
||||||
|
|
||||||
1. **Type:** Model capabilities (e.g. `core` for general-purpose model with
|
1. **Type:** Capabilities (e.g. `core` for general-purpose pipeline with
|
||||||
vocabulary, syntax, entities and word vectors, or `depent` for only vocab,
|
vocabulary, syntax, entities and word vectors, or `depent` for only vocab,
|
||||||
syntax and entities).
|
syntax and entities).
|
||||||
2. **Genre:** Type of text the model is trained on, e.g. `web` or `news`.
|
2. **Genre:** Type of text the pipeline is trained on, e.g. `web` or `news`.
|
||||||
3. **Size:** Model size indicator, `sm`, `md` or `lg`.
|
3. **Size:** Package size indicator, `sm`, `md` or `lg`.
|
||||||
|
|
||||||
For example, [`en_core_web_sm`](/models/en#en_core_web_sm) is a small English
|
For example, [`en_core_web_sm`](/models/en#en_core_web_sm) is a small English
|
||||||
model trained on written web text (blogs, news, comments), that includes
|
pipeline trained on written web text (blogs, news, comments), that includes
|
||||||
vocabulary, vectors, syntax and entities.
|
vocabulary, vectors, syntax and entities.
|
||||||
|
|
||||||
### Model versioning {#model-versioning}
|
### Package versioning {#model-versioning}
|
||||||
|
|
||||||
Additionally, the model versioning reflects both the compatibility with spaCy,
|
Additionally, the pipeline package versioning reflects both the compatibility
|
||||||
as well as the major and minor model version. A model version `a.b.c` translates
|
with spaCy, as well as the major and minor version. A package version `a.b.c`
|
||||||
to:
|
translates to:
|
||||||
|
|
||||||
- `a`: **spaCy major version**. For example, `2` for spaCy v2.x.
|
- `a`: **spaCy major version**. For example, `2` for spaCy v2.x.
|
||||||
- `b`: **Model major version**. Models with a different major version can't be
|
- `b`: **Package major version**. Pipelines with a different major version can't
|
||||||
loaded by the same code. For example, changing the width of the model, adding
|
be loaded by the same code. For example, changing the width of the model,
|
||||||
hidden layers or changing the activation changes the model major version.
|
adding hidden layers or changing the activation changes the major version.
|
||||||
- `c`: **Model minor version**. Same model structure, but different parameter
|
- `c`: **Package minor version**. Same pipeline structure, but different
|
||||||
values, e.g. from being trained on different data, for different numbers of
|
parameter values, e.g. from being trained on different data, for different
|
||||||
iterations, etc.
|
numbers of iterations, etc.
|
||||||
|
|
||||||
For a detailed compatibility overview, see the
|
For a detailed compatibility overview, see the
|
||||||
[`compatibility.json`](https://github.com/explosion/spacy-models/tree/master/compatibility.json)
|
[`compatibility.json`](https://github.com/explosion/spacy-models/tree/master/compatibility.json).
|
||||||
in the models repository. This is also the source of spaCy's internal
|
This is also the source of spaCy's internal compatibility check, performed when
|
||||||
compatibility check, performed when you run the [`download`](/api/cli#download)
|
you run the [`download`](/api/cli#download) command.
|
||||||
command.
|
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
When you call `nlp` on a text, spaCy first tokenizes the text to produce a `Doc`
|
When you call `nlp` on a text, spaCy first tokenizes the text to produce a `Doc`
|
||||||
object. The `Doc` is then processed in several different steps – this is also
|
object. The `Doc` is then processed in several different steps – this is also
|
||||||
referred to as the **processing pipeline**. The pipeline used by the
|
referred to as the **processing pipeline**. The pipeline used by the
|
||||||
[default models](/models) typically include a tagger, a lemmatizer, a parser and
|
[trained pipelines](/models) typically include a tagger, a lemmatizer, a parser
|
||||||
an entity recognizer. Each pipeline component returns the processed `Doc`, which
|
and an entity recognizer. Each pipeline component returns the processed `Doc`,
|
||||||
is then passed on to the next component.
|
which is then passed on to the next component.
|
||||||
|
|
||||||
![The processing pipeline](../../images/pipeline.svg)
|
![The processing pipeline](../../images/pipeline.svg)
|
||||||
|
|
||||||
|
@ -23,14 +23,15 @@ is then passed on to the next component.
|
||||||
| **textcat** | [`TextCategorizer`](/api/textcategorizer) | `Doc.cats` | Assign document labels. |
|
| **textcat** | [`TextCategorizer`](/api/textcategorizer) | `Doc.cats` | Assign document labels. |
|
||||||
| **custom** | [custom components](/usage/processing-pipelines#custom-components) | `Doc._.xxx`, `Token._.xxx`, `Span._.xxx` | Assign custom attributes, methods or properties. |
|
| **custom** | [custom components](/usage/processing-pipelines#custom-components) | `Doc._.xxx`, `Token._.xxx`, `Span._.xxx` | Assign custom attributes, methods or properties. |
|
||||||
|
|
||||||
The processing pipeline always **depends on the statistical model** and its
|
The capabilities of a processing pipeline always depend on the components, their
|
||||||
capabilities. For example, a pipeline can only include an entity recognizer
|
models and how they were trained. For example, a pipeline for named entity
|
||||||
component if the model includes data to make predictions of entity labels. This
|
recognition needs to include a trained named entity recognizer component with a
|
||||||
is why each model will specify the pipeline to use in its meta data and
|
statistical model and weights that enable it to **make predictions** of entity
|
||||||
[config](/usage/training#config), as a simple list containing the component
|
labels. This is why each pipeline specifies its components and their settings in
|
||||||
names:
|
the [config](/usage/training#config):
|
||||||
|
|
||||||
```ini
|
```ini
|
||||||
|
[nlp]
|
||||||
pipeline = ["tagger", "parser", "ner"]
|
pipeline = ["tagger", "parser", "ner"]
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
After tokenization, spaCy can **parse** and **tag** a given `Doc`. This is where
|
After tokenization, spaCy can **parse** and **tag** a given `Doc`. This is where
|
||||||
the statistical model comes in, which enables spaCy to **make a prediction** of
|
the trained pipeline and its statistical models come in, which enable spaCy to
|
||||||
which tag or label most likely applies in this context. A model consists of
|
**make predictions** of which tag or label most likely applies in this context.
|
||||||
binary data and is produced by showing a system enough examples for it to make
|
A trained component includes binary data that is produced by showing a system
|
||||||
predictions that generalize across the language – for example, a word following
|
enough examples for it to make predictions that generalize across the language –
|
||||||
"the" in English is most likely a noun.
|
for example, a word following "the" in English is most likely a noun.
|
||||||
|
|
||||||
Linguistic annotations are available as
|
Linguistic annotations are available as
|
||||||
[`Token` attributes](/api/token#attributes). Like many NLP libraries, spaCy
|
[`Token` attributes](/api/token#attributes). Like many NLP libraries, spaCy
|
||||||
|
@ -25,7 +25,8 @@ for token in doc:
|
||||||
|
|
||||||
> - **Text:** The original word text.
|
> - **Text:** The original word text.
|
||||||
> - **Lemma:** The base form of the word.
|
> - **Lemma:** The base form of the word.
|
||||||
> - **POS:** The simple [UPOS](https://universaldependencies.org/docs/u/pos/) part-of-speech tag.
|
> - **POS:** The simple [UPOS](https://universaldependencies.org/docs/u/pos/)
|
||||||
|
> part-of-speech tag.
|
||||||
> - **Tag:** The detailed part-of-speech tag.
|
> - **Tag:** The detailed part-of-speech tag.
|
||||||
> - **Dep:** Syntactic dependency, i.e. the relation between tokens.
|
> - **Dep:** Syntactic dependency, i.e. the relation between tokens.
|
||||||
> - **Shape:** The word shape – capitalization, punctuation, digits.
|
> - **Shape:** The word shape – capitalization, punctuation, digits.
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
If you've been modifying the pipeline, vocabulary, vectors and entities, or made
|
If you've been modifying the pipeline, vocabulary, vectors and entities, or made
|
||||||
updates to the model, you'll eventually want to **save your progress** – for
|
updates to the component models, you'll eventually want to **save your
|
||||||
example, everything that's in your `nlp` object. This means you'll have to
|
progress** – for example, everything that's in your `nlp` object. This means
|
||||||
translate its contents and structure into a format that can be saved, like a
|
you'll have to translate its contents and structure into a format that can be
|
||||||
file or a byte string. This process is called serialization. spaCy comes with
|
saved, like a file or a byte string. This process is called serialization. spaCy
|
||||||
**built-in serialization methods** and supports the
|
comes with **built-in serialization methods** and supports the
|
||||||
[Pickle protocol](https://www.diveinto.org/python3/serializing.html#dump).
|
[Pickle protocol](https://www.diveinto.org/python3/serializing.html#dump).
|
||||||
|
|
||||||
> #### What's pickle?
|
> #### What's pickle?
|
||||||
|
|
|
@ -1,25 +1,25 @@
|
||||||
spaCy's tagger, parser, text categorizer and many other components are powered
|
spaCy's tagger, parser, text categorizer and many other components are powered
|
||||||
by **statistical models**. Every "decision" these components make – for example,
|
by **statistical models**. Every "decision" these components make – for example,
|
||||||
which part-of-speech tag to assign, or whether a word is a named entity – is a
|
which part-of-speech tag to assign, or whether a word is a named entity – is a
|
||||||
**prediction** based on the model's current **weight values**. The weight
|
**prediction** based on the model's current **weight values**. The weight values
|
||||||
values are estimated based on examples the model has seen
|
are estimated based on examples the model has seen during **training**. To train
|
||||||
during **training**. To train a model, you first need training data – examples
|
a model, you first need training data – examples of text, and the labels you
|
||||||
of text, and the labels you want the model to predict. This could be a
|
want the model to predict. This could be a part-of-speech tag, a named entity or
|
||||||
part-of-speech tag, a named entity or any other information.
|
any other information.
|
||||||
|
|
||||||
Training is an iterative process in which the model's predictions are compared
|
Training is an iterative process in which the model's predictions are compared
|
||||||
against the reference annotations in order to estimate the **gradient of the
|
against the reference annotations in order to estimate the **gradient of the
|
||||||
loss**. The gradient of the loss is then used to calculate the gradient of the
|
loss**. The gradient of the loss is then used to calculate the gradient of the
|
||||||
weights through [backpropagation](https://thinc.ai/backprop101). The gradients
|
weights through [backpropagation](https://thinc.ai/backprop101). The gradients
|
||||||
indicate how the weight values should be changed so that the model's
|
indicate how the weight values should be changed so that the model's predictions
|
||||||
predictions become more similar to the reference labels over time.
|
become more similar to the reference labels over time.
|
||||||
|
|
||||||
> - **Training data:** Examples and their annotations.
|
> - **Training data:** Examples and their annotations.
|
||||||
> - **Text:** The input text the model should predict a label for.
|
> - **Text:** The input text the model should predict a label for.
|
||||||
> - **Label:** The label the model should predict.
|
> - **Label:** The label the model should predict.
|
||||||
> - **Gradient:** The direction and rate of change for a numeric value.
|
> - **Gradient:** The direction and rate of change for a numeric value.
|
||||||
> Minimising the gradient of the weights should result in predictions that
|
> Minimising the gradient of the weights should result in predictions that are
|
||||||
> are closer to the reference labels on the training data.
|
> closer to the reference labels on the training data.
|
||||||
|
|
||||||
![The training process](../../images/training.svg)
|
![The training process](../../images/training.svg)
|
||||||
|
|
||||||
|
|
|
@ -24,12 +24,12 @@ array([2.02280000e-01, -7.66180009e-02, 3.70319992e-01,
|
||||||
|
|
||||||
<Infobox title="Important note" variant="warning">
|
<Infobox title="Important note" variant="warning">
|
||||||
|
|
||||||
To make them compact and fast, spaCy's small [models](/models) (all packages
|
To make them compact and fast, spaCy's small [pipeline packages](/models) (all
|
||||||
that end in `sm`) **don't ship with word vectors**, and only include
|
packages that end in `sm`) **don't ship with word vectors**, and only include
|
||||||
context-sensitive **tensors**. This means you can still use the `similarity()`
|
context-sensitive **tensors**. This means you can still use the `similarity()`
|
||||||
methods to compare documents, spans and tokens – but the result won't be as
|
methods to compare documents, spans and tokens – but the result won't be as
|
||||||
good, and individual tokens won't have any vectors assigned. So in order to use
|
good, and individual tokens won't have any vectors assigned. So in order to use
|
||||||
_real_ word vectors, you need to download a larger model:
|
_real_ word vectors, you need to download a larger pipeline package:
|
||||||
|
|
||||||
```diff
|
```diff
|
||||||
- python -m spacy download en_core_web_sm
|
- python -m spacy download en_core_web_sm
|
||||||
|
@ -38,11 +38,11 @@ _real_ word vectors, you need to download a larger model:
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
Models that come with built-in word vectors make them available as the
|
Pipeline packages that come with built-in word vectors make them available as
|
||||||
[`Token.vector`](/api/token#vector) attribute. [`Doc.vector`](/api/doc#vector)
|
the [`Token.vector`](/api/token#vector) attribute.
|
||||||
and [`Span.vector`](/api/span#vector) will default to an average of their token
|
[`Doc.vector`](/api/doc#vector) and [`Span.vector`](/api/span#vector) will
|
||||||
vectors. You can also check if a token has a vector assigned, and get the L2
|
default to an average of their token vectors. You can also check if a token has
|
||||||
norm, which can be used to normalize vectors.
|
a vector assigned, and get the L2 norm, which can be used to normalize vectors.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### {executable="true"}
|
### {executable="true"}
|
||||||
|
@ -62,12 +62,12 @@ for token in tokens:
|
||||||
> - **OOV**: Out-of-vocabulary
|
> - **OOV**: Out-of-vocabulary
|
||||||
|
|
||||||
The words "dog", "cat" and "banana" are all pretty common in English, so they're
|
The words "dog", "cat" and "banana" are all pretty common in English, so they're
|
||||||
part of the model's vocabulary, and come with a vector. The word "afskfsd" on
|
part of the pipeline's vocabulary, and come with a vector. The word "afskfsd" on
|
||||||
the other hand is a lot less common and out-of-vocabulary – so its vector
|
the other hand is a lot less common and out-of-vocabulary – so its vector
|
||||||
representation consists of 300 dimensions of `0`, which means it's practically
|
representation consists of 300 dimensions of `0`, which means it's practically
|
||||||
nonexistent. If your application will benefit from a **large vocabulary** with
|
nonexistent. If your application will benefit from a **large vocabulary** with
|
||||||
more vectors, you should consider using one of the larger models or loading in a
|
more vectors, you should consider using one of the larger pipeline packages or
|
||||||
full vector package, for example,
|
loading in a full vector package, for example,
|
||||||
[`en_vectors_web_lg`](/models/en-starters#en_vectors_web_lg), which includes
|
[`en_vectors_web_lg`](/models/en-starters#en_vectors_web_lg), which includes
|
||||||
over **1 million unique vectors**.
|
over **1 million unique vectors**.
|
||||||
|
|
||||||
|
@ -82,7 +82,7 @@ Each [`Doc`](/api/doc), [`Span`](/api/span), [`Token`](/api/token) and
|
||||||
method that lets you compare it with another object, and determine the
|
method that lets you compare it with another object, and determine the
|
||||||
similarity. Of course similarity is always subjective – whether two words, spans
|
similarity. Of course similarity is always subjective – whether two words, spans
|
||||||
or documents are similar really depends on how you're looking at it. spaCy's
|
or documents are similar really depends on how you're looking at it. spaCy's
|
||||||
similarity model usually assumes a pretty general-purpose definition of
|
similarity implementation usually assumes a pretty general-purpose definition of
|
||||||
similarity.
|
similarity.
|
||||||
|
|
||||||
> #### 📝 Things to try
|
> #### 📝 Things to try
|
||||||
|
@ -99,7 +99,7 @@ similarity.
|
||||||
### {executable="true"}
|
### {executable="true"}
|
||||||
import spacy
|
import spacy
|
||||||
|
|
||||||
nlp = spacy.load("en_core_web_md") # make sure to use larger model!
|
nlp = spacy.load("en_core_web_md") # make sure to use larger package!
|
||||||
doc1 = nlp("I like salty fries and hamburgers.")
|
doc1 = nlp("I like salty fries and hamburgers.")
|
||||||
doc2 = nlp("Fast food tastes very good.")
|
doc2 = nlp("Fast food tastes very good.")
|
||||||
|
|
||||||
|
@ -143,10 +143,9 @@ us that builds on top of spaCy and lets you train and query more interesting and
|
||||||
detailed word vectors. It combines noun phrases like "fast food" or "fair game"
|
detailed word vectors. It combines noun phrases like "fast food" or "fair game"
|
||||||
and includes the part-of-speech tags and entity labels. The library also
|
and includes the part-of-speech tags and entity labels. The library also
|
||||||
includes annotation recipes for our annotation tool [Prodigy](https://prodi.gy)
|
includes annotation recipes for our annotation tool [Prodigy](https://prodi.gy)
|
||||||
that let you evaluate vector models and create terminology lists. For more
|
that let you evaluate vectors and create terminology lists. For more details,
|
||||||
details, check out
|
check out [our blog post](https://explosion.ai/blog/sense2vec-reloaded). To
|
||||||
[our blog post](https://explosion.ai/blog/sense2vec-reloaded). To explore the
|
explore the semantic similarities across all Reddit comments of 2015 and 2019,
|
||||||
semantic similarities across all Reddit comments of 2015 and 2019, see the
|
see the [interactive demo](https://explosion.ai/demos/sense2vec).
|
||||||
[interactive demo](https://explosion.ai/demos/sense2vec).
|
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
|
@ -331,7 +331,7 @@ name = "bert-base-cased"
|
||||||
tokenizer_config = {"use_fast": true}
|
tokenizer_config = {"use_fast": true}
|
||||||
|
|
||||||
[components.transformer.model.get_spans]
|
[components.transformer.model.get_spans]
|
||||||
@span_getters = "doc_spans.v1"
|
@span_getters = "spacy-transformers.doc_spans.v1"
|
||||||
|
|
||||||
[components.transformer.annotation_setter]
|
[components.transformer.annotation_setter]
|
||||||
@annotation_setters = "spacy-transformers.null_annotation_setter.v1"
|
@annotation_setters = "spacy-transformers.null_annotation_setter.v1"
|
||||||
|
@ -369,8 +369,9 @@ all defaults.
|
||||||
|
|
||||||
To change any of the settings, you can edit the `config.cfg` and re-run the
|
To change any of the settings, you can edit the `config.cfg` and re-run the
|
||||||
training. To change any of the functions, like the span getter, you can replace
|
training. To change any of the functions, like the span getter, you can replace
|
||||||
the name of the referenced function – e.g. `@span_getters = "sent_spans.v1"` to
|
the name of the referenced function – e.g.
|
||||||
process sentences. You can also register your own functions using the
|
`@span_getters = "spacy-transformers.sent_spans.v1"` to process sentences. You
|
||||||
|
can also register your own functions using the
|
||||||
[`span_getters` registry](/api/top-level#registry). For instance, the following
|
[`span_getters` registry](/api/top-level#registry). For instance, the following
|
||||||
custom function returns [`Span`](/api/span) objects following sentence
|
custom function returns [`Span`](/api/span) objects following sentence
|
||||||
boundaries, unless a sentence succeeds a certain amount of tokens, in which case
|
boundaries, unless a sentence succeeds a certain amount of tokens, in which case
|
||||||
|
|
|
@ -35,10 +35,10 @@ Using pip, spaCy releases are available as source packages and binary wheels.
|
||||||
$ pip install -U spacy
|
$ pip install -U spacy
|
||||||
```
|
```
|
||||||
|
|
||||||
> #### Download models
|
> #### Download pipelines
|
||||||
>
|
>
|
||||||
> After installation you need to download a language model. For more info and
|
> After installation you typically want to download a trained pipeline. For more
|
||||||
> available models, see the [docs on models](/models).
|
> info and available packages, see the [models directory](/models).
|
||||||
>
|
>
|
||||||
> ```cli
|
> ```cli
|
||||||
> $ python -m spacy download en_core_web_sm
|
> $ python -m spacy download en_core_web_sm
|
||||||
|
@ -54,7 +54,7 @@ To install additional data tables for lemmatization you can run
|
||||||
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data)
|
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data)
|
||||||
separately. The lookups package is needed to provide normalization and
|
separately. The lookups package is needed to provide normalization and
|
||||||
lemmatization data for new models and to lemmatize in languages that don't yet
|
lemmatization data for new models and to lemmatize in languages that don't yet
|
||||||
come with pretrained models and aren't powered by third-party libraries.
|
come with trained pipelines and aren't powered by third-party libraries.
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
|
@ -88,23 +88,21 @@ and pull requests to the recipe and setup are always appreciated.
|
||||||
> spaCy v2.x to v3.x may still require some changes to your code base. For
|
> spaCy v2.x to v3.x may still require some changes to your code base. For
|
||||||
> details see the sections on [backwards incompatibilities](/usage/v3#incompat)
|
> details see the sections on [backwards incompatibilities](/usage/v3#incompat)
|
||||||
> and [migrating](/usage/v3#migrating). Also remember to download the new
|
> and [migrating](/usage/v3#migrating). Also remember to download the new
|
||||||
> models, and retrain your own models.
|
> trained pipelines, and retrain your own pipelines.
|
||||||
|
|
||||||
When updating to a newer version of spaCy, it's generally recommended to start
|
When updating to a newer version of spaCy, it's generally recommended to start
|
||||||
with a clean virtual environment. If you're upgrading to a new major version,
|
with a clean virtual environment. If you're upgrading to a new major version,
|
||||||
make sure you have the latest **compatible models** installed, and that there
|
make sure you have the latest **compatible trained pipelines** installed, and
|
||||||
are no old and incompatible model packages left over in your environment, as
|
that there are no old and incompatible packages left over in your environment,
|
||||||
this can often lead to unexpected results and errors. If you've trained your own
|
as this can often lead to unexpected results and errors. If you've trained your
|
||||||
models, keep in mind that your train and runtime inputs must match. This means
|
own models, keep in mind that your train and runtime inputs must match. This
|
||||||
you'll have to **retrain your models** with the new version.
|
means you'll have to **retrain your pipelines** with the new version.
|
||||||
|
|
||||||
spaCy also provides a [`validate`](/api/cli#validate) command, which lets you
|
spaCy also provides a [`validate`](/api/cli#validate) command, which lets you
|
||||||
verify that all installed models are compatible with your spaCy version. If
|
verify that all installed pipeline packages are compatible with your spaCy
|
||||||
incompatible models are found, tips and installation instructions are printed.
|
version. If incompatible packages are found, tips and installation instructions
|
||||||
The command is also useful to detect out-of-sync model links resulting from
|
are printed. It's recommended to run the command with `python -m` to make sure
|
||||||
links created in different virtual environments. It's recommended to run the
|
you're executing the correct version of spaCy.
|
||||||
command with `python -m` to make sure you're executing the correct version of
|
|
||||||
spaCy.
|
|
||||||
|
|
||||||
```cli
|
```cli
|
||||||
$ pip install -U spacy
|
$ pip install -U spacy
|
||||||
|
@ -132,8 +130,8 @@ $ pip install -U spacy[cuda92]
|
||||||
Once you have a GPU-enabled installation, the best way to activate it is to call
|
Once you have a GPU-enabled installation, the best way to activate it is to call
|
||||||
[`spacy.prefer_gpu`](/api/top-level#spacy.prefer_gpu) or
|
[`spacy.prefer_gpu`](/api/top-level#spacy.prefer_gpu) or
|
||||||
[`spacy.require_gpu()`](/api/top-level#spacy.require_gpu) somewhere in your
|
[`spacy.require_gpu()`](/api/top-level#spacy.require_gpu) somewhere in your
|
||||||
script before any models have been loaded. `require_gpu` will raise an error if
|
script before any pipelines have been loaded. `require_gpu` will raise an error
|
||||||
no GPU is available.
|
if no GPU is available.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import spacy
|
import spacy
|
||||||
|
@ -238,16 +236,16 @@ installing, loading and using spaCy, as well as their solutions.
|
||||||
<Accordion title="No compatible model found" id="compatible-model">
|
<Accordion title="No compatible model found" id="compatible-model">
|
||||||
|
|
||||||
```
|
```
|
||||||
No compatible model found for [lang] (spaCy vX.X.X).
|
No compatible package found for [lang] (spaCy vX.X.X).
|
||||||
```
|
```
|
||||||
|
|
||||||
This usually means that the model you're trying to download does not exist, or
|
This usually means that the trained pipeline you're trying to download does not
|
||||||
isn't available for your version of spaCy. Check the
|
exist, or isn't available for your version of spaCy. Check the
|
||||||
[compatibility table](https://github.com/explosion/spacy-models/tree/master/compatibility.json)
|
[compatibility table](https://github.com/explosion/spacy-models/tree/master/compatibility.json)
|
||||||
to see which models are available for your spaCy version. If you're using an old
|
to see which packages are available for your spaCy version. If you're using an
|
||||||
version, consider upgrading to the latest release. Note that while spaCy
|
old version, consider upgrading to the latest release. Note that while spaCy
|
||||||
supports tokenization for [a variety of languages](/usage/models#languages), not
|
supports tokenization for [a variety of languages](/usage/models#languages), not
|
||||||
all of them come with statistical models. To only use the tokenizer, import the
|
all of them come with trained pipelines. To only use the tokenizer, import the
|
||||||
language's `Language` class instead, for example
|
language's `Language` class instead, for example
|
||||||
`from spacy.lang.fr import French`.
|
`from spacy.lang.fr import French`.
|
||||||
|
|
||||||
|
@ -259,7 +257,7 @@ language's `Language` class instead, for example
|
||||||
no such option: --no-cache-dir
|
no such option: --no-cache-dir
|
||||||
```
|
```
|
||||||
|
|
||||||
The `download` command uses pip to install the models and sets the
|
The `download` command uses pip to install the pipeline packages and sets the
|
||||||
`--no-cache-dir` flag to prevent it from requiring too much memory.
|
`--no-cache-dir` flag to prevent it from requiring too much memory.
|
||||||
[This setting](https://pip.pypa.io/en/stable/reference/pip_install/#caching)
|
[This setting](https://pip.pypa.io/en/stable/reference/pip_install/#caching)
|
||||||
requires pip v6.0 or newer. Run `pip install -U pip` to upgrade to the latest
|
requires pip v6.0 or newer. Run `pip install -U pip` to upgrade to the latest
|
||||||
|
@ -323,19 +321,19 @@ also run `which python` to find out where your Python executable is located.
|
||||||
|
|
||||||
</Accordion>
|
</Accordion>
|
||||||
|
|
||||||
<Accordion title="Import error: No module named [model]" id="import-error-models">
|
<Accordion title="Import error: No module named [name]" id="import-error-models">
|
||||||
|
|
||||||
```
|
```
|
||||||
ImportError: No module named 'en_core_web_sm'
|
ImportError: No module named 'en_core_web_sm'
|
||||||
```
|
```
|
||||||
|
|
||||||
As of spaCy v1.7, all models can be installed as Python packages. This means
|
As of spaCy v1.7, all trained pipelines can be installed as Python packages.
|
||||||
that they'll become importable modules of your application. If this fails, it's
|
This means that they'll become importable modules of your application. If this
|
||||||
usually a sign that the package is not installed in the current environment. Run
|
fails, it's usually a sign that the package is not installed in the current
|
||||||
`pip list` or `pip freeze` to check which model packages you have installed, and
|
environment. Run `pip list` or `pip freeze` to check which pipeline packages you
|
||||||
install the [correct models](/models) if necessary. If you're importing a model
|
have installed, and install the [correct package](/models) if necessary. If
|
||||||
manually at the top of a file, make sure to use the name of the package, not the
|
you're importing a package manually at the top of a file, make sure to use the
|
||||||
shortcut link you've created.
|
full name of the package.
|
||||||
|
|
||||||
</Accordion>
|
</Accordion>
|
||||||
|
|
||||||
|
|
|
@ -3,57 +3,79 @@ title: Layers and Model Architectures
|
||||||
teaser: Power spaCy components with custom neural networks
|
teaser: Power spaCy components with custom neural networks
|
||||||
menu:
|
menu:
|
||||||
- ['Type Signatures', 'type-sigs']
|
- ['Type Signatures', 'type-sigs']
|
||||||
- ['Defining Sublayers', 'sublayers']
|
- ['Swapping Architectures', 'swap-architectures']
|
||||||
- ['PyTorch & TensorFlow', 'frameworks']
|
- ['PyTorch & TensorFlow', 'frameworks']
|
||||||
|
- ['Thinc Models', 'thinc']
|
||||||
- ['Trainable Components', 'components']
|
- ['Trainable Components', 'components']
|
||||||
next: /usage/projects
|
next: /usage/projects
|
||||||
---
|
---
|
||||||
|
|
||||||
A **model architecture** is a function that wires up a
|
> #### Example
|
||||||
[Thinc `Model`](https://thinc.ai/docs/api-model) instance, which you can then
|
>
|
||||||
use in a component or as a layer of a larger network. You can use Thinc as a
|
> ```python
|
||||||
thin wrapper around frameworks such as PyTorch, TensorFlow or MXNet, or you can
|
> from thinc.api import Model, chain
|
||||||
implement your logic in Thinc directly. spaCy's built-in components will never
|
>
|
||||||
construct their `Model` instances themselves, so you won't have to subclass the
|
> @spacy.registry.architectures.register("model.v1")
|
||||||
component to change its model architecture. You can just **update the config**
|
> def build_model(width: int, classes: int) -> Model:
|
||||||
so that it refers to a different registered function. Once the component has
|
> tok2vec = build_tok2vec(width)
|
||||||
been created, its model instance has already been assigned, so you cannot change
|
> output_layer = build_output_layer(width, classes)
|
||||||
its model architecture. The architecture is like a recipe for the network, and
|
> model = chain(tok2vec, output_layer)
|
||||||
you can't change the recipe once the dish has already been prepared. You have to
|
> return model
|
||||||
make a new one.
|
> ```
|
||||||
|
|
||||||
![Diagram of a pipeline component with its model](../images/layers-architectures.svg)
|
A **model architecture** is a function that wires up a
|
||||||
|
[Thinc `Model`](https://thinc.ai/docs/api-model) instance. It describes the
|
||||||
|
neural network that is run internally as part of a component in a spaCy
|
||||||
|
pipeline. To define the actual architecture, you can implement your logic in
|
||||||
|
Thinc directly, or you can use Thinc as a thin wrapper around frameworks such as
|
||||||
|
PyTorch, TensorFlow and MXNet. Each Model can also be used as a sublayer of a
|
||||||
|
larger network, allowing you to freely combine implementations from different
|
||||||
|
frameworks into one `Thinc` Model.
|
||||||
|
|
||||||
|
spaCy's built-in components require a `Model` instance to be passed to them via
|
||||||
|
the config system. To change the model architecture of an existing component,
|
||||||
|
you just need to [**update the config**](#swap-architectures) so that it refers
|
||||||
|
to a different registered function. Once the component has been created from
|
||||||
|
this config, you won't be able to change it anymore. The architecture is like a
|
||||||
|
recipe for the network, and you can't change the recipe once the dish has
|
||||||
|
already been prepared. You have to make a new one.
|
||||||
|
|
||||||
|
```ini
|
||||||
|
### config.cfg (excerpt)
|
||||||
|
[components.tagger]
|
||||||
|
factory = "tagger"
|
||||||
|
|
||||||
|
[components.tagger.model]
|
||||||
|
@architectures = "model.v1"
|
||||||
|
width = 512
|
||||||
|
classes = 16
|
||||||
|
```
|
||||||
|
|
||||||
## Type signatures {#type-sigs}
|
## Type signatures {#type-sigs}
|
||||||
|
|
||||||
<!-- TODO: update example, maybe simplify definition? -->
|
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> @spacy.registry.architectures.register("spacy.Tagger.v1")
|
> from typing import List
|
||||||
> def build_tagger_model(
|
> from thinc.api import Model, chain
|
||||||
> tok2vec: Model[List[Doc], List[Floats2d]], nO: Optional[int] = None
|
> from thinc.types import Floats2d
|
||||||
> ) -> Model[List[Doc], List[Floats2d]]:
|
> def chain_model(
|
||||||
> t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
|
> tok2vec: Model[List[Doc], List[Floats2d]],
|
||||||
> output_layer = Softmax(nO, t2v_width, init_W=zero_init)
|
> layer1: Model[List[Floats2d], Floats2d],
|
||||||
> softmax = with_array(output_layer)
|
> layer2: Model[Floats2d, Floats2d]
|
||||||
> model = chain(tok2vec, softmax)
|
> ) -> Model[List[Doc], Floats2d]:
|
||||||
> model.set_ref("tok2vec", tok2vec)
|
> model = chain(tok2vec, layer1, layer2)
|
||||||
> model.set_ref("softmax", output_layer)
|
|
||||||
> model.set_ref("output_layer", output_layer)
|
|
||||||
> return model
|
> return model
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
The Thinc `Model` class is a **generic type** that can specify its input and
|
The Thinc `Model` class is a **generic type** that can specify its input and
|
||||||
output types. Python uses a square-bracket notation for this, so the type
|
output types. Python uses a square-bracket notation for this, so the type
|
||||||
~~Model[List, Dict]~~ says that each batch of inputs to the model will be a
|
~~Model[List, Dict]~~ says that each batch of inputs to the model will be a
|
||||||
list, and the outputs will be a dictionary. Both `typing.List` and `typing.Dict`
|
list, and the outputs will be a dictionary. You can be even more specific and
|
||||||
are also generics, allowing you to be more specific about the data. For
|
write for instance~~Model[List[Doc], Dict[str, float]]~~ to specify that the
|
||||||
instance, you can write ~~Model[List[Doc], Dict[str, float]]~~ to specify that
|
model expects a list of [`Doc`](/api/doc) objects as input, and returns a
|
||||||
the model expects a list of [`Doc`](/api/doc) objects as input, and returns a
|
dictionary mapping of strings to floats. Some of the most common types you'll
|
||||||
dictionary mapping strings to floats. Some of the most common types you'll see
|
see are:
|
||||||
are:
|
|
||||||
|
|
||||||
| Type | Description |
|
| Type | Description |
|
||||||
| ------------------ | ---------------------------------------------------------------------------------------------------- |
|
| ------------------ | ---------------------------------------------------------------------------------------------------- |
|
||||||
|
@ -62,7 +84,7 @@ are:
|
||||||
| ~~Ints2d~~ | A two-dimensional `numpy` or `cupy` array of integers. Common dtypes include uint64, int32 and int8. |
|
| ~~Ints2d~~ | A two-dimensional `numpy` or `cupy` array of integers. Common dtypes include uint64, int32 and int8. |
|
||||||
| ~~List[Floats2d]~~ | A list of two-dimensional arrays, generally with one array per `Doc` and one row per token. |
|
| ~~List[Floats2d]~~ | A list of two-dimensional arrays, generally with one array per `Doc` and one row per token. |
|
||||||
| ~~Ragged~~ | A container to handle variable-length sequence data in an unpadded contiguous array. |
|
| ~~Ragged~~ | A container to handle variable-length sequence data in an unpadded contiguous array. |
|
||||||
| ~~Padded~~ | A container to handle variable-length sequence data in a passed contiguous array. |
|
| ~~Padded~~ | A container to handle variable-length sequence data in a padded contiguous array. |
|
||||||
|
|
||||||
The model type signatures help you figure out which model architectures and
|
The model type signatures help you figure out which model architectures and
|
||||||
components can **fit together**. For instance, the
|
components can **fit together**. For instance, the
|
||||||
|
@ -78,10 +100,10 @@ interchangeably. There are many other ways they could be incompatible. However,
|
||||||
if the types don't match, they almost surely _won't_ be compatible. This little
|
if the types don't match, they almost surely _won't_ be compatible. This little
|
||||||
bit of validation goes a long way, especially if you
|
bit of validation goes a long way, especially if you
|
||||||
[configure your editor](https://thinc.ai/docs/usage-type-checking) or other
|
[configure your editor](https://thinc.ai/docs/usage-type-checking) or other
|
||||||
tools to highlight these errors early. Thinc will also verify that your types
|
tools to highlight these errors early. The config file is also validated at the
|
||||||
match correctly when your config file is processed at the beginning of training.
|
beginning of training, to verify that all the types match correctly.
|
||||||
|
|
||||||
<Infobox title="Tip: Static type checking in your editor" emoji="💡">
|
<Accordion title="Tip: Static type checking in your editor">
|
||||||
|
|
||||||
If you're using a modern editor like Visual Studio Code, you can
|
If you're using a modern editor like Visual Studio Code, you can
|
||||||
[set up `mypy`](https://thinc.ai/docs/usage-type-checking#install) with the
|
[set up `mypy`](https://thinc.ai/docs/usage-type-checking#install) with the
|
||||||
|
@ -90,86 +112,144 @@ code.
|
||||||
|
|
||||||
[![](../images/thinc_mypy.jpg)](https://thinc.ai/docs/usage-type-checking#linting)
|
[![](../images/thinc_mypy.jpg)](https://thinc.ai/docs/usage-type-checking#linting)
|
||||||
|
|
||||||
</Infobox>
|
</Accordion>
|
||||||
|
|
||||||
## Defining sublayers {#sublayers}
|
## Swapping model architectures {#swap-architectures}
|
||||||
|
|
||||||
Model architecture functions often accept **sublayers as arguments**, so that
|
If no model is specified for the [`TextCategorizer`](/api/textcategorizer), the
|
||||||
|
[TextCatEnsemble](/api/architectures#TextCatEnsemble) architecture is used by
|
||||||
|
default. This architecture combines a simpel bag-of-words model with a neural
|
||||||
|
network, usually resulting in the most accurate results, but at the cost of
|
||||||
|
speed. The config file for this model would look something like this:
|
||||||
|
|
||||||
|
```ini
|
||||||
|
### config.cfg (excerpt)
|
||||||
|
[components.textcat]
|
||||||
|
factory = "textcat"
|
||||||
|
labels = []
|
||||||
|
|
||||||
|
[components.textcat.model]
|
||||||
|
@architectures = "spacy.TextCatEnsemble.v1"
|
||||||
|
exclusive_classes = false
|
||||||
|
pretrained_vectors = null
|
||||||
|
width = 64
|
||||||
|
conv_depth = 2
|
||||||
|
embed_size = 2000
|
||||||
|
window_size = 1
|
||||||
|
ngram_size = 1
|
||||||
|
dropout = 0
|
||||||
|
nO = null
|
||||||
|
```
|
||||||
|
|
||||||
|
spaCy has two additional built-in `textcat` architectures, and you can easily
|
||||||
|
use those by swapping out the definition of the textcat's model. For instance,
|
||||||
|
to use the simple and fast bag-of-words model
|
||||||
|
[TextCatBOW](/api/architectures#TextCatBOW), you can change the config to:
|
||||||
|
|
||||||
|
```ini
|
||||||
|
### config.cfg (excerpt) {highlight="6-10"}
|
||||||
|
[components.textcat]
|
||||||
|
factory = "textcat"
|
||||||
|
labels = []
|
||||||
|
|
||||||
|
[components.textcat.model]
|
||||||
|
@architectures = "spacy.TextCatBOW.v1"
|
||||||
|
exclusive_classes = false
|
||||||
|
ngram_size = 1
|
||||||
|
no_output_layer = false
|
||||||
|
nO = null
|
||||||
|
```
|
||||||
|
|
||||||
|
For details on all pre-defined architectures shipped with spaCy and how to
|
||||||
|
configure them, check out the [model architectures](/api/architectures)
|
||||||
|
documentation.
|
||||||
|
|
||||||
|
### Defining sublayers {#sublayers}
|
||||||
|
|
||||||
|
Model architecture functions often accept **sublayers as arguments**, so that
|
||||||
you can try **substituting a different layer** into the network. Depending on
|
you can try **substituting a different layer** into the network. Depending on
|
||||||
how the architecture function is structured, you might be able to define your
|
how the architecture function is structured, you might be able to define your
|
||||||
network structure entirely through the [config system](/usage/training#config),
|
network structure entirely through the [config system](/usage/training#config),
|
||||||
using layers that have already been defined. The
|
using layers that have already been defined.
|
||||||
[transformers documentation](/usage/embeddings-transformers#transformers)
|
|
||||||
section shows a common example of swapping in a different sublayer.
|
|
||||||
|
|
||||||
In most neural network models for NLP, the most important parts of the network
|
In most neural network models for NLP, the most important parts of the network
|
||||||
are what we refer to as the
|
are what we refer to as the
|
||||||
[embed and encode](https://explosion.ai/blog/embed-encode-attend-predict) steps.
|
[embed and encode](https://explosion.ai/blog/deep-learning-formula-nlp) steps.
|
||||||
These steps together compute dense, context-sensitive representations of the
|
These steps together compute dense, context-sensitive representations of the
|
||||||
tokens. Most of spaCy's default architectures accept a
|
tokens, and their combination forms a typical
|
||||||
[`tok2vec` embedding layer](/api/architectures#tok2vec-arch) as an argument, so
|
[`Tok2Vec`](/api/architectures#Tok2Vec) layer:
|
||||||
you can control this important part of the network separately. This makes it
|
|
||||||
easy to **switch between** transformer, CNN, BiLSTM or other feature extraction
|
|
||||||
approaches. And if you want to define your own solution, all you need to do is
|
|
||||||
register a ~~Model[List[Doc], List[Floats2d]]~~ architecture function, and
|
|
||||||
you'll be able to try it out in any of spaCy components.
|
|
||||||
|
|
||||||
<!-- TODO: example of switching sublayers -->
|
```ini
|
||||||
|
### config.cfg (excerpt)
|
||||||
|
[components.tok2vec]
|
||||||
|
factory = "tok2vec"
|
||||||
|
|
||||||
### Registering new architectures
|
[components.tok2vec.model]
|
||||||
|
@architectures = "spacy.Tok2Vec.v1"
|
||||||
|
|
||||||
- Recap concept, link to config docs.
|
[components.tok2vec.model.embed]
|
||||||
|
@architectures = "spacy.MultiHashEmbed.v1"
|
||||||
|
# ...
|
||||||
|
|
||||||
|
[components.tok2vec.model.encode]
|
||||||
|
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
||||||
|
# ...
|
||||||
|
```
|
||||||
|
|
||||||
|
By defining these sublayers specifically, it becomes straightforward to swap out
|
||||||
|
a sublayer for another one, for instance changing the first sublayer to a
|
||||||
|
character embedding with the [CharacterEmbed](/api/architectures#CharacterEmbed)
|
||||||
|
architecture:
|
||||||
|
|
||||||
|
```ini
|
||||||
|
### config.cfg (excerpt)
|
||||||
|
[components.tok2vec.model.embed]
|
||||||
|
@architectures = "spacy.CharacterEmbed.v1"
|
||||||
|
# ...
|
||||||
|
|
||||||
|
[components.tok2vec.model.encode]
|
||||||
|
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
||||||
|
# ...
|
||||||
|
```
|
||||||
|
|
||||||
|
Most of spaCy's default architectures accept a `tok2vec` layer as a sublayer
|
||||||
|
within the larger task-specific neural network. This makes it easy to **switch
|
||||||
|
between** transformer, CNN, BiLSTM or other feature extraction approaches. The
|
||||||
|
[transformers documentation](/usage/embeddings-transformers#training-custom-model)
|
||||||
|
section shows an example of swapping out a model's standard `tok2vec` layer with
|
||||||
|
a transformer. And if you want to define your own solution, all you need to do
|
||||||
|
is register a ~~Model[List[Doc], List[Floats2d]]~~ architecture function, and
|
||||||
|
you'll be able to try it out in any of the spaCy components.
|
||||||
|
|
||||||
## Wrapping PyTorch, TensorFlow and other frameworks {#frameworks}
|
## Wrapping PyTorch, TensorFlow and other frameworks {#frameworks}
|
||||||
|
|
||||||
<!-- TODO: this is copied over from the Thinc docs and we probably want to shorten it and make it more spaCy-specific -->
|
Thinc allows you to [wrap models](https://thinc.ai/docs/usage-frameworks)
|
||||||
|
written in other machine learning frameworks like PyTorch, TensorFlow and MXNet
|
||||||
|
using a unified [`Model`](https://thinc.ai/docs/api-model) API. As well as
|
||||||
|
**wrapping whole models**, Thinc lets you call into an external framework for
|
||||||
|
just **part of your model**: you can have a model where you use PyTorch just for
|
||||||
|
the transformer layers, using "native" Thinc layers to do fiddly input and
|
||||||
|
output transformations and add on task-specific "heads", as efficiency is less
|
||||||
|
of a consideration for those parts of the network.
|
||||||
|
|
||||||
Thinc allows you to wrap models written in other machine learning frameworks
|
<!-- TODO: custom tagger implemented in PyTorch, wrapped as Thinc model, link off to project (with notebook?) -->
|
||||||
like PyTorch, TensorFlow and MXNet using a unified
|
|
||||||
[`Model`](https://thinc.ai/docs/api-model) API. As well as **wrapping whole
|
|
||||||
models**, Thinc lets you call into an external framework for just **part of your
|
|
||||||
model**: you can have a model where you use PyTorch just for the transformer
|
|
||||||
layers, using "native" Thinc layers to do fiddly input and output
|
|
||||||
transformations and add on task-specific "heads", as efficiency is less of a
|
|
||||||
consideration for those parts of the network.
|
|
||||||
|
|
||||||
Thinc uses a special class, [`Shim`](https://thinc.ai/docs/api-model#shim), to
|
## Implementing models in Thinc {#thinc}
|
||||||
hold references to external objects. This allows each wrapper space to define a
|
|
||||||
custom type, with whatever attributes and methods are helpful, to assist in
|
|
||||||
managing the communication between Thinc and the external library. The
|
|
||||||
[`Model`](https://thinc.ai/docs/api-model#model) class holds `shim` instances in
|
|
||||||
a separate list, and communicates with the shims about updates, serialization,
|
|
||||||
changes of device, etc.
|
|
||||||
|
|
||||||
The wrapper will receive each batch of inputs, convert them into a suitable form
|
<!-- TODO: use same example as above, custom tagger, but implemented in Thinc, link off to Thinc docs where appropriate -->
|
||||||
for the underlying model instance, and pass them over to the shim, which will
|
|
||||||
**manage the actual communication** with the model. The output is then passed
|
|
||||||
back into the wrapper, and converted for use in the rest of the network. The
|
|
||||||
equivalent procedure happens during backpropagation. Array conversion is handled
|
|
||||||
via the [DLPack](https://github.com/dmlc/dlpack) standard wherever possible, so
|
|
||||||
that data can be passed between the frameworks **without copying the data back**
|
|
||||||
to the host device unnecessarily.
|
|
||||||
|
|
||||||
| Framework | Wrapper layer | Shim | DLPack |
|
|
||||||
| -------------- | ------------------------------------------------------------------------- | --------------------------------------------------------- | --------------- |
|
|
||||||
| **PyTorch** | [`PyTorchWrapper`](https://thinc.ai/docs/api-layers#pytorchwrapper) | [`PyTorchShim`](https://thinc.ai/docs/api-model#shims) | ✅ |
|
|
||||||
| **TensorFlow** | [`TensorFlowWrapper`](https://thinc.ai/docs/api-layers#tensorflowwrapper) | [`TensorFlowShim`](https://thinc.ai/docs/api-model#shims) | ❌ <sup>1</sup> |
|
|
||||||
| **MXNet** | [`MXNetWrapper`](https://thinc.ai/docs/api-layers#mxnetwrapper) | [`MXNetShim`](https://thinc.ai/docs/api-model#shims) | ✅ |
|
|
||||||
|
|
||||||
1. DLPack support in TensorFlow is now
|
|
||||||
[available](<(https://github.com/tensorflow/tensorflow/issues/24453)>) but
|
|
||||||
still experimental.
|
|
||||||
|
|
||||||
<!-- TODO:
|
|
||||||
- Explain concept
|
|
||||||
- Link off to notebook
|
|
||||||
-->
|
|
||||||
|
|
||||||
## Models for trainable components {#components}
|
## Models for trainable components {#components}
|
||||||
|
|
||||||
|
<!-- TODO:
|
||||||
|
|
||||||
- Interaction with `predict`, `get_loss` and `set_annotations`
|
- Interaction with `predict`, `get_loss` and `set_annotations`
|
||||||
- Initialization life-cycle with `begin_training`.
|
- Initialization life-cycle with `begin_training`.
|
||||||
- Link to relation extraction notebook.
|
|
||||||
|
Example: relation extraction component (implemented as project template)
|
||||||
|
|
||||||
|
-->
|
||||||
|
|
||||||
|
![Diagram of a pipeline component with its model](../images/layers-architectures.svg)
|
||||||
|
|
||||||
```python
|
```python
|
||||||
def update(self, examples):
|
def update(self, examples):
|
||||||
|
|
|
@ -132,7 +132,7 @@ language can extend the `Lemmatizer` as part of its
|
||||||
### {executable="true"}
|
### {executable="true"}
|
||||||
import spacy
|
import spacy
|
||||||
|
|
||||||
# English models include a rule-based lemmatizer
|
# English pipelines include a rule-based lemmatizer
|
||||||
nlp = spacy.load("en_core_web_sm")
|
nlp = spacy.load("en_core_web_sm")
|
||||||
lemmatizer = nlp.get_pipe("lemmatizer")
|
lemmatizer = nlp.get_pipe("lemmatizer")
|
||||||
print(lemmatizer.mode) # 'rule'
|
print(lemmatizer.mode) # 'rule'
|
||||||
|
@ -156,14 +156,14 @@ component.
|
||||||
|
|
||||||
The data for spaCy's lemmatizers is distributed in the package
|
The data for spaCy's lemmatizers is distributed in the package
|
||||||
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). The
|
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). The
|
||||||
provided models already include all the required tables, but if you are creating
|
provided trained pipelines already include all the required tables, but if you
|
||||||
new models, you'll probably want to install `spacy-lookups-data` to provide the
|
are creating new pipelines, you'll probably want to install `spacy-lookups-data`
|
||||||
data when the lemmatizer is initialized.
|
to provide the data when the lemmatizer is initialized.
|
||||||
|
|
||||||
### Lookup lemmatizer {#lemmatizer-lookup}
|
### Lookup lemmatizer {#lemmatizer-lookup}
|
||||||
|
|
||||||
For models without a tagger or morphologizer, a lookup lemmatizer can be added
|
For pipelines without a tagger or morphologizer, a lookup lemmatizer can be
|
||||||
to the pipeline as long as a lookup table is provided, typically through
|
added to the pipeline as long as a lookup table is provided, typically through
|
||||||
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). The
|
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). The
|
||||||
lookup lemmatizer looks up the token surface form in the lookup table without
|
lookup lemmatizer looks up the token surface form in the lookup table without
|
||||||
reference to the token's part-of-speech or context.
|
reference to the token's part-of-speech or context.
|
||||||
|
@ -178,9 +178,9 @@ nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
|
||||||
|
|
||||||
### Rule-based lemmatizer {#lemmatizer-rule}
|
### Rule-based lemmatizer {#lemmatizer-rule}
|
||||||
|
|
||||||
When training models that include a component that assigns POS (a morphologizer
|
When training pipelines that include a component that assigns part-of-speech
|
||||||
or a tagger with a [POS mapping](#mappings-exceptions)), a rule-based lemmatizer
|
tags (a morphologizer or a tagger with a [POS mapping](#mappings-exceptions)), a
|
||||||
can be added using rule tables from
|
rule-based lemmatizer can be added using rule tables from
|
||||||
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data):
|
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data):
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
@ -366,10 +366,10 @@ sequence of tokens. You can walk up the tree with the
|
||||||
|
|
||||||
> #### Projective vs. non-projective
|
> #### Projective vs. non-projective
|
||||||
>
|
>
|
||||||
> For the [default English model](/models/en), the parse tree is **projective**,
|
> For the [default English pipelines](/models/en), the parse tree is
|
||||||
> which means that there are no crossing brackets. The tokens returned by
|
> **projective**, which means that there are no crossing brackets. The tokens
|
||||||
> `.subtree` are therefore guaranteed to be contiguous. This is not true for the
|
> returned by `.subtree` are therefore guaranteed to be contiguous. This is not
|
||||||
> German model, which has many
|
> true for the German pipelines, which have many
|
||||||
> [non-projective dependencies](https://explosion.ai/blog/german-model#word-order).
|
> [non-projective dependencies](https://explosion.ai/blog/german-model#word-order).
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
@ -497,26 +497,27 @@ displaCy in our [online demo](https://explosion.ai/demos/displacy)..
|
||||||
|
|
||||||
### Disabling the parser {#disabling}
|
### Disabling the parser {#disabling}
|
||||||
|
|
||||||
In the [default models](/models), the parser is loaded and enabled as part of
|
In the [trained pipelines](/models) provided by spaCy, the parser is loaded and
|
||||||
the [standard processing pipeline](/usage/processing-pipelines). If you don't
|
enabled by default as part of the
|
||||||
need any of the syntactic information, you should disable the parser. Disabling
|
[standard processing pipeline](/usage/processing-pipelines). If you don't need
|
||||||
the parser will make spaCy load and run much faster. If you want to load the
|
any of the syntactic information, you should disable the parser. Disabling the
|
||||||
parser, but need to disable it for specific documents, you can also control its
|
parser will make spaCy load and run much faster. If you want to load the parser,
|
||||||
use on the `nlp` object.
|
but need to disable it for specific documents, you can also control its use on
|
||||||
|
the `nlp` object. For more details, see the usage guide on
|
||||||
|
[disabling pipeline components](/usage/processing-pipelines/#disabling).
|
||||||
|
|
||||||
```python
|
```python
|
||||||
nlp = spacy.load("en_core_web_sm", disable=["parser"])
|
nlp = spacy.load("en_core_web_sm", disable=["parser"])
|
||||||
nlp = English().from_disk("/model", disable=["parser"])
|
|
||||||
doc = nlp("I don't want parsed", disable=["parser"])
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## Named Entity Recognition {#named-entities}
|
## Named Entity Recognition {#named-entities}
|
||||||
|
|
||||||
spaCy features an extremely fast statistical entity recognition system, that
|
spaCy features an extremely fast statistical entity recognition system, that
|
||||||
assigns labels to contiguous spans of tokens. The default model identifies a
|
assigns labels to contiguous spans of tokens. The default
|
||||||
variety of named and numeric entities, including companies, locations,
|
[trained pipelines](/models) can indentify a variety of named and numeric
|
||||||
organizations and products. You can add arbitrary classes to the entity
|
entities, including companies, locations, organizations and products. You can
|
||||||
recognition system, and update the model with new examples.
|
add arbitrary classes to the entity recognition system, and update the model
|
||||||
|
with new examples.
|
||||||
|
|
||||||
### Named Entity Recognition 101 {#named-entities-101}
|
### Named Entity Recognition 101 {#named-entities-101}
|
||||||
|
|
||||||
|
@ -669,7 +670,7 @@ responsibility for ensuring that the data is left in a consistent state.
|
||||||
|
|
||||||
<Infobox title="Annotation scheme">
|
<Infobox title="Annotation scheme">
|
||||||
|
|
||||||
For details on the entity types available in spaCy's pretrained models, see the
|
For details on the entity types available in spaCy's trained pipelines, see the
|
||||||
"label scheme" sections of the individual models in the
|
"label scheme" sections of the individual models in the
|
||||||
[models directory](/models).
|
[models directory](/models).
|
||||||
|
|
||||||
|
@ -710,9 +711,8 @@ import DisplacyEntHtml from 'images/displacy-ent2.html'
|
||||||
To ground the named entities into the "real world", spaCy provides functionality
|
To ground the named entities into the "real world", spaCy provides functionality
|
||||||
to perform entity linking, which resolves a textual entity to a unique
|
to perform entity linking, which resolves a textual entity to a unique
|
||||||
identifier from a knowledge base (KB). You can create your own
|
identifier from a knowledge base (KB). You can create your own
|
||||||
[`KnowledgeBase`](/api/kb) and
|
[`KnowledgeBase`](/api/kb) and [train](/usage/training) a new
|
||||||
[train a new Entity Linking model](/usage/training#entity-linker) using that
|
[`EntityLinker`](/api/entitylinker) using that custom knowledge base.
|
||||||
custom-made KB.
|
|
||||||
|
|
||||||
### Accessing entity identifiers {#entity-linking-accessing model="entity linking"}
|
### Accessing entity identifiers {#entity-linking-accessing model="entity linking"}
|
||||||
|
|
||||||
|
@ -724,7 +724,7 @@ object, or the `ent_kb_id` and `ent_kb_id_` attributes of a
|
||||||
```python
|
```python
|
||||||
import spacy
|
import spacy
|
||||||
|
|
||||||
nlp = spacy.load("my_custom_el_model")
|
nlp = spacy.load("my_custom_el_pipeline")
|
||||||
doc = nlp("Ada Lovelace was born in London")
|
doc = nlp("Ada Lovelace was born in London")
|
||||||
|
|
||||||
# Document level
|
# Document level
|
||||||
|
@ -1042,13 +1042,15 @@ function that behaves the same way.
|
||||||
|
|
||||||
<Infobox title="Important note" variant="warning">
|
<Infobox title="Important note" variant="warning">
|
||||||
|
|
||||||
If you're using a statistical model, writing to the
|
If you've loaded a trained pipeline, writing to the
|
||||||
[`nlp.Defaults`](/api/language#defaults) or `English.Defaults` directly won't
|
[`nlp.Defaults`](/api/language#defaults) or `English.Defaults` directly won't
|
||||||
work, since the regular expressions are read from the model and will be compiled
|
work, since the regular expressions are read from the pipeline data and will be
|
||||||
when you load it. If you modify `nlp.Defaults`, you'll only see the effect if
|
compiled when you load it. If you modify `nlp.Defaults`, you'll only see the
|
||||||
you call [`spacy.blank`](/api/top-level#spacy.blank). If you want to modify the
|
effect if you call [`spacy.blank`](/api/top-level#spacy.blank). If you want to
|
||||||
tokenizer loaded from a statistical model, you should modify `nlp.tokenizer`
|
modify the tokenizer loaded from a trained pipeline, you should modify
|
||||||
directly.
|
`nlp.tokenizer` directly. If you're training your own pipeline, you can register
|
||||||
|
[callbacks](/usage/training/#custom-code-nlp-callbacks) to modify the `nlp`
|
||||||
|
object before training.
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
|
@ -1218,11 +1220,11 @@ print(doc.text, [token.text for token in doc])
|
||||||
|
|
||||||
<Infobox title="Important note on tokenization and models" variant="warning">
|
<Infobox title="Important note on tokenization and models" variant="warning">
|
||||||
|
|
||||||
Keep in mind that your model's result may be less accurate if the tokenization
|
Keep in mind that your models' results may be less accurate if the tokenization
|
||||||
during training differs from the tokenization at runtime. So if you modify a
|
during training differs from the tokenization at runtime. So if you modify a
|
||||||
pretrained model's tokenization afterwards, it may produce very different
|
trained pipeline's tokenization afterwards, it may produce very different
|
||||||
predictions. You should therefore train your model with the **same tokenizer**
|
predictions. You should therefore train your pipeline with the **same
|
||||||
it will be using at runtime. See the docs on
|
tokenizer** it will be using at runtime. See the docs on
|
||||||
[training with custom tokenization](#custom-tokenizer-training) for details.
|
[training with custom tokenization](#custom-tokenizer-training) for details.
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
@ -1231,7 +1233,7 @@ it will be using at runtime. See the docs on
|
||||||
|
|
||||||
spaCy's [training config](/usage/training#config) describe the settings,
|
spaCy's [training config](/usage/training#config) describe the settings,
|
||||||
hyperparameters, pipeline and tokenizer used for constructing and training the
|
hyperparameters, pipeline and tokenizer used for constructing and training the
|
||||||
model. The `[nlp.tokenizer]` block refers to a **registered function** that
|
pipeline. The `[nlp.tokenizer]` block refers to a **registered function** that
|
||||||
takes the `nlp` object and returns a tokenizer. Here, we're registering a
|
takes the `nlp` object and returns a tokenizer. Here, we're registering a
|
||||||
function called `whitespace_tokenizer` in the
|
function called `whitespace_tokenizer` in the
|
||||||
[`@tokenizers` registry](/api/registry). To make sure spaCy knows how to
|
[`@tokenizers` registry](/api/registry). To make sure spaCy knows how to
|
||||||
|
@ -1626,11 +1628,11 @@ spaCy provides four alternatives for sentence segmentation:
|
||||||
|
|
||||||
Unlike other libraries, spaCy uses the dependency parse to determine sentence
|
Unlike other libraries, spaCy uses the dependency parse to determine sentence
|
||||||
boundaries. This is usually the most accurate approach, but it requires a
|
boundaries. This is usually the most accurate approach, but it requires a
|
||||||
**statistical model** that provides accurate predictions. If your texts are
|
**trained pipeline** that provides accurate predictions. If your texts are
|
||||||
closer to general-purpose news or web text, this should work well out-of-the-box
|
closer to general-purpose news or web text, this should work well out-of-the-box
|
||||||
with spaCy's provided models. For social media or conversational text that
|
with spaCy's provided trained pipelines. For social media or conversational text
|
||||||
doesn't follow the same rules, your application may benefit from a custom model
|
that doesn't follow the same rules, your application may benefit from a custom
|
||||||
or rule-based component.
|
trained or rule-based component.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### {executable="true"}
|
### {executable="true"}
|
||||||
|
@ -1652,8 +1654,8 @@ parses consistent with the sentence boundaries.
|
||||||
The [`SentenceRecognizer`](/api/sentencerecognizer) is a simple statistical
|
The [`SentenceRecognizer`](/api/sentencerecognizer) is a simple statistical
|
||||||
component that only provides sentence boundaries. Along with being faster and
|
component that only provides sentence boundaries. Along with being faster and
|
||||||
smaller than the parser, its primary advantage is that it's easier to train
|
smaller than the parser, its primary advantage is that it's easier to train
|
||||||
custom models because it only requires annotated sentence boundaries rather than
|
because it only requires annotated sentence boundaries rather than full
|
||||||
full dependency parses.
|
dependency parses.
|
||||||
|
|
||||||
<!-- TODO: update/confirm usage once we have final models trained -->
|
<!-- TODO: update/confirm usage once we have final models trained -->
|
||||||
|
|
||||||
|
@ -1685,7 +1687,7 @@ need sentence boundaries without dependency parses.
|
||||||
import spacy
|
import spacy
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
|
|
||||||
nlp = English() # just the language with no model
|
nlp = English() # just the language with no pipeline
|
||||||
nlp.add_pipe("sentencizer")
|
nlp.add_pipe("sentencizer")
|
||||||
doc = nlp("This is a sentence. This is another sentence.")
|
doc = nlp("This is a sentence. This is another sentence.")
|
||||||
for sent in doc.sents:
|
for sent in doc.sents:
|
||||||
|
@ -1827,11 +1829,11 @@ or Tomas Mikolov's original
|
||||||
[Word2vec implementation](https://code.google.com/archive/p/word2vec/). Most
|
[Word2vec implementation](https://code.google.com/archive/p/word2vec/). Most
|
||||||
word vector libraries output an easy-to-read text-based format, where each line
|
word vector libraries output an easy-to-read text-based format, where each line
|
||||||
consists of the word followed by its vector. For everyday use, we want to
|
consists of the word followed by its vector. For everyday use, we want to
|
||||||
convert the vectors model into a binary format that loads faster and takes up
|
convert the vectors into a binary format that loads faster and takes up less
|
||||||
less space on disk. The easiest way to do this is the
|
space on disk. The easiest way to do this is the
|
||||||
[`init model`](/api/cli#init-model) command-line utility. This will output a
|
[`init vocab`](/api/cli#init-vocab) command-line utility. This will output a
|
||||||
spaCy model in the directory `/tmp/la_vectors_wiki_lg`, giving you access to
|
blank spaCy pipeline in the directory `/tmp/la_vectors_wiki_lg`, giving you
|
||||||
some nice Latin vectors. You can then pass the directory path to
|
access to some nice Latin vectors. You can then pass the directory path to
|
||||||
[`spacy.load`](/api/top-level#spacy.load).
|
[`spacy.load`](/api/top-level#spacy.load).
|
||||||
|
|
||||||
> #### Usage example
|
> #### Usage example
|
||||||
|
@ -1845,7 +1847,7 @@ some nice Latin vectors. You can then pass the directory path to
|
||||||
|
|
||||||
```cli
|
```cli
|
||||||
$ wget https://s3-us-west-1.amazonaws.com/fasttext-vectors/word-vectors-v2/cc.la.300.vec.gz
|
$ wget https://s3-us-west-1.amazonaws.com/fasttext-vectors/word-vectors-v2/cc.la.300.vec.gz
|
||||||
$ python -m spacy init model en /tmp/la_vectors_wiki_lg --vectors-loc cc.la.300.vec.gz
|
$ python -m spacy init vocab en /tmp/la_vectors_wiki_lg --vectors-loc cc.la.300.vec.gz
|
||||||
```
|
```
|
||||||
|
|
||||||
<Accordion title="How to optimize vector coverage" id="custom-vectors-coverage" spaced>
|
<Accordion title="How to optimize vector coverage" id="custom-vectors-coverage" spaced>
|
||||||
|
@ -1853,13 +1855,13 @@ $ python -m spacy init model en /tmp/la_vectors_wiki_lg --vectors-loc cc.la.300.
|
||||||
To help you strike a good balance between coverage and memory usage, spaCy's
|
To help you strike a good balance between coverage and memory usage, spaCy's
|
||||||
[`Vectors`](/api/vectors) class lets you map **multiple keys** to the **same
|
[`Vectors`](/api/vectors) class lets you map **multiple keys** to the **same
|
||||||
row** of the table. If you're using the
|
row** of the table. If you're using the
|
||||||
[`spacy init model`](/api/cli#init-model) command to create a vocabulary,
|
[`spacy init vocab`](/api/cli#init-vocab) command to create a vocabulary,
|
||||||
pruning the vectors will be taken care of automatically if you set the
|
pruning the vectors will be taken care of automatically if you set the
|
||||||
`--prune-vectors` flag. You can also do it manually in the following steps:
|
`--prune-vectors` flag. You can also do it manually in the following steps:
|
||||||
|
|
||||||
1. Start with a **word vectors model** that covers a huge vocabulary. For
|
1. Start with a **word vectors package** that covers a huge vocabulary. For
|
||||||
instance, the [`en_vectors_web_lg`](/models/en-starters#en_vectors_web_lg)
|
instance, the [`en_vectors_web_lg`](/models/en-starters#en_vectors_web_lg)
|
||||||
model provides 300-dimensional GloVe vectors for over 1 million terms of
|
starter provides 300-dimensional GloVe vectors for over 1 million terms of
|
||||||
English.
|
English.
|
||||||
2. If your vocabulary has values set for the `Lexeme.prob` attribute, the
|
2. If your vocabulary has values set for the `Lexeme.prob` attribute, the
|
||||||
lexemes will be sorted by descending probability to determine which vectors
|
lexemes will be sorted by descending probability to determine which vectors
|
||||||
|
@ -1900,17 +1902,17 @@ the two words.
|
||||||
In the example above, the vector for "Shore" was removed and remapped to the
|
In the example above, the vector for "Shore" was removed and remapped to the
|
||||||
vector of "coast", which is deemed about 73% similar. "Leaving" was remapped to
|
vector of "coast", which is deemed about 73% similar. "Leaving" was remapped to
|
||||||
the vector of "leaving", which is identical. If you're using the
|
the vector of "leaving", which is identical. If you're using the
|
||||||
[`init model`](/api/cli#init-model) command, you can set the `--prune-vectors`
|
[`init vocab`](/api/cli#init-vocab) command, you can set the `--prune-vectors`
|
||||||
option to easily reduce the size of the vectors as you add them to a spaCy
|
option to easily reduce the size of the vectors as you add them to a spaCy
|
||||||
model:
|
pipeline:
|
||||||
|
|
||||||
```cli
|
```cli
|
||||||
$ python -m spacy init model en /tmp/la_vectors_web_md --vectors-loc la.300d.vec.tgz --prune-vectors 10000
|
$ python -m spacy init vocab en /tmp/la_vectors_web_md --vectors-loc la.300d.vec.tgz --prune-vectors 10000
|
||||||
```
|
```
|
||||||
|
|
||||||
This will create a spaCy model with vectors for the first 10,000 words in the
|
This will create a blank spaCy pipeline with vectors for the first 10,000 words
|
||||||
vectors model. All other words in the vectors model are mapped to the closest
|
in the vectors. All other words in the vectors are mapped to the closest vector
|
||||||
vector among those retained.
|
among those retained.
|
||||||
|
|
||||||
</Accordion>
|
</Accordion>
|
||||||
|
|
||||||
|
@ -1925,8 +1927,8 @@ possible. You can modify the vectors via the [`Vocab`](/api/vocab) or
|
||||||
if you have vectors in an arbitrary format, as you can read in the vectors with
|
if you have vectors in an arbitrary format, as you can read in the vectors with
|
||||||
your own logic, and just set them with a simple loop. This method is likely to
|
your own logic, and just set them with a simple loop. This method is likely to
|
||||||
be slower than approaches that work with the whole vectors table at once, but
|
be slower than approaches that work with the whole vectors table at once, but
|
||||||
it's a great approach for once-off conversions before you save out your model to
|
it's a great approach for once-off conversions before you save out your `nlp`
|
||||||
disk.
|
object to disk.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### Adding vectors
|
### Adding vectors
|
||||||
|
@ -1978,14 +1980,14 @@ print(nlp2.lang, [token.is_stop for token in nlp2("custom stop")])
|
||||||
The [`@spacy.registry.languages`](/api/top-level#registry) decorator lets you
|
The [`@spacy.registry.languages`](/api/top-level#registry) decorator lets you
|
||||||
register a custom language class and assign it a string name. This means that
|
register a custom language class and assign it a string name. This means that
|
||||||
you can call [`spacy.blank`](/api/top-level#spacy.blank) with your custom
|
you can call [`spacy.blank`](/api/top-level#spacy.blank) with your custom
|
||||||
language name, and even train models with it and refer to it in your
|
language name, and even train pipelines with it and refer to it in your
|
||||||
[training config](/usage/training#config).
|
[training config](/usage/training#config).
|
||||||
|
|
||||||
> #### Config usage
|
> #### Config usage
|
||||||
>
|
>
|
||||||
> After registering your custom language class using the `languages` registry,
|
> After registering your custom language class using the `languages` registry,
|
||||||
> you can refer to it in your [training config](/usage/training#config). This
|
> you can refer to it in your [training config](/usage/training#config). This
|
||||||
> means spaCy will train your model using the custom subclass.
|
> means spaCy will train your pipeline using the custom subclass.
|
||||||
>
|
>
|
||||||
> ```ini
|
> ```ini
|
||||||
> [nlp]
|
> [nlp]
|
||||||
|
|
|
@ -8,25 +8,24 @@ menu:
|
||||||
- ['Production Use', 'production']
|
- ['Production Use', 'production']
|
||||||
---
|
---
|
||||||
|
|
||||||
spaCy's models can be installed as **Python packages**. This means that they're
|
spaCy's trained pipelines can be installed as **Python packages**. This means
|
||||||
a component of your application, just like any other module. They're versioned
|
that they're a component of your application, just like any other module.
|
||||||
and can be defined as a dependency in your `requirements.txt`. Models can be
|
They're versioned and can be defined as a dependency in your `requirements.txt`.
|
||||||
installed from a download URL or a local directory, manually or via
|
Trained pipelines can be installed from a download URL or a local directory,
|
||||||
[pip](https://pypi.python.org/pypi/pip). Their data can be located anywhere on
|
manually or via [pip](https://pypi.python.org/pypi/pip). Their data can be
|
||||||
your file system.
|
located anywhere on your file system.
|
||||||
|
|
||||||
> #### Important note
|
> #### Important note
|
||||||
>
|
>
|
||||||
> If you're upgrading to spaCy v3.x, you need to **download the new models**. If
|
> If you're upgrading to spaCy v3.x, you need to **download the new pipeline
|
||||||
> you've trained statistical models that use spaCy's annotations, you should
|
> packages**. If you've trained your own pipelines, you need to **retrain** them
|
||||||
> **retrain your models** after updating spaCy. If you don't retrain, you may
|
> after updating spaCy.
|
||||||
> suffer train/test skew, which might decrease your accuracy.
|
|
||||||
|
|
||||||
## Quickstart {hidden="true"}
|
## Quickstart {hidden="true"}
|
||||||
|
|
||||||
import QuickstartModels from 'widgets/quickstart-models.js'
|
import QuickstartModels from 'widgets/quickstart-models.js'
|
||||||
|
|
||||||
<QuickstartModels title="Quickstart" id="quickstart" description="Install a default model, get the code to load it from within spaCy and an example to test it. For more options, see the section on available models below." />
|
<QuickstartModels title="Quickstart" id="quickstart" description="Install a default trained pipeline package, get the code to load it from within spaCy and an example to test it. For more options, see the section on available packages below." />
|
||||||
|
|
||||||
## Language support {#languages}
|
## Language support {#languages}
|
||||||
|
|
||||||
|
@ -34,14 +33,14 @@ spaCy currently provides support for the following languages. You can help by
|
||||||
[improving the existing language data](/usage/adding-languages#language-data)
|
[improving the existing language data](/usage/adding-languages#language-data)
|
||||||
and extending the tokenization patterns.
|
and extending the tokenization patterns.
|
||||||
[See here](https://github.com/explosion/spaCy/issues/3056) for details on how to
|
[See here](https://github.com/explosion/spaCy/issues/3056) for details on how to
|
||||||
contribute to model development.
|
contribute to development.
|
||||||
|
|
||||||
> #### Usage note
|
> #### Usage note
|
||||||
>
|
>
|
||||||
> If a model is available for a language, you can download it using the
|
> If a trained pipeline is available for a language, you can download it using
|
||||||
> [`spacy download`](/api/cli#download) command. In order to use languages that
|
> the [`spacy download`](/api/cli#download) command. In order to use languages
|
||||||
> don't yet come with a model, you have to import them directly, or use
|
> that don't yet come with a trained pipeline, you have to import them directly,
|
||||||
> [`spacy.blank`](/api/top-level#spacy.blank):
|
> or use [`spacy.blank`](/api/top-level#spacy.blank):
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> from spacy.lang.fi import Finnish
|
> from spacy.lang.fi import Finnish
|
||||||
|
@ -73,13 +72,13 @@ import Languages from 'widgets/languages.js'
|
||||||
> nlp = spacy.blank("xx")
|
> nlp = spacy.blank("xx")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
spaCy also supports models trained on more than one language. This is especially
|
spaCy also supports pipelines trained on more than one language. This is
|
||||||
useful for named entity recognition. The language ID used for multi-language or
|
especially useful for named entity recognition. The language ID used for
|
||||||
language-neutral models is `xx`. The language class, a generic subclass
|
multi-language or language-neutral pipelines is `xx`. The language class, a
|
||||||
containing only the base language data, can be found in
|
generic subclass containing only the base language data, can be found in
|
||||||
[`lang/xx`](https://github.com/explosion/spaCy/tree/master/spacy/lang/xx).
|
[`lang/xx`](https://github.com/explosion/spaCy/tree/master/spacy/lang/xx).
|
||||||
|
|
||||||
To train a model using the neutral multi-language class, you can set
|
To train a pipeline using the neutral multi-language class, you can set
|
||||||
`lang = "xx"` in your [training config](/usage/training#config). You can also
|
`lang = "xx"` in your [training config](/usage/training#config). You can also
|
||||||
import the `MultiLanguage` class directly, or call
|
import the `MultiLanguage` class directly, or call
|
||||||
[`spacy.blank("xx")`](/api/top-level#spacy.blank) for lazy-loading.
|
[`spacy.blank("xx")`](/api/top-level#spacy.blank) for lazy-loading.
|
||||||
|
@ -111,7 +110,7 @@ The Chinese language class supports three word segmentation options:
|
||||||
3. **PKUSeg**: As of spaCy v2.3.0, support for
|
3. **PKUSeg**: As of spaCy v2.3.0, support for
|
||||||
[PKUSeg](https://github.com/lancopku/PKUSeg-python) has been added to support
|
[PKUSeg](https://github.com/lancopku/PKUSeg-python) has been added to support
|
||||||
better segmentation for Chinese OntoNotes and the provided
|
better segmentation for Chinese OntoNotes and the provided
|
||||||
[Chinese models](/models/zh). Enable PKUSeg with the tokenizer option
|
[Chinese pipelines](/models/zh). Enable PKUSeg with the tokenizer option
|
||||||
`{"segmenter": "pkuseg"}`.
|
`{"segmenter": "pkuseg"}`.
|
||||||
|
|
||||||
<Infobox variant="warning">
|
<Infobox variant="warning">
|
||||||
|
@ -169,9 +168,9 @@ nlp.tokenizer.pkuseg_update_user_dict([], reset=True)
|
||||||
|
|
||||||
</Accordion>
|
</Accordion>
|
||||||
|
|
||||||
<Accordion title="Details on pretrained and custom Chinese models" spaced>
|
<Accordion title="Details on trained and custom Chinese pipelines" spaced>
|
||||||
|
|
||||||
The [Chinese models](/models/zh) provided by spaCy include a custom `pkuseg`
|
The [Chinese pipelines](/models/zh) provided by spaCy include a custom `pkuseg`
|
||||||
model trained only on
|
model trained only on
|
||||||
[Chinese OntoNotes 5.0](https://catalog.ldc.upenn.edu/LDC2013T19), since the
|
[Chinese OntoNotes 5.0](https://catalog.ldc.upenn.edu/LDC2013T19), since the
|
||||||
models provided by `pkuseg` include data restricted to research use. For
|
models provided by `pkuseg` include data restricted to research use. For
|
||||||
|
@ -208,29 +207,29 @@ nlp = Chinese(meta={"tokenizer": {"config": {"pkuseg_model": "/path/to/pkuseg_mo
|
||||||
The Japanese language class uses
|
The Japanese language class uses
|
||||||
[SudachiPy](https://github.com/WorksApplications/SudachiPy) for word
|
[SudachiPy](https://github.com/WorksApplications/SudachiPy) for word
|
||||||
segmentation and part-of-speech tagging. The default Japanese language class and
|
segmentation and part-of-speech tagging. The default Japanese language class and
|
||||||
the provided Japanese models use SudachiPy split mode `A`. The `meta` argument
|
the provided Japanese pipelines use SudachiPy split mode `A`. The `meta`
|
||||||
of the `Japanese` language class can be used to configure the split mode to `A`,
|
argument of the `Japanese` language class can be used to configure the split
|
||||||
`B` or `C`.
|
mode to `A`, `B` or `C`.
|
||||||
|
|
||||||
<Infobox variant="warning">
|
<Infobox variant="warning">
|
||||||
|
|
||||||
If you run into errors related to `sudachipy`, which is currently under active
|
If you run into errors related to `sudachipy`, which is currently under active
|
||||||
development, we suggest downgrading to `sudachipy==0.4.5`, which is the version
|
development, we suggest downgrading to `sudachipy==0.4.5`, which is the version
|
||||||
used for training the current [Japanese models](/models/ja).
|
used for training the current [Japanese pipelines](/models/ja).
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
## Installing and using models {#download}
|
## Installing and using trained pipelines {#download}
|
||||||
|
|
||||||
The easiest way to download a model is via spaCy's
|
The easiest way to download a trained pipeline is via spaCy's
|
||||||
[`download`](/api/cli#download) command. It takes care of finding the
|
[`download`](/api/cli#download) command. It takes care of finding the
|
||||||
best-matching model compatible with your spaCy installation.
|
best-matching package compatible with your spaCy installation.
|
||||||
|
|
||||||
> #### Important note for v3.0
|
> #### Important note for v3.0
|
||||||
>
|
>
|
||||||
> Note that as of spaCy v3.0, model shortcut links that create (potentially
|
> Note that as of spaCy v3.0, shortcut links like `en` that create (potentially
|
||||||
> brittle) symlinks in your spaCy installation are **deprecated**. To download
|
> brittle) symlinks in your spaCy installation are **deprecated**. To download
|
||||||
> and load an installed model, use its full name:
|
> and load an installed pipeline package, use its full name:
|
||||||
>
|
>
|
||||||
> ```diff
|
> ```diff
|
||||||
> - python -m spacy download en
|
> - python -m spacy download en
|
||||||
|
@ -243,14 +242,14 @@ best-matching model compatible with your spaCy installation.
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
```cli
|
```cli
|
||||||
# Download best-matching version of a model for your spaCy installation
|
# Download best-matching version of a package for your spaCy installation
|
||||||
$ python -m spacy download en_core_web_sm
|
$ python -m spacy download en_core_web_sm
|
||||||
|
|
||||||
# Download exact model version
|
# Download exact package version
|
||||||
$ python -m spacy download en_core_web_sm-3.0.0 --direct
|
$ python -m spacy download en_core_web_sm-3.0.0 --direct
|
||||||
```
|
```
|
||||||
|
|
||||||
The download command will [install the model](/usage/models#download-pip) via
|
The download command will [install the package](/usage/models#download-pip) via
|
||||||
pip and place the package in your `site-packages` directory.
|
pip and place the package in your `site-packages` directory.
|
||||||
|
|
||||||
```cli
|
```cli
|
||||||
|
@ -266,11 +265,11 @@ doc = nlp("This is a sentence.")
|
||||||
|
|
||||||
### Installation via pip {#download-pip}
|
### Installation via pip {#download-pip}
|
||||||
|
|
||||||
To download a model directly using [pip](https://pypi.python.org/pypi/pip),
|
To download a trained pipeline directly using
|
||||||
point `pip install` to the URL or local path of the archive file. To find the
|
[pip](https://pypi.python.org/pypi/pip), point `pip install` to the URL or local
|
||||||
direct link to a model, head over to the
|
path of the archive file. To find the direct link to a package, head over to the
|
||||||
[model releases](https://github.com/explosion/spacy-models/releases), right
|
[releases](https://github.com/explosion/spacy-models/releases), right click on
|
||||||
click on the archive link and copy it to your clipboard.
|
the archive link and copy it to your clipboard.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# With external URL
|
# With external URL
|
||||||
|
@ -280,60 +279,61 @@ $ pip install https://github.com/explosion/spacy-models/releases/download/en_cor
|
||||||
$ pip install /Users/you/en_core_web_sm-3.0.0.tar.gz
|
$ pip install /Users/you/en_core_web_sm-3.0.0.tar.gz
|
||||||
```
|
```
|
||||||
|
|
||||||
By default, this will install the model into your `site-packages` directory. You
|
By default, this will install the pipeline package into your `site-packages`
|
||||||
can then use `spacy.load()` to load it via its package name or
|
directory. You can then use `spacy.load` to load it via its package name or
|
||||||
[import it](#usage-import) explicitly as a module. If you need to download
|
[import it](#usage-import) explicitly as a module. If you need to download
|
||||||
models as part of an automated process, we recommend using pip with a direct
|
pipeline packages as part of an automated process, we recommend using pip with a
|
||||||
link, instead of relying on spaCy's [`download`](/api/cli#download) command.
|
direct link, instead of relying on spaCy's [`download`](/api/cli#download)
|
||||||
|
command.
|
||||||
|
|
||||||
You can also add the direct download link to your application's
|
You can also add the direct download link to your application's
|
||||||
`requirements.txt`. For more details, see the section on
|
`requirements.txt`. For more details, see the section on
|
||||||
[working with models in production](#production).
|
[working with pipeline packages in production](#production).
|
||||||
|
|
||||||
### Manual download and installation {#download-manual}
|
### Manual download and installation {#download-manual}
|
||||||
|
|
||||||
In some cases, you might prefer downloading the data manually, for example to
|
In some cases, you might prefer downloading the data manually, for example to
|
||||||
place it into a custom directory. You can download the model via your browser
|
place it into a custom directory. You can download the package via your browser
|
||||||
from the [latest releases](https://github.com/explosion/spacy-models/releases),
|
from the [latest releases](https://github.com/explosion/spacy-models/releases),
|
||||||
or configure your own download script using the URL of the archive file. The
|
or configure your own download script using the URL of the archive file. The
|
||||||
archive consists of a model directory that contains another directory with the
|
archive consists of a package directory that contains another directory with the
|
||||||
model data.
|
pipeline data.
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
### Directory structure {highlight="6"}
|
### Directory structure {highlight="6"}
|
||||||
└── en_core_web_md-3.0.0.tar.gz # downloaded archive
|
└── en_core_web_md-3.0.0.tar.gz # downloaded archive
|
||||||
├── setup.py # setup file for pip installation
|
├── setup.py # setup file for pip installation
|
||||||
├── meta.json # copy of model meta
|
├── meta.json # copy of pipeline meta
|
||||||
└── en_core_web_md # 📦 model package
|
└── en_core_web_md # 📦 pipeline package
|
||||||
├── __init__.py # init for pip installation
|
├── __init__.py # init for pip installation
|
||||||
└── en_core_web_md-3.0.0 # model data
|
└── en_core_web_md-3.0.0 # pipeline data
|
||||||
├── config.cfg # model config
|
├── config.cfg # pipeline config
|
||||||
├── meta.json # model meta
|
├── meta.json # pipeline meta
|
||||||
└── ... # directories with component data
|
└── ... # directories with component data
|
||||||
```
|
```
|
||||||
|
|
||||||
You can place the **model package directory** anywhere on your local file
|
You can place the **pipeline package directory** anywhere on your local file
|
||||||
system.
|
system.
|
||||||
|
|
||||||
### Using models with spaCy {#usage}
|
### Using trained pipelines with spaCy {#usage}
|
||||||
|
|
||||||
To load a model, use [`spacy.load`](/api/top-level#spacy.load) with the model's
|
To load a pipeline package, use [`spacy.load`](/api/top-level#spacy.load) with
|
||||||
package name or a path to the data directory:
|
the package name or a path to the data directory:
|
||||||
|
|
||||||
> #### Important note for v3.0
|
> #### Important note for v3.0
|
||||||
>
|
>
|
||||||
> Note that as of spaCy v3.0, model shortcut links that create (potentially
|
> Note that as of spaCy v3.0, shortcut links like `en` that create (potentially
|
||||||
> brittle) symlinks in your spaCy installation are **deprecated**. To load an
|
> brittle) symlinks in your spaCy installation are **deprecated**. To download
|
||||||
> installed model, use its full name:
|
> and load an installed pipeline package, use its full name:
|
||||||
>
|
>
|
||||||
> ```diff
|
> ```diff
|
||||||
> - nlp = spacy.load("en")
|
> - python -m spacy download en
|
||||||
> + nlp = spacy.load("en_core_web_sm")
|
> + python -m spacy dowmload en_core_web_sm
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import spacy
|
import spacy
|
||||||
nlp = spacy.load("en_core_web_sm") # load model package "en_core_web_sm"
|
nlp = spacy.load("en_core_web_sm") # load package "en_core_web_sm"
|
||||||
nlp = spacy.load("/path/to/en_core_web_sm") # load package from a directory
|
nlp = spacy.load("/path/to/en_core_web_sm") # load package from a directory
|
||||||
|
|
||||||
doc = nlp("This is a sentence.")
|
doc = nlp("This is a sentence.")
|
||||||
|
@ -342,17 +342,18 @@ doc = nlp("This is a sentence.")
|
||||||
<Infobox title="Tip: Preview model info" emoji="💡">
|
<Infobox title="Tip: Preview model info" emoji="💡">
|
||||||
|
|
||||||
You can use the [`info`](/api/cli#info) command or
|
You can use the [`info`](/api/cli#info) command or
|
||||||
[`spacy.info()`](/api/top-level#spacy.info) method to print a model's meta data
|
[`spacy.info()`](/api/top-level#spacy.info) method to print a pipeline
|
||||||
before loading it. Each `Language` object with a loaded model also exposes the
|
packages's meta data before loading it. Each `Language` object with a loaded
|
||||||
model's meta data as the attribute `meta`. For example, `nlp.meta['version']`
|
pipeline also exposes the pipeline's meta data as the attribute `meta`. For
|
||||||
will return the model's version.
|
example, `nlp.meta['version']` will return the package version.
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
### Importing models as modules {#usage-import}
|
### Importing pipeline packages as modules {#usage-import}
|
||||||
|
|
||||||
If you've installed a model via spaCy's downloader, or directly via pip, you can
|
If you've installed a trained pipeline via [`spacy download`](/api/cli#download)
|
||||||
also `import` it and then call its `load()` method with no arguments:
|
or directly via pip, you can also `import` it and then call its `load()` method
|
||||||
|
with no arguments:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### {executable="true"}
|
### {executable="true"}
|
||||||
|
@ -362,51 +363,38 @@ nlp = en_core_web_sm.load()
|
||||||
doc = nlp("This is a sentence.")
|
doc = nlp("This is a sentence.")
|
||||||
```
|
```
|
||||||
|
|
||||||
How you choose to load your models ultimately depends on personal preference.
|
How you choose to load your trained pipelines ultimately depends on personal
|
||||||
However, **for larger code bases**, we usually recommend native imports, as this
|
preference. However, **for larger code bases**, we usually recommend native
|
||||||
will make it easier to integrate models with your existing build process,
|
imports, as this will make it easier to integrate pipeline packages with your
|
||||||
continuous integration workflow and testing framework. It'll also prevent you
|
existing build process, continuous integration workflow and testing framework.
|
||||||
from ever trying to load a model that is not installed, as your code will raise
|
It'll also prevent you from ever trying to load a package that is not installed,
|
||||||
an `ImportError` immediately, instead of failing somewhere down the line when
|
as your code will raise an `ImportError` immediately, instead of failing
|
||||||
calling `spacy.load()`.
|
somewhere down the line when calling `spacy.load()`. For more details, see the
|
||||||
|
section on [working with pipeline packages in production](#production).
|
||||||
|
|
||||||
For more details, see the section on
|
## Using trained pipelines in production {#production}
|
||||||
[working with models in production](#production).
|
|
||||||
|
|
||||||
### Using your own models {#own-models}
|
If your application depends on one or more trained pipeline packages, you'll
|
||||||
|
usually want to integrate them into your continuous integration workflow and
|
||||||
If you've trained your own model, for example for
|
build process. While spaCy provides a range of useful helpers for downloading
|
||||||
[additional languages](/usage/adding-languages) or
|
and loading pipeline packages, the underlying functionality is entirely based on
|
||||||
[custom named entities](/usage/training#ner), you can save its state using the
|
native Python packaging. This allows your application to handle a spaCy pipeline
|
||||||
[`Language.to_disk()`](/api/language#to_disk) method. To make the model more
|
like any other package dependency.
|
||||||
convenient to deploy, we recommend wrapping it as a Python package.
|
|
||||||
|
|
||||||
For more information and a detailed guide on how to package your model, see the
|
|
||||||
documentation on [saving and loading models](/usage/saving-loading#models).
|
|
||||||
|
|
||||||
## Using models in production {#production}
|
|
||||||
|
|
||||||
If your application depends on one or more models, you'll usually want to
|
|
||||||
integrate them into your continuous integration workflow and build process.
|
|
||||||
While spaCy provides a range of useful helpers for downloading, linking and
|
|
||||||
loading models, the underlying functionality is entirely based on native Python
|
|
||||||
packages. This allows your application to handle a model like any other package
|
|
||||||
dependency.
|
|
||||||
|
|
||||||
<!-- TODO: reference relevant spaCy project -->
|
<!-- TODO: reference relevant spaCy project -->
|
||||||
|
|
||||||
### Downloading and requiring model dependencies {#models-download}
|
### Downloading and requiring package dependencies {#models-download}
|
||||||
|
|
||||||
spaCy's built-in [`download`](/api/cli#download) command is mostly intended as a
|
spaCy's built-in [`download`](/api/cli#download) command is mostly intended as a
|
||||||
convenient, interactive wrapper. It performs compatibility checks and prints
|
convenient, interactive wrapper. It performs compatibility checks and prints
|
||||||
detailed error messages and warnings. However, if you're downloading models as
|
detailed error messages and warnings. However, if you're downloading pipeline
|
||||||
part of an automated build process, this only adds an unnecessary layer of
|
packages as part of an automated build process, this only adds an unnecessary
|
||||||
complexity. If you know which models your application needs, you should be
|
layer of complexity. If you know which packages your application needs, you
|
||||||
specifying them directly.
|
should be specifying them directly.
|
||||||
|
|
||||||
Because all models are valid Python packages, you can add them to your
|
Because pipeline packages are valid Python packages, you can add them to your
|
||||||
application's `requirements.txt`. If you're running your own internal PyPi
|
application's `requirements.txt`. If you're running your own internal PyPi
|
||||||
installation, you can upload the models there. pip's
|
installation, you can upload the pipeline packages there. pip's
|
||||||
[requirements file format](https://pip.pypa.io/en/latest/reference/pip_install/#requirements-file-format)
|
[requirements file format](https://pip.pypa.io/en/latest/reference/pip_install/#requirements-file-format)
|
||||||
supports both package names to download via a PyPi server, as well as direct
|
supports both package names to download via a PyPi server, as well as direct
|
||||||
URLs.
|
URLs.
|
||||||
|
@ -422,17 +410,17 @@ the download URL. This way, the package won't be re-downloaded and overwritten
|
||||||
if it's already installed - just like when you're downloading a package from
|
if it's already installed - just like when you're downloading a package from
|
||||||
PyPi.
|
PyPi.
|
||||||
|
|
||||||
All models are versioned and specify their spaCy dependency. This ensures
|
All pipeline packages are versioned and specify their spaCy dependency. This
|
||||||
cross-compatibility and lets you specify exact version requirements for each
|
ensures cross-compatibility and lets you specify exact version requirements for
|
||||||
model. If you've trained your own model, you can use the
|
each pipeline. If you've [trained](/usage/training) your own pipeline, you can
|
||||||
[`package`](/api/cli#package) command to generate the required meta data and
|
use the [`spacy package`](/api/cli#package) command to generate the required
|
||||||
turn it into a loadable package.
|
meta data and turn it into a loadable package.
|
||||||
|
|
||||||
### Loading and testing models {#models-loading}
|
### Loading and testing pipeline packages {#models-loading}
|
||||||
|
|
||||||
Models are regular Python packages, so you can also import them as a package
|
Pipeline packages are regular Python packages, so you can also import them as a
|
||||||
using Python's native `import` syntax, and then call the `load` method to load
|
package using Python's native `import` syntax, and then call the `load` method
|
||||||
the model data and return an `nlp` object:
|
to load the data and return an `nlp` object:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import en_core_web_sm
|
import en_core_web_sm
|
||||||
|
@ -440,16 +428,17 @@ nlp = en_core_web_sm.load()
|
||||||
```
|
```
|
||||||
|
|
||||||
In general, this approach is recommended for larger code bases, as it's more
|
In general, this approach is recommended for larger code bases, as it's more
|
||||||
"native", and doesn't depend on symlinks or rely on spaCy's loader to resolve
|
"native", and doesn't rely on spaCy's loader to resolve string names to
|
||||||
string names to model packages. If a model can't be imported, Python will raise
|
packages. If a package can't be imported, Python will raise an `ImportError`
|
||||||
an `ImportError` immediately. And if a model is imported but not used, any
|
immediately. And if a package is imported but not used, any linter will catch
|
||||||
linter will catch that.
|
that.
|
||||||
|
|
||||||
Similarly, it'll give you more flexibility when writing tests that require
|
Similarly, it'll give you more flexibility when writing tests that require
|
||||||
loading models. For example, instead of writing your own `try` and `except`
|
loading pipelines. For example, instead of writing your own `try` and `except`
|
||||||
logic around spaCy's loader, you can use
|
logic around spaCy's loader, you can use
|
||||||
[pytest](http://pytest.readthedocs.io/en/latest/)'s
|
[pytest](http://pytest.readthedocs.io/en/latest/)'s
|
||||||
[`importorskip()`](https://docs.pytest.org/en/latest/builtin.html#_pytest.outcomes.importorskip)
|
[`importorskip()`](https://docs.pytest.org/en/latest/builtin.html#_pytest.outcomes.importorskip)
|
||||||
method to only run a test if a specific model or model version is installed.
|
method to only run a test if a specific pipeline package or version is
|
||||||
Each model package exposes a `__version__` attribute which you can also use to
|
installed. Each pipeline package package exposes a `__version__` attribute which
|
||||||
perform your own version compatibility checks before loading a model.
|
you can also use to perform your own version compatibility checks before loading
|
||||||
|
it.
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user