mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-22 14:30:32 +03:00
Merge branch 'develop' into nightly.spacy.io
This commit is contained in:
commit
12c1be9438
35
Makefile
35
Makefile
|
@ -1,11 +1,15 @@
|
|||
SHELL := /bin/bash
|
||||
PYVER := 3.6
|
||||
VENV := ./env$(PYVER)
|
||||
|
||||
ifndef SPACY_EXTRAS
|
||||
override SPACY_EXTRAS = spacy-lookups-data jieba pkuseg==0.0.25 sudachipy sudachidict_core
|
||||
endif
|
||||
|
||||
ifndef PYVER
|
||||
override PYVER = 3.6
|
||||
endif
|
||||
|
||||
VENV := ./env$(PYVER)
|
||||
|
||||
version := $(shell "bin/get-version.sh")
|
||||
package := $(shell "bin/get-package.sh")
|
||||
|
||||
|
@ -13,9 +17,15 @@ ifndef SPACY_BIN
|
|||
override SPACY_BIN = $(package)-$(version).pex
|
||||
endif
|
||||
|
||||
dist/$(SPACY_BIN) : wheelhouse/spacy-$(version).stamp
|
||||
ifndef WHEELHOUSE
|
||||
override WHEELHOUSE = "./wheelhouse"
|
||||
endif
|
||||
|
||||
|
||||
|
||||
dist/$(SPACY_BIN) : $(WHEELHOUSE)/spacy-$(PYVER)-$(version).stamp
|
||||
$(VENV)/bin/pex \
|
||||
-f ./wheelhouse \
|
||||
-f $(WHEELHOUSE) \
|
||||
--no-index \
|
||||
--disable-cache \
|
||||
-m spacy \
|
||||
|
@ -25,22 +35,23 @@ dist/$(SPACY_BIN) : wheelhouse/spacy-$(version).stamp
|
|||
chmod a+rx $@
|
||||
cp $@ dist/spacy.pex
|
||||
|
||||
dist/pytest.pex : wheelhouse/pytest-*.whl
|
||||
$(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m pytest -o $@ pytest pytest-timeout mock
|
||||
dist/pytest.pex : $(WHEELHOUSE)/pytest-*.whl
|
||||
$(VENV)/bin/pex -f $(WHEELHOUSE) --no-index --disable-cache -m pytest -o $@ pytest pytest-timeout mock
|
||||
chmod a+rx $@
|
||||
|
||||
wheelhouse/spacy-$(version).stamp : $(VENV)/bin/pex setup.py spacy/*.py* spacy/*/*.py*
|
||||
$(VENV)/bin/pip wheel . -w ./wheelhouse
|
||||
$(VENV)/bin/pip wheel $(SPACY_EXTRAS) -w ./wheelhouse
|
||||
$(WHEELHOUSE)/spacy-$(PYVER)-$(version).stamp : $(VENV)/bin/pex setup.py spacy/*.py* spacy/*/*.py*
|
||||
$(VENV)/bin/pip wheel . -w $(WHEELHOUSE)
|
||||
$(VENV)/bin/pip wheel $(SPACY_EXTRAS) -w $(WHEELHOUSE)
|
||||
|
||||
touch $@
|
||||
|
||||
wheelhouse/pytest-%.whl : $(VENV)/bin/pex
|
||||
$(VENV)/bin/pip wheel pytest pytest-timeout mock -w ./wheelhouse
|
||||
$(WHEELHOUSE)/pytest-%.whl : $(VENV)/bin/pex
|
||||
$(VENV)/bin/pip wheel pytest pytest-timeout mock -w $(WHEELHOUSE)
|
||||
|
||||
$(VENV)/bin/pex :
|
||||
python$(PYVER) -m venv $(VENV)
|
||||
$(VENV)/bin/pip install -U pip setuptools pex wheel
|
||||
$(VENV)/bin/pip install numpy
|
||||
|
||||
.PHONY : clean test
|
||||
|
||||
|
@ -50,6 +61,6 @@ test : dist/spacy-$(version).pex dist/pytest.pex
|
|||
|
||||
clean : setup.py
|
||||
rm -rf dist/*
|
||||
rm -rf ./wheelhouse
|
||||
rm -rf $(WHEELHOUSE)/*
|
||||
rm -rf $(VENV)
|
||||
python setup.py clean --all
|
||||
|
|
|
@ -36,7 +36,7 @@ max_length = 0
|
|||
limit = 0
|
||||
|
||||
[training.batcher]
|
||||
@batchers = "batch_by_words.v1"
|
||||
@batchers = "spacy.batch_by_words.v1"
|
||||
discard_oversize = false
|
||||
tolerance = 0.2
|
||||
|
||||
|
|
|
@ -35,7 +35,7 @@ max_length = 0
|
|||
limit = 0
|
||||
|
||||
[training.batcher]
|
||||
@batchers = "batch_by_words.v1"
|
||||
@batchers = "spacy.batch_by_words.v1"
|
||||
discard_oversize = false
|
||||
tolerance = 0.2
|
||||
|
||||
|
|
|
@ -24,7 +24,7 @@ redirects = [
|
|||
{from = "/docs/usage/customizing-tokenizer", to = "/usage/linguistic-features#tokenization", force = true},
|
||||
{from = "/docs/usage/language-processing-pipeline", to = "/usage/processing-pipelines", force = true},
|
||||
{from = "/docs/usage/customizing-pipeline", to = "/usage/processing-pipelines", force = true},
|
||||
{from = "/docs/usage/training-ner", to = "/usage/training#ner", force = true},
|
||||
{from = "/docs/usage/training-ner", to = "/usage/training", force = true},
|
||||
{from = "/docs/usage/tutorials", to = "/usage/examples", force = true},
|
||||
{from = "/docs/usage/data-model", to = "/api", force = true},
|
||||
{from = "/docs/usage/cli", to = "/api/cli", force = true},
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# fmt: off
|
||||
__title__ = "spacy-nightly"
|
||||
__version__ = "3.0.0a12"
|
||||
__version__ = "3.0.0a13"
|
||||
__release__ = True
|
||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||
|
|
|
@ -29,9 +29,9 @@ from .project.document import project_document # noqa: F401
|
|||
|
||||
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
|
||||
def link(*args, **kwargs):
|
||||
"""As of spaCy v3.0, model symlinks are deprecated. You can load models
|
||||
using their full names or from a directory path."""
|
||||
"""As of spaCy v3.0, symlinks like "en" are deprecated. You can load trained
|
||||
pipeline packages using their full names or from a directory path."""
|
||||
msg.warn(
|
||||
"As of spaCy v3.0, model symlinks are deprecated. You can load models "
|
||||
"using their full names or from a directory path."
|
||||
"As of spaCy v3.0, model symlinks are deprecated. You can load trained "
|
||||
"pipeline packages using their full names or from a directory path."
|
||||
)
|
||||
|
|
|
@ -25,7 +25,7 @@ COMMAND = "python -m spacy"
|
|||
NAME = "spacy"
|
||||
HELP = """spaCy Command-line Interface
|
||||
|
||||
DOCS: https://spacy.io/api/cli
|
||||
DOCS: https://nightly.spacy.io/api/cli
|
||||
"""
|
||||
PROJECT_HELP = f"""Command-line interface for spaCy projects and templates.
|
||||
You'd typically start by cloning a project template to a local directory and
|
||||
|
@ -36,7 +36,7 @@ DEBUG_HELP = """Suite of helpful commands for debugging and profiling. Includes
|
|||
commands to check and validate your config files, training and evaluation data,
|
||||
and custom model implementations.
|
||||
"""
|
||||
INIT_HELP = """Commands for initializing configs and models."""
|
||||
INIT_HELP = """Commands for initializing configs and pipeline packages."""
|
||||
|
||||
# Wrappers for Typer's annotations. Initially created to set defaults and to
|
||||
# keep the names short, but not needed at the moment.
|
||||
|
@ -297,9 +297,7 @@ def ensure_pathy(path):
|
|||
return Pathy(path)
|
||||
|
||||
|
||||
def git_sparse_checkout(
|
||||
repo: str, subpath: str, dest: Path, *, branch: Optional[str] = None
|
||||
):
|
||||
def git_sparse_checkout(repo: str, subpath: str, dest: Path, *, branch: str = "master"):
|
||||
if dest.exists():
|
||||
msg.fail("Destination of checkout must not exist", exits=1)
|
||||
if not dest.parent.exists():
|
||||
|
@ -323,21 +321,30 @@ def git_sparse_checkout(
|
|||
# This is the "clone, but don't download anything" part.
|
||||
cmd = (
|
||||
f"git clone {repo} {tmp_dir} --no-checkout --depth 1 "
|
||||
"--filter=blob:none" # <-- The key bit
|
||||
f"--filter=blob:none " # <-- The key bit
|
||||
f"-b {branch}"
|
||||
)
|
||||
if branch is not None:
|
||||
cmd = f"{cmd} -b {branch}"
|
||||
run_command(cmd, capture=True)
|
||||
# Now we need to find the missing filenames for the subpath we want.
|
||||
# Looking for this 'rev-list' command in the git --help? Hah.
|
||||
cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}"
|
||||
ret = run_command(cmd, capture=True)
|
||||
missings = "\n".join([x[1:] for x in ret.stdout.split() if x.startswith("?")])
|
||||
repo = _from_http_to_git(repo)
|
||||
# Now pass those missings into another bit of git internals
|
||||
run_command(
|
||||
f"git -C {tmp_dir} fetch-pack --stdin {repo}", capture=True, stdin=missings
|
||||
)
|
||||
missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")])
|
||||
cmd = f"git -C {tmp_dir} fetch-pack {repo} {missings}"
|
||||
run_command(cmd, capture=True)
|
||||
# And finally, we can checkout our subpath
|
||||
run_command(f"git -C {tmp_dir} checkout {branch} {subpath}")
|
||||
cmd = f"git -C {tmp_dir} checkout {branch} {subpath}"
|
||||
run_command(cmd)
|
||||
# We need Path(name) to make sure we also support subdirectories
|
||||
shutil.move(str(tmp_dir / Path(subpath)), str(dest))
|
||||
|
||||
|
||||
def _from_http_to_git(repo):
|
||||
if repo.startswith("http://"):
|
||||
repo = repo.replace(r"http://", r"https://")
|
||||
if repo.startswith(r"https://"):
|
||||
repo = repo.replace("https://", "git@").replace("/", ":", 1)
|
||||
repo = f"{repo}.git"
|
||||
return repo
|
||||
|
|
|
@ -44,7 +44,7 @@ def convert_cli(
|
|||
file_type: FileTypes = Opt("spacy", "--file-type", "-t", help="Type of data to produce"),
|
||||
n_sents: int = Opt(1, "--n-sents", "-n", help="Number of sentences per doc (0 to disable)"),
|
||||
seg_sents: bool = Opt(False, "--seg-sents", "-s", help="Segment sentences (for -c ner)"),
|
||||
model: Optional[str] = Opt(None, "--model", "-b", help="Model for sentence segmentation (for -s)"),
|
||||
model: Optional[str] = Opt(None, "--model", "--base", "-b", help="Trained spaCy pipeline for sentence segmentation to use as base (for --seg-sents)"),
|
||||
morphology: bool = Opt(False, "--morphology", "-m", help="Enable appending morphology to tags"),
|
||||
merge_subtokens: bool = Opt(False, "--merge-subtokens", "-T", help="Merge CoNLL-U subtokens"),
|
||||
converter: str = Opt("auto", "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"),
|
||||
|
@ -61,6 +61,8 @@ def convert_cli(
|
|||
If no output_dir is specified and the output format is JSON, the data
|
||||
is written to stdout, so you can pipe them forward to a JSON file:
|
||||
$ spacy convert some_file.conllu --file-type json > some_file.json
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/cli#convert
|
||||
"""
|
||||
if isinstance(file_type, FileTypes):
|
||||
# We get an instance of the FileTypes from the CLI so we need its string value
|
||||
|
@ -261,6 +263,6 @@ def _get_converter(msg, converter, input_path):
|
|||
msg.warn(
|
||||
"Can't automatically detect NER format. "
|
||||
"Conversion may not succeed. "
|
||||
"See https://spacy.io/api/cli#convert"
|
||||
"See https://nightly.spacy.io/api/cli#convert"
|
||||
)
|
||||
return converter
|
||||
|
|
|
@ -31,6 +31,8 @@ def debug_config_cli(
|
|||
Similar as with the 'train' command, you can override settings from the config
|
||||
as command line options. For instance, --training.batch_size 128 overrides
|
||||
the value of "batch_size" in the block "[training]".
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/cli#debug-config
|
||||
"""
|
||||
overrides = parse_config_overrides(ctx.args)
|
||||
import_code(code_path)
|
||||
|
|
|
@ -18,7 +18,7 @@ from .. import util
|
|||
NEW_LABEL_THRESHOLD = 50
|
||||
# Minimum number of expected occurrences of dependency labels
|
||||
DEP_LABEL_THRESHOLD = 20
|
||||
# Minimum number of expected examples to train a blank model
|
||||
# Minimum number of expected examples to train a new pipeline
|
||||
BLANK_MODEL_MIN_THRESHOLD = 100
|
||||
BLANK_MODEL_THRESHOLD = 2000
|
||||
|
||||
|
@ -47,6 +47,8 @@ def debug_data_cli(
|
|||
Analyze, debug and validate your training and development data. Outputs
|
||||
useful stats, and can help you find problems like invalid entity annotations,
|
||||
cyclic dependencies, low data labels and more.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/cli#debug-data
|
||||
"""
|
||||
if ctx.command.name == "debug-data":
|
||||
msg.warn(
|
||||
|
@ -148,7 +150,7 @@ def debug_data(
|
|||
msg.text(f"Language: {config['nlp']['lang']}")
|
||||
msg.text(f"Training pipeline: {', '.join(pipeline)}")
|
||||
if resume_components:
|
||||
msg.text(f"Components from other models: {', '.join(resume_components)}")
|
||||
msg.text(f"Components from other pipelines: {', '.join(resume_components)}")
|
||||
if frozen_components:
|
||||
msg.text(f"Frozen components: {', '.join(frozen_components)}")
|
||||
msg.text(f"{len(train_dataset)} training docs")
|
||||
|
@ -164,9 +166,7 @@ def debug_data(
|
|||
# TODO: make this feedback more fine-grained and report on updated
|
||||
# components vs. blank components
|
||||
if not resume_components and len(train_dataset) < BLANK_MODEL_THRESHOLD:
|
||||
text = (
|
||||
f"Low number of examples to train from a blank model ({len(train_dataset)})"
|
||||
)
|
||||
text = f"Low number of examples to train a new pipeline ({len(train_dataset)})"
|
||||
if len(train_dataset) < BLANK_MODEL_MIN_THRESHOLD:
|
||||
msg.fail(text)
|
||||
else:
|
||||
|
@ -214,7 +214,7 @@ def debug_data(
|
|||
show=verbose,
|
||||
)
|
||||
else:
|
||||
msg.info("No word vectors present in the model")
|
||||
msg.info("No word vectors present in the package")
|
||||
|
||||
if "ner" in factory_names:
|
||||
# Get all unique NER labels present in the data
|
||||
|
|
|
@ -30,6 +30,8 @@ def debug_model_cli(
|
|||
"""
|
||||
Analyze a Thinc model implementation. Includes checks for internal structure
|
||||
and activations during training.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/cli#debug-model
|
||||
"""
|
||||
if use_gpu >= 0:
|
||||
msg.info("Using GPU")
|
||||
|
|
|
@ -17,16 +17,19 @@ from ..errors import OLD_MODEL_SHORTCUTS
|
|||
def download_cli(
|
||||
# fmt: off
|
||||
ctx: typer.Context,
|
||||
model: str = Arg(..., help="Name of model to download"),
|
||||
model: str = Arg(..., help="Name of pipeline package to download"),
|
||||
direct: bool = Opt(False, "--direct", "-d", "-D", help="Force direct download of name + version"),
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
Download compatible model from default download path using pip. If --direct
|
||||
flag is set, the command expects the full model name with version.
|
||||
For direct downloads, the compatibility check will be skipped. All
|
||||
Download compatible trained pipeline from the default download path using
|
||||
pip. If --direct flag is set, the command expects the full package name with
|
||||
version. For direct downloads, the compatibility check will be skipped. All
|
||||
additional arguments provided to this command will be passed to `pip install`
|
||||
on model installation.
|
||||
on package installation.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/cli#download
|
||||
AVAILABLE PACKAGES: https://spacy.io/models
|
||||
"""
|
||||
download(model, direct, *ctx.args)
|
||||
|
||||
|
@ -34,11 +37,11 @@ def download_cli(
|
|||
def download(model: str, direct: bool = False, *pip_args) -> None:
|
||||
if not is_package("spacy") and "--no-deps" not in pip_args:
|
||||
msg.warn(
|
||||
"Skipping model package dependencies and setting `--no-deps`. "
|
||||
"Skipping pipeline package dependencies and setting `--no-deps`. "
|
||||
"You don't seem to have the spaCy package itself installed "
|
||||
"(maybe because you've built from source?), so installing the "
|
||||
"model dependencies would cause spaCy to be downloaded, which "
|
||||
"probably isn't what you want. If the model package has other "
|
||||
"package dependencies would cause spaCy to be downloaded, which "
|
||||
"probably isn't what you want. If the pipeline package has other "
|
||||
"dependencies, you'll have to install them manually."
|
||||
)
|
||||
pip_args = pip_args + ("--no-deps",)
|
||||
|
@ -53,7 +56,7 @@ def download(model: str, direct: bool = False, *pip_args) -> None:
|
|||
if model in OLD_MODEL_SHORTCUTS:
|
||||
msg.warn(
|
||||
f"As of spaCy v3.0, shortcuts like '{model}' are deprecated. Please"
|
||||
f"use the full model name '{OLD_MODEL_SHORTCUTS[model]}' instead."
|
||||
f"use the full pipeline package name '{OLD_MODEL_SHORTCUTS[model]}' instead."
|
||||
)
|
||||
model_name = OLD_MODEL_SHORTCUTS[model]
|
||||
compatibility = get_compatibility()
|
||||
|
@ -61,7 +64,7 @@ def download(model: str, direct: bool = False, *pip_args) -> None:
|
|||
download_model(dl_tpl.format(m=model_name, v=version), pip_args)
|
||||
msg.good(
|
||||
"Download and installation successful",
|
||||
f"You can now load the model via spacy.load('{model_name}')",
|
||||
f"You can now load the package via spacy.load('{model_name}')",
|
||||
)
|
||||
|
||||
|
||||
|
@ -71,16 +74,16 @@ def get_compatibility() -> dict:
|
|||
if r.status_code != 200:
|
||||
msg.fail(
|
||||
f"Server error ({r.status_code})",
|
||||
f"Couldn't fetch compatibility table. Please find a model for your spaCy "
|
||||
f"Couldn't fetch compatibility table. Please find a package for your spaCy "
|
||||
f"installation (v{about.__version__}), and download it manually. "
|
||||
f"For more details, see the documentation: "
|
||||
f"https://spacy.io/usage/models",
|
||||
f"https://nightly.spacy.io/usage/models",
|
||||
exits=1,
|
||||
)
|
||||
comp_table = r.json()
|
||||
comp = comp_table["spacy"]
|
||||
if version not in comp:
|
||||
msg.fail(f"No compatible models found for v{version} of spaCy", exits=1)
|
||||
msg.fail(f"No compatible packages found for v{version} of spaCy", exits=1)
|
||||
return comp[version]
|
||||
|
||||
|
||||
|
@ -88,7 +91,7 @@ def get_version(model: str, comp: dict) -> str:
|
|||
model = get_base_version(model)
|
||||
if model not in comp:
|
||||
msg.fail(
|
||||
f"No compatible model found for '{model}' (spaCy v{about.__version__})",
|
||||
f"No compatible package found for '{model}' (spaCy v{about.__version__})",
|
||||
exits=1,
|
||||
)
|
||||
return comp[model][0]
|
||||
|
|
|
@ -26,13 +26,16 @@ def evaluate_cli(
|
|||
# fmt: on
|
||||
):
|
||||
"""
|
||||
Evaluate a model. Expects a loadable spaCy model and evaluation data in the
|
||||
binary .spacy format. The --gold-preproc option sets up the evaluation
|
||||
examples with gold-standard sentences and tokens for the predictions. Gold
|
||||
preprocessing helps the annotations align to the tokenization, and may
|
||||
result in sequences of more consistent length. However, it may reduce
|
||||
runtime accuracy due to train/test skew. To render a sample of dependency
|
||||
parses in a HTML file, set as output directory as the displacy_path argument.
|
||||
Evaluate a trained pipeline. Expects a loadable spaCy pipeline and evaluation
|
||||
data in the binary .spacy format. The --gold-preproc option sets up the
|
||||
evaluation examples with gold-standard sentences and tokens for the
|
||||
predictions. Gold preprocessing helps the annotations align to the
|
||||
tokenization, and may result in sequences of more consistent length. However,
|
||||
it may reduce runtime accuracy due to train/test skew. To render a sample of
|
||||
dependency parses in a HTML file, set as output directory as the
|
||||
displacy_path argument.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/cli#evaluate
|
||||
"""
|
||||
evaluate(
|
||||
model,
|
||||
|
|
|
@ -12,15 +12,17 @@ from .. import about
|
|||
@app.command("info")
|
||||
def info_cli(
|
||||
# fmt: off
|
||||
model: Optional[str] = Arg(None, help="Optional model name"),
|
||||
model: Optional[str] = Arg(None, help="Optional loadable spaCy pipeline"),
|
||||
markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues"),
|
||||
silent: bool = Opt(False, "--silent", "-s", "-S", help="Don't print anything (just return)"),
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
Print info about spaCy installation. If a model is speficied as an argument,
|
||||
print model information. Flag --markdown prints details in Markdown for easy
|
||||
Print info about spaCy installation. If a pipeline is speficied as an argument,
|
||||
print its meta information. Flag --markdown prints details in Markdown for easy
|
||||
copy-pasting to GitHub issues.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/cli#info
|
||||
"""
|
||||
info(model, markdown=markdown, silent=silent)
|
||||
|
||||
|
@ -30,14 +32,16 @@ def info(
|
|||
) -> Union[str, dict]:
|
||||
msg = Printer(no_print=silent, pretty=not silent)
|
||||
if model:
|
||||
title = f"Info about model '{model}'"
|
||||
title = f"Info about pipeline '{model}'"
|
||||
data = info_model(model, silent=silent)
|
||||
else:
|
||||
title = "Info about spaCy"
|
||||
data = info_spacy()
|
||||
raw_data = {k.lower().replace(" ", "_"): v for k, v in data.items()}
|
||||
if "Models" in data and isinstance(data["Models"], dict):
|
||||
data["Models"] = ", ".join(f"{n} ({v})" for n, v in data["Models"].items())
|
||||
if "Pipelines" in data and isinstance(data["Pipelines"], dict):
|
||||
data["Pipelines"] = ", ".join(
|
||||
f"{n} ({v})" for n, v in data["Pipelines"].items()
|
||||
)
|
||||
markdown_data = get_markdown(data, title=title)
|
||||
if markdown:
|
||||
if not silent:
|
||||
|
@ -63,7 +67,7 @@ def info_spacy() -> Dict[str, any]:
|
|||
"Location": str(Path(__file__).parent.parent),
|
||||
"Platform": platform.platform(),
|
||||
"Python version": platform.python_version(),
|
||||
"Models": all_models,
|
||||
"Pipelines": all_models,
|
||||
}
|
||||
|
||||
|
||||
|
@ -81,7 +85,7 @@ def info_model(model: str, *, silent: bool = True) -> Dict[str, Any]:
|
|||
model_path = model
|
||||
meta_path = model_path / "meta.json"
|
||||
if not meta_path.is_file():
|
||||
msg.fail("Can't find model meta.json", meta_path, exits=1)
|
||||
msg.fail("Can't find pipeline meta.json", meta_path, exits=1)
|
||||
meta = srsly.read_json(meta_path)
|
||||
if model_path.resolve() != model_path:
|
||||
meta["source"] = str(model_path.resolve())
|
||||
|
|
|
@ -27,7 +27,7 @@ def init_config_cli(
|
|||
# fmt: off
|
||||
output_file: Path = Arg(..., help="File to save config.cfg to or - for stdout (will only output config and no additional logging info)", allow_dash=True),
|
||||
lang: Optional[str] = Opt("en", "--lang", "-l", help="Two-letter code of the language to use"),
|
||||
pipeline: Optional[str] = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include in the model (without 'tok2vec' or 'transformer')"),
|
||||
pipeline: Optional[str] = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"),
|
||||
optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
||||
cpu: bool = Opt(False, "--cpu", "-C", help="Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
||||
# fmt: on
|
||||
|
@ -37,6 +37,8 @@ def init_config_cli(
|
|||
specified via the CLI arguments, this command generates a config with the
|
||||
optimal settings for you use case. This includes the choice of architecture,
|
||||
pretrained weights and related hyperparameters.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/cli#init-config
|
||||
"""
|
||||
if isinstance(optimize, Optimizations): # instance of enum from the CLI
|
||||
optimize = optimize.value
|
||||
|
@ -59,15 +61,23 @@ def init_fill_config_cli(
|
|||
functions for their default values and update the base config. This command
|
||||
can be used with a config generated via the training quickstart widget:
|
||||
https://nightly.spacy.io/usage/training#quickstart
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/cli#init-fill-config
|
||||
"""
|
||||
fill_config(output_file, base_path, pretraining=pretraining, diff=diff)
|
||||
|
||||
|
||||
def fill_config(
|
||||
output_file: Path, base_path: Path, *, pretraining: bool = False, diff: bool = False
|
||||
output_file: Path,
|
||||
base_path: Path,
|
||||
*,
|
||||
pretraining: bool = False,
|
||||
diff: bool = False,
|
||||
silent: bool = False,
|
||||
) -> Tuple[Config, Config]:
|
||||
is_stdout = str(output_file) == "-"
|
||||
msg = Printer(no_print=is_stdout)
|
||||
no_print = is_stdout or silent
|
||||
msg = Printer(no_print=no_print)
|
||||
with show_validation_error(hint_fill=False):
|
||||
config = util.load_config(base_path)
|
||||
nlp, _ = util.load_model_from_config(config, auto_fill=True, validate=False)
|
||||
|
@ -85,7 +95,7 @@ def fill_config(
|
|||
msg.warn("Nothing to auto-fill: base config is already complete")
|
||||
else:
|
||||
msg.good("Auto-filled config with all values")
|
||||
if diff and not is_stdout:
|
||||
if diff and not no_print:
|
||||
if before == after:
|
||||
msg.warn("No diff to show: nothing was auto-filled")
|
||||
else:
|
||||
|
@ -94,7 +104,8 @@ def fill_config(
|
|||
print(diff_strings(before, after))
|
||||
msg.divider("END CONFIG DIFF")
|
||||
print("")
|
||||
save_config(filled, output_file, is_stdout=is_stdout)
|
||||
save_config(filled, output_file, is_stdout=is_stdout, silent=silent)
|
||||
return config, filled
|
||||
|
||||
|
||||
def init_config(
|
||||
|
@ -149,8 +160,11 @@ def init_config(
|
|||
save_config(nlp.config, output_file, is_stdout=is_stdout)
|
||||
|
||||
|
||||
def save_config(config: Config, output_file: Path, is_stdout: bool = False) -> None:
|
||||
msg = Printer(no_print=is_stdout)
|
||||
def save_config(
|
||||
config: Config, output_file: Path, is_stdout: bool = False, silent: bool = False
|
||||
) -> None:
|
||||
no_print = is_stdout or silent
|
||||
msg = Printer(no_print=no_print)
|
||||
if is_stdout:
|
||||
print(config.to_str())
|
||||
else:
|
||||
|
@ -158,9 +172,10 @@ def save_config(config: Config, output_file: Path, is_stdout: bool = False) -> N
|
|||
output_file.parent.mkdir(parents=True)
|
||||
config.to_disk(output_file, interpolate=False)
|
||||
msg.good("Saved config", output_file)
|
||||
msg.text("You can now add your data and train your model:")
|
||||
msg.text("You can now add your data and train your pipeline:")
|
||||
variables = ["--paths.train ./train.spacy", "--paths.dev ./dev.spacy"]
|
||||
print(f"{COMMAND} train {output_file.parts[-1]} {' '.join(variables)}")
|
||||
if not no_print:
|
||||
print(f"{COMMAND} train {output_file.parts[-1]} {' '.join(variables)}")
|
||||
|
||||
|
||||
def has_spacy_transformers() -> bool:
|
||||
|
|
|
@ -28,7 +28,7 @@ except ImportError:
|
|||
DEFAULT_OOV_PROB = -20
|
||||
|
||||
|
||||
@init_cli.command("model")
|
||||
@init_cli.command("vocab")
|
||||
@app.command(
|
||||
"init-model",
|
||||
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||
|
@ -37,8 +37,8 @@ DEFAULT_OOV_PROB = -20
|
|||
def init_model_cli(
|
||||
# fmt: off
|
||||
ctx: typer.Context, # This is only used to read additional arguments
|
||||
lang: str = Arg(..., help="Model language"),
|
||||
output_dir: Path = Arg(..., help="Model output directory"),
|
||||
lang: str = Arg(..., help="Pipeline language"),
|
||||
output_dir: Path = Arg(..., help="Pipeline output directory"),
|
||||
freqs_loc: Optional[Path] = Arg(None, help="Location of words frequencies file", exists=True),
|
||||
clusters_loc: Optional[Path] = Opt(None, "--clusters-loc", "-c", help="Optional location of brown clusters data", exists=True),
|
||||
jsonl_loc: Optional[Path] = Opt(None, "--jsonl-loc", "-j", help="Location of JSONL-formatted attributes file", exists=True),
|
||||
|
@ -46,19 +46,22 @@ def init_model_cli(
|
|||
prune_vectors: int = Opt(-1, "--prune-vectors", "-V", help="Optional number of vectors to prune to"),
|
||||
truncate_vectors: int = Opt(0, "--truncate-vectors", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
|
||||
vectors_name: Optional[str] = Opt(None, "--vectors-name", "-vn", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
|
||||
model_name: Optional[str] = Opt(None, "--model-name", "-mn", help="Optional name for the model meta"),
|
||||
base_model: Optional[str] = Opt(None, "--base-model", "-b", help="Base model (for languages with custom tokenizers)")
|
||||
model_name: Optional[str] = Opt(None, "--meta-name", "-mn", help="Optional name of the package for the pipeline meta"),
|
||||
base_model: Optional[str] = Opt(None, "--base", "-b", help="Name of or path to base pipeline to start with (mostly relevant for pipelines with custom tokenizers)")
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
Create a new model from raw data. If vectors are provided in Word2Vec format,
|
||||
they can be either a .txt or zipped as a .zip or .tar.gz.
|
||||
Create a new blank pipeline directory with vocab and vectors from raw data.
|
||||
If vectors are provided in Word2Vec format, they can be either a .txt or
|
||||
zipped as a .zip or .tar.gz.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/cli#init-vocab
|
||||
"""
|
||||
if ctx.command.name == "init-model":
|
||||
msg.warn(
|
||||
"The init-model command is now available via the 'init model' "
|
||||
"subcommand (without the hyphen). You can run python -m spacy init "
|
||||
"--help for an overview of the other available initialization commands."
|
||||
"The init-model command is now called 'init vocab'. You can run "
|
||||
"'python -m spacy init --help' for an overview of the other "
|
||||
"available initialization commands."
|
||||
)
|
||||
init_model(
|
||||
lang,
|
||||
|
@ -115,10 +118,10 @@ def init_model(
|
|||
msg.fail("Can't find words frequencies file", freqs_loc, exits=1)
|
||||
lex_attrs = read_attrs_from_deprecated(msg, freqs_loc, clusters_loc)
|
||||
|
||||
with msg.loading("Creating model..."):
|
||||
with msg.loading("Creating blank pipeline..."):
|
||||
nlp = create_model(lang, lex_attrs, name=model_name, base_model=base_model)
|
||||
|
||||
msg.good("Successfully created model")
|
||||
msg.good("Successfully created blank pipeline")
|
||||
if vectors_loc is not None:
|
||||
add_vectors(
|
||||
msg, nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name
|
||||
|
@ -242,7 +245,8 @@ def add_vectors(
|
|||
if vectors_data is not None:
|
||||
nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
|
||||
if name is None:
|
||||
nlp.vocab.vectors.name = f"{nlp.meta['lang']}_model.vectors"
|
||||
# TODO: Is this correct? Does this matter?
|
||||
nlp.vocab.vectors.name = f"{nlp.meta['lang']}_{nlp.meta['name']}.vectors"
|
||||
else:
|
||||
nlp.vocab.vectors.name = name
|
||||
nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
|
||||
|
|
|
@ -14,23 +14,25 @@ from .. import about
|
|||
@app.command("package")
|
||||
def package_cli(
|
||||
# fmt: off
|
||||
input_dir: Path = Arg(..., help="Directory with model data", exists=True, file_okay=False),
|
||||
input_dir: Path = Arg(..., help="Directory with pipeline data", exists=True, file_okay=False),
|
||||
output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False),
|
||||
meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False),
|
||||
create_meta: bool = Opt(False, "--create-meta", "-c", "-C", help="Create meta.json, even if one exists"),
|
||||
version: Optional[str] = Opt(None, "--version", "-v", help="Package version to override meta"),
|
||||
no_sdist: bool = Opt(False, "--no-sdist", "-NS", help="Don't build .tar.gz sdist, can be set if you want to run this step manually"),
|
||||
force: bool = Opt(False, "--force", "-f", "-F", help="Force overwriting existing model in output directory"),
|
||||
force: bool = Opt(False, "--force", "-f", "-F", help="Force overwriting existing data in output directory"),
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
Generate an installable Python package for a model. Includes model data,
|
||||
Generate an installable Python package for a pipeline. Includes binary data,
|
||||
meta and required installation files. A new directory will be created in the
|
||||
specified output directory, and model data will be copied over. If
|
||||
specified output directory, and the data will be copied over. If
|
||||
--create-meta is set and a meta.json already exists in the output directory,
|
||||
the existing values will be used as the defaults in the command-line prompt.
|
||||
After packaging, "python setup.py sdist" is run in the package directory,
|
||||
which will create a .tar.gz archive that can be installed via "pip install".
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/cli#package
|
||||
"""
|
||||
package(
|
||||
input_dir,
|
||||
|
@ -59,14 +61,14 @@ def package(
|
|||
output_path = util.ensure_path(output_dir)
|
||||
meta_path = util.ensure_path(meta_path)
|
||||
if not input_path or not input_path.exists():
|
||||
msg.fail("Can't locate model data", input_path, exits=1)
|
||||
msg.fail("Can't locate pipeline data", input_path, exits=1)
|
||||
if not output_path or not output_path.exists():
|
||||
msg.fail("Output directory not found", output_path, exits=1)
|
||||
if meta_path and not meta_path.exists():
|
||||
msg.fail("Can't find model meta.json", meta_path, exits=1)
|
||||
msg.fail("Can't find pipeline meta.json", meta_path, exits=1)
|
||||
meta_path = meta_path or input_dir / "meta.json"
|
||||
if not meta_path.exists() or not meta_path.is_file():
|
||||
msg.fail("Can't load model meta.json", meta_path, exits=1)
|
||||
msg.fail("Can't load pipeline meta.json", meta_path, exits=1)
|
||||
meta = srsly.read_json(meta_path)
|
||||
meta = get_meta(input_dir, meta)
|
||||
if version is not None:
|
||||
|
@ -77,7 +79,7 @@ def package(
|
|||
meta = generate_meta(meta, msg)
|
||||
errors = validate(ModelMetaSchema, meta)
|
||||
if errors:
|
||||
msg.fail("Invalid model meta.json")
|
||||
msg.fail("Invalid pipeline meta.json")
|
||||
print("\n".join(errors))
|
||||
sys.exit(1)
|
||||
model_name = meta["lang"] + "_" + meta["name"]
|
||||
|
@ -118,7 +120,7 @@ def get_meta(
|
|||
) -> Dict[str, Any]:
|
||||
meta = {
|
||||
"lang": "en",
|
||||
"name": "model",
|
||||
"name": "pipeline",
|
||||
"version": "0.0.0",
|
||||
"description": "",
|
||||
"author": "",
|
||||
|
@ -143,10 +145,10 @@ def get_meta(
|
|||
def generate_meta(existing_meta: Dict[str, Any], msg: Printer) -> Dict[str, Any]:
|
||||
meta = existing_meta or {}
|
||||
settings = [
|
||||
("lang", "Model language", meta.get("lang", "en")),
|
||||
("name", "Model name", meta.get("name", "model")),
|
||||
("version", "Model version", meta.get("version", "0.0.0")),
|
||||
("description", "Model description", meta.get("description", None)),
|
||||
("lang", "Pipeline language", meta.get("lang", "en")),
|
||||
("name", "Pipeline name", meta.get("name", "pipeline")),
|
||||
("version", "Package version", meta.get("version", "0.0.0")),
|
||||
("description", "Package description", meta.get("description", None)),
|
||||
("author", "Author", meta.get("author", None)),
|
||||
("email", "Author email", meta.get("email", None)),
|
||||
("url", "Author website", meta.get("url", None)),
|
||||
|
@ -154,8 +156,8 @@ def generate_meta(existing_meta: Dict[str, Any], msg: Printer) -> Dict[str, Any]
|
|||
]
|
||||
msg.divider("Generating meta.json")
|
||||
msg.text(
|
||||
"Enter the package settings for your model. The following information "
|
||||
"will be read from your model data: pipeline, vectors."
|
||||
"Enter the package settings for your pipeline. The following information "
|
||||
"will be read from your pipeline data: pipeline, vectors."
|
||||
)
|
||||
for setting, desc, default in settings:
|
||||
response = get_raw_input(desc, default)
|
||||
|
|
|
@ -31,7 +31,7 @@ def pretrain_cli(
|
|||
# fmt: off
|
||||
ctx: typer.Context, # This is only used to read additional arguments
|
||||
texts_loc: Path = Arg(..., help="Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'", exists=True),
|
||||
output_dir: Path = Arg(..., help="Directory to write models to on each epoch"),
|
||||
output_dir: Path = Arg(..., help="Directory to write weights to on each epoch"),
|
||||
config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False),
|
||||
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
|
||||
|
@ -57,6 +57,8 @@ def pretrain_cli(
|
|||
To load the weights back in during 'spacy train', you need to ensure
|
||||
all settings are the same between pretraining and training. Ideally,
|
||||
this is done by using the same config file for both commands.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/cli#pretrain
|
||||
"""
|
||||
overrides = parse_config_overrides(ctx.args)
|
||||
import_code(code_path)
|
||||
|
@ -376,10 +378,9 @@ def verify_cli_args(texts_loc, output_dir, config_path, resume_path, epoch_resum
|
|||
if output_dir.exists() and [p for p in output_dir.iterdir()]:
|
||||
if resume_path:
|
||||
msg.warn(
|
||||
"Output directory is not empty. ",
|
||||
"If you're resuming a run from a previous model in this directory, "
|
||||
"the old models for the consecutive epochs will be overwritten "
|
||||
"with the new ones.",
|
||||
"Output directory is not empty.",
|
||||
"If you're resuming a run in this directory, the old weights "
|
||||
"for the consecutive epochs will be overwritten with the new ones.",
|
||||
)
|
||||
else:
|
||||
msg.warn(
|
||||
|
|
|
@ -19,7 +19,7 @@ from ..util import load_model
|
|||
def profile_cli(
|
||||
# fmt: off
|
||||
ctx: typer.Context, # This is only used to read current calling context
|
||||
model: str = Arg(..., help="Model to load"),
|
||||
model: str = Arg(..., help="Trained pipeline to load"),
|
||||
inputs: Optional[Path] = Arg(None, help="Location of input file. '-' for stdin.", exists=True, allow_dash=True),
|
||||
n_texts: int = Opt(10000, "--n-texts", "-n", help="Maximum number of texts to use if available"),
|
||||
# fmt: on
|
||||
|
@ -29,6 +29,8 @@ def profile_cli(
|
|||
Input should be formatted as one JSON object per line with a key "text".
|
||||
It can either be provided as a JSONL file, or be read from sys.sytdin.
|
||||
If no input file is specified, the IMDB dataset is loaded via Thinc.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/cli#debug-profile
|
||||
"""
|
||||
if ctx.parent.command.name == NAME: # called as top-level command
|
||||
msg.warn(
|
||||
|
@ -60,9 +62,9 @@ def profile(model: str, inputs: Optional[Path] = None, n_texts: int = 10000) ->
|
|||
inputs, _ = zip(*imdb_train)
|
||||
msg.info(f"Loaded IMDB dataset and using {n_inputs} examples")
|
||||
inputs = inputs[:n_inputs]
|
||||
with msg.loading(f"Loading model '{model}'..."):
|
||||
with msg.loading(f"Loading pipeline '{model}'..."):
|
||||
nlp = load_model(model)
|
||||
msg.good(f"Loaded model '{model}'")
|
||||
msg.good(f"Loaded pipeline '{model}'")
|
||||
texts = list(itertools.islice(inputs, n_texts))
|
||||
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
|
||||
s = pstats.Stats("Profile.prof")
|
||||
|
|
|
@ -20,6 +20,8 @@ def project_assets_cli(
|
|||
defined in the "assets" section of the project.yml. If a checksum is
|
||||
provided in the project.yml, the file is only downloaded if no local file
|
||||
with the same checksum exists.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/cli#project-assets
|
||||
"""
|
||||
project_assets(project_dir)
|
||||
|
||||
|
|
|
@ -22,6 +22,8 @@ def project_clone_cli(
|
|||
only download the files from the given subdirectory. The GitHub repo
|
||||
defaults to the official spaCy template repo, but can be customized
|
||||
(including using a private repo).
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/cli#project-clone
|
||||
"""
|
||||
if dest is None:
|
||||
dest = Path.cwd() / name
|
||||
|
@ -43,7 +45,7 @@ def project_clone(name: str, dest: Path, *, repo: str = about.__projects__) -> N
|
|||
git_sparse_checkout(repo, name, dest)
|
||||
except subprocess.CalledProcessError:
|
||||
err = f"Could not clone '{name}' from repo '{repo_name}'"
|
||||
msg.fail(err)
|
||||
msg.fail(err, exits=1)
|
||||
msg.good(f"Cloned '{name}' from {repo_name}", project_dir)
|
||||
if not (project_dir / PROJECT_FILE).exists():
|
||||
msg.warn(f"No {PROJECT_FILE} found in directory")
|
||||
|
@ -78,6 +80,7 @@ def check_clone(name: str, dest: Path, repo: str) -> None:
|
|||
if not dest.parent.exists():
|
||||
# We're not creating parents, parent dir should exist
|
||||
msg.fail(
|
||||
f"Can't clone project, parent directory doesn't exist: {dest.parent}",
|
||||
f"Can't clone project, parent directory doesn't exist: {dest.parent}. "
|
||||
f"Create the necessary folder(s) first before continuing.",
|
||||
exits=1,
|
||||
)
|
||||
|
|
|
@ -6,6 +6,9 @@ from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config
|
|||
|
||||
|
||||
DOCS_URL = "https://nightly.spacy.io"
|
||||
INTRO = f"""> ⚠️ This project template uses the new [**spaCy v3.0**]({DOCS_URL}), which
|
||||
> is currently available as a nightly pre-release. You can install it from pip as `spacy-nightly`:
|
||||
> `pip install spacy-nightly`. Make sure to use a fresh virtual environment."""
|
||||
INTRO_PROJECT = f"""The [`{PROJECT_FILE}`]({PROJECT_FILE}) defines the data assets required by the
|
||||
project, as well as the available commands and workflows. For details, see the
|
||||
[spaCy projects documentation]({DOCS_URL}/usage/projects)."""
|
||||
|
@ -21,8 +24,10 @@ be fetched by running [`spacy project assets`]({DOCS_URL}/api/cli#project-assets
|
|||
in the project directory."""
|
||||
# These markers are added to the Markdown and can be used to update the file in
|
||||
# place if it already exists. Only the auto-generated part will be replaced.
|
||||
MARKER_START = "<!-- AUTO-GENERATED DOCS START (do not remove) -->"
|
||||
MARKER_END = "<!-- AUTO-GENERATED DOCS END (do not remove) -->"
|
||||
MARKER_START = "<!-- SPACY PROJECT: AUTO-GENERATED DOCS START (do not remove) -->"
|
||||
MARKER_END = "<!-- SPACY PROJECT: AUTO-GENERATED DOCS END (do not remove) -->"
|
||||
# If this marker is used in an existing README, it's ignored and not replaced
|
||||
MARKER_IGNORE = "<!-- SPACY PROJECT: IGNORE -->"
|
||||
|
||||
|
||||
@project_cli.command("document")
|
||||
|
@ -38,6 +43,8 @@ def project_document_cli(
|
|||
hidden markers are added so you can add custom content before or after the
|
||||
auto-generated section and only the auto-generated docs will be replaced
|
||||
when you re-run the command.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/cli#project-document
|
||||
"""
|
||||
project_document(project_dir, output_file, no_emoji=no_emoji)
|
||||
|
||||
|
@ -52,6 +59,7 @@ def project_document(
|
|||
title = config.get("title")
|
||||
description = config.get("description")
|
||||
md.add(md.title(1, f"spaCy Project{f': {title}' if title else ''}", "🪐"))
|
||||
md.add(INTRO)
|
||||
if description:
|
||||
md.add(description)
|
||||
md.add(md.title(2, PROJECT_FILE, "📋"))
|
||||
|
@ -96,13 +104,16 @@ def project_document(
|
|||
if output_file.exists():
|
||||
with output_file.open("r", encoding="utf8") as f:
|
||||
existing = f.read()
|
||||
if MARKER_IGNORE in existing:
|
||||
msg.warn("Found ignore marker in existing file: skipping", output_file)
|
||||
return
|
||||
if MARKER_START in existing and MARKER_END in existing:
|
||||
msg.info("Found existing file: only replacing auto-generated docs")
|
||||
before = existing.split(MARKER_START)[0]
|
||||
after = existing.split(MARKER_END)[1]
|
||||
content = f"{before}{content}{after}"
|
||||
else:
|
||||
msg.info("Replacing existing file")
|
||||
msg.warn("Replacing existing file")
|
||||
with output_file.open("w") as f:
|
||||
f.write(content)
|
||||
msg.good("Saved project documentation", output_file)
|
||||
|
|
|
@ -31,7 +31,10 @@ def project_update_dvc_cli(
|
|||
"""Auto-generate Data Version Control (DVC) config. A DVC
|
||||
project can only define one pipeline, so you need to specify one workflow
|
||||
defined in the project.yml. If no workflow is specified, the first defined
|
||||
workflow is used. The DVC config will only be updated if the project.yml changed.
|
||||
workflow is used. The DVC config will only be updated if the project.yml
|
||||
changed.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/cli#project-dvc
|
||||
"""
|
||||
project_update_dvc(project_dir, workflow, verbose=verbose, force=force)
|
||||
|
||||
|
|
|
@ -17,7 +17,9 @@ def project_pull_cli(
|
|||
"""Retrieve available precomputed outputs from a remote storage.
|
||||
You can alias remotes in your project.yml by mapping them to storage paths.
|
||||
A storage can be anything that the smart-open library can upload to, e.g.
|
||||
gcs, aws, ssh, local directories etc
|
||||
AWS, Google Cloud Storage, SSH, local directories etc.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/cli#project-pull
|
||||
"""
|
||||
for url, output_path in project_pull(project_dir, remote):
|
||||
if url is not None:
|
||||
|
@ -38,5 +40,5 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
|
|||
url = storage.pull(output_path, command_hash=cmd_hash)
|
||||
yield url, output_path
|
||||
|
||||
if cmd.get("outptus") and all(loc.exists() for loc in cmd["outputs"]):
|
||||
if cmd.get("outputs") and all(loc.exists() for loc in cmd["outputs"]):
|
||||
update_lockfile(project_dir, cmd)
|
||||
|
|
|
@ -13,9 +13,12 @@ def project_push_cli(
|
|||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
||||
# fmt: on
|
||||
):
|
||||
"""Persist outputs to a remote storage. You can alias remotes in your project.yml
|
||||
by mapping them to storage paths. A storage can be anything that the smart-open
|
||||
library can upload to, e.g. gcs, aws, ssh, local directories etc
|
||||
"""Persist outputs to a remote storage. You can alias remotes in your
|
||||
project.yml by mapping them to storage paths. A storage can be anything that
|
||||
the smart-open library can upload to, e.g. AWS, Google Cloud Storage, SSH,
|
||||
local directories etc.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/cli#project-push
|
||||
"""
|
||||
for output_path, url in project_push(project_dir, remote):
|
||||
if url is None:
|
||||
|
|
|
@ -24,6 +24,8 @@ def project_run_cli(
|
|||
name is specified, all commands in the workflow are run, in order. If
|
||||
commands define dependencies and/or outputs, they will only be re-run if
|
||||
state has changed.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/cli#project-run
|
||||
"""
|
||||
if show_help or not subcommand:
|
||||
print_run_help(project_dir, subcommand)
|
||||
|
|
|
@ -29,7 +29,7 @@ name = "{{ transformer["name"] }}"
|
|||
tokenizer_config = {"use_fast": true}
|
||||
|
||||
[components.transformer.model.get_spans]
|
||||
@span_getters = "strided_spans.v1"
|
||||
@span_getters = "spacy-transformers.strided_spans.v1"
|
||||
window = 128
|
||||
stride = 96
|
||||
|
||||
|
@ -42,7 +42,7 @@ factory = "tagger"
|
|||
nO = null
|
||||
|
||||
[components.tagger.model.tok2vec]
|
||||
@architectures = "spacy-transformers.Tok2VecListener.v1"
|
||||
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||
grad_factor = 1.0
|
||||
|
||||
[components.tagger.model.tok2vec.pooling]
|
||||
|
@ -62,7 +62,7 @@ use_upper = false
|
|||
nO = null
|
||||
|
||||
[components.parser.model.tok2vec]
|
||||
@architectures = "spacy-transformers.Tok2VecListener.v1"
|
||||
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||
grad_factor = 1.0
|
||||
|
||||
[components.parser.model.tok2vec.pooling]
|
||||
|
@ -82,7 +82,7 @@ use_upper = false
|
|||
nO = null
|
||||
|
||||
[components.ner.model.tok2vec]
|
||||
@architectures = "spacy-transformers.Tok2VecListener.v1"
|
||||
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||
grad_factor = 1.0
|
||||
|
||||
[components.ner.model.tok2vec.pooling]
|
||||
|
@ -204,13 +204,13 @@ max_length = 0
|
|||
|
||||
{% if use_transformer %}
|
||||
[training.batcher]
|
||||
@batchers = "batch_by_padded.v1"
|
||||
@batchers = "spacy.batch_by_padded.v1"
|
||||
discard_oversize = true
|
||||
size = 2000
|
||||
buffer = 256
|
||||
{%- else %}
|
||||
[training.batcher]
|
||||
@batchers = "batch_by_words.v1"
|
||||
@batchers = "spacy.batch_by_words.v1"
|
||||
discard_oversize = false
|
||||
tolerance = 0.2
|
||||
|
||||
|
|
|
@ -26,7 +26,7 @@ def train_cli(
|
|||
# fmt: off
|
||||
ctx: typer.Context, # This is only used to read additional arguments
|
||||
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
||||
output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store model in"),
|
||||
output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"),
|
||||
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||
|
@ -34,7 +34,7 @@ def train_cli(
|
|||
# fmt: on
|
||||
):
|
||||
"""
|
||||
Train or update a spaCy model. Requires data in spaCy's binary format. To
|
||||
Train or update a spaCy pipeline. Requires data in spaCy's binary format. To
|
||||
convert data from other formats, use the `spacy convert` command. The
|
||||
config file includes all settings and hyperparameters used during traing.
|
||||
To override settings in the config, e.g. settings that point to local
|
||||
|
@ -44,6 +44,8 @@ def train_cli(
|
|||
lets you pass in a Python file that's imported before training. It can be
|
||||
used to register custom functions and architectures that can then be
|
||||
referenced in the config.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/cli#train
|
||||
"""
|
||||
util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
|
||||
verify_cli_args(config_path, output_path)
|
||||
|
@ -77,6 +79,9 @@ def train(
|
|||
)
|
||||
if config.get("training", {}).get("seed") is not None:
|
||||
fix_random_seed(config["training"]["seed"])
|
||||
if config.get("system", {}).get("use_pytorch_for_gpu_memory"):
|
||||
# It feels kind of weird to not have a default for this.
|
||||
use_pytorch_for_gpu_memory()
|
||||
# Use original config here before it's resolved to functions
|
||||
sourced_components = get_sourced_components(config)
|
||||
with show_validation_error(config_path):
|
||||
|
@ -85,9 +90,6 @@ def train(
|
|||
util.load_vectors_into_model(nlp, config["training"]["vectors"])
|
||||
verify_config(nlp)
|
||||
raw_text, tag_map, morph_rules, weights_data = load_from_paths(config)
|
||||
if config.get("system", {}).get("use_pytorch_for_gpu_memory"):
|
||||
# It feels kind of weird to not have a default for this.
|
||||
use_pytorch_for_gpu_memory()
|
||||
T_cfg = config["training"]
|
||||
optimizer = T_cfg["optimizer"]
|
||||
train_corpus = T_cfg["train_corpus"]
|
||||
|
@ -113,12 +115,12 @@ def train(
|
|||
# Load morph rules
|
||||
nlp.vocab.morphology.load_morph_exceptions(morph_rules)
|
||||
|
||||
# Load a pretrained tok2vec model - cf. CLI command 'pretrain'
|
||||
# Load pretrained tok2vec weights - cf. CLI command 'pretrain'
|
||||
if weights_data is not None:
|
||||
tok2vec_path = config["pretraining"].get("tok2vec_model", None)
|
||||
if tok2vec_path is None:
|
||||
msg.fail(
|
||||
f"To use a pretrained tok2vec model, the config needs to specify which "
|
||||
f"To pretrained tok2vec weights, the config needs to specify which "
|
||||
f"tok2vec layer to load in the setting [pretraining.tok2vec_model].",
|
||||
exits=1,
|
||||
)
|
||||
|
@ -159,7 +161,8 @@ def train(
|
|||
print_row(info)
|
||||
if is_best_checkpoint and output_path is not None:
|
||||
update_meta(T_cfg, nlp, info)
|
||||
nlp.to_disk(output_path / "model-best")
|
||||
with nlp.use_params(optimizer.averages):
|
||||
nlp.to_disk(output_path / "model-best")
|
||||
progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False)
|
||||
progress.set_description(f"Epoch {info['epoch']}")
|
||||
except Exception as e:
|
||||
|
@ -182,22 +185,16 @@ def train(
|
|||
nlp.to_disk(final_model_path)
|
||||
else:
|
||||
nlp.to_disk(final_model_path)
|
||||
msg.good(f"Saved model to output directory {final_model_path}")
|
||||
msg.good(f"Saved pipeline to output directory {final_model_path}")
|
||||
|
||||
|
||||
def create_train_batches(iterator, batcher, max_epochs: int):
|
||||
epoch = 1
|
||||
examples = []
|
||||
# Stream the first epoch, so we start training faster and support
|
||||
# infinite streams.
|
||||
for batch in batcher(iterator):
|
||||
yield epoch, batch
|
||||
if max_epochs != 1:
|
||||
examples.extend(batch)
|
||||
epoch = 0
|
||||
examples = list(iterator)
|
||||
if not examples:
|
||||
# Raise error if no data
|
||||
raise ValueError(Errors.E986)
|
||||
while epoch != max_epochs:
|
||||
while max_epochs < 1 or epoch != max_epochs:
|
||||
random.shuffle(examples)
|
||||
for batch in batcher(examples):
|
||||
yield epoch, batch
|
||||
|
@ -270,9 +267,9 @@ def train_while_improving(
|
|||
|
||||
epoch (int): How many passes over the data have been completed.
|
||||
step (int): How many steps have been completed.
|
||||
score (float): The main score form the last evaluation.
|
||||
score (float): The main score from the last evaluation.
|
||||
other_scores: : The other scores from the last evaluation.
|
||||
loss: The accumulated losses throughout training.
|
||||
losses: The accumulated losses throughout training.
|
||||
checkpoints: A list of previous results, where each result is a
|
||||
(score, step, epoch) tuple.
|
||||
"""
|
||||
|
|
|
@ -13,9 +13,11 @@ from ..util import get_package_path, get_model_meta, is_compatible_version
|
|||
@app.command("validate")
|
||||
def validate_cli():
|
||||
"""
|
||||
Validate the currently installed models and spaCy version. Checks if the
|
||||
installed models are compatible and shows upgrade instructions if available.
|
||||
Should be run after `pip install -U spacy`.
|
||||
Validate the currently installed pipeline packages and spaCy version. Checks
|
||||
if the installed packages are compatible and shows upgrade instructions if
|
||||
available. Should be run after `pip install -U spacy`.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/cli#validate
|
||||
"""
|
||||
validate()
|
||||
|
||||
|
@ -25,13 +27,13 @@ def validate() -> None:
|
|||
spacy_version = get_base_version(about.__version__)
|
||||
current_compat = compat.get(spacy_version, {})
|
||||
if not current_compat:
|
||||
msg.warn(f"No compatible models found for v{spacy_version} of spaCy")
|
||||
msg.warn(f"No compatible packages found for v{spacy_version} of spaCy")
|
||||
incompat_models = {d["name"] for _, d in model_pkgs.items() if not d["compat"]}
|
||||
na_models = [m for m in incompat_models if m not in current_compat]
|
||||
update_models = [m for m in incompat_models if m in current_compat]
|
||||
spacy_dir = Path(__file__).parent.parent
|
||||
|
||||
msg.divider(f"Installed models (spaCy v{about.__version__})")
|
||||
msg.divider(f"Installed pipeline packages (spaCy v{about.__version__})")
|
||||
msg.info(f"spaCy installation: {spacy_dir}")
|
||||
|
||||
if model_pkgs:
|
||||
|
@ -47,15 +49,15 @@ def validate() -> None:
|
|||
rows.append((data["name"], data["spacy"], version, comp))
|
||||
msg.table(rows, header=header)
|
||||
else:
|
||||
msg.text("No models found in your current environment.", exits=0)
|
||||
msg.text("No pipeline packages found in your current environment.", exits=0)
|
||||
if update_models:
|
||||
msg.divider("Install updates")
|
||||
msg.text("Use the following commands to update the model packages:")
|
||||
msg.text("Use the following commands to update the packages:")
|
||||
cmd = "python -m spacy download {}"
|
||||
print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n")
|
||||
if na_models:
|
||||
msg.info(
|
||||
f"The following models are custom spaCy models or not "
|
||||
f"The following packages are custom spaCy pipelines or not "
|
||||
f"available for spaCy v{about.__version__}:",
|
||||
", ".join(na_models),
|
||||
)
|
||||
|
|
|
@ -69,7 +69,7 @@ max_length = 2000
|
|||
limit = 0
|
||||
|
||||
[training.batcher]
|
||||
@batchers = "batch_by_words.v1"
|
||||
@batchers = "spacy.batch_by_words.v1"
|
||||
discard_oversize = false
|
||||
tolerance = 0.2
|
||||
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
"""
|
||||
spaCy's built in visualization suite for dependencies and named entities.
|
||||
|
||||
DOCS: https://spacy.io/api/top-level#displacy
|
||||
USAGE: https://spacy.io/usage/visualizers
|
||||
DOCS: https://nightly.spacy.io/api/top-level#displacy
|
||||
USAGE: https://nightly.spacy.io/usage/visualizers
|
||||
"""
|
||||
from typing import Union, Iterable, Optional, Dict, Any, Callable
|
||||
import warnings
|
||||
|
@ -37,8 +37,8 @@ def render(
|
|||
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
|
||||
RETURNS (str): Rendered HTML markup.
|
||||
|
||||
DOCS: https://spacy.io/api/top-level#displacy.render
|
||||
USAGE: https://spacy.io/usage/visualizers
|
||||
DOCS: https://nightly.spacy.io/api/top-level#displacy.render
|
||||
USAGE: https://nightly.spacy.io/usage/visualizers
|
||||
"""
|
||||
factories = {
|
||||
"dep": (DependencyRenderer, parse_deps),
|
||||
|
@ -88,8 +88,8 @@ def serve(
|
|||
port (int): Port to serve visualisation.
|
||||
host (str): Host to serve visualisation.
|
||||
|
||||
DOCS: https://spacy.io/api/top-level#displacy.serve
|
||||
USAGE: https://spacy.io/usage/visualizers
|
||||
DOCS: https://nightly.spacy.io/api/top-level#displacy.serve
|
||||
USAGE: https://nightly.spacy.io/usage/visualizers
|
||||
"""
|
||||
from wsgiref import simple_server
|
||||
|
||||
|
|
|
@ -249,6 +249,12 @@ class EntityRenderer:
|
|||
colors = dict(DEFAULT_LABEL_COLORS)
|
||||
user_colors = registry.displacy_colors.get_all()
|
||||
for user_color in user_colors.values():
|
||||
if callable(user_color):
|
||||
# Since this comes from the function registry, we want to make
|
||||
# sure we support functions that *return* a dict of colors
|
||||
user_color = user_color()
|
||||
if not isinstance(user_color, dict):
|
||||
raise ValueError(Errors.E925.format(obj=type(user_color)))
|
||||
colors.update(user_color)
|
||||
colors.update(options.get("colors", {}))
|
||||
self.default_color = DEFAULT_ENTITY_COLOR
|
||||
|
|
|
@ -22,7 +22,7 @@ class Warnings:
|
|||
"generate a dependency visualization for it. Make sure the Doc "
|
||||
"was processed with a model that supports dependency parsing, and "
|
||||
"not just a language class like `English()`. For more info, see "
|
||||
"the docs:\nhttps://spacy.io/usage/models")
|
||||
"the docs:\nhttps://nightly.spacy.io/usage/models")
|
||||
W006 = ("No entities to visualize found in Doc object. If this is "
|
||||
"surprising to you, make sure the Doc was processed using a model "
|
||||
"that supports named entity recognition, and check the `doc.ents` "
|
||||
|
@ -112,6 +112,9 @@ class Warnings:
|
|||
"word segmenters: {supported}. Defaulting to {default}.")
|
||||
W104 = ("Skipping modifications for '{target}' segmenter. The current "
|
||||
"segmenter is '{current}'.")
|
||||
W105 = ("As of spaCy v3.0, the {matcher}.pipe method is deprecated. If you "
|
||||
"need to match on a stream of documents, you can use nlp.pipe and "
|
||||
"call the {matcher} on each Doc object.")
|
||||
|
||||
|
||||
@add_codes
|
||||
|
@ -144,7 +147,7 @@ class Errors:
|
|||
E010 = ("Word vectors set to length 0. This may be because you don't have "
|
||||
"a model installed or loaded, or because your model doesn't "
|
||||
"include word vectors. For more info, see the docs:\n"
|
||||
"https://spacy.io/usage/models")
|
||||
"https://nightly.spacy.io/usage/models")
|
||||
E011 = ("Unknown operator: '{op}'. Options: {opts}")
|
||||
E012 = ("Cannot add pattern for zero tokens to matcher.\nKey: {key}")
|
||||
E014 = ("Unknown tag ID: {tag}")
|
||||
|
@ -178,7 +181,7 @@ class Errors:
|
|||
"list of (unicode, bool) tuples. Got bytes instance: {value}")
|
||||
E029 = ("noun_chunks requires the dependency parse, which requires a "
|
||||
"statistical model to be installed and loaded. For more info, see "
|
||||
"the documentation:\nhttps://spacy.io/usage/models")
|
||||
"the documentation:\nhttps://nightly.spacy.io/usage/models")
|
||||
E030 = ("Sentence boundaries unset. You can add the 'sentencizer' "
|
||||
"component to the pipeline with: "
|
||||
"nlp.add_pipe('sentencizer'). "
|
||||
|
@ -291,7 +294,7 @@ class Errors:
|
|||
E102 = ("Can't merge non-disjoint spans. '{token}' is already part of "
|
||||
"tokens to merge. If you want to find the longest non-overlapping "
|
||||
"spans, you can use the util.filter_spans helper:\n"
|
||||
"https://spacy.io/api/top-level#util.filter_spans")
|
||||
"https://nightly.spacy.io/api/top-level#util.filter_spans")
|
||||
E103 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A "
|
||||
"token can only be part of one entity, so make sure the entities "
|
||||
"you're setting don't overlap.")
|
||||
|
@ -361,10 +364,10 @@ class Errors:
|
|||
E137 = ("Expected 'dict' type, but got '{type}' from '{line}'. Make sure "
|
||||
"to provide a valid JSON object as input with either the `text` "
|
||||
"or `tokens` key. For more info, see the docs:\n"
|
||||
"https://spacy.io/api/cli#pretrain-jsonl")
|
||||
"https://nightly.spacy.io/api/cli#pretrain-jsonl")
|
||||
E138 = ("Invalid JSONL format for raw text '{text}'. Make sure the input "
|
||||
"includes either the `text` or `tokens` key. For more info, see "
|
||||
"the docs:\nhttps://spacy.io/api/cli#pretrain-jsonl")
|
||||
"the docs:\nhttps://nightly.spacy.io/api/cli#pretrain-jsonl")
|
||||
E139 = ("Knowledge Base for component '{name}' is empty. Use the methods "
|
||||
"kb.add_entity and kb.add_alias to add entries.")
|
||||
E140 = ("The list of entities, prior probabilities and entity vectors "
|
||||
|
@ -473,6 +476,8 @@ class Errors:
|
|||
E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
|
||||
|
||||
# TODO: fix numbering after merging develop into master
|
||||
E925 = ("Invalid color values for displaCy visualizer: expected dictionary "
|
||||
"mapping label names to colors but got: {obj}")
|
||||
E926 = ("It looks like you're trying to modify nlp.{attr} directly. This "
|
||||
"doesn't work because it's an immutable computed property. If you "
|
||||
"need to modify the pipeline, use the built-in methods like "
|
||||
|
@ -645,6 +650,10 @@ class Errors:
|
|||
"Required tables '{tables}', found '{found}'. If you are not "
|
||||
"providing custom lookups, make sure you have the package "
|
||||
"spacy-lookups-data installed.")
|
||||
E1005 = ("Unable to set attribute '{attr}' in tokenizer exception for "
|
||||
"'{chunk}'. Tokenizer exceptions are only allowed to specify "
|
||||
"`ORTH` and `NORM`.")
|
||||
E1006 = ("Unable to initialize {name} model with 0 labels.")
|
||||
|
||||
|
||||
@add_codes
|
||||
|
|
|
@ -11,7 +11,7 @@ ItemT = TypeVar("ItemT")
|
|||
BatcherT = Callable[[Iterable[ItemT]], Iterable[List[ItemT]]]
|
||||
|
||||
|
||||
@registry.batchers("batch_by_padded.v1")
|
||||
@registry.batchers("spacy.batch_by_padded.v1")
|
||||
def configure_minibatch_by_padded_size(
|
||||
*,
|
||||
size: Sizing,
|
||||
|
@ -46,7 +46,7 @@ def configure_minibatch_by_padded_size(
|
|||
)
|
||||
|
||||
|
||||
@registry.batchers("batch_by_words.v1")
|
||||
@registry.batchers("spacy.batch_by_words.v1")
|
||||
def configure_minibatch_by_words(
|
||||
*,
|
||||
size: Sizing,
|
||||
|
@ -70,7 +70,7 @@ def configure_minibatch_by_words(
|
|||
)
|
||||
|
||||
|
||||
@registry.batchers("batch_by_sequence.v1")
|
||||
@registry.batchers("spacy.batch_by_sequence.v1")
|
||||
def configure_minibatch(
|
||||
size: Sizing, get_length: Optional[Callable[[ItemT], int]] = None
|
||||
) -> BatcherT:
|
||||
|
|
|
@ -106,7 +106,7 @@ def conll_ner2docs(
|
|||
raise ValueError(
|
||||
"The token-per-line NER file is not formatted correctly. "
|
||||
"Try checking whitespace and delimiters. See "
|
||||
"https://spacy.io/api/cli#convert"
|
||||
"https://nightly.spacy.io/api/cli#convert"
|
||||
)
|
||||
length = len(cols[0])
|
||||
words.extend(cols[0])
|
||||
|
|
|
@ -44,7 +44,7 @@ def read_iob(raw_sents, vocab, n_sents):
|
|||
sent_tags = ["-"] * len(sent_words)
|
||||
else:
|
||||
raise ValueError(
|
||||
"The sentence-per-line IOB/IOB2 file is not formatted correctly. Try checking whitespace and delimiters. See https://spacy.io/api/cli#convert"
|
||||
"The sentence-per-line IOB/IOB2 file is not formatted correctly. Try checking whitespace and delimiters. See https://nightly.spacy.io/api/cli#convert"
|
||||
)
|
||||
words.extend(sent_words)
|
||||
tags.extend(sent_tags)
|
||||
|
|
|
@ -38,7 +38,7 @@ class Corpus:
|
|||
limit (int): Limit corpus to a subset of examples, e.g. for debugging.
|
||||
Defaults to 0, which indicates no limit.
|
||||
|
||||
DOCS: https://spacy.io/api/corpus
|
||||
DOCS: https://nightly.spacy.io/api/corpus
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
|
@ -83,7 +83,7 @@ class Corpus:
|
|||
nlp (Language): The current nlp object.
|
||||
YIELDS (Example): The examples.
|
||||
|
||||
DOCS: https://spacy.io/api/corpus#call
|
||||
DOCS: https://nightly.spacy.io/api/corpus#call
|
||||
"""
|
||||
ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.path))
|
||||
if self.gold_preproc:
|
||||
|
|
|
@ -21,7 +21,7 @@ cdef class Candidate:
|
|||
algorithm which will disambiguate the various candidates to the correct one.
|
||||
Each candidate (alias, entity) pair is assigned to a certain prior probability.
|
||||
|
||||
DOCS: https://spacy.io/api/kb/#candidate_init
|
||||
DOCS: https://nightly.spacy.io/api/kb/#candidate_init
|
||||
"""
|
||||
|
||||
def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob):
|
||||
|
@ -79,7 +79,7 @@ cdef class KnowledgeBase:
|
|||
"""A `KnowledgeBase` instance stores unique identifiers for entities and their textual aliases,
|
||||
to support entity linking of named entities to real-world concepts.
|
||||
|
||||
DOCS: https://spacy.io/api/kb
|
||||
DOCS: https://nightly.spacy.io/api/kb
|
||||
"""
|
||||
|
||||
def __init__(self, Vocab vocab, entity_vector_length):
|
||||
|
|
|
@ -7,6 +7,7 @@ _concat_icons = CONCAT_ICONS.replace("\u00B0", "")
|
|||
|
||||
_currency = r"\$¢£€¥฿"
|
||||
_quotes = CONCAT_QUOTES.replace("'", "")
|
||||
_units = UNITS.replace("%", "")
|
||||
|
||||
_prefixes = (
|
||||
LIST_PUNCT
|
||||
|
@ -26,7 +27,7 @@ _suffixes = (
|
|||
r"(?<=[0-9])\+",
|
||||
r"(?<=°[FfCcKk])\.",
|
||||
r"(?<=[0-9])(?:[{c}])".format(c=_currency),
|
||||
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
||||
r"(?<=[0-9])(?:{u})".format(u=_units),
|
||||
r"(?<=[{al}{e}{q}(?:{c})])\.".format(
|
||||
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency
|
||||
),
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
from typing import Optional, Any, Dict, Callable, Iterable, Union, List, Pattern
|
||||
from typing import Tuple, Iterator
|
||||
from typing import Tuple, Iterator, Optional
|
||||
from dataclasses import dataclass
|
||||
import random
|
||||
import itertools
|
||||
|
@ -95,7 +95,7 @@ class Language:
|
|||
object and processing pipeline.
|
||||
lang (str): Two-letter language ID, i.e. ISO code.
|
||||
|
||||
DOCS: https://spacy.io/api/language
|
||||
DOCS: https://nightly.spacy.io/api/language
|
||||
"""
|
||||
|
||||
Defaults = BaseDefaults
|
||||
|
@ -130,7 +130,7 @@ class Language:
|
|||
create_tokenizer (Callable): Function that takes the nlp object and
|
||||
returns a tokenizer.
|
||||
|
||||
DOCS: https://spacy.io/api/language#init
|
||||
DOCS: https://nightly.spacy.io/api/language#init
|
||||
"""
|
||||
# We're only calling this to import all factories provided via entry
|
||||
# points. The factory decorator applied to these functions takes care
|
||||
|
@ -185,14 +185,14 @@ class Language:
|
|||
|
||||
RETURNS (Dict[str, Any]): The meta.
|
||||
|
||||
DOCS: https://spacy.io/api/language#meta
|
||||
DOCS: https://nightly.spacy.io/api/language#meta
|
||||
"""
|
||||
spacy_version = util.get_model_version_range(about.__version__)
|
||||
if self.vocab.lang:
|
||||
self._meta.setdefault("lang", self.vocab.lang)
|
||||
else:
|
||||
self._meta.setdefault("lang", self.lang)
|
||||
self._meta.setdefault("name", "model")
|
||||
self._meta.setdefault("name", "pipeline")
|
||||
self._meta.setdefault("version", "0.0.0")
|
||||
self._meta.setdefault("spacy_version", spacy_version)
|
||||
self._meta.setdefault("description", "")
|
||||
|
@ -225,7 +225,7 @@ class Language:
|
|||
|
||||
RETURNS (thinc.api.Config): The config.
|
||||
|
||||
DOCS: https://spacy.io/api/language#config
|
||||
DOCS: https://nightly.spacy.io/api/language#config
|
||||
"""
|
||||
self._config.setdefault("nlp", {})
|
||||
self._config.setdefault("training", {})
|
||||
|
@ -433,7 +433,7 @@ class Language:
|
|||
will be combined and normalized for the whole pipeline.
|
||||
func (Optional[Callable]): Factory function if not used as a decorator.
|
||||
|
||||
DOCS: https://spacy.io/api/language#factory
|
||||
DOCS: https://nightly.spacy.io/api/language#factory
|
||||
"""
|
||||
if not isinstance(name, str):
|
||||
raise ValueError(Errors.E963.format(decorator="factory"))
|
||||
|
@ -513,7 +513,7 @@ class Language:
|
|||
Used for pipeline analysis.
|
||||
func (Optional[Callable]): Factory function if not used as a decorator.
|
||||
|
||||
DOCS: https://spacy.io/api/language#component
|
||||
DOCS: https://nightly.spacy.io/api/language#component
|
||||
"""
|
||||
if name is not None and not isinstance(name, str):
|
||||
raise ValueError(Errors.E963.format(decorator="component"))
|
||||
|
@ -579,7 +579,7 @@ class Language:
|
|||
name (str): Name of pipeline component to get.
|
||||
RETURNS (callable): The pipeline component.
|
||||
|
||||
DOCS: https://spacy.io/api/language#get_pipe
|
||||
DOCS: https://nightly.spacy.io/api/language#get_pipe
|
||||
"""
|
||||
for pipe_name, component in self._components:
|
||||
if pipe_name == name:
|
||||
|
@ -608,7 +608,7 @@ class Language:
|
|||
arguments and types expected by the factory.
|
||||
RETURNS (Callable[[Doc], Doc]): The pipeline component.
|
||||
|
||||
DOCS: https://spacy.io/api/language#create_pipe
|
||||
DOCS: https://nightly.spacy.io/api/language#create_pipe
|
||||
"""
|
||||
name = name if name is not None else factory_name
|
||||
if not isinstance(config, dict):
|
||||
|
@ -722,7 +722,7 @@ class Language:
|
|||
arguments and types expected by the factory.
|
||||
RETURNS (Callable[[Doc], Doc]): The pipeline component.
|
||||
|
||||
DOCS: https://spacy.io/api/language#add_pipe
|
||||
DOCS: https://nightly.spacy.io/api/language#add_pipe
|
||||
"""
|
||||
if not isinstance(factory_name, str):
|
||||
bad_val = repr(factory_name)
|
||||
|
@ -820,7 +820,7 @@ class Language:
|
|||
name (str): Name of the component.
|
||||
RETURNS (bool): Whether a component of the name exists in the pipeline.
|
||||
|
||||
DOCS: https://spacy.io/api/language#has_pipe
|
||||
DOCS: https://nightly.spacy.io/api/language#has_pipe
|
||||
"""
|
||||
return name in self.pipe_names
|
||||
|
||||
|
@ -841,7 +841,7 @@ class Language:
|
|||
validate (bool): Whether to validate the component config against the
|
||||
arguments and types expected by the factory.
|
||||
|
||||
DOCS: https://spacy.io/api/language#replace_pipe
|
||||
DOCS: https://nightly.spacy.io/api/language#replace_pipe
|
||||
"""
|
||||
if name not in self.pipe_names:
|
||||
raise ValueError(Errors.E001.format(name=name, opts=self.pipe_names))
|
||||
|
@ -870,7 +870,7 @@ class Language:
|
|||
old_name (str): Name of the component to rename.
|
||||
new_name (str): New name of the component.
|
||||
|
||||
DOCS: https://spacy.io/api/language#rename_pipe
|
||||
DOCS: https://nightly.spacy.io/api/language#rename_pipe
|
||||
"""
|
||||
if old_name not in self.component_names:
|
||||
raise ValueError(
|
||||
|
@ -891,7 +891,7 @@ class Language:
|
|||
name (str): Name of the component to remove.
|
||||
RETURNS (tuple): A `(name, component)` tuple of the removed component.
|
||||
|
||||
DOCS: https://spacy.io/api/language#remove_pipe
|
||||
DOCS: https://nightly.spacy.io/api/language#remove_pipe
|
||||
"""
|
||||
if name not in self.component_names:
|
||||
raise ValueError(Errors.E001.format(name=name, opts=self.component_names))
|
||||
|
@ -944,7 +944,7 @@ class Language:
|
|||
keyword arguments for specific components.
|
||||
RETURNS (Doc): A container for accessing the annotations.
|
||||
|
||||
DOCS: https://spacy.io/api/language#call
|
||||
DOCS: https://nightly.spacy.io/api/language#call
|
||||
"""
|
||||
if len(text) > self.max_length:
|
||||
raise ValueError(
|
||||
|
@ -993,7 +993,7 @@ class Language:
|
|||
disable (str or iterable): The name(s) of the pipes to disable
|
||||
enable (str or iterable): The name(s) of the pipes to enable - all others will be disabled
|
||||
|
||||
DOCS: https://spacy.io/api/language#select_pipes
|
||||
DOCS: https://nightly.spacy.io/api/language#select_pipes
|
||||
"""
|
||||
if enable is None and disable is None:
|
||||
raise ValueError(Errors.E991)
|
||||
|
@ -1044,7 +1044,7 @@ class Language:
|
|||
exclude (Iterable[str]): Names of components that shouldn't be updated.
|
||||
RETURNS (Dict[str, float]): The updated losses dictionary
|
||||
|
||||
DOCS: https://spacy.io/api/language#update
|
||||
DOCS: https://nightly.spacy.io/api/language#update
|
||||
"""
|
||||
if _ is not None:
|
||||
raise ValueError(Errors.E989)
|
||||
|
@ -1106,7 +1106,7 @@ class Language:
|
|||
>>> raw_batch = [Example.from_dict(nlp.make_doc(text), {}) for text in next(raw_text_batches)]
|
||||
>>> nlp.rehearse(raw_batch)
|
||||
|
||||
DOCS: https://spacy.io/api/language#rehearse
|
||||
DOCS: https://nightly.spacy.io/api/language#rehearse
|
||||
"""
|
||||
if len(examples) == 0:
|
||||
return
|
||||
|
@ -1153,7 +1153,7 @@ class Language:
|
|||
create_optimizer if it doesn't exist.
|
||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||
|
||||
DOCS: https://spacy.io/api/language#begin_training
|
||||
DOCS: https://nightly.spacy.io/api/language#begin_training
|
||||
"""
|
||||
# TODO: throw warning when get_gold_tuples is provided instead of get_examples
|
||||
if get_examples is None:
|
||||
|
@ -1200,7 +1200,7 @@ class Language:
|
|||
sgd (Optional[Optimizer]): An optimizer.
|
||||
RETURNS (Optimizer): The optimizer.
|
||||
|
||||
DOCS: https://spacy.io/api/language#resume_training
|
||||
DOCS: https://nightly.spacy.io/api/language#resume_training
|
||||
"""
|
||||
if device >= 0: # TODO: do we need this here?
|
||||
require_gpu(device)
|
||||
|
@ -1236,7 +1236,7 @@ class Language:
|
|||
for the scorer.
|
||||
RETURNS (Scorer): The scorer containing the evaluation results.
|
||||
|
||||
DOCS: https://spacy.io/api/language#evaluate
|
||||
DOCS: https://nightly.spacy.io/api/language#evaluate
|
||||
"""
|
||||
validate_examples(examples, "Language.evaluate")
|
||||
if component_cfg is None:
|
||||
|
@ -1275,7 +1275,7 @@ class Language:
|
|||
return results
|
||||
|
||||
@contextmanager
|
||||
def use_params(self, params: dict):
|
||||
def use_params(self, params: Optional[dict]):
|
||||
"""Replace weights of models in the pipeline with those provided in the
|
||||
params dictionary. Can be used as a contextmanager, in which case,
|
||||
models go back to their original weights after the block.
|
||||
|
@ -1286,26 +1286,29 @@ class Language:
|
|||
>>> with nlp.use_params(optimizer.averages):
|
||||
>>> nlp.to_disk("/tmp/checkpoint")
|
||||
|
||||
DOCS: https://spacy.io/api/language#use_params
|
||||
DOCS: https://nightly.spacy.io/api/language#use_params
|
||||
"""
|
||||
contexts = [
|
||||
pipe.use_params(params)
|
||||
for name, pipe in self.pipeline
|
||||
if hasattr(pipe, "use_params") and hasattr(pipe, "model")
|
||||
]
|
||||
# TODO: Having trouble with contextlib
|
||||
# Workaround: these aren't actually context managers atm.
|
||||
for context in contexts:
|
||||
try:
|
||||
next(context)
|
||||
except StopIteration:
|
||||
pass
|
||||
yield
|
||||
for context in contexts:
|
||||
try:
|
||||
next(context)
|
||||
except StopIteration:
|
||||
pass
|
||||
if not params:
|
||||
yield
|
||||
else:
|
||||
contexts = [
|
||||
pipe.use_params(params)
|
||||
for name, pipe in self.pipeline
|
||||
if hasattr(pipe, "use_params") and hasattr(pipe, "model")
|
||||
]
|
||||
# TODO: Having trouble with contextlib
|
||||
# Workaround: these aren't actually context managers atm.
|
||||
for context in contexts:
|
||||
try:
|
||||
next(context)
|
||||
except StopIteration:
|
||||
pass
|
||||
yield
|
||||
for context in contexts:
|
||||
try:
|
||||
next(context)
|
||||
except StopIteration:
|
||||
pass
|
||||
|
||||
def pipe(
|
||||
self,
|
||||
|
@ -1314,7 +1317,6 @@ class Language:
|
|||
as_tuples: bool = False,
|
||||
batch_size: int = 1000,
|
||||
disable: Iterable[str] = SimpleFrozenList(),
|
||||
cleanup: bool = False,
|
||||
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
|
||||
n_process: int = 1,
|
||||
):
|
||||
|
@ -1326,14 +1328,12 @@ class Language:
|
|||
(doc, context) tuples. Defaults to False.
|
||||
batch_size (int): The number of texts to buffer.
|
||||
disable (List[str]): Names of the pipeline components to disable.
|
||||
cleanup (bool): If True, unneeded strings are freed to control memory
|
||||
use. Experimental.
|
||||
component_cfg (Dict[str, Dict]): An optional dictionary with extra keyword
|
||||
arguments for specific components.
|
||||
n_process (int): Number of processors to process texts. If -1, set `multiprocessing.cpu_count()`.
|
||||
YIELDS (Doc): Documents in the order of the original text.
|
||||
|
||||
DOCS: https://spacy.io/api/language#pipe
|
||||
DOCS: https://nightly.spacy.io/api/language#pipe
|
||||
"""
|
||||
if n_process == -1:
|
||||
n_process = mp.cpu_count()
|
||||
|
@ -1378,35 +1378,9 @@ class Language:
|
|||
for pipe in pipes:
|
||||
docs = pipe(docs)
|
||||
|
||||
# Track weakrefs of "recent" documents, so that we can see when they
|
||||
# expire from memory. When they do, we know we don't need old strings.
|
||||
# This way, we avoid maintaining an unbounded growth in string entries
|
||||
# in the string store.
|
||||
recent_refs = weakref.WeakSet()
|
||||
old_refs = weakref.WeakSet()
|
||||
# Keep track of the original string data, so that if we flush old strings,
|
||||
# we can recover the original ones. However, we only want to do this if we're
|
||||
# really adding strings, to save up-front costs.
|
||||
original_strings_data = None
|
||||
nr_seen = 0
|
||||
for doc in docs:
|
||||
yield doc
|
||||
if cleanup:
|
||||
recent_refs.add(doc)
|
||||
if nr_seen < 10000:
|
||||
old_refs.add(doc)
|
||||
nr_seen += 1
|
||||
elif len(old_refs) == 0:
|
||||
old_refs, recent_refs = recent_refs, old_refs
|
||||
if original_strings_data is None:
|
||||
original_strings_data = list(self.vocab.strings)
|
||||
else:
|
||||
keys, strings = self.vocab.strings._cleanup_stale_strings(
|
||||
original_strings_data
|
||||
)
|
||||
self.vocab._reset_cache(keys, strings)
|
||||
self.tokenizer._reset_cache(keys)
|
||||
nr_seen = 0
|
||||
|
||||
def _multiprocessing_pipe(
|
||||
self,
|
||||
|
@ -1495,7 +1469,7 @@ class Language:
|
|||
the types expected by the factory.
|
||||
RETURNS (Language): The initialized Language class.
|
||||
|
||||
DOCS: https://spacy.io/api/language#from_config
|
||||
DOCS: https://nightly.spacy.io/api/language#from_config
|
||||
"""
|
||||
if auto_fill:
|
||||
config = Config(
|
||||
|
@ -1608,7 +1582,7 @@ class Language:
|
|||
it doesn't exist.
|
||||
exclude (list): Names of components or serialization fields to exclude.
|
||||
|
||||
DOCS: https://spacy.io/api/language#to_disk
|
||||
DOCS: https://nightly.spacy.io/api/language#to_disk
|
||||
"""
|
||||
path = util.ensure_path(path)
|
||||
serializers = {}
|
||||
|
@ -1637,7 +1611,7 @@ class Language:
|
|||
exclude (list): Names of components or serialization fields to exclude.
|
||||
RETURNS (Language): The modified `Language` object.
|
||||
|
||||
DOCS: https://spacy.io/api/language#from_disk
|
||||
DOCS: https://nightly.spacy.io/api/language#from_disk
|
||||
"""
|
||||
|
||||
def deserialize_meta(path: Path) -> None:
|
||||
|
@ -1685,7 +1659,7 @@ class Language:
|
|||
exclude (list): Names of components or serialization fields to exclude.
|
||||
RETURNS (bytes): The serialized form of the `Language` object.
|
||||
|
||||
DOCS: https://spacy.io/api/language#to_bytes
|
||||
DOCS: https://nightly.spacy.io/api/language#to_bytes
|
||||
"""
|
||||
serializers = {}
|
||||
serializers["vocab"] = lambda: self.vocab.to_bytes()
|
||||
|
@ -1709,7 +1683,7 @@ class Language:
|
|||
exclude (list): Names of components or serialization fields to exclude.
|
||||
RETURNS (Language): The `Language` object.
|
||||
|
||||
DOCS: https://spacy.io/api/language#from_bytes
|
||||
DOCS: https://nightly.spacy.io/api/language#from_bytes
|
||||
"""
|
||||
|
||||
def deserialize_meta(b):
|
||||
|
|
|
@ -30,7 +30,7 @@ cdef class Lexeme:
|
|||
tag, dependency parse, or lemma (lemmatization depends on the
|
||||
part-of-speech tag).
|
||||
|
||||
DOCS: https://spacy.io/api/lexeme
|
||||
DOCS: https://nightly.spacy.io/api/lexeme
|
||||
"""
|
||||
def __init__(self, Vocab vocab, attr_t orth):
|
||||
"""Create a Lexeme object.
|
||||
|
|
|
@ -57,7 +57,7 @@ class Table(OrderedDict):
|
|||
data (dict): The dictionary.
|
||||
name (str): Optional table name for reference.
|
||||
|
||||
DOCS: https://spacy.io/api/lookups#table.from_dict
|
||||
DOCS: https://nightly.spacy.io/api/lookups#table.from_dict
|
||||
"""
|
||||
self = cls(name=name)
|
||||
self.update(data)
|
||||
|
@ -69,7 +69,7 @@ class Table(OrderedDict):
|
|||
name (str): Optional table name for reference.
|
||||
data (dict): Initial data, used to hint Bloom Filter.
|
||||
|
||||
DOCS: https://spacy.io/api/lookups#table.init
|
||||
DOCS: https://nightly.spacy.io/api/lookups#table.init
|
||||
"""
|
||||
OrderedDict.__init__(self)
|
||||
self.name = name
|
||||
|
@ -135,7 +135,7 @@ class Table(OrderedDict):
|
|||
|
||||
RETURNS (bytes): The serialized table.
|
||||
|
||||
DOCS: https://spacy.io/api/lookups#table.to_bytes
|
||||
DOCS: https://nightly.spacy.io/api/lookups#table.to_bytes
|
||||
"""
|
||||
data = {
|
||||
"name": self.name,
|
||||
|
@ -150,7 +150,7 @@ class Table(OrderedDict):
|
|||
bytes_data (bytes): The data to load.
|
||||
RETURNS (Table): The loaded table.
|
||||
|
||||
DOCS: https://spacy.io/api/lookups#table.from_bytes
|
||||
DOCS: https://nightly.spacy.io/api/lookups#table.from_bytes
|
||||
"""
|
||||
loaded = srsly.msgpack_loads(bytes_data)
|
||||
data = loaded.get("dict", {})
|
||||
|
@ -172,7 +172,7 @@ class Lookups:
|
|||
def __init__(self) -> None:
|
||||
"""Initialize the Lookups object.
|
||||
|
||||
DOCS: https://spacy.io/api/lookups#init
|
||||
DOCS: https://nightly.spacy.io/api/lookups#init
|
||||
"""
|
||||
self._tables = {}
|
||||
|
||||
|
@ -201,7 +201,7 @@ class Lookups:
|
|||
data (dict): Optional data to add to the table.
|
||||
RETURNS (Table): The newly added table.
|
||||
|
||||
DOCS: https://spacy.io/api/lookups#add_table
|
||||
DOCS: https://nightly.spacy.io/api/lookups#add_table
|
||||
"""
|
||||
if name in self.tables:
|
||||
raise ValueError(Errors.E158.format(name=name))
|
||||
|
@ -215,7 +215,7 @@ class Lookups:
|
|||
name (str): Name of the table to set.
|
||||
table (Table): The Table to set.
|
||||
|
||||
DOCS: https://spacy.io/api/lookups#set_table
|
||||
DOCS: https://nightly.spacy.io/api/lookups#set_table
|
||||
"""
|
||||
self._tables[name] = table
|
||||
|
||||
|
@ -227,7 +227,7 @@ class Lookups:
|
|||
default (Any): Optional default value to return if table doesn't exist.
|
||||
RETURNS (Table): The table.
|
||||
|
||||
DOCS: https://spacy.io/api/lookups#get_table
|
||||
DOCS: https://nightly.spacy.io/api/lookups#get_table
|
||||
"""
|
||||
if name not in self._tables:
|
||||
if default == UNSET:
|
||||
|
@ -241,7 +241,7 @@ class Lookups:
|
|||
name (str): Name of the table to remove.
|
||||
RETURNS (Table): The removed table.
|
||||
|
||||
DOCS: https://spacy.io/api/lookups#remove_table
|
||||
DOCS: https://nightly.spacy.io/api/lookups#remove_table
|
||||
"""
|
||||
if name not in self._tables:
|
||||
raise KeyError(Errors.E159.format(name=name, tables=self.tables))
|
||||
|
@ -253,7 +253,7 @@ class Lookups:
|
|||
name (str): Name of the table.
|
||||
RETURNS (bool): Whether a table of that name exists.
|
||||
|
||||
DOCS: https://spacy.io/api/lookups#has_table
|
||||
DOCS: https://nightly.spacy.io/api/lookups#has_table
|
||||
"""
|
||||
return name in self._tables
|
||||
|
||||
|
@ -262,7 +262,7 @@ class Lookups:
|
|||
|
||||
RETURNS (bytes): The serialized Lookups.
|
||||
|
||||
DOCS: https://spacy.io/api/lookups#to_bytes
|
||||
DOCS: https://nightly.spacy.io/api/lookups#to_bytes
|
||||
"""
|
||||
return srsly.msgpack_dumps(self._tables)
|
||||
|
||||
|
@ -272,7 +272,7 @@ class Lookups:
|
|||
bytes_data (bytes): The data to load.
|
||||
RETURNS (Lookups): The loaded Lookups.
|
||||
|
||||
DOCS: https://spacy.io/api/lookups#from_bytes
|
||||
DOCS: https://nightly.spacy.io/api/lookups#from_bytes
|
||||
"""
|
||||
self._tables = {}
|
||||
for key, value in srsly.msgpack_loads(bytes_data).items():
|
||||
|
@ -287,7 +287,7 @@ class Lookups:
|
|||
|
||||
path (str / Path): The file path.
|
||||
|
||||
DOCS: https://spacy.io/api/lookups#to_disk
|
||||
DOCS: https://nightly.spacy.io/api/lookups#to_disk
|
||||
"""
|
||||
if len(self._tables):
|
||||
path = ensure_path(path)
|
||||
|
@ -306,7 +306,7 @@ class Lookups:
|
|||
path (str / Path): The directory path.
|
||||
RETURNS (Lookups): The loaded lookups.
|
||||
|
||||
DOCS: https://spacy.io/api/lookups#from_disk
|
||||
DOCS: https://nightly.spacy.io/api/lookups#from_disk
|
||||
"""
|
||||
path = ensure_path(path)
|
||||
filepath = path / filename
|
||||
|
|
|
@ -31,8 +31,8 @@ DEF PADDING = 5
|
|||
cdef class Matcher:
|
||||
"""Match sequences of tokens, based on pattern rules.
|
||||
|
||||
DOCS: https://spacy.io/api/matcher
|
||||
USAGE: https://spacy.io/usage/rule-based-matching
|
||||
DOCS: https://nightly.spacy.io/api/matcher
|
||||
USAGE: https://nightly.spacy.io/usage/rule-based-matching
|
||||
"""
|
||||
|
||||
def __init__(self, vocab, validate=True):
|
||||
|
@ -176,18 +176,10 @@ cdef class Matcher:
|
|||
return (self._callbacks[key], self._patterns[key])
|
||||
|
||||
def pipe(self, docs, batch_size=1000, return_matches=False, as_tuples=False):
|
||||
"""Match a stream of documents, yielding them in turn.
|
||||
|
||||
docs (Iterable[Union[Doc, Span]]): A stream of documents or spans.
|
||||
batch_size (int): Number of documents to accumulate into a working set.
|
||||
return_matches (bool): Yield the match lists along with the docs, making
|
||||
results (doc, matches) tuples.
|
||||
as_tuples (bool): Interpret the input stream as (doc, context) tuples,
|
||||
and yield (result, context) tuples out.
|
||||
If both return_matches and as_tuples are True, the output will
|
||||
be a sequence of ((doc, matches), context) tuples.
|
||||
YIELDS (Doc): Documents, in order.
|
||||
"""Match a stream of documents, yielding them in turn. Deprecated as of
|
||||
spaCy v3.0.
|
||||
"""
|
||||
warnings.warn(Warnings.W105.format(matcher="Matcher"), DeprecationWarning)
|
||||
if as_tuples:
|
||||
for doc, context in docs:
|
||||
matches = self(doc)
|
||||
|
@ -203,13 +195,16 @@ cdef class Matcher:
|
|||
else:
|
||||
yield doc
|
||||
|
||||
def __call__(self, object doclike):
|
||||
def __call__(self, object doclike, *, as_spans=False):
|
||||
"""Find all token sequences matching the supplied pattern.
|
||||
|
||||
doclike (Doc or Span): The document to match over.
|
||||
RETURNS (list): A list of `(key, start, end)` tuples,
|
||||
as_spans (bool): Return Span objects with labels instead of (match_id,
|
||||
start, end) tuples.
|
||||
RETURNS (list): A list of `(match_id, start, end)` tuples,
|
||||
describing the matches. A match tuple describes a span
|
||||
`doc[start:end]`. The `label_id` and `key` are both integers.
|
||||
`doc[start:end]`. The `match_id` is an integer. If as_spans is set
|
||||
to True, a list of Span objects is returned.
|
||||
"""
|
||||
if isinstance(doclike, Doc):
|
||||
doc = doclike
|
||||
|
@ -262,7 +257,10 @@ cdef class Matcher:
|
|||
on_match = self._callbacks.get(key, None)
|
||||
if on_match is not None:
|
||||
on_match(self, doc, i, final_matches)
|
||||
return final_matches
|
||||
if as_spans:
|
||||
return [Span(doc, start, end, label=key) for key, start, end in final_matches]
|
||||
else:
|
||||
return final_matches
|
||||
|
||||
def _normalize_key(self, key):
|
||||
if isinstance(key, basestring):
|
||||
|
|
|
@ -7,6 +7,7 @@ import warnings
|
|||
from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA
|
||||
from ..structs cimport TokenC
|
||||
from ..tokens.token cimport Token
|
||||
from ..tokens.span cimport Span
|
||||
from ..typedefs cimport attr_t
|
||||
|
||||
from ..schemas import TokenPattern
|
||||
|
@ -18,8 +19,8 @@ cdef class PhraseMatcher:
|
|||
sequences based on lists of token descriptions, the `PhraseMatcher` accepts
|
||||
match patterns in the form of `Doc` objects.
|
||||
|
||||
DOCS: https://spacy.io/api/phrasematcher
|
||||
USAGE: https://spacy.io/usage/rule-based-matching#phrasematcher
|
||||
DOCS: https://nightly.spacy.io/api/phrasematcher
|
||||
USAGE: https://nightly.spacy.io/usage/rule-based-matching#phrasematcher
|
||||
|
||||
Adapted from FlashText: https://github.com/vi3k6i5/flashtext
|
||||
MIT License (see `LICENSE`)
|
||||
|
@ -33,7 +34,7 @@ cdef class PhraseMatcher:
|
|||
attr (int / str): Token attribute to match on.
|
||||
validate (bool): Perform additional validation when patterns are added.
|
||||
|
||||
DOCS: https://spacy.io/api/phrasematcher#init
|
||||
DOCS: https://nightly.spacy.io/api/phrasematcher#init
|
||||
"""
|
||||
self.vocab = vocab
|
||||
self._callbacks = {}
|
||||
|
@ -60,7 +61,7 @@ cdef class PhraseMatcher:
|
|||
|
||||
RETURNS (int): The number of rules.
|
||||
|
||||
DOCS: https://spacy.io/api/phrasematcher#len
|
||||
DOCS: https://nightly.spacy.io/api/phrasematcher#len
|
||||
"""
|
||||
return len(self._callbacks)
|
||||
|
||||
|
@ -70,7 +71,7 @@ cdef class PhraseMatcher:
|
|||
key (str): The match ID.
|
||||
RETURNS (bool): Whether the matcher contains rules for this match ID.
|
||||
|
||||
DOCS: https://spacy.io/api/phrasematcher#contains
|
||||
DOCS: https://nightly.spacy.io/api/phrasematcher#contains
|
||||
"""
|
||||
return key in self._callbacks
|
||||
|
||||
|
@ -84,7 +85,7 @@ cdef class PhraseMatcher:
|
|||
|
||||
key (str): The match ID.
|
||||
|
||||
DOCS: https://spacy.io/api/phrasematcher#remove
|
||||
DOCS: https://nightly.spacy.io/api/phrasematcher#remove
|
||||
"""
|
||||
if key not in self._docs:
|
||||
raise KeyError(key)
|
||||
|
@ -163,7 +164,7 @@ cdef class PhraseMatcher:
|
|||
as variable arguments. Will be ignored if a list of patterns is
|
||||
provided as the second argument.
|
||||
|
||||
DOCS: https://spacy.io/api/phrasematcher#add
|
||||
DOCS: https://nightly.spacy.io/api/phrasematcher#add
|
||||
"""
|
||||
if docs is None or hasattr(docs, "__call__"): # old API
|
||||
on_match = docs
|
||||
|
@ -216,15 +217,18 @@ cdef class PhraseMatcher:
|
|||
result = internal_node
|
||||
map_set(self.mem, <MapStruct*>result, self.vocab.strings[key], NULL)
|
||||
|
||||
def __call__(self, doc):
|
||||
def __call__(self, doc, *, as_spans=False):
|
||||
"""Find all sequences matching the supplied patterns on the `Doc`.
|
||||
|
||||
doc (Doc): The document to match over.
|
||||
RETURNS (list): A list of `(key, start, end)` tuples,
|
||||
as_spans (bool): Return Span objects with labels instead of (match_id,
|
||||
start, end) tuples.
|
||||
RETURNS (list): A list of `(match_id, start, end)` tuples,
|
||||
describing the matches. A match tuple describes a span
|
||||
`doc[start:end]`. The `label_id` and `key` are both integers.
|
||||
`doc[start:end]`. The `match_id` is an integer. If as_spans is set
|
||||
to True, a list of Span objects is returned.
|
||||
|
||||
DOCS: https://spacy.io/api/phrasematcher#call
|
||||
DOCS: https://nightly.spacy.io/api/phrasematcher#call
|
||||
"""
|
||||
matches = []
|
||||
if doc is None or len(doc) == 0:
|
||||
|
@ -239,7 +243,10 @@ cdef class PhraseMatcher:
|
|||
on_match = self._callbacks.get(self.vocab.strings[ent_id])
|
||||
if on_match is not None:
|
||||
on_match(self, doc, i, matches)
|
||||
return matches
|
||||
if as_spans:
|
||||
return [Span(doc, start, end, label=key) for key, start, end in matches]
|
||||
else:
|
||||
return matches
|
||||
|
||||
cdef void find_matches(self, Doc doc, vector[SpanC] *matches) nogil:
|
||||
cdef MapStruct* current_node = self.c_map
|
||||
|
@ -285,20 +292,10 @@ cdef class PhraseMatcher:
|
|||
idx += 1
|
||||
|
||||
def pipe(self, stream, batch_size=1000, return_matches=False, as_tuples=False):
|
||||
"""Match a stream of documents, yielding them in turn.
|
||||
|
||||
docs (iterable): A stream of documents.
|
||||
batch_size (int): Number of documents to accumulate into a working set.
|
||||
return_matches (bool): Yield the match lists along with the docs, making
|
||||
results (doc, matches) tuples.
|
||||
as_tuples (bool): Interpret the input stream as (doc, context) tuples,
|
||||
and yield (result, context) tuples out.
|
||||
If both return_matches and as_tuples are True, the output will
|
||||
be a sequence of ((doc, matches), context) tuples.
|
||||
YIELDS (Doc): Documents, in order.
|
||||
|
||||
DOCS: https://spacy.io/api/phrasematcher#pipe
|
||||
"""Match a stream of documents, yielding them in turn. Deprecated as of
|
||||
spaCy v3.0.
|
||||
"""
|
||||
warnings.warn(Warnings.W105.format(matcher="PhraseMatcher"), DeprecationWarning)
|
||||
if as_tuples:
|
||||
for doc, context in stream:
|
||||
matches = self(doc)
|
||||
|
|
|
@ -24,7 +24,7 @@ def build_nel_encoder(tok2vec: Model, nO: Optional[int] = None) -> Model:
|
|||
return model
|
||||
|
||||
|
||||
@registry.assets.register("spacy.KBFromFile.v1")
|
||||
@registry.misc.register("spacy.KBFromFile.v1")
|
||||
def load_kb(kb_path: str) -> Callable[[Vocab], KnowledgeBase]:
|
||||
def kb_from_file(vocab):
|
||||
kb = KnowledgeBase(vocab, entity_vector_length=1)
|
||||
|
@ -34,7 +34,7 @@ def load_kb(kb_path: str) -> Callable[[Vocab], KnowledgeBase]:
|
|||
return kb_from_file
|
||||
|
||||
|
||||
@registry.assets.register("spacy.EmptyKB.v1")
|
||||
@registry.misc.register("spacy.EmptyKB.v1")
|
||||
def empty_kb(entity_vector_length: int) -> Callable[[Vocab], KnowledgeBase]:
|
||||
def empty_kb_factory(vocab):
|
||||
return KnowledgeBase(vocab=vocab, entity_vector_length=entity_vector_length)
|
||||
|
@ -42,6 +42,6 @@ def empty_kb(entity_vector_length: int) -> Callable[[Vocab], KnowledgeBase]:
|
|||
return empty_kb_factory
|
||||
|
||||
|
||||
@registry.assets.register("spacy.CandidateGenerator.v1")
|
||||
@registry.misc.register("spacy.CandidateGenerator.v1")
|
||||
def create_candidates() -> Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]:
|
||||
return get_candidates
|
||||
|
|
|
@ -38,7 +38,7 @@ class AttributeRuler(Pipe):
|
|||
"""Set token-level attributes for tokens matched by Matcher patterns.
|
||||
Additionally supports importing patterns from tag maps and morph rules.
|
||||
|
||||
DOCS: https://spacy.io/api/attributeruler
|
||||
DOCS: https://nightly.spacy.io/api/attributeruler
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
|
@ -59,7 +59,7 @@ class AttributeRuler(Pipe):
|
|||
|
||||
RETURNS (AttributeRuler): The AttributeRuler component.
|
||||
|
||||
DOCS: https://spacy.io/api/attributeruler#init
|
||||
DOCS: https://nightly.spacy.io/api/attributeruler#init
|
||||
"""
|
||||
self.name = name
|
||||
self.vocab = vocab
|
||||
|
@ -77,7 +77,7 @@ class AttributeRuler(Pipe):
|
|||
doc (Doc): The document to process.
|
||||
RETURNS (Doc): The processed Doc.
|
||||
|
||||
DOCS: https://spacy.io/api/attributeruler#call
|
||||
DOCS: https://nightly.spacy.io/api/attributeruler#call
|
||||
"""
|
||||
matches = sorted(self.matcher(doc))
|
||||
|
||||
|
@ -121,7 +121,7 @@ class AttributeRuler(Pipe):
|
|||
tag_map (dict): The tag map that maps fine-grained tags to
|
||||
coarse-grained tags and morphological features.
|
||||
|
||||
DOCS: https://spacy.io/api/attributeruler#load_from_morph_rules
|
||||
DOCS: https://nightly.spacy.io/api/attributeruler#load_from_morph_rules
|
||||
"""
|
||||
for tag, attrs in tag_map.items():
|
||||
pattern = [{"TAG": tag}]
|
||||
|
@ -139,7 +139,7 @@ class AttributeRuler(Pipe):
|
|||
fine-grained tags to coarse-grained tags, lemmas and morphological
|
||||
features.
|
||||
|
||||
DOCS: https://spacy.io/api/attributeruler#load_from_morph_rules
|
||||
DOCS: https://nightly.spacy.io/api/attributeruler#load_from_morph_rules
|
||||
"""
|
||||
for tag in morph_rules:
|
||||
for word in morph_rules[tag]:
|
||||
|
@ -163,7 +163,7 @@ class AttributeRuler(Pipe):
|
|||
index (int): The index of the token in the matched span to modify. May
|
||||
be negative to index from the end of the span. Defaults to 0.
|
||||
|
||||
DOCS: https://spacy.io/api/attributeruler#add
|
||||
DOCS: https://nightly.spacy.io/api/attributeruler#add
|
||||
"""
|
||||
self.matcher.add(len(self.attrs), patterns)
|
||||
self._attrs_unnormed.append(attrs)
|
||||
|
@ -178,7 +178,7 @@ class AttributeRuler(Pipe):
|
|||
as the arguments to AttributeRuler.add (patterns/attrs/index) to
|
||||
add as patterns.
|
||||
|
||||
DOCS: https://spacy.io/api/attributeruler#add_patterns
|
||||
DOCS: https://nightly.spacy.io/api/attributeruler#add_patterns
|
||||
"""
|
||||
for p in pattern_dicts:
|
||||
self.add(**p)
|
||||
|
@ -203,7 +203,7 @@ class AttributeRuler(Pipe):
|
|||
Scorer.score_token_attr for the attributes "tag", "pos", "morph"
|
||||
and "lemma" for the target token attributes.
|
||||
|
||||
DOCS: https://spacy.io/api/tagger#score
|
||||
DOCS: https://nightly.spacy.io/api/tagger#score
|
||||
"""
|
||||
validate_examples(examples, "AttributeRuler.score")
|
||||
results = {}
|
||||
|
@ -227,7 +227,7 @@ class AttributeRuler(Pipe):
|
|||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
RETURNS (bytes): The serialized object.
|
||||
|
||||
DOCS: https://spacy.io/api/attributeruler#to_bytes
|
||||
DOCS: https://nightly.spacy.io/api/attributeruler#to_bytes
|
||||
"""
|
||||
serialize = {}
|
||||
serialize["vocab"] = self.vocab.to_bytes
|
||||
|
@ -243,7 +243,7 @@ class AttributeRuler(Pipe):
|
|||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
returns (AttributeRuler): The loaded object.
|
||||
|
||||
DOCS: https://spacy.io/api/attributeruler#from_bytes
|
||||
DOCS: https://nightly.spacy.io/api/attributeruler#from_bytes
|
||||
"""
|
||||
|
||||
def load_patterns(b):
|
||||
|
@ -264,7 +264,7 @@ class AttributeRuler(Pipe):
|
|||
|
||||
path (Union[Path, str]): A path to a directory.
|
||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
DOCS: https://spacy.io/api/attributeruler#to_disk
|
||||
DOCS: https://nightly.spacy.io/api/attributeruler#to_disk
|
||||
"""
|
||||
serialize = {
|
||||
"vocab": lambda p: self.vocab.to_disk(p),
|
||||
|
@ -279,7 +279,7 @@ class AttributeRuler(Pipe):
|
|||
|
||||
path (Union[Path, str]): A path to a directory.
|
||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
DOCS: https://spacy.io/api/attributeruler#from_disk
|
||||
DOCS: https://nightly.spacy.io/api/attributeruler#from_disk
|
||||
"""
|
||||
|
||||
def load_patterns(p):
|
||||
|
|
|
@ -105,7 +105,7 @@ def make_parser(
|
|||
cdef class DependencyParser(Parser):
|
||||
"""Pipeline component for dependency parsing.
|
||||
|
||||
DOCS: https://spacy.io/api/dependencyparser
|
||||
DOCS: https://nightly.spacy.io/api/dependencyparser
|
||||
"""
|
||||
TransitionSystem = ArcEager
|
||||
|
||||
|
@ -146,7 +146,7 @@ cdef class DependencyParser(Parser):
|
|||
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans
|
||||
and Scorer.score_deps.
|
||||
|
||||
DOCS: https://spacy.io/api/dependencyparser#score
|
||||
DOCS: https://nightly.spacy.io/api/dependencyparser#score
|
||||
"""
|
||||
validate_examples(examples, "DependencyParser.score")
|
||||
def dep_getter(token, attr):
|
||||
|
|
|
@ -39,12 +39,12 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
|
|||
requires=["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"],
|
||||
assigns=["token.ent_kb_id"],
|
||||
default_config={
|
||||
"kb_loader": {"@assets": "spacy.EmptyKB.v1", "entity_vector_length": 64},
|
||||
"kb_loader": {"@misc": "spacy.EmptyKB.v1", "entity_vector_length": 64},
|
||||
"model": DEFAULT_NEL_MODEL,
|
||||
"labels_discard": [],
|
||||
"incl_prior": True,
|
||||
"incl_context": True,
|
||||
"get_candidates": {"@assets": "spacy.CandidateGenerator.v1"},
|
||||
"get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
|
||||
},
|
||||
)
|
||||
def make_entity_linker(
|
||||
|
@ -83,7 +83,7 @@ def make_entity_linker(
|
|||
class EntityLinker(Pipe):
|
||||
"""Pipeline component for named entity linking.
|
||||
|
||||
DOCS: https://spacy.io/api/entitylinker
|
||||
DOCS: https://nightly.spacy.io/api/entitylinker
|
||||
"""
|
||||
|
||||
NIL = "NIL" # string used to refer to a non-existing link
|
||||
|
@ -111,7 +111,7 @@ class EntityLinker(Pipe):
|
|||
incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
|
||||
incl_context (bool): Whether or not to include the local context in the model.
|
||||
|
||||
DOCS: https://spacy.io/api/entitylinker#init
|
||||
DOCS: https://nightly.spacy.io/api/entitylinker#init
|
||||
"""
|
||||
self.vocab = vocab
|
||||
self.model = model
|
||||
|
@ -151,7 +151,7 @@ class EntityLinker(Pipe):
|
|||
create_optimizer if it doesn't exist.
|
||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||
|
||||
DOCS: https://spacy.io/api/entitylinker#begin_training
|
||||
DOCS: https://nightly.spacy.io/api/entitylinker#begin_training
|
||||
"""
|
||||
self.require_kb()
|
||||
nO = self.kb.entity_vector_length
|
||||
|
@ -182,7 +182,7 @@ class EntityLinker(Pipe):
|
|||
Updated using the component name as the key.
|
||||
RETURNS (Dict[str, float]): The updated losses dictionary.
|
||||
|
||||
DOCS: https://spacy.io/api/entitylinker#update
|
||||
DOCS: https://nightly.spacy.io/api/entitylinker#update
|
||||
"""
|
||||
self.require_kb()
|
||||
if losses is None:
|
||||
|
@ -264,7 +264,7 @@ class EntityLinker(Pipe):
|
|||
doc (Doc): The document to process.
|
||||
RETURNS (Doc): The processed Doc.
|
||||
|
||||
DOCS: https://spacy.io/api/entitylinker#call
|
||||
DOCS: https://nightly.spacy.io/api/entitylinker#call
|
||||
"""
|
||||
kb_ids = self.predict([doc])
|
||||
self.set_annotations([doc], kb_ids)
|
||||
|
@ -279,7 +279,7 @@ class EntityLinker(Pipe):
|
|||
batch_size (int): The number of documents to buffer.
|
||||
YIELDS (Doc): Processed documents in order.
|
||||
|
||||
DOCS: https://spacy.io/api/entitylinker#pipe
|
||||
DOCS: https://nightly.spacy.io/api/entitylinker#pipe
|
||||
"""
|
||||
for docs in util.minibatch(stream, size=batch_size):
|
||||
kb_ids = self.predict(docs)
|
||||
|
@ -294,7 +294,7 @@ class EntityLinker(Pipe):
|
|||
docs (Iterable[Doc]): The documents to predict.
|
||||
RETURNS (List[int]): The models prediction for each document.
|
||||
|
||||
DOCS: https://spacy.io/api/entitylinker#predict
|
||||
DOCS: https://nightly.spacy.io/api/entitylinker#predict
|
||||
"""
|
||||
self.require_kb()
|
||||
entity_count = 0
|
||||
|
@ -391,7 +391,7 @@ class EntityLinker(Pipe):
|
|||
docs (Iterable[Doc]): The documents to modify.
|
||||
kb_ids (List[str]): The IDs to set, produced by EntityLinker.predict.
|
||||
|
||||
DOCS: https://spacy.io/api/entitylinker#set_annotations
|
||||
DOCS: https://nightly.spacy.io/api/entitylinker#set_annotations
|
||||
"""
|
||||
count_ents = len([ent for doc in docs for ent in doc.ents])
|
||||
if count_ents != len(kb_ids):
|
||||
|
@ -412,7 +412,7 @@ class EntityLinker(Pipe):
|
|||
path (str / Path): Path to a directory.
|
||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
|
||||
DOCS: https://spacy.io/api/entitylinker#to_disk
|
||||
DOCS: https://nightly.spacy.io/api/entitylinker#to_disk
|
||||
"""
|
||||
serialize = {}
|
||||
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
|
||||
|
@ -430,7 +430,7 @@ class EntityLinker(Pipe):
|
|||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
RETURNS (EntityLinker): The modified EntityLinker object.
|
||||
|
||||
DOCS: https://spacy.io/api/entitylinker#from_disk
|
||||
DOCS: https://nightly.spacy.io/api/entitylinker#from_disk
|
||||
"""
|
||||
|
||||
def load_model(p):
|
||||
|
|
|
@ -53,8 +53,8 @@ class EntityRuler:
|
|||
purely rule-based entity recognition system. After initialization, the
|
||||
component is typically added to the pipeline using `nlp.add_pipe`.
|
||||
|
||||
DOCS: https://spacy.io/api/entityruler
|
||||
USAGE: https://spacy.io/usage/rule-based-matching#entityruler
|
||||
DOCS: https://nightly.spacy.io/api/entityruler
|
||||
USAGE: https://nightly.spacy.io/usage/rule-based-matching#entityruler
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
|
@ -88,7 +88,7 @@ class EntityRuler:
|
|||
added by the model, overwrite them by matches if necessary.
|
||||
ent_id_sep (str): Separator used internally for entity IDs.
|
||||
|
||||
DOCS: https://spacy.io/api/entityruler#init
|
||||
DOCS: https://nightly.spacy.io/api/entityruler#init
|
||||
"""
|
||||
self.nlp = nlp
|
||||
self.name = name
|
||||
|
@ -127,7 +127,7 @@ class EntityRuler:
|
|||
doc (Doc): The Doc object in the pipeline.
|
||||
RETURNS (Doc): The Doc with added entities, if available.
|
||||
|
||||
DOCS: https://spacy.io/api/entityruler#call
|
||||
DOCS: https://nightly.spacy.io/api/entityruler#call
|
||||
"""
|
||||
matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
|
||||
matches = set(
|
||||
|
@ -165,7 +165,7 @@ class EntityRuler:
|
|||
|
||||
RETURNS (set): The string labels.
|
||||
|
||||
DOCS: https://spacy.io/api/entityruler#labels
|
||||
DOCS: https://nightly.spacy.io/api/entityruler#labels
|
||||
"""
|
||||
keys = set(self.token_patterns.keys())
|
||||
keys.update(self.phrase_patterns.keys())
|
||||
|
@ -185,7 +185,7 @@ class EntityRuler:
|
|||
|
||||
RETURNS (set): The string entity ids.
|
||||
|
||||
DOCS: https://spacy.io/api/entityruler#ent_ids
|
||||
DOCS: https://nightly.spacy.io/api/entityruler#ent_ids
|
||||
"""
|
||||
keys = set(self.token_patterns.keys())
|
||||
keys.update(self.phrase_patterns.keys())
|
||||
|
@ -203,7 +203,7 @@ class EntityRuler:
|
|||
|
||||
RETURNS (list): The original patterns, one dictionary per pattern.
|
||||
|
||||
DOCS: https://spacy.io/api/entityruler#patterns
|
||||
DOCS: https://nightly.spacy.io/api/entityruler#patterns
|
||||
"""
|
||||
all_patterns = []
|
||||
for label, patterns in self.token_patterns.items():
|
||||
|
@ -230,7 +230,7 @@ class EntityRuler:
|
|||
|
||||
patterns (list): The patterns to add.
|
||||
|
||||
DOCS: https://spacy.io/api/entityruler#add_patterns
|
||||
DOCS: https://nightly.spacy.io/api/entityruler#add_patterns
|
||||
"""
|
||||
|
||||
# disable the nlp components after this one in case they hadn't been initialized / deserialised yet
|
||||
|
@ -324,7 +324,7 @@ class EntityRuler:
|
|||
patterns_bytes (bytes): The bytestring to load.
|
||||
RETURNS (EntityRuler): The loaded entity ruler.
|
||||
|
||||
DOCS: https://spacy.io/api/entityruler#from_bytes
|
||||
DOCS: https://nightly.spacy.io/api/entityruler#from_bytes
|
||||
"""
|
||||
cfg = srsly.msgpack_loads(patterns_bytes)
|
||||
self.clear()
|
||||
|
@ -346,7 +346,7 @@ class EntityRuler:
|
|||
|
||||
RETURNS (bytes): The serialized patterns.
|
||||
|
||||
DOCS: https://spacy.io/api/entityruler#to_bytes
|
||||
DOCS: https://nightly.spacy.io/api/entityruler#to_bytes
|
||||
"""
|
||||
serial = {
|
||||
"overwrite": self.overwrite,
|
||||
|
@ -365,7 +365,7 @@ class EntityRuler:
|
|||
path (str / Path): The JSONL file to load.
|
||||
RETURNS (EntityRuler): The loaded entity ruler.
|
||||
|
||||
DOCS: https://spacy.io/api/entityruler#from_disk
|
||||
DOCS: https://nightly.spacy.io/api/entityruler#from_disk
|
||||
"""
|
||||
path = ensure_path(path)
|
||||
self.clear()
|
||||
|
@ -401,7 +401,7 @@ class EntityRuler:
|
|||
|
||||
path (str / Path): The JSONL file to save.
|
||||
|
||||
DOCS: https://spacy.io/api/entityruler#to_disk
|
||||
DOCS: https://nightly.spacy.io/api/entityruler#to_disk
|
||||
"""
|
||||
path = ensure_path(path)
|
||||
cfg = {
|
||||
|
|
|
@ -15,7 +15,7 @@ def merge_noun_chunks(doc: Doc) -> Doc:
|
|||
doc (Doc): The Doc object.
|
||||
RETURNS (Doc): The Doc object with merged noun chunks.
|
||||
|
||||
DOCS: https://spacy.io/api/pipeline-functions#merge_noun_chunks
|
||||
DOCS: https://nightly.spacy.io/api/pipeline-functions#merge_noun_chunks
|
||||
"""
|
||||
if not doc.is_parsed:
|
||||
return doc
|
||||
|
@ -37,7 +37,7 @@ def merge_entities(doc: Doc):
|
|||
doc (Doc): The Doc object.
|
||||
RETURNS (Doc): The Doc object with merged entities.
|
||||
|
||||
DOCS: https://spacy.io/api/pipeline-functions#merge_entities
|
||||
DOCS: https://nightly.spacy.io/api/pipeline-functions#merge_entities
|
||||
"""
|
||||
with doc.retokenize() as retokenizer:
|
||||
for ent in doc.ents:
|
||||
|
@ -54,7 +54,7 @@ def merge_subtokens(doc: Doc, label: str = "subtok") -> Doc:
|
|||
label (str): The subtoken dependency label.
|
||||
RETURNS (Doc): The Doc object with merged subtokens.
|
||||
|
||||
DOCS: https://spacy.io/api/pipeline-functions#merge_subtokens
|
||||
DOCS: https://nightly.spacy.io/api/pipeline-functions#merge_subtokens
|
||||
"""
|
||||
# TODO: make stateful component with "label" config
|
||||
merger = Matcher(doc.vocab)
|
||||
|
|
|
@ -43,7 +43,7 @@ class Lemmatizer(Pipe):
|
|||
The Lemmatizer supports simple part-of-speech-sensitive suffix rules and
|
||||
lookup tables.
|
||||
|
||||
DOCS: https://spacy.io/api/lemmatizer
|
||||
DOCS: https://nightly.spacy.io/api/lemmatizer
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
|
@ -54,7 +54,7 @@ class Lemmatizer(Pipe):
|
|||
mode (str): The lemmatizer mode.
|
||||
RETURNS (dict): The lookups configuration settings for this mode.
|
||||
|
||||
DOCS: https://spacy.io/api/lemmatizer#get_lookups_config
|
||||
DOCS: https://nightly.spacy.io/api/lemmatizer#get_lookups_config
|
||||
"""
|
||||
if mode == "lookup":
|
||||
return {
|
||||
|
@ -80,7 +80,7 @@ class Lemmatizer(Pipe):
|
|||
lookups should be loaded.
|
||||
RETURNS (Lookups): The Lookups object.
|
||||
|
||||
DOCS: https://spacy.io/api/lemmatizer#get_lookups_config
|
||||
DOCS: https://nightly.spacy.io/api/lemmatizer#get_lookups_config
|
||||
"""
|
||||
config = cls.get_lookups_config(mode)
|
||||
required_tables = config.get("required_tables", [])
|
||||
|
@ -123,7 +123,7 @@ class Lemmatizer(Pipe):
|
|||
overwrite (bool): Whether to overwrite existing lemmas. Defaults to
|
||||
`False`.
|
||||
|
||||
DOCS: https://spacy.io/api/lemmatizer#init
|
||||
DOCS: https://nightly.spacy.io/api/lemmatizer#init
|
||||
"""
|
||||
self.vocab = vocab
|
||||
self.model = model
|
||||
|
@ -152,7 +152,7 @@ class Lemmatizer(Pipe):
|
|||
doc (Doc): The Doc to process.
|
||||
RETURNS (Doc): The processed Doc.
|
||||
|
||||
DOCS: https://spacy.io/api/lemmatizer#call
|
||||
DOCS: https://nightly.spacy.io/api/lemmatizer#call
|
||||
"""
|
||||
for token in doc:
|
||||
if self.overwrite or token.lemma == 0:
|
||||
|
@ -168,7 +168,7 @@ class Lemmatizer(Pipe):
|
|||
batch_size (int): The number of documents to buffer.
|
||||
YIELDS (Doc): Processed documents in order.
|
||||
|
||||
DOCS: https://spacy.io/api/lemmatizer#pipe
|
||||
DOCS: https://nightly.spacy.io/api/lemmatizer#pipe
|
||||
"""
|
||||
for doc in stream:
|
||||
doc = self(doc)
|
||||
|
@ -180,7 +180,7 @@ class Lemmatizer(Pipe):
|
|||
token (Token): The token to lemmatize.
|
||||
RETURNS (list): The available lemmas for the string.
|
||||
|
||||
DOCS: https://spacy.io/api/lemmatizer#lookup_lemmatize
|
||||
DOCS: https://nightly.spacy.io/api/lemmatizer#lookup_lemmatize
|
||||
"""
|
||||
lookup_table = self.lookups.get_table("lemma_lookup", {})
|
||||
result = lookup_table.get(token.text, token.text)
|
||||
|
@ -194,7 +194,7 @@ class Lemmatizer(Pipe):
|
|||
token (Token): The token to lemmatize.
|
||||
RETURNS (list): The available lemmas for the string.
|
||||
|
||||
DOCS: https://spacy.io/api/lemmatizer#rule_lemmatize
|
||||
DOCS: https://nightly.spacy.io/api/lemmatizer#rule_lemmatize
|
||||
"""
|
||||
cache_key = (token.orth, token.pos, token.morph)
|
||||
if cache_key in self.cache:
|
||||
|
@ -260,7 +260,7 @@ class Lemmatizer(Pipe):
|
|||
token (Token): The token.
|
||||
RETURNS (bool): Whether the token is a base form.
|
||||
|
||||
DOCS: https://spacy.io/api/lemmatizer#is_base_form
|
||||
DOCS: https://nightly.spacy.io/api/lemmatizer#is_base_form
|
||||
"""
|
||||
return False
|
||||
|
||||
|
@ -270,7 +270,7 @@ class Lemmatizer(Pipe):
|
|||
examples (Iterable[Example]): The examples to score.
|
||||
RETURNS (Dict[str, Any]): The scores.
|
||||
|
||||
DOCS: https://spacy.io/api/lemmatizer#score
|
||||
DOCS: https://nightly.spacy.io/api/lemmatizer#score
|
||||
"""
|
||||
validate_examples(examples, "Lemmatizer.score")
|
||||
return Scorer.score_token_attr(examples, "lemma", **kwargs)
|
||||
|
@ -282,7 +282,7 @@ class Lemmatizer(Pipe):
|
|||
it doesn't exist.
|
||||
exclude (list): String names of serialization fields to exclude.
|
||||
|
||||
DOCS: https://spacy.io/api/vocab#to_disk
|
||||
DOCS: https://nightly.spacy.io/api/vocab#to_disk
|
||||
"""
|
||||
serialize = {}
|
||||
serialize["vocab"] = lambda p: self.vocab.to_disk(p)
|
||||
|
@ -297,7 +297,7 @@ class Lemmatizer(Pipe):
|
|||
exclude (list): String names of serialization fields to exclude.
|
||||
RETURNS (Vocab): The modified `Vocab` object.
|
||||
|
||||
DOCS: https://spacy.io/api/vocab#to_disk
|
||||
DOCS: https://nightly.spacy.io/api/vocab#to_disk
|
||||
"""
|
||||
deserialize = {}
|
||||
deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
|
||||
|
@ -310,7 +310,7 @@ class Lemmatizer(Pipe):
|
|||
exclude (list): String names of serialization fields to exclude.
|
||||
RETURNS (bytes): The serialized form of the `Vocab` object.
|
||||
|
||||
DOCS: https://spacy.io/api/vocab#to_bytes
|
||||
DOCS: https://nightly.spacy.io/api/vocab#to_bytes
|
||||
"""
|
||||
serialize = {}
|
||||
serialize["vocab"] = self.vocab.to_bytes
|
||||
|
@ -324,7 +324,7 @@ class Lemmatizer(Pipe):
|
|||
exclude (list): String names of serialization fields to exclude.
|
||||
RETURNS (Vocab): The `Vocab` object.
|
||||
|
||||
DOCS: https://spacy.io/api/vocab#from_bytes
|
||||
DOCS: https://nightly.spacy.io/api/vocab#from_bytes
|
||||
"""
|
||||
deserialize = {}
|
||||
deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
|
||||
|
|
|
@ -79,7 +79,7 @@ class Morphologizer(Tagger):
|
|||
labels_morph (dict): Mapping of morph + POS tags to morph labels.
|
||||
labels_pos (dict): Mapping of morph + POS tags to POS tags.
|
||||
|
||||
DOCS: https://spacy.io/api/morphologizer#init
|
||||
DOCS: https://nightly.spacy.io/api/morphologizer#init
|
||||
"""
|
||||
self.vocab = vocab
|
||||
self.model = model
|
||||
|
@ -106,7 +106,7 @@ class Morphologizer(Tagger):
|
|||
label (str): The label to add.
|
||||
RETURNS (int): 0 if label is already present, otherwise 1.
|
||||
|
||||
DOCS: https://spacy.io/api/morphologizer#add_label
|
||||
DOCS: https://nightly.spacy.io/api/morphologizer#add_label
|
||||
"""
|
||||
if not isinstance(label, str):
|
||||
raise ValueError(Errors.E187)
|
||||
|
@ -139,7 +139,7 @@ class Morphologizer(Tagger):
|
|||
create_optimizer if it doesn't exist.
|
||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||
|
||||
DOCS: https://spacy.io/api/morphologizer#begin_training
|
||||
DOCS: https://nightly.spacy.io/api/morphologizer#begin_training
|
||||
"""
|
||||
if not hasattr(get_examples, "__call__"):
|
||||
err = Errors.E930.format(name="Morphologizer", obj=type(get_examples))
|
||||
|
@ -169,7 +169,7 @@ class Morphologizer(Tagger):
|
|||
docs (Iterable[Doc]): The documents to modify.
|
||||
batch_tag_ids: The IDs to set, produced by Morphologizer.predict.
|
||||
|
||||
DOCS: https://spacy.io/api/morphologizer#set_annotations
|
||||
DOCS: https://nightly.spacy.io/api/morphologizer#set_annotations
|
||||
"""
|
||||
if isinstance(docs, Doc):
|
||||
docs = [docs]
|
||||
|
@ -194,7 +194,7 @@ class Morphologizer(Tagger):
|
|||
scores: Scores representing the model's predictions.
|
||||
RETUTNRS (Tuple[float, float]): The loss and the gradient.
|
||||
|
||||
DOCS: https://spacy.io/api/morphologizer#get_loss
|
||||
DOCS: https://nightly.spacy.io/api/morphologizer#get_loss
|
||||
"""
|
||||
validate_examples(examples, "Morphologizer.get_loss")
|
||||
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
|
||||
|
@ -231,7 +231,7 @@ class Morphologizer(Tagger):
|
|||
Scorer.score_token_attr for the attributes "pos" and "morph" and
|
||||
Scorer.score_token_attr_per_feat for the attribute "morph".
|
||||
|
||||
DOCS: https://spacy.io/api/morphologizer#score
|
||||
DOCS: https://nightly.spacy.io/api/morphologizer#score
|
||||
"""
|
||||
validate_examples(examples, "Morphologizer.score")
|
||||
results = {}
|
||||
|
@ -247,7 +247,7 @@ class Morphologizer(Tagger):
|
|||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
RETURNS (bytes): The serialized object.
|
||||
|
||||
DOCS: https://spacy.io/api/morphologizer#to_bytes
|
||||
DOCS: https://nightly.spacy.io/api/morphologizer#to_bytes
|
||||
"""
|
||||
serialize = {}
|
||||
serialize["model"] = self.model.to_bytes
|
||||
|
@ -262,7 +262,7 @@ class Morphologizer(Tagger):
|
|||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
RETURNS (Morphologizer): The loaded Morphologizer.
|
||||
|
||||
DOCS: https://spacy.io/api/morphologizer#from_bytes
|
||||
DOCS: https://nightly.spacy.io/api/morphologizer#from_bytes
|
||||
"""
|
||||
def load_model(b):
|
||||
try:
|
||||
|
@ -284,7 +284,7 @@ class Morphologizer(Tagger):
|
|||
path (str / Path): Path to a directory.
|
||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
|
||||
DOCS: https://spacy.io/api/morphologizer#to_disk
|
||||
DOCS: https://nightly.spacy.io/api/morphologizer#to_disk
|
||||
"""
|
||||
serialize = {
|
||||
"vocab": lambda p: self.vocab.to_disk(p),
|
||||
|
@ -300,7 +300,7 @@ class Morphologizer(Tagger):
|
|||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
RETURNS (Morphologizer): The modified Morphologizer object.
|
||||
|
||||
DOCS: https://spacy.io/api/morphologizer#from_disk
|
||||
DOCS: https://nightly.spacy.io/api/morphologizer#from_disk
|
||||
"""
|
||||
def load_model(p):
|
||||
with p.open("rb") as file_:
|
||||
|
|
|
@ -88,7 +88,7 @@ def make_ner(
|
|||
cdef class EntityRecognizer(Parser):
|
||||
"""Pipeline component for named entity recognition.
|
||||
|
||||
DOCS: https://spacy.io/api/entityrecognizer
|
||||
DOCS: https://nightly.spacy.io/api/entityrecognizer
|
||||
"""
|
||||
TransitionSystem = BiluoPushDown
|
||||
|
||||
|
@ -119,7 +119,7 @@ cdef class EntityRecognizer(Parser):
|
|||
examples (Iterable[Example]): The examples to score.
|
||||
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
|
||||
|
||||
DOCS: https://spacy.io/api/entityrecognizer#score
|
||||
DOCS: https://nightly.spacy.io/api/entityrecognizer#score
|
||||
"""
|
||||
validate_examples(examples, "EntityRecognizer.score")
|
||||
return Scorer.score_spans(examples, "ents", **kwargs)
|
||||
|
|
|
@ -15,7 +15,7 @@ cdef class Pipe:
|
|||
from it and it defines the interface that components should follow to
|
||||
function as trainable components in a spaCy pipeline.
|
||||
|
||||
DOCS: https://spacy.io/api/pipe
|
||||
DOCS: https://nightly.spacy.io/api/pipe
|
||||
"""
|
||||
def __init__(self, vocab, model, name, **cfg):
|
||||
"""Initialize a pipeline component.
|
||||
|
@ -25,7 +25,7 @@ cdef class Pipe:
|
|||
name (str): The component instance name.
|
||||
**cfg: Additonal settings and config parameters.
|
||||
|
||||
DOCS: https://spacy.io/api/pipe#init
|
||||
DOCS: https://nightly.spacy.io/api/pipe#init
|
||||
"""
|
||||
self.vocab = vocab
|
||||
self.model = model
|
||||
|
@ -37,10 +37,10 @@ cdef class Pipe:
|
|||
and returned. This usually happens under the hood when the nlp object
|
||||
is called on a text and all components are applied to the Doc.
|
||||
|
||||
docs (Doc): The Doc to preocess.
|
||||
docs (Doc): The Doc to process.
|
||||
RETURNS (Doc): The processed Doc.
|
||||
|
||||
DOCS: https://spacy.io/api/pipe#call
|
||||
DOCS: https://nightly.spacy.io/api/pipe#call
|
||||
"""
|
||||
scores = self.predict([doc])
|
||||
self.set_annotations([doc], scores)
|
||||
|
@ -55,7 +55,7 @@ cdef class Pipe:
|
|||
batch_size (int): The number of documents to buffer.
|
||||
YIELDS (Doc): Processed documents in order.
|
||||
|
||||
DOCS: https://spacy.io/api/pipe#pipe
|
||||
DOCS: https://nightly.spacy.io/api/pipe#pipe
|
||||
"""
|
||||
for docs in util.minibatch(stream, size=batch_size):
|
||||
scores = self.predict(docs)
|
||||
|
@ -69,7 +69,7 @@ cdef class Pipe:
|
|||
docs (Iterable[Doc]): The documents to predict.
|
||||
RETURNS: Vector representations for each token in the documents.
|
||||
|
||||
DOCS: https://spacy.io/api/pipe#predict
|
||||
DOCS: https://nightly.spacy.io/api/pipe#predict
|
||||
"""
|
||||
raise NotImplementedError(Errors.E931.format(method="predict", name=self.name))
|
||||
|
||||
|
@ -79,7 +79,7 @@ cdef class Pipe:
|
|||
docs (Iterable[Doc]): The documents to modify.
|
||||
scores: The scores to assign.
|
||||
|
||||
DOCS: https://spacy.io/api/pipe#set_annotations
|
||||
DOCS: https://nightly.spacy.io/api/pipe#set_annotations
|
||||
"""
|
||||
raise NotImplementedError(Errors.E931.format(method="set_annotations", name=self.name))
|
||||
|
||||
|
@ -96,7 +96,7 @@ cdef class Pipe:
|
|||
Updated using the component name as the key.
|
||||
RETURNS (Dict[str, float]): The updated losses dictionary.
|
||||
|
||||
DOCS: https://spacy.io/api/pipe#update
|
||||
DOCS: https://nightly.spacy.io/api/pipe#update
|
||||
"""
|
||||
if losses is None:
|
||||
losses = {}
|
||||
|
@ -132,7 +132,7 @@ cdef class Pipe:
|
|||
Updated using the component name as the key.
|
||||
RETURNS (Dict[str, float]): The updated losses dictionary.
|
||||
|
||||
DOCS: https://spacy.io/api/pipe#rehearse
|
||||
DOCS: https://nightly.spacy.io/api/pipe#rehearse
|
||||
"""
|
||||
pass
|
||||
|
||||
|
@ -144,7 +144,7 @@ cdef class Pipe:
|
|||
scores: Scores representing the model's predictions.
|
||||
RETUTNRS (Tuple[float, float]): The loss and the gradient.
|
||||
|
||||
DOCS: https://spacy.io/api/pipe#get_loss
|
||||
DOCS: https://nightly.spacy.io/api/pipe#get_loss
|
||||
"""
|
||||
raise NotImplementedError(Errors.E931.format(method="get_loss", name=self.name))
|
||||
|
||||
|
@ -156,7 +156,7 @@ cdef class Pipe:
|
|||
label (str): The label to add.
|
||||
RETURNS (int): 0 if label is already present, otherwise 1.
|
||||
|
||||
DOCS: https://spacy.io/api/pipe#add_label
|
||||
DOCS: https://nightly.spacy.io/api/pipe#add_label
|
||||
"""
|
||||
raise NotImplementedError(Errors.E931.format(method="add_label", name=self.name))
|
||||
|
||||
|
@ -165,7 +165,7 @@ cdef class Pipe:
|
|||
|
||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||
|
||||
DOCS: https://spacy.io/api/pipe#create_optimizer
|
||||
DOCS: https://nightly.spacy.io/api/pipe#create_optimizer
|
||||
"""
|
||||
return util.create_default_optimizer()
|
||||
|
||||
|
@ -181,7 +181,7 @@ cdef class Pipe:
|
|||
create_optimizer if it doesn't exist.
|
||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||
|
||||
DOCS: https://spacy.io/api/pipe#begin_training
|
||||
DOCS: https://nightly.spacy.io/api/pipe#begin_training
|
||||
"""
|
||||
self.model.initialize()
|
||||
if sgd is None:
|
||||
|
@ -200,7 +200,7 @@ cdef class Pipe:
|
|||
|
||||
params (dict): The parameter values to use in the model.
|
||||
|
||||
DOCS: https://spacy.io/api/pipe#use_params
|
||||
DOCS: https://nightly.spacy.io/api/pipe#use_params
|
||||
"""
|
||||
with self.model.use_params(params):
|
||||
yield
|
||||
|
@ -211,7 +211,7 @@ cdef class Pipe:
|
|||
examples (Iterable[Example]): The examples to score.
|
||||
RETURNS (Dict[str, Any]): The scores.
|
||||
|
||||
DOCS: https://spacy.io/api/pipe#score
|
||||
DOCS: https://nightly.spacy.io/api/pipe#score
|
||||
"""
|
||||
return {}
|
||||
|
||||
|
@ -221,7 +221,7 @@ cdef class Pipe:
|
|||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
RETURNS (bytes): The serialized object.
|
||||
|
||||
DOCS: https://spacy.io/api/pipe#to_bytes
|
||||
DOCS: https://nightly.spacy.io/api/pipe#to_bytes
|
||||
"""
|
||||
serialize = {}
|
||||
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
|
||||
|
@ -236,7 +236,7 @@ cdef class Pipe:
|
|||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
RETURNS (Pipe): The loaded object.
|
||||
|
||||
DOCS: https://spacy.io/api/pipe#from_bytes
|
||||
DOCS: https://nightly.spacy.io/api/pipe#from_bytes
|
||||
"""
|
||||
|
||||
def load_model(b):
|
||||
|
@ -259,7 +259,7 @@ cdef class Pipe:
|
|||
path (str / Path): Path to a directory.
|
||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
|
||||
DOCS: https://spacy.io/api/pipe#to_disk
|
||||
DOCS: https://nightly.spacy.io/api/pipe#to_disk
|
||||
"""
|
||||
serialize = {}
|
||||
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
|
||||
|
@ -274,7 +274,7 @@ cdef class Pipe:
|
|||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
RETURNS (Pipe): The loaded object.
|
||||
|
||||
DOCS: https://spacy.io/api/pipe#from_disk
|
||||
DOCS: https://nightly.spacy.io/api/pipe#from_disk
|
||||
"""
|
||||
|
||||
def load_model(p):
|
||||
|
|
|
@ -29,7 +29,7 @@ def make_sentencizer(
|
|||
class Sentencizer(Pipe):
|
||||
"""Segment the Doc into sentences using a rule-based strategy.
|
||||
|
||||
DOCS: https://spacy.io/api/sentencizer
|
||||
DOCS: https://nightly.spacy.io/api/sentencizer
|
||||
"""
|
||||
|
||||
default_punct_chars = ['!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹',
|
||||
|
@ -51,7 +51,7 @@ class Sentencizer(Pipe):
|
|||
serialized with the nlp object.
|
||||
RETURNS (Sentencizer): The sentencizer component.
|
||||
|
||||
DOCS: https://spacy.io/api/sentencizer#init
|
||||
DOCS: https://nightly.spacy.io/api/sentencizer#init
|
||||
"""
|
||||
self.name = name
|
||||
if punct_chars:
|
||||
|
@ -68,7 +68,7 @@ class Sentencizer(Pipe):
|
|||
doc (Doc): The document to process.
|
||||
RETURNS (Doc): The processed Doc.
|
||||
|
||||
DOCS: https://spacy.io/api/sentencizer#call
|
||||
DOCS: https://nightly.spacy.io/api/sentencizer#call
|
||||
"""
|
||||
start = 0
|
||||
seen_period = False
|
||||
|
@ -94,7 +94,7 @@ class Sentencizer(Pipe):
|
|||
batch_size (int): The number of documents to buffer.
|
||||
YIELDS (Doc): Processed documents in order.
|
||||
|
||||
DOCS: https://spacy.io/api/sentencizer#pipe
|
||||
DOCS: https://nightly.spacy.io/api/sentencizer#pipe
|
||||
"""
|
||||
for docs in util.minibatch(stream, size=batch_size):
|
||||
predictions = self.predict(docs)
|
||||
|
@ -157,7 +157,7 @@ class Sentencizer(Pipe):
|
|||
examples (Iterable[Example]): The examples to score.
|
||||
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
|
||||
|
||||
DOCS: https://spacy.io/api/sentencizer#score
|
||||
DOCS: https://nightly.spacy.io/api/sentencizer#score
|
||||
"""
|
||||
validate_examples(examples, "Sentencizer.score")
|
||||
results = Scorer.score_spans(examples, "sents", **kwargs)
|
||||
|
@ -169,7 +169,7 @@ class Sentencizer(Pipe):
|
|||
|
||||
RETURNS (bytes): The serialized object.
|
||||
|
||||
DOCS: https://spacy.io/api/sentencizer#to_bytes
|
||||
DOCS: https://nightly.spacy.io/api/sentencizer#to_bytes
|
||||
"""
|
||||
return srsly.msgpack_dumps({"punct_chars": list(self.punct_chars)})
|
||||
|
||||
|
@ -179,7 +179,7 @@ class Sentencizer(Pipe):
|
|||
bytes_data (bytes): The data to load.
|
||||
returns (Sentencizer): The loaded object.
|
||||
|
||||
DOCS: https://spacy.io/api/sentencizer#from_bytes
|
||||
DOCS: https://nightly.spacy.io/api/sentencizer#from_bytes
|
||||
"""
|
||||
cfg = srsly.msgpack_loads(bytes_data)
|
||||
self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars))
|
||||
|
@ -188,7 +188,7 @@ class Sentencizer(Pipe):
|
|||
def to_disk(self, path, *, exclude=tuple()):
|
||||
"""Serialize the sentencizer to disk.
|
||||
|
||||
DOCS: https://spacy.io/api/sentencizer#to_disk
|
||||
DOCS: https://nightly.spacy.io/api/sentencizer#to_disk
|
||||
"""
|
||||
path = util.ensure_path(path)
|
||||
path = path.with_suffix(".json")
|
||||
|
@ -198,7 +198,7 @@ class Sentencizer(Pipe):
|
|||
def from_disk(self, path, *, exclude=tuple()):
|
||||
"""Load the sentencizer from disk.
|
||||
|
||||
DOCS: https://spacy.io/api/sentencizer#from_disk
|
||||
DOCS: https://nightly.spacy.io/api/sentencizer#from_disk
|
||||
"""
|
||||
path = util.ensure_path(path)
|
||||
path = path.with_suffix(".json")
|
||||
|
|
|
@ -44,7 +44,7 @@ def make_senter(nlp: Language, name: str, model: Model):
|
|||
class SentenceRecognizer(Tagger):
|
||||
"""Pipeline component for sentence segmentation.
|
||||
|
||||
DOCS: https://spacy.io/api/sentencerecognizer
|
||||
DOCS: https://nightly.spacy.io/api/sentencerecognizer
|
||||
"""
|
||||
def __init__(self, vocab, model, name="senter"):
|
||||
"""Initialize a sentence recognizer.
|
||||
|
@ -54,7 +54,7 @@ class SentenceRecognizer(Tagger):
|
|||
name (str): The component instance name, used to add entries to the
|
||||
losses during training.
|
||||
|
||||
DOCS: https://spacy.io/api/sentencerecognizer#init
|
||||
DOCS: https://nightly.spacy.io/api/sentencerecognizer#init
|
||||
"""
|
||||
self.vocab = vocab
|
||||
self.model = model
|
||||
|
@ -76,7 +76,7 @@ class SentenceRecognizer(Tagger):
|
|||
docs (Iterable[Doc]): The documents to modify.
|
||||
batch_tag_ids: The IDs to set, produced by SentenceRecognizer.predict.
|
||||
|
||||
DOCS: https://spacy.io/api/sentencerecognizer#set_annotations
|
||||
DOCS: https://nightly.spacy.io/api/sentencerecognizer#set_annotations
|
||||
"""
|
||||
if isinstance(docs, Doc):
|
||||
docs = [docs]
|
||||
|
@ -101,7 +101,7 @@ class SentenceRecognizer(Tagger):
|
|||
scores: Scores representing the model's predictions.
|
||||
RETUTNRS (Tuple[float, float]): The loss and the gradient.
|
||||
|
||||
DOCS: https://spacy.io/api/sentencerecognizer#get_loss
|
||||
DOCS: https://nightly.spacy.io/api/sentencerecognizer#get_loss
|
||||
"""
|
||||
validate_examples(examples, "SentenceRecognizer.get_loss")
|
||||
labels = self.labels
|
||||
|
@ -135,7 +135,7 @@ class SentenceRecognizer(Tagger):
|
|||
create_optimizer if it doesn't exist.
|
||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||
|
||||
DOCS: https://spacy.io/api/sentencerecognizer#begin_training
|
||||
DOCS: https://nightly.spacy.io/api/sentencerecognizer#begin_training
|
||||
"""
|
||||
self.set_output(len(self.labels))
|
||||
self.model.initialize()
|
||||
|
@ -151,7 +151,7 @@ class SentenceRecognizer(Tagger):
|
|||
|
||||
examples (Iterable[Example]): The examples to score.
|
||||
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
|
||||
DOCS: https://spacy.io/api/sentencerecognizer#score
|
||||
DOCS: https://nightly.spacy.io/api/sentencerecognizer#score
|
||||
"""
|
||||
validate_examples(examples, "SentenceRecognizer.score")
|
||||
results = Scorer.score_spans(examples, "sents", **kwargs)
|
||||
|
@ -164,7 +164,7 @@ class SentenceRecognizer(Tagger):
|
|||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
RETURNS (bytes): The serialized object.
|
||||
|
||||
DOCS: https://spacy.io/api/sentencerecognizer#to_bytes
|
||||
DOCS: https://nightly.spacy.io/api/sentencerecognizer#to_bytes
|
||||
"""
|
||||
serialize = {}
|
||||
serialize["model"] = self.model.to_bytes
|
||||
|
@ -179,7 +179,7 @@ class SentenceRecognizer(Tagger):
|
|||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
RETURNS (Tagger): The loaded SentenceRecognizer.
|
||||
|
||||
DOCS: https://spacy.io/api/sentencerecognizer#from_bytes
|
||||
DOCS: https://nightly.spacy.io/api/sentencerecognizer#from_bytes
|
||||
"""
|
||||
def load_model(b):
|
||||
try:
|
||||
|
@ -201,7 +201,7 @@ class SentenceRecognizer(Tagger):
|
|||
path (str / Path): Path to a directory.
|
||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
|
||||
DOCS: https://spacy.io/api/sentencerecognizer#to_disk
|
||||
DOCS: https://nightly.spacy.io/api/sentencerecognizer#to_disk
|
||||
"""
|
||||
serialize = {
|
||||
"vocab": lambda p: self.vocab.to_disk(p),
|
||||
|
@ -217,7 +217,7 @@ class SentenceRecognizer(Tagger):
|
|||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
RETURNS (Tagger): The modified SentenceRecognizer object.
|
||||
|
||||
DOCS: https://spacy.io/api/sentencerecognizer#from_disk
|
||||
DOCS: https://nightly.spacy.io/api/sentencerecognizer#from_disk
|
||||
"""
|
||||
def load_model(p):
|
||||
with p.open("rb") as file_:
|
||||
|
|
|
@ -78,7 +78,7 @@ class SimpleNER(Pipe):
|
|||
def add_label(self, label: str) -> None:
|
||||
"""Add a new label to the pipe.
|
||||
label (str): The label to add.
|
||||
DOCS: https://spacy.io/api/simplener#add_label
|
||||
DOCS: https://nightly.spacy.io/api/simplener#add_label
|
||||
"""
|
||||
if not isinstance(label, str):
|
||||
raise ValueError(Errors.E187)
|
||||
|
|
|
@ -58,7 +58,7 @@ def make_tagger(nlp: Language, name: str, model: Model):
|
|||
class Tagger(Pipe):
|
||||
"""Pipeline component for part-of-speech tagging.
|
||||
|
||||
DOCS: https://spacy.io/api/tagger
|
||||
DOCS: https://nightly.spacy.io/api/tagger
|
||||
"""
|
||||
def __init__(self, vocab, model, name="tagger", *, labels=None):
|
||||
"""Initialize a part-of-speech tagger.
|
||||
|
@ -69,7 +69,7 @@ class Tagger(Pipe):
|
|||
losses during training.
|
||||
labels (List): The set of labels. Defaults to None.
|
||||
|
||||
DOCS: https://spacy.io/api/tagger#init
|
||||
DOCS: https://nightly.spacy.io/api/tagger#init
|
||||
"""
|
||||
self.vocab = vocab
|
||||
self.model = model
|
||||
|
@ -86,7 +86,7 @@ class Tagger(Pipe):
|
|||
|
||||
RETURNS (Tuple[str]): The labels.
|
||||
|
||||
DOCS: https://spacy.io/api/tagger#labels
|
||||
DOCS: https://nightly.spacy.io/api/tagger#labels
|
||||
"""
|
||||
return tuple(self.cfg["labels"])
|
||||
|
||||
|
@ -96,7 +96,7 @@ class Tagger(Pipe):
|
|||
doc (Doc): The document to process.
|
||||
RETURNS (Doc): The processed Doc.
|
||||
|
||||
DOCS: https://spacy.io/api/tagger#call
|
||||
DOCS: https://nightly.spacy.io/api/tagger#call
|
||||
"""
|
||||
tags = self.predict([doc])
|
||||
self.set_annotations([doc], tags)
|
||||
|
@ -111,7 +111,7 @@ class Tagger(Pipe):
|
|||
batch_size (int): The number of documents to buffer.
|
||||
YIELDS (Doc): Processed documents in order.
|
||||
|
||||
DOCS: https://spacy.io/api/tagger#pipe
|
||||
DOCS: https://nightly.spacy.io/api/tagger#pipe
|
||||
"""
|
||||
for docs in util.minibatch(stream, size=batch_size):
|
||||
tag_ids = self.predict(docs)
|
||||
|
@ -124,7 +124,7 @@ class Tagger(Pipe):
|
|||
docs (Iterable[Doc]): The documents to predict.
|
||||
RETURNS: The models prediction for each document.
|
||||
|
||||
DOCS: https://spacy.io/api/tagger#predict
|
||||
DOCS: https://nightly.spacy.io/api/tagger#predict
|
||||
"""
|
||||
if not any(len(doc) for doc in docs):
|
||||
# Handle cases where there are no tokens in any docs.
|
||||
|
@ -153,7 +153,7 @@ class Tagger(Pipe):
|
|||
docs (Iterable[Doc]): The documents to modify.
|
||||
batch_tag_ids: The IDs to set, produced by Tagger.predict.
|
||||
|
||||
DOCS: https://spacy.io/api/tagger#set_annotations
|
||||
DOCS: https://nightly.spacy.io/api/tagger#set_annotations
|
||||
"""
|
||||
if isinstance(docs, Doc):
|
||||
docs = [docs]
|
||||
|
@ -182,7 +182,7 @@ class Tagger(Pipe):
|
|||
Updated using the component name as the key.
|
||||
RETURNS (Dict[str, float]): The updated losses dictionary.
|
||||
|
||||
DOCS: https://spacy.io/api/tagger#update
|
||||
DOCS: https://nightly.spacy.io/api/tagger#update
|
||||
"""
|
||||
if losses is None:
|
||||
losses = {}
|
||||
|
@ -220,7 +220,7 @@ class Tagger(Pipe):
|
|||
Updated using the component name as the key.
|
||||
RETURNS (Dict[str, float]): The updated losses dictionary.
|
||||
|
||||
DOCS: https://spacy.io/api/tagger#rehearse
|
||||
DOCS: https://nightly.spacy.io/api/tagger#rehearse
|
||||
"""
|
||||
validate_examples(examples, "Tagger.rehearse")
|
||||
docs = [eg.predicted for eg in examples]
|
||||
|
@ -247,7 +247,7 @@ class Tagger(Pipe):
|
|||
scores: Scores representing the model's predictions.
|
||||
RETUTNRS (Tuple[float, float]): The loss and the gradient.
|
||||
|
||||
DOCS: https://spacy.io/api/tagger#get_loss
|
||||
DOCS: https://nightly.spacy.io/api/tagger#get_loss
|
||||
"""
|
||||
validate_examples(examples, "Tagger.get_loss")
|
||||
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
|
||||
|
@ -269,7 +269,7 @@ class Tagger(Pipe):
|
|||
create_optimizer if it doesn't exist.
|
||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||
|
||||
DOCS: https://spacy.io/api/tagger#begin_training
|
||||
DOCS: https://nightly.spacy.io/api/tagger#begin_training
|
||||
"""
|
||||
if not hasattr(get_examples, "__call__"):
|
||||
err = Errors.E930.format(name="Tagger", obj=type(get_examples))
|
||||
|
@ -285,9 +285,16 @@ class Tagger(Pipe):
|
|||
doc_sample.append(Doc(self.vocab, words=["hello"]))
|
||||
for tag in sorted(tags):
|
||||
self.add_label(tag)
|
||||
if len(self.labels) == 0:
|
||||
err = Errors.E1006.format(name="Tagger")
|
||||
raise ValueError(err)
|
||||
self.set_output(len(self.labels))
|
||||
if self.labels:
|
||||
self.model.initialize(X=doc_sample)
|
||||
if doc_sample:
|
||||
label_sample = [
|
||||
self.model.ops.alloc2f(len(doc), len(self.labels))
|
||||
for doc in doc_sample
|
||||
]
|
||||
self.model.initialize(X=doc_sample, Y=label_sample)
|
||||
else:
|
||||
self.model.initialize()
|
||||
if sgd is None:
|
||||
|
@ -300,7 +307,7 @@ class Tagger(Pipe):
|
|||
label (str): The label to add.
|
||||
RETURNS (int): 0 if label is already present, otherwise 1.
|
||||
|
||||
DOCS: https://spacy.io/api/tagger#add_label
|
||||
DOCS: https://nightly.spacy.io/api/tagger#add_label
|
||||
"""
|
||||
if not isinstance(label, str):
|
||||
raise ValueError(Errors.E187)
|
||||
|
@ -317,7 +324,7 @@ class Tagger(Pipe):
|
|||
RETURNS (Dict[str, Any]): The scores, produced by
|
||||
Scorer.score_token_attr for the attributes "tag".
|
||||
|
||||
DOCS: https://spacy.io/api/tagger#score
|
||||
DOCS: https://nightly.spacy.io/api/tagger#score
|
||||
"""
|
||||
validate_examples(examples, "Tagger.score")
|
||||
return Scorer.score_token_attr(examples, "tag", **kwargs)
|
||||
|
@ -328,7 +335,7 @@ class Tagger(Pipe):
|
|||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
RETURNS (bytes): The serialized object.
|
||||
|
||||
DOCS: https://spacy.io/api/tagger#to_bytes
|
||||
DOCS: https://nightly.spacy.io/api/tagger#to_bytes
|
||||
"""
|
||||
serialize = {}
|
||||
serialize["model"] = self.model.to_bytes
|
||||
|
@ -343,7 +350,7 @@ class Tagger(Pipe):
|
|||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
RETURNS (Tagger): The loaded Tagger.
|
||||
|
||||
DOCS: https://spacy.io/api/tagger#from_bytes
|
||||
DOCS: https://nightly.spacy.io/api/tagger#from_bytes
|
||||
"""
|
||||
def load_model(b):
|
||||
try:
|
||||
|
@ -365,7 +372,7 @@ class Tagger(Pipe):
|
|||
path (str / Path): Path to a directory.
|
||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
|
||||
DOCS: https://spacy.io/api/tagger#to_disk
|
||||
DOCS: https://nightly.spacy.io/api/tagger#to_disk
|
||||
"""
|
||||
serialize = {
|
||||
"vocab": lambda p: self.vocab.to_disk(p),
|
||||
|
@ -381,7 +388,7 @@ class Tagger(Pipe):
|
|||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
RETURNS (Tagger): The modified Tagger object.
|
||||
|
||||
DOCS: https://spacy.io/api/tagger#from_disk
|
||||
DOCS: https://nightly.spacy.io/api/tagger#from_disk
|
||||
"""
|
||||
def load_model(p):
|
||||
with p.open("rb") as file_:
|
||||
|
|
|
@ -92,7 +92,7 @@ def make_textcat(
|
|||
class TextCategorizer(Pipe):
|
||||
"""Pipeline component for text classification.
|
||||
|
||||
DOCS: https://spacy.io/api/textcategorizer
|
||||
DOCS: https://nightly.spacy.io/api/textcategorizer
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
|
@ -111,7 +111,7 @@ class TextCategorizer(Pipe):
|
|||
losses during training.
|
||||
labels (Iterable[str]): The labels to use.
|
||||
|
||||
DOCS: https://spacy.io/api/textcategorizer#init
|
||||
DOCS: https://nightly.spacy.io/api/textcategorizer#init
|
||||
"""
|
||||
self.vocab = vocab
|
||||
self.model = model
|
||||
|
@ -124,7 +124,7 @@ class TextCategorizer(Pipe):
|
|||
def labels(self) -> Tuple[str]:
|
||||
"""RETURNS (Tuple[str]): The labels currently added to the component.
|
||||
|
||||
DOCS: https://spacy.io/api/textcategorizer#labels
|
||||
DOCS: https://nightly.spacy.io/api/textcategorizer#labels
|
||||
"""
|
||||
return tuple(self.cfg.setdefault("labels", []))
|
||||
|
||||
|
@ -146,7 +146,7 @@ class TextCategorizer(Pipe):
|
|||
batch_size (int): The number of documents to buffer.
|
||||
YIELDS (Doc): Processed documents in order.
|
||||
|
||||
DOCS: https://spacy.io/api/textcategorizer#pipe
|
||||
DOCS: https://nightly.spacy.io/api/textcategorizer#pipe
|
||||
"""
|
||||
for docs in util.minibatch(stream, size=batch_size):
|
||||
scores = self.predict(docs)
|
||||
|
@ -159,7 +159,7 @@ class TextCategorizer(Pipe):
|
|||
docs (Iterable[Doc]): The documents to predict.
|
||||
RETURNS: The models prediction for each document.
|
||||
|
||||
DOCS: https://spacy.io/api/textcategorizer#predict
|
||||
DOCS: https://nightly.spacy.io/api/textcategorizer#predict
|
||||
"""
|
||||
tensors = [doc.tensor for doc in docs]
|
||||
if not any(len(doc) for doc in docs):
|
||||
|
@ -177,7 +177,7 @@ class TextCategorizer(Pipe):
|
|||
docs (Iterable[Doc]): The documents to modify.
|
||||
scores: The scores to set, produced by TextCategorizer.predict.
|
||||
|
||||
DOCS: https://spacy.io/api/textcategorizer#set_annotations
|
||||
DOCS: https://nightly.spacy.io/api/textcategorizer#set_annotations
|
||||
"""
|
||||
for i, doc in enumerate(docs):
|
||||
for j, label in enumerate(self.labels):
|
||||
|
@ -204,7 +204,7 @@ class TextCategorizer(Pipe):
|
|||
Updated using the component name as the key.
|
||||
RETURNS (Dict[str, float]): The updated losses dictionary.
|
||||
|
||||
DOCS: https://spacy.io/api/textcategorizer#update
|
||||
DOCS: https://nightly.spacy.io/api/textcategorizer#update
|
||||
"""
|
||||
if losses is None:
|
||||
losses = {}
|
||||
|
@ -245,7 +245,7 @@ class TextCategorizer(Pipe):
|
|||
Updated using the component name as the key.
|
||||
RETURNS (Dict[str, float]): The updated losses dictionary.
|
||||
|
||||
DOCS: https://spacy.io/api/textcategorizer#rehearse
|
||||
DOCS: https://nightly.spacy.io/api/textcategorizer#rehearse
|
||||
"""
|
||||
if losses is not None:
|
||||
losses.setdefault(self.name, 0.0)
|
||||
|
@ -289,7 +289,7 @@ class TextCategorizer(Pipe):
|
|||
scores: Scores representing the model's predictions.
|
||||
RETUTNRS (Tuple[float, float]): The loss and the gradient.
|
||||
|
||||
DOCS: https://spacy.io/api/textcategorizer#get_loss
|
||||
DOCS: https://nightly.spacy.io/api/textcategorizer#get_loss
|
||||
"""
|
||||
validate_examples(examples, "TextCategorizer.get_loss")
|
||||
truths, not_missing = self._examples_to_truth(examples)
|
||||
|
@ -305,7 +305,7 @@ class TextCategorizer(Pipe):
|
|||
label (str): The label to add.
|
||||
RETURNS (int): 0 if label is already present, otherwise 1.
|
||||
|
||||
DOCS: https://spacy.io/api/textcategorizer#add_label
|
||||
DOCS: https://nightly.spacy.io/api/textcategorizer#add_label
|
||||
"""
|
||||
if not isinstance(label, str):
|
||||
raise ValueError(Errors.E187)
|
||||
|
@ -343,7 +343,7 @@ class TextCategorizer(Pipe):
|
|||
create_optimizer if it doesn't exist.
|
||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||
|
||||
DOCS: https://spacy.io/api/textcategorizer#begin_training
|
||||
DOCS: https://nightly.spacy.io/api/textcategorizer#begin_training
|
||||
"""
|
||||
if not hasattr(get_examples, "__call__"):
|
||||
err = Errors.E930.format(name="TextCategorizer", obj=type(get_examples))
|
||||
|
@ -378,7 +378,7 @@ class TextCategorizer(Pipe):
|
|||
positive_label (str): Optional positive label.
|
||||
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats.
|
||||
|
||||
DOCS: https://spacy.io/api/textcategorizer#score
|
||||
DOCS: https://nightly.spacy.io/api/textcategorizer#score
|
||||
"""
|
||||
validate_examples(examples, "TextCategorizer.score")
|
||||
return Scorer.score_cats(
|
||||
|
|
|
@ -56,7 +56,7 @@ class Tok2Vec(Pipe):
|
|||
a list of Doc objects as input, and output a list of 2d float arrays.
|
||||
name (str): The component instance name.
|
||||
|
||||
DOCS: https://spacy.io/api/tok2vec#init
|
||||
DOCS: https://nightly.spacy.io/api/tok2vec#init
|
||||
"""
|
||||
self.vocab = vocab
|
||||
self.model = model
|
||||
|
@ -88,10 +88,10 @@ class Tok2Vec(Pipe):
|
|||
"""Add context-sensitive embeddings to the Doc.tensor attribute, allowing
|
||||
them to be used as features by downstream components.
|
||||
|
||||
docs (Doc): The Doc to preocess.
|
||||
docs (Doc): The Doc to process.
|
||||
RETURNS (Doc): The processed Doc.
|
||||
|
||||
DOCS: https://spacy.io/api/tok2vec#call
|
||||
DOCS: https://nightly.spacy.io/api/tok2vec#call
|
||||
"""
|
||||
tokvecses = self.predict([doc])
|
||||
self.set_annotations([doc], tokvecses)
|
||||
|
@ -106,7 +106,7 @@ class Tok2Vec(Pipe):
|
|||
batch_size (int): The number of documents to buffer.
|
||||
YIELDS (Doc): Processed documents in order.
|
||||
|
||||
DOCS: https://spacy.io/api/tok2vec#pipe
|
||||
DOCS: https://nightly.spacy.io/api/tok2vec#pipe
|
||||
"""
|
||||
for docs in minibatch(stream, batch_size):
|
||||
docs = list(docs)
|
||||
|
@ -121,7 +121,7 @@ class Tok2Vec(Pipe):
|
|||
docs (Iterable[Doc]): The documents to predict.
|
||||
RETURNS: Vector representations for each token in the documents.
|
||||
|
||||
DOCS: https://spacy.io/api/tok2vec#predict
|
||||
DOCS: https://nightly.spacy.io/api/tok2vec#predict
|
||||
"""
|
||||
tokvecs = self.model.predict(docs)
|
||||
batch_id = Tok2VecListener.get_batch_id(docs)
|
||||
|
@ -135,7 +135,7 @@ class Tok2Vec(Pipe):
|
|||
docs (Iterable[Doc]): The documents to modify.
|
||||
tokvecses: The tensors to set, produced by Tok2Vec.predict.
|
||||
|
||||
DOCS: https://spacy.io/api/tok2vec#set_annotations
|
||||
DOCS: https://nightly.spacy.io/api/tok2vec#set_annotations
|
||||
"""
|
||||
for doc, tokvecs in zip(docs, tokvecses):
|
||||
assert tokvecs.shape[0] == len(doc)
|
||||
|
@ -162,7 +162,7 @@ class Tok2Vec(Pipe):
|
|||
Updated using the component name as the key.
|
||||
RETURNS (Dict[str, float]): The updated losses dictionary.
|
||||
|
||||
DOCS: https://spacy.io/api/tok2vec#update
|
||||
DOCS: https://nightly.spacy.io/api/tok2vec#update
|
||||
"""
|
||||
if losses is None:
|
||||
losses = {}
|
||||
|
@ -220,7 +220,7 @@ class Tok2Vec(Pipe):
|
|||
create_optimizer if it doesn't exist.
|
||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||
|
||||
DOCS: https://spacy.io/api/tok2vec#begin_training
|
||||
DOCS: https://nightly.spacy.io/api/tok2vec#begin_training
|
||||
"""
|
||||
docs = [Doc(self.vocab, words=["hello"])]
|
||||
self.model.initialize(X=docs)
|
||||
|
|
|
@ -6,6 +6,7 @@ from itertools import islice
|
|||
from libcpp.vector cimport vector
|
||||
from libc.string cimport memset
|
||||
from libc.stdlib cimport calloc, free
|
||||
import random
|
||||
|
||||
import srsly
|
||||
from thinc.api import set_dropout_rate
|
||||
|
@ -275,22 +276,22 @@ cdef class Parser(Pipe):
|
|||
# Prepare the stepwise model, and get the callback for finishing the batch
|
||||
model, backprop_tok2vec = self.model.begin_update(
|
||||
[eg.predicted for eg in examples])
|
||||
if self.cfg["update_with_oracle_cut_size"] >= 1:
|
||||
# Chop sequences into lengths of this many transitions, to make the
|
||||
max_moves = self.cfg["update_with_oracle_cut_size"]
|
||||
if max_moves >= 1:
|
||||
# Chop sequences into lengths of this many words, to make the
|
||||
# batch uniform length.
|
||||
# We used to randomize this, but it's not clear that actually helps?
|
||||
cut_size = self.cfg["update_with_oracle_cut_size"]
|
||||
states, golds, max_steps = self._init_gold_batch(
|
||||
max_moves = int(random.uniform(max_moves // 2, max_moves * 2))
|
||||
states, golds, _ = self._init_gold_batch(
|
||||
examples,
|
||||
max_length=cut_size
|
||||
max_length=max_moves
|
||||
)
|
||||
else:
|
||||
states, golds, _ = self.moves.init_gold_batch(examples)
|
||||
max_steps = max([len(eg.x) for eg in examples])
|
||||
if not states:
|
||||
return losses
|
||||
all_states = list(states)
|
||||
states_golds = list(zip(states, golds))
|
||||
n_moves = 0
|
||||
while states_golds:
|
||||
states, golds = zip(*states_golds)
|
||||
scores, backprop = model.begin_update(states)
|
||||
|
@ -303,6 +304,9 @@ cdef class Parser(Pipe):
|
|||
# Follow the predicted action
|
||||
self.transition_states(states, scores)
|
||||
states_golds = [(s, g) for (s, g) in zip(states, golds) if not s.is_final()]
|
||||
if max_moves >= 1 and n_moves >= max_moves:
|
||||
break
|
||||
n_moves += 1
|
||||
|
||||
backprop_tok2vec(golds)
|
||||
if sgd not in (None, False):
|
||||
|
@ -498,7 +502,7 @@ cdef class Parser(Pipe):
|
|||
raise ValueError(Errors.E149) from None
|
||||
return self
|
||||
|
||||
def _init_gold_batch(self, examples, min_length=5, max_length=500):
|
||||
def _init_gold_batch(self, examples, max_length):
|
||||
"""Make a square batch, of length equal to the shortest transition
|
||||
sequence or a cap. A long
|
||||
doc will get multiple states. Let's say we have a doc of length 2*N,
|
||||
|
@ -511,8 +515,7 @@ cdef class Parser(Pipe):
|
|||
all_states = self.moves.init_batch([eg.predicted for eg in examples])
|
||||
states = []
|
||||
golds = []
|
||||
kept = []
|
||||
max_length_seen = 0
|
||||
to_cut = []
|
||||
for state, eg in zip(all_states, examples):
|
||||
if self.moves.has_gold(eg) and not state.is_final():
|
||||
gold = self.moves.init_gold(state, eg)
|
||||
|
@ -522,30 +525,22 @@ cdef class Parser(Pipe):
|
|||
else:
|
||||
oracle_actions = self.moves.get_oracle_sequence_from_state(
|
||||
state.copy(), gold)
|
||||
kept.append((eg, state, gold, oracle_actions))
|
||||
min_length = min(min_length, len(oracle_actions))
|
||||
max_length_seen = max(max_length, len(oracle_actions))
|
||||
if not kept:
|
||||
to_cut.append((eg, state, gold, oracle_actions))
|
||||
if not to_cut:
|
||||
return states, golds, 0
|
||||
max_length = max(min_length, min(max_length, max_length_seen))
|
||||
cdef int clas
|
||||
max_moves = 0
|
||||
for eg, state, gold, oracle_actions in kept:
|
||||
for eg, state, gold, oracle_actions in to_cut:
|
||||
for i in range(0, len(oracle_actions), max_length):
|
||||
start_state = state.copy()
|
||||
n_moves = 0
|
||||
for clas in oracle_actions[i:i+max_length]:
|
||||
action = self.moves.c[clas]
|
||||
action.do(state.c, action.label)
|
||||
state.c.push_hist(action.clas)
|
||||
n_moves += 1
|
||||
if state.is_final():
|
||||
break
|
||||
max_moves = max(max_moves, n_moves)
|
||||
if self.moves.has_gold(eg, start_state.B(0), state.B(0)):
|
||||
states.append(start_state)
|
||||
golds.append(gold)
|
||||
max_moves = max(max_moves, n_moves)
|
||||
if state.is_final():
|
||||
break
|
||||
return states, golds, max_moves
|
||||
return states, golds, max_length
|
||||
|
|
|
@ -85,7 +85,7 @@ class Scorer:
|
|||
) -> None:
|
||||
"""Initialize the Scorer.
|
||||
|
||||
DOCS: https://spacy.io/api/scorer#init
|
||||
DOCS: https://nightly.spacy.io/api/scorer#init
|
||||
"""
|
||||
self.nlp = nlp
|
||||
self.cfg = cfg
|
||||
|
@ -101,7 +101,7 @@ class Scorer:
|
|||
examples (Iterable[Example]): The predicted annotations + correct annotations.
|
||||
RETURNS (Dict): A dictionary of scores.
|
||||
|
||||
DOCS: https://spacy.io/api/scorer#score
|
||||
DOCS: https://nightly.spacy.io/api/scorer#score
|
||||
"""
|
||||
scores = {}
|
||||
if hasattr(self.nlp.tokenizer, "score"):
|
||||
|
@ -121,7 +121,7 @@ class Scorer:
|
|||
RETURNS (Dict[str, float]): A dictionary containing the scores
|
||||
token_acc/p/r/f.
|
||||
|
||||
DOCS: https://spacy.io/api/scorer#score_tokenization
|
||||
DOCS: https://nightly.spacy.io/api/scorer#score_tokenization
|
||||
"""
|
||||
acc_score = PRFScore()
|
||||
prf_score = PRFScore()
|
||||
|
@ -169,7 +169,7 @@ class Scorer:
|
|||
RETURNS (Dict[str, float]): A dictionary containing the accuracy score
|
||||
under the key attr_acc.
|
||||
|
||||
DOCS: https://spacy.io/api/scorer#score_token_attr
|
||||
DOCS: https://nightly.spacy.io/api/scorer#score_token_attr
|
||||
"""
|
||||
tag_score = PRFScore()
|
||||
for example in examples:
|
||||
|
@ -263,7 +263,7 @@ class Scorer:
|
|||
RETURNS (Dict[str, Any]): A dictionary containing the PRF scores under
|
||||
the keys attr_p/r/f and the per-type PRF scores under attr_per_type.
|
||||
|
||||
DOCS: https://spacy.io/api/scorer#score_spans
|
||||
DOCS: https://nightly.spacy.io/api/scorer#score_spans
|
||||
"""
|
||||
score = PRFScore()
|
||||
score_per_type = dict()
|
||||
|
@ -350,7 +350,7 @@ class Scorer:
|
|||
attr_f_per_type,
|
||||
attr_auc_per_type
|
||||
|
||||
DOCS: https://spacy.io/api/scorer#score_cats
|
||||
DOCS: https://nightly.spacy.io/api/scorer#score_cats
|
||||
"""
|
||||
if threshold is None:
|
||||
threshold = 0.5 if multi_label else 0.0
|
||||
|
@ -467,7 +467,7 @@ class Scorer:
|
|||
RETURNS (Dict[str, Any]): A dictionary containing the scores:
|
||||
attr_uas, attr_las, and attr_las_per_type.
|
||||
|
||||
DOCS: https://spacy.io/api/scorer#score_deps
|
||||
DOCS: https://nightly.spacy.io/api/scorer#score_deps
|
||||
"""
|
||||
unlabelled = PRFScore()
|
||||
labelled = PRFScore()
|
||||
|
|
|
@ -23,7 +23,6 @@ cdef class StringStore:
|
|||
cdef Pool mem
|
||||
|
||||
cdef vector[hash_t] keys
|
||||
cdef set[hash_t] hits
|
||||
cdef public PreshMap _map
|
||||
|
||||
cdef const Utf8Str* intern_unicode(self, unicode py_string)
|
||||
|
|
|
@ -91,7 +91,7 @@ cdef Utf8Str* _allocate(Pool mem, const unsigned char* chars, uint32_t length) e
|
|||
cdef class StringStore:
|
||||
"""Look up strings by 64-bit hashes.
|
||||
|
||||
DOCS: https://spacy.io/api/stringstore
|
||||
DOCS: https://nightly.spacy.io/api/stringstore
|
||||
"""
|
||||
def __init__(self, strings=None, freeze=False):
|
||||
"""Create the StringStore.
|
||||
|
@ -127,7 +127,6 @@ cdef class StringStore:
|
|||
return SYMBOLS_BY_INT[string_or_id]
|
||||
else:
|
||||
key = string_or_id
|
||||
self.hits.insert(key)
|
||||
utf8str = <Utf8Str*>self._map.get(key)
|
||||
if utf8str is NULL:
|
||||
raise KeyError(Errors.E018.format(hash_value=string_or_id))
|
||||
|
@ -198,7 +197,6 @@ cdef class StringStore:
|
|||
if key < len(SYMBOLS_BY_INT):
|
||||
return True
|
||||
else:
|
||||
self.hits.insert(key)
|
||||
return self._map.get(key) is not NULL
|
||||
|
||||
def __iter__(self):
|
||||
|
@ -210,7 +208,6 @@ cdef class StringStore:
|
|||
cdef hash_t key
|
||||
for i in range(self.keys.size()):
|
||||
key = self.keys[i]
|
||||
self.hits.insert(key)
|
||||
utf8str = <Utf8Str*>self._map.get(key)
|
||||
yield decode_Utf8Str(utf8str)
|
||||
# TODO: Iterate OOV here?
|
||||
|
@ -269,41 +266,9 @@ cdef class StringStore:
|
|||
self.mem = Pool()
|
||||
self._map = PreshMap()
|
||||
self.keys.clear()
|
||||
self.hits.clear()
|
||||
for string in strings:
|
||||
self.add(string)
|
||||
|
||||
def _cleanup_stale_strings(self, excepted):
|
||||
"""
|
||||
excepted (list): Strings that should not be removed.
|
||||
RETURNS (keys, strings): Dropped strings and keys that can be dropped from other places
|
||||
"""
|
||||
if self.hits.size() == 0:
|
||||
# If we don't have any hits, just skip cleanup
|
||||
return
|
||||
|
||||
cdef vector[hash_t] tmp
|
||||
dropped_strings = []
|
||||
dropped_keys = []
|
||||
for i in range(self.keys.size()):
|
||||
key = self.keys[i]
|
||||
# Here we cannot use __getitem__ because it also set hit.
|
||||
utf8str = <Utf8Str*>self._map.get(key)
|
||||
value = decode_Utf8Str(utf8str)
|
||||
if self.hits.count(key) != 0 or value in excepted:
|
||||
tmp.push_back(key)
|
||||
else:
|
||||
dropped_keys.append(key)
|
||||
dropped_strings.append(value)
|
||||
|
||||
self.keys.swap(tmp)
|
||||
strings = list(self)
|
||||
self._reset_and_load(strings)
|
||||
# Here we have strings but hits to it should be reseted
|
||||
self.hits.clear()
|
||||
|
||||
return dropped_keys, dropped_strings
|
||||
|
||||
cdef const Utf8Str* intern_unicode(self, unicode py_string):
|
||||
# 0 means missing, but we don't bother offsetting the index.
|
||||
cdef bytes byte_string = py_string.encode("utf8")
|
||||
|
@ -319,6 +284,5 @@ cdef class StringStore:
|
|||
return value
|
||||
value = _allocate(self.mem, <unsigned char*>utf8_string, length)
|
||||
self._map.set(key, value)
|
||||
self.hits.insert(key)
|
||||
self.keys.push_back(key)
|
||||
return value
|
||||
|
|
|
@ -317,7 +317,8 @@ def test_doc_from_array_morph(en_vocab):
|
|||
|
||||
|
||||
def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
||||
en_texts = ["Merging the docs is fun.", "They don't think alike."]
|
||||
en_texts = ["Merging the docs is fun.", "", "They don't think alike."]
|
||||
en_texts_without_empty = [t for t in en_texts if len(t)]
|
||||
de_text = "Wie war die Frage?"
|
||||
en_docs = [en_tokenizer(text) for text in en_texts]
|
||||
docs_idx = en_texts[0].index("docs")
|
||||
|
@ -338,14 +339,14 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
|||
Doc.from_docs(en_docs + [de_doc])
|
||||
|
||||
m_doc = Doc.from_docs(en_docs)
|
||||
assert len(en_docs) == len(list(m_doc.sents))
|
||||
assert len(en_texts_without_empty) == len(list(m_doc.sents))
|
||||
assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
|
||||
assert str(m_doc) == " ".join(en_texts)
|
||||
assert str(m_doc) == " ".join(en_texts_without_empty)
|
||||
p_token = m_doc[len(en_docs[0]) - 1]
|
||||
assert p_token.text == "." and bool(p_token.whitespace_)
|
||||
en_docs_tokens = [t for doc in en_docs for t in doc]
|
||||
assert len(m_doc) == len(en_docs_tokens)
|
||||
think_idx = len(en_texts[0]) + 1 + en_texts[1].index("think")
|
||||
think_idx = len(en_texts[0]) + 1 + en_texts[2].index("think")
|
||||
assert m_doc[9].idx == think_idx
|
||||
with pytest.raises(AttributeError):
|
||||
# not callable, because it was not set via set_extension
|
||||
|
@ -353,14 +354,14 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
|||
assert len(m_doc.user_data) == len(en_docs[0].user_data) # but it's there
|
||||
|
||||
m_doc = Doc.from_docs(en_docs, ensure_whitespace=False)
|
||||
assert len(en_docs) == len(list(m_doc.sents))
|
||||
assert len(str(m_doc)) == len(en_texts[0]) + len(en_texts[1])
|
||||
assert len(en_texts_without_empty) == len(list(m_doc.sents))
|
||||
assert len(str(m_doc)) == sum(len(t) for t in en_texts)
|
||||
assert str(m_doc) == "".join(en_texts)
|
||||
p_token = m_doc[len(en_docs[0]) - 1]
|
||||
assert p_token.text == "." and not bool(p_token.whitespace_)
|
||||
en_docs_tokens = [t for doc in en_docs for t in doc]
|
||||
assert len(m_doc) == len(en_docs_tokens)
|
||||
think_idx = len(en_texts[0]) + 0 + en_texts[1].index("think")
|
||||
think_idx = len(en_texts[0]) + 0 + en_texts[2].index("think")
|
||||
assert m_doc[9].idx == think_idx
|
||||
|
||||
m_doc = Doc.from_docs(en_docs, attrs=["lemma", "length", "pos"])
|
||||
|
@ -369,12 +370,12 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
|||
assert list(m_doc.sents)
|
||||
assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
|
||||
# space delimiter considered, although spacy attribute was missing
|
||||
assert str(m_doc) == " ".join(en_texts)
|
||||
assert str(m_doc) == " ".join(en_texts_without_empty)
|
||||
p_token = m_doc[len(en_docs[0]) - 1]
|
||||
assert p_token.text == "." and bool(p_token.whitespace_)
|
||||
en_docs_tokens = [t for doc in en_docs for t in doc]
|
||||
assert len(m_doc) == len(en_docs_tokens)
|
||||
think_idx = len(en_texts[0]) + 1 + en_texts[1].index("think")
|
||||
think_idx = len(en_texts[0]) + 1 + en_texts[2].index("think")
|
||||
assert m_doc[9].idx == think_idx
|
||||
|
||||
|
||||
|
|
|
@ -14,7 +14,7 @@ LANGUAGES = ["el", "en", "fr", "nl"]
|
|||
|
||||
@pytest.mark.parametrize("lang", LANGUAGES)
|
||||
def test_lemmatizer_initialize(lang, capfd):
|
||||
@registry.assets("lemmatizer_init_lookups")
|
||||
@registry.misc("lemmatizer_init_lookups")
|
||||
def lemmatizer_init_lookups():
|
||||
lookups = Lookups()
|
||||
lookups.add_table("lemma_lookup", {"cope": "cope"})
|
||||
|
@ -25,9 +25,7 @@ def test_lemmatizer_initialize(lang, capfd):
|
|||
|
||||
"""Test that languages can be initialized."""
|
||||
nlp = get_lang_class(lang)()
|
||||
nlp.add_pipe(
|
||||
"lemmatizer", config={"lookups": {"@assets": "lemmatizer_init_lookups"}}
|
||||
)
|
||||
nlp.add_pipe("lemmatizer", config={"lookups": {"@misc": "lemmatizer_init_lookups"}})
|
||||
# Check for stray print statements (see #3342)
|
||||
doc = nlp("test") # noqa: F841
|
||||
captured = capfd.readouterr()
|
||||
|
|
|
@ -2,7 +2,8 @@ import pytest
|
|||
import re
|
||||
from mock import Mock
|
||||
from spacy.matcher import Matcher, DependencyMatcher
|
||||
from spacy.tokens import Doc, Token
|
||||
from spacy.tokens import Doc, Token, Span
|
||||
|
||||
from ..doc.test_underscore import clean_underscore # noqa: F401
|
||||
|
||||
|
||||
|
@ -469,3 +470,26 @@ def test_matcher_span(matcher):
|
|||
assert len(matcher(doc)) == 2
|
||||
assert len(matcher(span_js)) == 1
|
||||
assert len(matcher(span_java)) == 1
|
||||
|
||||
|
||||
def test_matcher_as_spans(matcher):
|
||||
"""Test the new as_spans=True API."""
|
||||
text = "JavaScript is good but Java is better"
|
||||
doc = Doc(matcher.vocab, words=text.split())
|
||||
matches = matcher(doc, as_spans=True)
|
||||
assert len(matches) == 2
|
||||
assert isinstance(matches[0], Span)
|
||||
assert matches[0].text == "JavaScript"
|
||||
assert matches[0].label_ == "JS"
|
||||
assert isinstance(matches[1], Span)
|
||||
assert matches[1].text == "Java"
|
||||
assert matches[1].label_ == "Java"
|
||||
|
||||
|
||||
def test_matcher_deprecated(matcher):
|
||||
doc = Doc(matcher.vocab, words=["hello", "world"])
|
||||
with pytest.warns(DeprecationWarning) as record:
|
||||
for _ in matcher.pipe([doc]):
|
||||
pass
|
||||
assert record.list
|
||||
assert "spaCy v3.0" in str(record.list[0].message)
|
||||
|
|
|
@ -2,7 +2,7 @@ import pytest
|
|||
import srsly
|
||||
from mock import Mock
|
||||
from spacy.matcher import PhraseMatcher
|
||||
from spacy.tokens import Doc
|
||||
from spacy.tokens import Doc, Span
|
||||
from ..util import get_doc
|
||||
|
||||
|
||||
|
@ -287,3 +287,30 @@ def test_phrase_matcher_pickle(en_vocab):
|
|||
# clunky way to vaguely check that callback is unpickled
|
||||
(vocab, docs, callbacks, attr) = matcher_unpickled.__reduce__()[1]
|
||||
assert isinstance(callbacks.get("TEST2"), Mock)
|
||||
|
||||
|
||||
def test_phrase_matcher_as_spans(en_vocab):
|
||||
"""Test the new as_spans=True API."""
|
||||
matcher = PhraseMatcher(en_vocab)
|
||||
matcher.add("A", [Doc(en_vocab, words=["hello", "world"])])
|
||||
matcher.add("B", [Doc(en_vocab, words=["test"])])
|
||||
doc = Doc(en_vocab, words=["...", "hello", "world", "this", "is", "a", "test"])
|
||||
matches = matcher(doc, as_spans=True)
|
||||
assert len(matches) == 2
|
||||
assert isinstance(matches[0], Span)
|
||||
assert matches[0].text == "hello world"
|
||||
assert matches[0].label_ == "A"
|
||||
assert isinstance(matches[1], Span)
|
||||
assert matches[1].text == "test"
|
||||
assert matches[1].label_ == "B"
|
||||
|
||||
|
||||
def test_phrase_matcher_deprecated(en_vocab):
|
||||
matcher = PhraseMatcher(en_vocab)
|
||||
matcher.add("TEST", [Doc(en_vocab, words=["helllo"])])
|
||||
doc = Doc(en_vocab, words=["hello", "world"])
|
||||
with pytest.warns(DeprecationWarning) as record:
|
||||
for _ in matcher.pipe([doc]):
|
||||
pass
|
||||
assert record.list
|
||||
assert "spaCy v3.0" in str(record.list[0].message)
|
||||
|
|
|
@ -31,7 +31,7 @@ def pattern_dicts():
|
|||
]
|
||||
|
||||
|
||||
@registry.assets("attribute_ruler_patterns")
|
||||
@registry.misc("attribute_ruler_patterns")
|
||||
def attribute_ruler_patterns():
|
||||
return [
|
||||
{
|
||||
|
@ -86,7 +86,7 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
|
|||
# initialize with patterns from asset
|
||||
nlp.add_pipe(
|
||||
"attribute_ruler",
|
||||
config={"pattern_dicts": {"@assets": "attribute_ruler_patterns"}},
|
||||
config={"pattern_dicts": {"@misc": "attribute_ruler_patterns"}},
|
||||
)
|
||||
doc = nlp("This is a test.")
|
||||
assert doc[2].lemma_ == "the"
|
||||
|
|
|
@ -137,7 +137,7 @@ def test_kb_undefined(nlp):
|
|||
|
||||
def test_kb_empty(nlp):
|
||||
"""Test that the EL can't train with an empty KB"""
|
||||
config = {"kb_loader": {"@assets": "spacy.EmptyKB.v1", "entity_vector_length": 342}}
|
||||
config = {"kb_loader": {"@misc": "spacy.EmptyKB.v1", "entity_vector_length": 342}}
|
||||
entity_linker = nlp.add_pipe("entity_linker", config=config)
|
||||
assert len(entity_linker.kb) == 0
|
||||
with pytest.raises(ValueError):
|
||||
|
@ -183,7 +183,7 @@ def test_el_pipe_configuration(nlp):
|
|||
ruler = nlp.add_pipe("entity_ruler")
|
||||
ruler.add_patterns([pattern])
|
||||
|
||||
@registry.assets.register("myAdamKB.v1")
|
||||
@registry.misc.register("myAdamKB.v1")
|
||||
def mykb() -> Callable[["Vocab"], KnowledgeBase]:
|
||||
def create_kb(vocab):
|
||||
kb = KnowledgeBase(vocab, entity_vector_length=1)
|
||||
|
@ -199,7 +199,7 @@ def test_el_pipe_configuration(nlp):
|
|||
# run an EL pipe without a trained context encoder, to check the candidate generation step only
|
||||
nlp.add_pipe(
|
||||
"entity_linker",
|
||||
config={"kb_loader": {"@assets": "myAdamKB.v1"}, "incl_context": False},
|
||||
config={"kb_loader": {"@misc": "myAdamKB.v1"}, "incl_context": False},
|
||||
)
|
||||
# With the default get_candidates function, matching is case-sensitive
|
||||
text = "Douglas and douglas are not the same."
|
||||
|
@ -211,7 +211,7 @@ def test_el_pipe_configuration(nlp):
|
|||
def get_lowercased_candidates(kb, span):
|
||||
return kb.get_alias_candidates(span.text.lower())
|
||||
|
||||
@registry.assets.register("spacy.LowercaseCandidateGenerator.v1")
|
||||
@registry.misc.register("spacy.LowercaseCandidateGenerator.v1")
|
||||
def create_candidates() -> Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]:
|
||||
return get_lowercased_candidates
|
||||
|
||||
|
@ -220,9 +220,9 @@ def test_el_pipe_configuration(nlp):
|
|||
"entity_linker",
|
||||
"entity_linker",
|
||||
config={
|
||||
"kb_loader": {"@assets": "myAdamKB.v1"},
|
||||
"kb_loader": {"@misc": "myAdamKB.v1"},
|
||||
"incl_context": False,
|
||||
"get_candidates": {"@assets": "spacy.LowercaseCandidateGenerator.v1"},
|
||||
"get_candidates": {"@misc": "spacy.LowercaseCandidateGenerator.v1"},
|
||||
},
|
||||
)
|
||||
doc = nlp(text)
|
||||
|
@ -282,7 +282,7 @@ def test_append_invalid_alias(nlp):
|
|||
def test_preserving_links_asdoc(nlp):
|
||||
"""Test that Span.as_doc preserves the existing entity links"""
|
||||
|
||||
@registry.assets.register("myLocationsKB.v1")
|
||||
@registry.misc.register("myLocationsKB.v1")
|
||||
def dummy_kb() -> Callable[["Vocab"], KnowledgeBase]:
|
||||
def create_kb(vocab):
|
||||
mykb = KnowledgeBase(vocab, entity_vector_length=1)
|
||||
|
@ -304,7 +304,7 @@ def test_preserving_links_asdoc(nlp):
|
|||
]
|
||||
ruler = nlp.add_pipe("entity_ruler")
|
||||
ruler.add_patterns(patterns)
|
||||
el_config = {"kb_loader": {"@assets": "myLocationsKB.v1"}, "incl_prior": False}
|
||||
el_config = {"kb_loader": {"@misc": "myLocationsKB.v1"}, "incl_prior": False}
|
||||
el_pipe = nlp.add_pipe("entity_linker", config=el_config, last=True)
|
||||
el_pipe.begin_training(lambda: [])
|
||||
el_pipe.incl_context = False
|
||||
|
@ -387,7 +387,7 @@ def test_overfitting_IO():
|
|||
doc = nlp(text)
|
||||
train_examples.append(Example.from_dict(doc, annotation))
|
||||
|
||||
@registry.assets.register("myOverfittingKB.v1")
|
||||
@registry.misc.register("myOverfittingKB.v1")
|
||||
def dummy_kb() -> Callable[["Vocab"], KnowledgeBase]:
|
||||
def create_kb(vocab):
|
||||
# create artificial KB - assign same prior weight to the two russ cochran's
|
||||
|
@ -408,7 +408,7 @@ def test_overfitting_IO():
|
|||
# Create the Entity Linker component and add it to the pipeline
|
||||
nlp.add_pipe(
|
||||
"entity_linker",
|
||||
config={"kb_loader": {"@assets": "myOverfittingKB.v1"}},
|
||||
config={"kb_loader": {"@misc": "myOverfittingKB.v1"}},
|
||||
last=True,
|
||||
)
|
||||
|
||||
|
|
|
@ -13,7 +13,7 @@ def nlp():
|
|||
|
||||
@pytest.fixture
|
||||
def lemmatizer(nlp):
|
||||
@registry.assets("cope_lookups")
|
||||
@registry.misc("cope_lookups")
|
||||
def cope_lookups():
|
||||
lookups = Lookups()
|
||||
lookups.add_table("lemma_lookup", {"cope": "cope"})
|
||||
|
@ -23,13 +23,13 @@ def lemmatizer(nlp):
|
|||
return lookups
|
||||
|
||||
lemmatizer = nlp.add_pipe(
|
||||
"lemmatizer", config={"mode": "rule", "lookups": {"@assets": "cope_lookups"}}
|
||||
"lemmatizer", config={"mode": "rule", "lookups": {"@misc": "cope_lookups"}}
|
||||
)
|
||||
return lemmatizer
|
||||
|
||||
|
||||
def test_lemmatizer_init(nlp):
|
||||
@registry.assets("cope_lookups")
|
||||
@registry.misc("cope_lookups")
|
||||
def cope_lookups():
|
||||
lookups = Lookups()
|
||||
lookups.add_table("lemma_lookup", {"cope": "cope"})
|
||||
|
@ -39,7 +39,7 @@ def test_lemmatizer_init(nlp):
|
|||
return lookups
|
||||
|
||||
lemmatizer = nlp.add_pipe(
|
||||
"lemmatizer", config={"mode": "lookup", "lookups": {"@assets": "cope_lookups"}}
|
||||
"lemmatizer", config={"mode": "lookup", "lookups": {"@misc": "cope_lookups"}}
|
||||
)
|
||||
assert isinstance(lemmatizer.lookups, Lookups)
|
||||
assert lemmatizer.mode == "lookup"
|
||||
|
@ -51,14 +51,14 @@ def test_lemmatizer_init(nlp):
|
|||
|
||||
nlp.remove_pipe("lemmatizer")
|
||||
|
||||
@registry.assets("empty_lookups")
|
||||
@registry.misc("empty_lookups")
|
||||
def empty_lookups():
|
||||
return Lookups()
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
nlp.add_pipe(
|
||||
"lemmatizer",
|
||||
config={"mode": "lookup", "lookups": {"@assets": "empty_lookups"}},
|
||||
config={"mode": "lookup", "lookups": {"@misc": "empty_lookups"}},
|
||||
)
|
||||
|
||||
|
||||
|
@ -79,7 +79,7 @@ def test_lemmatizer_config(nlp, lemmatizer):
|
|||
|
||||
|
||||
def test_lemmatizer_serialize(nlp, lemmatizer):
|
||||
@registry.assets("cope_lookups")
|
||||
@registry.misc("cope_lookups")
|
||||
def cope_lookups():
|
||||
lookups = Lookups()
|
||||
lookups.add_table("lemma_lookup", {"cope": "cope"})
|
||||
|
@ -90,7 +90,7 @@ def test_lemmatizer_serialize(nlp, lemmatizer):
|
|||
|
||||
nlp2 = English()
|
||||
lemmatizer2 = nlp2.add_pipe(
|
||||
"lemmatizer", config={"mode": "rule", "lookups": {"@assets": "cope_lookups"}}
|
||||
"lemmatizer", config={"mode": "rule", "lookups": {"@misc": "cope_lookups"}}
|
||||
)
|
||||
lemmatizer2.from_bytes(lemmatizer.to_bytes())
|
||||
assert lemmatizer.to_bytes() == lemmatizer2.to_bytes()
|
||||
|
|
|
@ -28,8 +28,6 @@ def test_tagger_begin_training_tag_map():
|
|||
|
||||
TAGS = ("N", "V", "J")
|
||||
|
||||
MORPH_RULES = {"V": {"like": {"lemma": "luck"}}}
|
||||
|
||||
TRAIN_DATA = [
|
||||
("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
|
||||
("Eat blue ham", {"tags": ["V", "J", "N"]}),
|
||||
|
@ -69,3 +67,10 @@ def test_overfitting_IO():
|
|||
assert doc2[1].tag_ is "V"
|
||||
assert doc2[2].tag_ is "J"
|
||||
assert doc2[3].tag_ is "N"
|
||||
|
||||
|
||||
def test_tagger_requires_labels():
|
||||
nlp = English()
|
||||
tagger = nlp.add_pipe("tagger")
|
||||
with pytest.raises(ValueError):
|
||||
optimizer = nlp.begin_training()
|
||||
|
|
|
@ -84,9 +84,8 @@ def test_overfitting_IO():
|
|||
# Simple test to try and quickly overfit the textcat component - ensuring the ML models work correctly
|
||||
fix_random_seed(0)
|
||||
nlp = English()
|
||||
textcat = nlp.add_pipe("textcat")
|
||||
# Set exclusive labels
|
||||
textcat.model.attrs["multi_label"] = False
|
||||
textcat = nlp.add_pipe("textcat", config={"model": {"exclusive_classes": True}})
|
||||
train_examples = []
|
||||
for text, annotations in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||
|
@ -103,9 +102,8 @@ def test_overfitting_IO():
|
|||
test_text = "I am happy."
|
||||
doc = nlp(test_text)
|
||||
cats = doc.cats
|
||||
# note that by default, exclusive_classes = false so we need a bigger error margin
|
||||
assert cats["POSITIVE"] > 0.8
|
||||
assert cats["POSITIVE"] + cats["NEGATIVE"] == pytest.approx(1.0, 0.1)
|
||||
assert cats["POSITIVE"] > 0.9
|
||||
assert cats["POSITIVE"] + cats["NEGATIVE"] == pytest.approx(1.0, 0.001)
|
||||
|
||||
# Also test the results are still the same after IO
|
||||
with make_tempdir() as tmp_dir:
|
||||
|
@ -113,8 +111,8 @@ def test_overfitting_IO():
|
|||
nlp2 = util.load_model_from_path(tmp_dir)
|
||||
doc2 = nlp2(test_text)
|
||||
cats2 = doc2.cats
|
||||
assert cats2["POSITIVE"] > 0.8
|
||||
assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.1)
|
||||
assert cats2["POSITIVE"] > 0.9
|
||||
assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.001)
|
||||
|
||||
# Test scoring
|
||||
scores = nlp.evaluate(train_examples, scorer_cfg={"positive_label": "POSITIVE"})
|
||||
|
|
|
@ -326,7 +326,8 @@ def test_issue4348():
|
|||
nlp = English()
|
||||
example = Example.from_dict(nlp.make_doc(""), {"tags": []})
|
||||
TRAIN_DATA = [example, example]
|
||||
nlp.add_pipe("tagger")
|
||||
tagger = nlp.add_pipe("tagger")
|
||||
tagger.add_label("A")
|
||||
optimizer = nlp.begin_training()
|
||||
for i in range(5):
|
||||
losses = {}
|
||||
|
|
|
@ -63,6 +63,7 @@ def tagger():
|
|||
# need to add model for two reasons:
|
||||
# 1. no model leads to error in serialization,
|
||||
# 2. the affected line is the one for model serialization
|
||||
tagger.add_label("A")
|
||||
tagger.begin_training(lambda: [], pipeline=nlp.pipeline)
|
||||
return tagger
|
||||
|
||||
|
@ -70,7 +71,7 @@ def tagger():
|
|||
def entity_linker():
|
||||
nlp = Language()
|
||||
|
||||
@registry.assets.register("TestIssue5230KB.v1")
|
||||
@registry.misc.register("TestIssue5230KB.v1")
|
||||
def dummy_kb() -> Callable[["Vocab"], KnowledgeBase]:
|
||||
def create_kb(vocab):
|
||||
kb = KnowledgeBase(vocab, entity_vector_length=1)
|
||||
|
@ -79,7 +80,7 @@ def entity_linker():
|
|||
|
||||
return create_kb
|
||||
|
||||
config = {"kb_loader": {"@assets": "TestIssue5230KB.v1"}}
|
||||
config = {"kb_loader": {"@misc": "TestIssue5230KB.v1"}}
|
||||
entity_linker = nlp.add_pipe("entity_linker", config=config)
|
||||
# need to add model for two reasons:
|
||||
# 1. no model leads to error in serialization,
|
||||
|
|
|
@ -28,7 +28,7 @@ path = ${paths.train}
|
|||
path = ${paths.dev}
|
||||
|
||||
[training.batcher]
|
||||
@batchers = "batch_by_words.v1"
|
||||
@batchers = "spacy.batch_by_words.v1"
|
||||
size = 666
|
||||
|
||||
[nlp]
|
||||
|
@ -144,6 +144,7 @@ def test_serialize_nlp():
|
|||
""" Create a custom nlp pipeline from config and ensure it serializes it correctly """
|
||||
nlp_config = Config().from_str(nlp_config_string)
|
||||
nlp, _ = load_model_from_config(nlp_config, auto_fill=True)
|
||||
nlp.get_pipe("tagger").add_label("A")
|
||||
nlp.begin_training()
|
||||
assert "tok2vec" in nlp.pipe_names
|
||||
assert "tagger" in nlp.pipe_names
|
||||
|
|
|
@ -85,7 +85,7 @@ def test_serialize_subclassed_kb():
|
|||
super().__init__(vocab, entity_vector_length)
|
||||
self.custom_field = custom_field
|
||||
|
||||
@registry.assets.register("spacy.CustomKB.v1")
|
||||
@registry.misc.register("spacy.CustomKB.v1")
|
||||
def custom_kb(
|
||||
entity_vector_length: int, custom_field: int
|
||||
) -> Callable[["Vocab"], KnowledgeBase]:
|
||||
|
@ -101,7 +101,7 @@ def test_serialize_subclassed_kb():
|
|||
nlp = English()
|
||||
config = {
|
||||
"kb_loader": {
|
||||
"@assets": "spacy.CustomKB.v1",
|
||||
"@misc": "spacy.CustomKB.v1",
|
||||
"entity_vector_length": 342,
|
||||
"custom_field": 666,
|
||||
}
|
||||
|
|
|
@ -3,11 +3,18 @@ import pytest
|
|||
from spacy.ml.models.tok2vec import build_Tok2Vec_model
|
||||
from spacy.ml.models.tok2vec import MultiHashEmbed, CharacterEmbed
|
||||
from spacy.ml.models.tok2vec import MishWindowEncoder, MaxoutWindowEncoder
|
||||
from spacy.pipeline.tok2vec import Tok2Vec, Tok2VecListener
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.tokens import Doc
|
||||
|
||||
from spacy.gold import Example
|
||||
from spacy import util
|
||||
from spacy.lang.en import English
|
||||
from .util import get_batch
|
||||
|
||||
from thinc.api import Config
|
||||
|
||||
from numpy.testing import assert_equal
|
||||
|
||||
|
||||
def test_empty_doc():
|
||||
width = 128
|
||||
|
@ -41,7 +48,7 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
|
|||
also_use_static_vectors=False,
|
||||
also_embed_subwords=True,
|
||||
),
|
||||
MaxoutWindowEncoder(width=width, depth=4, window_size=1, maxout_pieces=3,),
|
||||
MaxoutWindowEncoder(width=width, depth=4, window_size=1, maxout_pieces=3),
|
||||
)
|
||||
tok2vec.initialize()
|
||||
vectors, backprop = tok2vec.begin_update(batch)
|
||||
|
@ -74,3 +81,89 @@ def test_tok2vec_configs(width, embed_arch, embed_config, encode_arch, encode_co
|
|||
assert len(vectors) == len(docs)
|
||||
assert vectors[0].shape == (len(docs[0]), width)
|
||||
backprop(vectors)
|
||||
|
||||
|
||||
def test_init_tok2vec():
|
||||
# Simple test to initialize the default tok2vec
|
||||
nlp = English()
|
||||
tok2vec = nlp.add_pipe("tok2vec")
|
||||
assert tok2vec.listeners == []
|
||||
nlp.begin_training()
|
||||
|
||||
|
||||
cfg_string = """
|
||||
[nlp]
|
||||
lang = "en"
|
||||
pipeline = ["tok2vec","tagger"]
|
||||
|
||||
[components]
|
||||
|
||||
[components.tagger]
|
||||
factory = "tagger"
|
||||
|
||||
[components.tagger.model]
|
||||
@architectures = "spacy.Tagger.v1"
|
||||
nO = null
|
||||
|
||||
[components.tagger.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecListener.v1"
|
||||
width = ${components.tok2vec.model.encode.width}
|
||||
|
||||
[components.tok2vec]
|
||||
factory = "tok2vec"
|
||||
|
||||
[components.tok2vec.model]
|
||||
@architectures = "spacy.Tok2Vec.v1"
|
||||
|
||||
[components.tok2vec.model.embed]
|
||||
@architectures = "spacy.MultiHashEmbed.v1"
|
||||
width = ${components.tok2vec.model.encode.width}
|
||||
rows = 2000
|
||||
also_embed_subwords = true
|
||||
also_use_static_vectors = false
|
||||
|
||||
[components.tok2vec.model.encode]
|
||||
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
||||
width = 96
|
||||
depth = 4
|
||||
window_size = 1
|
||||
maxout_pieces = 3
|
||||
"""
|
||||
|
||||
TRAIN_DATA = [
|
||||
("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
|
||||
("Eat blue ham", {"tags": ["V", "J", "N"]}),
|
||||
]
|
||||
|
||||
def test_tok2vec_listener():
|
||||
orig_config = Config().from_str(cfg_string)
|
||||
nlp, config = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
|
||||
assert nlp.pipe_names == ["tok2vec", "tagger"]
|
||||
tagger = nlp.get_pipe("tagger")
|
||||
tok2vec = nlp.get_pipe("tok2vec")
|
||||
tagger_tok2vec = tagger.model.get_ref("tok2vec")
|
||||
assert isinstance(tok2vec, Tok2Vec)
|
||||
assert isinstance(tagger_tok2vec, Tok2VecListener)
|
||||
train_examples = []
|
||||
for t in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
for tag in t[1]["tags"]:
|
||||
tagger.add_label(tag)
|
||||
|
||||
# Check that the Tok2Vec component finds it listeners
|
||||
assert tok2vec.listeners == []
|
||||
optimizer = nlp.begin_training(lambda: train_examples)
|
||||
assert tok2vec.listeners == [tagger_tok2vec]
|
||||
|
||||
for i in range(5):
|
||||
losses = {}
|
||||
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
||||
|
||||
doc = nlp("Running the pipeline as a whole.")
|
||||
doc_tensor = tagger_tok2vec.predict([doc])[0]
|
||||
assert_equal(doc.tensor, doc_tensor)
|
||||
|
||||
# TODO: should this warn or error?
|
||||
nlp.select_pipes(disable="tok2vec")
|
||||
assert nlp.pipe_names == ["tagger"]
|
||||
nlp("Running the pipeline with the Tok2Vec component disabled.")
|
||||
|
|
|
@ -105,7 +105,13 @@ def test_tokenizer_add_special_case(tokenizer, text, tokens):
|
|||
assert doc[1].text == tokens[1]["orth"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text,tokens", [("lorem", [{"orth": "lo"}, {"orth": "re"}])])
|
||||
@pytest.mark.parametrize(
|
||||
"text,tokens",
|
||||
[
|
||||
("lorem", [{"orth": "lo"}, {"orth": "re"}]),
|
||||
("lorem", [{"orth": "lo", "tag": "A"}, {"orth": "rem"}]),
|
||||
],
|
||||
)
|
||||
def test_tokenizer_validate_special_case(tokenizer, text, tokens):
|
||||
with pytest.raises(ValueError):
|
||||
tokenizer.add_special_case(text, tokens)
|
||||
|
|
|
@ -34,9 +34,9 @@ cdef class Tokenizer:
|
|||
vector[SpanC] &filtered)
|
||||
cdef int _retokenize_special_spans(self, Doc doc, TokenC* tokens,
|
||||
object span_data)
|
||||
cdef int _try_cache(self, hash_t key, Doc tokens) except -1
|
||||
cdef int _try_specials(self, hash_t key, Doc tokens,
|
||||
int* has_special) except -1
|
||||
cdef int _try_specials_and_cache(self, hash_t key, Doc tokens,
|
||||
int* has_special,
|
||||
bint with_special_cases) except -1
|
||||
cdef int _tokenize(self, Doc tokens, unicode span, hash_t key,
|
||||
int* has_special, bint with_special_cases) except -1
|
||||
cdef unicode _split_affixes(self, Pool mem, unicode string,
|
||||
|
|
|
@ -17,7 +17,7 @@ from .strings cimport hash_string
|
|||
from .lexeme cimport EMPTY_LEXEME
|
||||
|
||||
from .attrs import intify_attrs
|
||||
from .symbols import ORTH
|
||||
from .symbols import ORTH, NORM
|
||||
from .errors import Errors, Warnings
|
||||
from . import util
|
||||
from .util import registry
|
||||
|
@ -31,7 +31,7 @@ cdef class Tokenizer:
|
|||
"""Segment text, and create Doc objects with the discovered segment
|
||||
boundaries.
|
||||
|
||||
DOCS: https://spacy.io/api/tokenizer
|
||||
DOCS: https://nightly.spacy.io/api/tokenizer
|
||||
"""
|
||||
def __init__(self, Vocab vocab, rules=None, prefix_search=None,
|
||||
suffix_search=None, infix_finditer=None, token_match=None,
|
||||
|
@ -54,7 +54,7 @@ cdef class Tokenizer:
|
|||
EXAMPLE:
|
||||
>>> tokenizer = Tokenizer(nlp.vocab)
|
||||
|
||||
DOCS: https://spacy.io/api/tokenizer#init
|
||||
DOCS: https://nightly.spacy.io/api/tokenizer#init
|
||||
"""
|
||||
self.mem = Pool()
|
||||
self._cache = PreshMap()
|
||||
|
@ -147,7 +147,7 @@ cdef class Tokenizer:
|
|||
string (str): The string to tokenize.
|
||||
RETURNS (Doc): A container for linguistic annotations.
|
||||
|
||||
DOCS: https://spacy.io/api/tokenizer#call
|
||||
DOCS: https://nightly.spacy.io/api/tokenizer#call
|
||||
"""
|
||||
doc = self._tokenize_affixes(string, True)
|
||||
self._apply_special_cases(doc)
|
||||
|
@ -169,8 +169,6 @@ cdef class Tokenizer:
|
|||
cdef int i = 0
|
||||
cdef int start = 0
|
||||
cdef int has_special = 0
|
||||
cdef bint specials_hit = 0
|
||||
cdef bint cache_hit = 0
|
||||
cdef bint in_ws = string[0].isspace()
|
||||
cdef unicode span
|
||||
# The task here is much like string.split, but not quite
|
||||
|
@ -186,13 +184,7 @@ cdef class Tokenizer:
|
|||
# we don't have to create the slice when we hit the cache.
|
||||
span = string[start:i]
|
||||
key = hash_string(span)
|
||||
specials_hit = 0
|
||||
cache_hit = 0
|
||||
if with_special_cases:
|
||||
specials_hit = self._try_specials(key, doc, &has_special)
|
||||
if not specials_hit:
|
||||
cache_hit = self._try_cache(key, doc)
|
||||
if not specials_hit and not cache_hit:
|
||||
if not self._try_specials_and_cache(key, doc, &has_special, with_special_cases):
|
||||
self._tokenize(doc, span, key, &has_special, with_special_cases)
|
||||
if uc == ' ':
|
||||
doc.c[doc.length - 1].spacy = True
|
||||
|
@ -204,13 +196,7 @@ cdef class Tokenizer:
|
|||
if start < i:
|
||||
span = string[start:]
|
||||
key = hash_string(span)
|
||||
specials_hit = 0
|
||||
cache_hit = 0
|
||||
if with_special_cases:
|
||||
specials_hit = self._try_specials(key, doc, &has_special)
|
||||
if not specials_hit:
|
||||
cache_hit = self._try_cache(key, doc)
|
||||
if not specials_hit and not cache_hit:
|
||||
if not self._try_specials_and_cache(key, doc, &has_special, with_special_cases):
|
||||
self._tokenize(doc, span, key, &has_special, with_special_cases)
|
||||
doc.c[doc.length - 1].spacy = string[-1] == " " and not in_ws
|
||||
return doc
|
||||
|
@ -223,7 +209,7 @@ cdef class Tokenizer:
|
|||
Defaults to 1000.
|
||||
YIELDS (Doc): A sequence of Doc objects, in order.
|
||||
|
||||
DOCS: https://spacy.io/api/tokenizer#pipe
|
||||
DOCS: https://nightly.spacy.io/api/tokenizer#pipe
|
||||
"""
|
||||
for text in texts:
|
||||
yield self(text)
|
||||
|
@ -364,27 +350,33 @@ cdef class Tokenizer:
|
|||
offset += span[3]
|
||||
return offset
|
||||
|
||||
cdef int _try_cache(self, hash_t key, Doc tokens) except -1:
|
||||
cached = <_Cached*>self._cache.get(key)
|
||||
if cached == NULL:
|
||||
return False
|
||||
cdef int _try_specials_and_cache(self, hash_t key, Doc tokens, int* has_special, bint with_special_cases) except -1:
|
||||
cdef bint specials_hit = 0
|
||||
cdef bint cache_hit = 0
|
||||
cdef int i
|
||||
if cached.is_lex:
|
||||
for i in range(cached.length):
|
||||
tokens.push_back(cached.data.lexemes[i], False)
|
||||
else:
|
||||
for i in range(cached.length):
|
||||
tokens.push_back(&cached.data.tokens[i], False)
|
||||
return True
|
||||
|
||||
cdef int _try_specials(self, hash_t key, Doc tokens, int* has_special) except -1:
|
||||
cached = <_Cached*>self._specials.get(key)
|
||||
if cached == NULL:
|
||||
if with_special_cases:
|
||||
cached = <_Cached*>self._specials.get(key)
|
||||
if cached == NULL:
|
||||
specials_hit = False
|
||||
else:
|
||||
for i in range(cached.length):
|
||||
tokens.push_back(&cached.data.tokens[i], False)
|
||||
has_special[0] = 1
|
||||
specials_hit = True
|
||||
if not specials_hit:
|
||||
cached = <_Cached*>self._cache.get(key)
|
||||
if cached == NULL:
|
||||
cache_hit = False
|
||||
else:
|
||||
if cached.is_lex:
|
||||
for i in range(cached.length):
|
||||
tokens.push_back(cached.data.lexemes[i], False)
|
||||
else:
|
||||
for i in range(cached.length):
|
||||
tokens.push_back(&cached.data.tokens[i], False)
|
||||
cache_hit = True
|
||||
if not specials_hit and not cache_hit:
|
||||
return False
|
||||
cdef int i
|
||||
for i in range(cached.length):
|
||||
tokens.push_back(&cached.data.tokens[i], False)
|
||||
has_special[0] = 1
|
||||
return True
|
||||
|
||||
cdef int _tokenize(self, Doc tokens, unicode span, hash_t orig_key, int* has_special, bint with_special_cases) except -1:
|
||||
|
@ -462,12 +454,7 @@ cdef class Tokenizer:
|
|||
for i in range(prefixes.size()):
|
||||
tokens.push_back(prefixes[0][i], False)
|
||||
if string:
|
||||
if with_special_cases:
|
||||
specials_hit = self._try_specials(hash_string(string), tokens,
|
||||
has_special)
|
||||
if not specials_hit:
|
||||
cache_hit = self._try_cache(hash_string(string), tokens)
|
||||
if specials_hit or cache_hit:
|
||||
if self._try_specials_and_cache(hash_string(string), tokens, has_special, with_special_cases):
|
||||
pass
|
||||
elif (self.token_match and self.token_match(string)) or \
|
||||
(self.url_match and \
|
||||
|
@ -542,7 +529,7 @@ cdef class Tokenizer:
|
|||
and `.end()` methods, denoting the placement of internal segment
|
||||
separators, e.g. hyphens.
|
||||
|
||||
DOCS: https://spacy.io/api/tokenizer#find_infix
|
||||
DOCS: https://nightly.spacy.io/api/tokenizer#find_infix
|
||||
"""
|
||||
if self.infix_finditer is None:
|
||||
return 0
|
||||
|
@ -555,7 +542,7 @@ cdef class Tokenizer:
|
|||
string (str): The string to segment.
|
||||
RETURNS (int): The length of the prefix if present, otherwise `None`.
|
||||
|
||||
DOCS: https://spacy.io/api/tokenizer#find_prefix
|
||||
DOCS: https://nightly.spacy.io/api/tokenizer#find_prefix
|
||||
"""
|
||||
if self.prefix_search is None:
|
||||
return 0
|
||||
|
@ -569,7 +556,7 @@ cdef class Tokenizer:
|
|||
string (str): The string to segment.
|
||||
Returns (int): The length of the suffix if present, otherwise `None`.
|
||||
|
||||
DOCS: https://spacy.io/api/tokenizer#find_suffix
|
||||
DOCS: https://nightly.spacy.io/api/tokenizer#find_suffix
|
||||
"""
|
||||
if self.suffix_search is None:
|
||||
return 0
|
||||
|
@ -584,9 +571,11 @@ cdef class Tokenizer:
|
|||
self.add_special_case(chunk, substrings)
|
||||
|
||||
def _validate_special_case(self, chunk, substrings):
|
||||
"""Check whether the `ORTH` fields match the string.
|
||||
"""Check whether the `ORTH` fields match the string. Check that
|
||||
additional features beyond `ORTH` and `NORM` are not set by the
|
||||
exception.
|
||||
|
||||
string (str): The string to specially tokenize.
|
||||
chunk (str): The string to specially tokenize.
|
||||
substrings (iterable): A sequence of dicts, where each dict describes
|
||||
a token and its attributes.
|
||||
"""
|
||||
|
@ -594,6 +583,10 @@ cdef class Tokenizer:
|
|||
orth = "".join([spec[ORTH] for spec in attrs])
|
||||
if chunk != orth:
|
||||
raise ValueError(Errors.E997.format(chunk=chunk, orth=orth, token_attrs=substrings))
|
||||
for substring in attrs:
|
||||
for attr in substring:
|
||||
if attr not in (ORTH, NORM):
|
||||
raise ValueError(Errors.E1005.format(attr=self.vocab.strings[attr], chunk=chunk))
|
||||
|
||||
def add_special_case(self, unicode string, substrings):
|
||||
"""Add a special-case tokenization rule.
|
||||
|
@ -603,7 +596,7 @@ cdef class Tokenizer:
|
|||
a token and its attributes. The `ORTH` fields of the attributes
|
||||
must exactly match the string when they are concatenated.
|
||||
|
||||
DOCS: https://spacy.io/api/tokenizer#add_special_case
|
||||
DOCS: https://nightly.spacy.io/api/tokenizer#add_special_case
|
||||
"""
|
||||
self._validate_special_case(string, substrings)
|
||||
substrings = list(substrings)
|
||||
|
@ -642,7 +635,7 @@ cdef class Tokenizer:
|
|||
string (str): The string to tokenize.
|
||||
RETURNS (list): A list of (pattern_string, token_string) tuples
|
||||
|
||||
DOCS: https://spacy.io/api/tokenizer#explain
|
||||
DOCS: https://nightly.spacy.io/api/tokenizer#explain
|
||||
"""
|
||||
prefix_search = self.prefix_search
|
||||
suffix_search = self.suffix_search
|
||||
|
@ -723,7 +716,7 @@ cdef class Tokenizer:
|
|||
it doesn't exist.
|
||||
exclude (list): String names of serialization fields to exclude.
|
||||
|
||||
DOCS: https://spacy.io/api/tokenizer#to_disk
|
||||
DOCS: https://nightly.spacy.io/api/tokenizer#to_disk
|
||||
"""
|
||||
path = util.ensure_path(path)
|
||||
with path.open("wb") as file_:
|
||||
|
@ -737,7 +730,7 @@ cdef class Tokenizer:
|
|||
exclude (list): String names of serialization fields to exclude.
|
||||
RETURNS (Tokenizer): The modified `Tokenizer` object.
|
||||
|
||||
DOCS: https://spacy.io/api/tokenizer#from_disk
|
||||
DOCS: https://nightly.spacy.io/api/tokenizer#from_disk
|
||||
"""
|
||||
path = util.ensure_path(path)
|
||||
with path.open("rb") as file_:
|
||||
|
@ -751,7 +744,7 @@ cdef class Tokenizer:
|
|||
exclude (list): String names of serialization fields to exclude.
|
||||
RETURNS (bytes): The serialized form of the `Tokenizer` object.
|
||||
|
||||
DOCS: https://spacy.io/api/tokenizer#to_bytes
|
||||
DOCS: https://nightly.spacy.io/api/tokenizer#to_bytes
|
||||
"""
|
||||
serializers = {
|
||||
"vocab": lambda: self.vocab.to_bytes(),
|
||||
|
@ -771,7 +764,7 @@ cdef class Tokenizer:
|
|||
exclude (list): String names of serialization fields to exclude.
|
||||
RETURNS (Tokenizer): The `Tokenizer` object.
|
||||
|
||||
DOCS: https://spacy.io/api/tokenizer#from_bytes
|
||||
DOCS: https://nightly.spacy.io/api/tokenizer#from_bytes
|
||||
"""
|
||||
data = {}
|
||||
deserializers = {
|
||||
|
|
|
@ -24,8 +24,8 @@ from ..strings import get_string_id
|
|||
cdef class Retokenizer:
|
||||
"""Helper class for doc.retokenize() context manager.
|
||||
|
||||
DOCS: https://spacy.io/api/doc#retokenize
|
||||
USAGE: https://spacy.io/usage/linguistic-features#retokenization
|
||||
DOCS: https://nightly.spacy.io/api/doc#retokenize
|
||||
USAGE: https://nightly.spacy.io/usage/linguistic-features#retokenization
|
||||
"""
|
||||
cdef Doc doc
|
||||
cdef list merges
|
||||
|
@ -47,7 +47,7 @@ cdef class Retokenizer:
|
|||
span (Span): The span to merge.
|
||||
attrs (dict): Attributes to set on the merged token.
|
||||
|
||||
DOCS: https://spacy.io/api/doc#retokenizer.merge
|
||||
DOCS: https://nightly.spacy.io/api/doc#retokenizer.merge
|
||||
"""
|
||||
if (span.start, span.end) in self._spans_to_merge:
|
||||
return
|
||||
|
@ -73,7 +73,7 @@ cdef class Retokenizer:
|
|||
attrs (dict): Attributes to set on all split tokens. Attribute names
|
||||
mapped to list of per-token attribute values.
|
||||
|
||||
DOCS: https://spacy.io/api/doc#retokenizer.split
|
||||
DOCS: https://nightly.spacy.io/api/doc#retokenizer.split
|
||||
"""
|
||||
if ''.join(orths) != token.text:
|
||||
raise ValueError(Errors.E117.format(new=''.join(orths), old=token.text))
|
||||
|
|
|
@ -61,7 +61,7 @@ class DocBin:
|
|||
store_user_data (bool): Whether to include the `Doc.user_data`.
|
||||
docs (Iterable[Doc]): Docs to add.
|
||||
|
||||
DOCS: https://spacy.io/api/docbin#init
|
||||
DOCS: https://nightly.spacy.io/api/docbin#init
|
||||
"""
|
||||
attrs = sorted([intify_attr(attr) for attr in attrs])
|
||||
self.version = "0.1"
|
||||
|
@ -86,7 +86,7 @@ class DocBin:
|
|||
|
||||
doc (Doc): The Doc object to add.
|
||||
|
||||
DOCS: https://spacy.io/api/docbin#add
|
||||
DOCS: https://nightly.spacy.io/api/docbin#add
|
||||
"""
|
||||
array = doc.to_array(self.attrs)
|
||||
if len(array.shape) == 1:
|
||||
|
@ -115,7 +115,7 @@ class DocBin:
|
|||
vocab (Vocab): The shared vocab.
|
||||
YIELDS (Doc): The Doc objects.
|
||||
|
||||
DOCS: https://spacy.io/api/docbin#get_docs
|
||||
DOCS: https://nightly.spacy.io/api/docbin#get_docs
|
||||
"""
|
||||
for string in self.strings:
|
||||
vocab[string]
|
||||
|
@ -141,7 +141,7 @@ class DocBin:
|
|||
|
||||
other (DocBin): The DocBin to merge into the current bin.
|
||||
|
||||
DOCS: https://spacy.io/api/docbin#merge
|
||||
DOCS: https://nightly.spacy.io/api/docbin#merge
|
||||
"""
|
||||
if self.attrs != other.attrs:
|
||||
raise ValueError(Errors.E166.format(current=self.attrs, other=other.attrs))
|
||||
|
@ -158,7 +158,7 @@ class DocBin:
|
|||
|
||||
RETURNS (bytes): The serialized DocBin.
|
||||
|
||||
DOCS: https://spacy.io/api/docbin#to_bytes
|
||||
DOCS: https://nightly.spacy.io/api/docbin#to_bytes
|
||||
"""
|
||||
for tokens in self.tokens:
|
||||
assert len(tokens.shape) == 2, tokens.shape # this should never happen
|
||||
|
@ -185,7 +185,7 @@ class DocBin:
|
|||
bytes_data (bytes): The data to load from.
|
||||
RETURNS (DocBin): The loaded DocBin.
|
||||
|
||||
DOCS: https://spacy.io/api/docbin#from_bytes
|
||||
DOCS: https://nightly.spacy.io/api/docbin#from_bytes
|
||||
"""
|
||||
msg = srsly.msgpack_loads(zlib.decompress(bytes_data))
|
||||
self.attrs = msg["attrs"]
|
||||
|
@ -211,7 +211,7 @@ class DocBin:
|
|||
|
||||
path (str / Path): The file path.
|
||||
|
||||
DOCS: https://spacy.io/api/docbin#to_disk
|
||||
DOCS: https://nightly.spacy.io/api/docbin#to_disk
|
||||
"""
|
||||
path = ensure_path(path)
|
||||
with path.open("wb") as file_:
|
||||
|
@ -223,7 +223,7 @@ class DocBin:
|
|||
path (str / Path): The file path.
|
||||
RETURNS (DocBin): The loaded DocBin.
|
||||
|
||||
DOCS: https://spacy.io/api/docbin#to_disk
|
||||
DOCS: https://nightly.spacy.io/api/docbin#to_disk
|
||||
"""
|
||||
path = ensure_path(path)
|
||||
with path.open("rb") as file_:
|
||||
|
|
|
@ -104,7 +104,7 @@ cdef class Doc:
|
|||
>>> from spacy.tokens import Doc
|
||||
>>> doc = Doc(nlp.vocab, words=["hello", "world", "!"], spaces=[True, False, False])
|
||||
|
||||
DOCS: https://spacy.io/api/doc
|
||||
DOCS: https://nightly.spacy.io/api/doc
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
|
@ -118,8 +118,8 @@ cdef class Doc:
|
|||
method (callable): Optional method for method extension.
|
||||
force (bool): Force overwriting existing attribute.
|
||||
|
||||
DOCS: https://spacy.io/api/doc#set_extension
|
||||
USAGE: https://spacy.io/usage/processing-pipelines#custom-components-attributes
|
||||
DOCS: https://nightly.spacy.io/api/doc#set_extension
|
||||
USAGE: https://nightly.spacy.io/usage/processing-pipelines#custom-components-attributes
|
||||
"""
|
||||
if cls.has_extension(name) and not kwargs.get("force", False):
|
||||
raise ValueError(Errors.E090.format(name=name, obj="Doc"))
|
||||
|
@ -132,7 +132,7 @@ cdef class Doc:
|
|||
name (str): Name of the extension.
|
||||
RETURNS (tuple): A `(default, method, getter, setter)` tuple.
|
||||
|
||||
DOCS: https://spacy.io/api/doc#get_extension
|
||||
DOCS: https://nightly.spacy.io/api/doc#get_extension
|
||||
"""
|
||||
return Underscore.doc_extensions.get(name)
|
||||
|
||||
|
@ -143,7 +143,7 @@ cdef class Doc:
|
|||
name (str): Name of the extension.
|
||||
RETURNS (bool): Whether the extension has been registered.
|
||||
|
||||
DOCS: https://spacy.io/api/doc#has_extension
|
||||
DOCS: https://nightly.spacy.io/api/doc#has_extension
|
||||
"""
|
||||
return name in Underscore.doc_extensions
|
||||
|
||||
|
@ -155,7 +155,7 @@ cdef class Doc:
|
|||
RETURNS (tuple): A `(default, method, getter, setter)` tuple of the
|
||||
removed extension.
|
||||
|
||||
DOCS: https://spacy.io/api/doc#remove_extension
|
||||
DOCS: https://nightly.spacy.io/api/doc#remove_extension
|
||||
"""
|
||||
if not cls.has_extension(name):
|
||||
raise ValueError(Errors.E046.format(name=name))
|
||||
|
@ -173,7 +173,7 @@ cdef class Doc:
|
|||
it is not. If `None`, defaults to `[True]*len(words)`
|
||||
user_data (dict or None): Optional extra data to attach to the Doc.
|
||||
|
||||
DOCS: https://spacy.io/api/doc#init
|
||||
DOCS: https://nightly.spacy.io/api/doc#init
|
||||
"""
|
||||
self.vocab = vocab
|
||||
size = max(20, (len(words) if words is not None else 0))
|
||||
|
@ -288,7 +288,7 @@ cdef class Doc:
|
|||
You can use negative indices and open-ended ranges, which have
|
||||
their normal Python semantics.
|
||||
|
||||
DOCS: https://spacy.io/api/doc#getitem
|
||||
DOCS: https://nightly.spacy.io/api/doc#getitem
|
||||
"""
|
||||
if isinstance(i, slice):
|
||||
start, stop = normalize_slice(len(self), i.start, i.stop, i.step)
|
||||
|
@ -305,7 +305,7 @@ cdef class Doc:
|
|||
than-Python speeds are required, you can instead access the annotations
|
||||
as a numpy array, or access the underlying C data directly from Cython.
|
||||
|
||||
DOCS: https://spacy.io/api/doc#iter
|
||||
DOCS: https://nightly.spacy.io/api/doc#iter
|
||||
"""
|
||||
cdef int i
|
||||
for i in range(self.length):
|
||||
|
@ -316,7 +316,7 @@ cdef class Doc:
|
|||
|
||||
RETURNS (int): The number of tokens in the document.
|
||||
|
||||
DOCS: https://spacy.io/api/doc#len
|
||||
DOCS: https://nightly.spacy.io/api/doc#len
|
||||
"""
|
||||
return self.length
|
||||
|
||||
|
@ -349,7 +349,7 @@ cdef class Doc:
|
|||
the span.
|
||||
RETURNS (Span): The newly constructed object.
|
||||
|
||||
DOCS: https://spacy.io/api/doc#char_span
|
||||
DOCS: https://nightly.spacy.io/api/doc#char_span
|
||||
"""
|
||||
if not isinstance(label, int):
|
||||
label = self.vocab.strings.add(label)
|
||||
|
@ -374,7 +374,7 @@ cdef class Doc:
|
|||
`Span`, `Token` and `Lexeme` objects.
|
||||
RETURNS (float): A scalar similarity score. Higher is more similar.
|
||||
|
||||
DOCS: https://spacy.io/api/doc#similarity
|
||||
DOCS: https://nightly.spacy.io/api/doc#similarity
|
||||
"""
|
||||
if "similarity" in self.user_hooks:
|
||||
return self.user_hooks["similarity"](self, other)
|
||||
|
@ -407,7 +407,7 @@ cdef class Doc:
|
|||
|
||||
RETURNS (bool): Whether a word vector is associated with the object.
|
||||
|
||||
DOCS: https://spacy.io/api/doc#has_vector
|
||||
DOCS: https://nightly.spacy.io/api/doc#has_vector
|
||||
"""
|
||||
if "has_vector" in self.user_hooks:
|
||||
return self.user_hooks["has_vector"](self)
|
||||
|
@ -425,7 +425,7 @@ cdef class Doc:
|
|||
RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
|
||||
representing the document's semantics.
|
||||
|
||||
DOCS: https://spacy.io/api/doc#vector
|
||||
DOCS: https://nightly.spacy.io/api/doc#vector
|
||||
"""
|
||||
def __get__(self):
|
||||
if "vector" in self.user_hooks:
|
||||
|
@ -453,7 +453,7 @@ cdef class Doc:
|
|||
|
||||
RETURNS (float): The L2 norm of the vector representation.
|
||||
|
||||
DOCS: https://spacy.io/api/doc#vector_norm
|
||||
DOCS: https://nightly.spacy.io/api/doc#vector_norm
|
||||
"""
|
||||
def __get__(self):
|
||||
if "vector_norm" in self.user_hooks:
|
||||
|
@ -493,7 +493,7 @@ cdef class Doc:
|
|||
|
||||
RETURNS (tuple): Entities in the document, one `Span` per entity.
|
||||
|
||||
DOCS: https://spacy.io/api/doc#ents
|
||||
DOCS: https://nightly.spacy.io/api/doc#ents
|
||||
"""
|
||||
def __get__(self):
|
||||
cdef int i
|
||||
|
@ -584,7 +584,7 @@ cdef class Doc:
|
|||
|
||||
YIELDS (Span): Noun chunks in the document.
|
||||
|
||||
DOCS: https://spacy.io/api/doc#noun_chunks
|
||||
DOCS: https://nightly.spacy.io/api/doc#noun_chunks
|
||||
"""
|
||||
|
||||
# Accumulate the result before beginning to iterate over it. This
|
||||
|
@ -609,7 +609,7 @@ cdef class Doc:
|
|||
|
||||
YIELDS (Span): Sentences in the document.
|
||||
|
||||
DOCS: https://spacy.io/api/doc#sents
|
||||
DOCS: https://nightly.spacy.io/api/doc#sents
|
||||
"""
|
||||
if not self.is_sentenced:
|
||||
raise ValueError(Errors.E030)
|
||||
|
@ -722,7 +722,7 @@ cdef class Doc:
|
|||
attr_id (int): The attribute ID to key the counts.
|
||||
RETURNS (dict): A dictionary mapping attributes to integer counts.
|
||||
|
||||
DOCS: https://spacy.io/api/doc#count_by
|
||||
DOCS: https://nightly.spacy.io/api/doc#count_by
|
||||
"""
|
||||
cdef int i
|
||||
cdef attr_t attr
|
||||
|
@ -777,7 +777,7 @@ cdef class Doc:
|
|||
array (numpy.ndarray[ndim=2, dtype='int32']): The attribute values.
|
||||
RETURNS (Doc): Itself.
|
||||
|
||||
DOCS: https://spacy.io/api/doc#from_array
|
||||
DOCS: https://nightly.spacy.io/api/doc#from_array
|
||||
"""
|
||||
# Handle scalar/list inputs of strings/ints for py_attr_ids
|
||||
# See also #3064
|
||||
|
@ -872,7 +872,7 @@ cdef class Doc:
|
|||
attrs (list): Optional list of attribute ID ints or attribute name strings.
|
||||
RETURNS (Doc): A doc that contains the concatenated docs, or None if no docs were given.
|
||||
|
||||
DOCS: https://spacy.io/api/doc#from_docs
|
||||
DOCS: https://nightly.spacy.io/api/doc#from_docs
|
||||
"""
|
||||
if not docs:
|
||||
return None
|
||||
|
@ -920,7 +920,9 @@ cdef class Doc:
|
|||
warnings.warn(Warnings.W101.format(name=name))
|
||||
else:
|
||||
warnings.warn(Warnings.W102.format(key=key, value=value))
|
||||
char_offset += len(doc.text) if not ensure_whitespace or doc[-1].is_space else len(doc.text) + 1
|
||||
char_offset += len(doc.text)
|
||||
if ensure_whitespace and not (len(doc) > 0 and doc[-1].is_space):
|
||||
char_offset += 1
|
||||
|
||||
arrays = [doc.to_array(attrs) for doc in docs]
|
||||
|
||||
|
@ -932,7 +934,7 @@ cdef class Doc:
|
|||
token_offset = -1
|
||||
for doc in docs[:-1]:
|
||||
token_offset += len(doc)
|
||||
if not doc[-1].is_space:
|
||||
if not (len(doc) > 0 and doc[-1].is_space):
|
||||
concat_spaces[token_offset] = True
|
||||
|
||||
concat_array = numpy.concatenate(arrays)
|
||||
|
@ -951,7 +953,7 @@ cdef class Doc:
|
|||
RETURNS (np.array[ndim=2, dtype=numpy.int32]): LCA matrix with shape
|
||||
(n, n), where n = len(self).
|
||||
|
||||
DOCS: https://spacy.io/api/doc#get_lca_matrix
|
||||
DOCS: https://nightly.spacy.io/api/doc#get_lca_matrix
|
||||
"""
|
||||
return numpy.asarray(_get_lca_matrix(self, 0, len(self)))
|
||||
|
||||
|
@ -985,7 +987,7 @@ cdef class Doc:
|
|||
it doesn't exist. Paths may be either strings or Path-like objects.
|
||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
|
||||
DOCS: https://spacy.io/api/doc#to_disk
|
||||
DOCS: https://nightly.spacy.io/api/doc#to_disk
|
||||
"""
|
||||
path = util.ensure_path(path)
|
||||
with path.open("wb") as file_:
|
||||
|
@ -1000,7 +1002,7 @@ cdef class Doc:
|
|||
exclude (list): String names of serialization fields to exclude.
|
||||
RETURNS (Doc): The modified `Doc` object.
|
||||
|
||||
DOCS: https://spacy.io/api/doc#from_disk
|
||||
DOCS: https://nightly.spacy.io/api/doc#from_disk
|
||||
"""
|
||||
path = util.ensure_path(path)
|
||||
with path.open("rb") as file_:
|
||||
|
@ -1014,7 +1016,7 @@ cdef class Doc:
|
|||
RETURNS (bytes): A losslessly serialized copy of the `Doc`, including
|
||||
all annotations.
|
||||
|
||||
DOCS: https://spacy.io/api/doc#to_bytes
|
||||
DOCS: https://nightly.spacy.io/api/doc#to_bytes
|
||||
"""
|
||||
return srsly.msgpack_dumps(self.to_dict(exclude=exclude))
|
||||
|
||||
|
@ -1025,7 +1027,7 @@ cdef class Doc:
|
|||
exclude (list): String names of serialization fields to exclude.
|
||||
RETURNS (Doc): Itself.
|
||||
|
||||
DOCS: https://spacy.io/api/doc#from_bytes
|
||||
DOCS: https://nightly.spacy.io/api/doc#from_bytes
|
||||
"""
|
||||
return self.from_dict(srsly.msgpack_loads(bytes_data), exclude=exclude)
|
||||
|
||||
|
@ -1036,7 +1038,7 @@ cdef class Doc:
|
|||
RETURNS (bytes): A losslessly serialized copy of the `Doc`, including
|
||||
all annotations.
|
||||
|
||||
DOCS: https://spacy.io/api/doc#to_bytes
|
||||
DOCS: https://nightly.spacy.io/api/doc#to_bytes
|
||||
"""
|
||||
array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID, NORM, ENT_KB_ID]
|
||||
if self.is_tagged:
|
||||
|
@ -1084,7 +1086,7 @@ cdef class Doc:
|
|||
exclude (list): String names of serialization fields to exclude.
|
||||
RETURNS (Doc): Itself.
|
||||
|
||||
DOCS: https://spacy.io/api/doc#from_dict
|
||||
DOCS: https://nightly.spacy.io/api/doc#from_dict
|
||||
"""
|
||||
if self.length != 0:
|
||||
raise ValueError(Errors.E033.format(length=self.length))
|
||||
|
@ -1164,8 +1166,8 @@ cdef class Doc:
|
|||
retokenization are invalidated, although they may accidentally
|
||||
continue to work.
|
||||
|
||||
DOCS: https://spacy.io/api/doc#retokenize
|
||||
USAGE: https://spacy.io/usage/linguistic-features#retokenization
|
||||
DOCS: https://nightly.spacy.io/api/doc#retokenize
|
||||
USAGE: https://nightly.spacy.io/usage/linguistic-features#retokenization
|
||||
"""
|
||||
return Retokenizer(self)
|
||||
|
||||
|
@ -1200,7 +1202,7 @@ cdef class Doc:
|
|||
be added to an "_" key in the data, e.g. "_": {"foo": "bar"}.
|
||||
RETURNS (dict): The data in spaCy's JSON format.
|
||||
|
||||
DOCS: https://spacy.io/api/doc#to_json
|
||||
DOCS: https://nightly.spacy.io/api/doc#to_json
|
||||
"""
|
||||
data = {"text": self.text}
|
||||
if self.is_nered:
|
||||
|
|
|
@ -27,7 +27,7 @@ from .underscore import Underscore, get_ext_args
|
|||
cdef class Span:
|
||||
"""A slice from a Doc object.
|
||||
|
||||
DOCS: https://spacy.io/api/span
|
||||
DOCS: https://nightly.spacy.io/api/span
|
||||
"""
|
||||
@classmethod
|
||||
def set_extension(cls, name, **kwargs):
|
||||
|
@ -40,8 +40,8 @@ cdef class Span:
|
|||
method (callable): Optional method for method extension.
|
||||
force (bool): Force overwriting existing attribute.
|
||||
|
||||
DOCS: https://spacy.io/api/span#set_extension
|
||||
USAGE: https://spacy.io/usage/processing-pipelines#custom-components-attributes
|
||||
DOCS: https://nightly.spacy.io/api/span#set_extension
|
||||
USAGE: https://nightly.spacy.io/usage/processing-pipelines#custom-components-attributes
|
||||
"""
|
||||
if cls.has_extension(name) and not kwargs.get("force", False):
|
||||
raise ValueError(Errors.E090.format(name=name, obj="Span"))
|
||||
|
@ -54,7 +54,7 @@ cdef class Span:
|
|||
name (str): Name of the extension.
|
||||
RETURNS (tuple): A `(default, method, getter, setter)` tuple.
|
||||
|
||||
DOCS: https://spacy.io/api/span#get_extension
|
||||
DOCS: https://nightly.spacy.io/api/span#get_extension
|
||||
"""
|
||||
return Underscore.span_extensions.get(name)
|
||||
|
||||
|
@ -65,7 +65,7 @@ cdef class Span:
|
|||
name (str): Name of the extension.
|
||||
RETURNS (bool): Whether the extension has been registered.
|
||||
|
||||
DOCS: https://spacy.io/api/span#has_extension
|
||||
DOCS: https://nightly.spacy.io/api/span#has_extension
|
||||
"""
|
||||
return name in Underscore.span_extensions
|
||||
|
||||
|
@ -77,7 +77,7 @@ cdef class Span:
|
|||
RETURNS (tuple): A `(default, method, getter, setter)` tuple of the
|
||||
removed extension.
|
||||
|
||||
DOCS: https://spacy.io/api/span#remove_extension
|
||||
DOCS: https://nightly.spacy.io/api/span#remove_extension
|
||||
"""
|
||||
if not cls.has_extension(name):
|
||||
raise ValueError(Errors.E046.format(name=name))
|
||||
|
@ -95,7 +95,7 @@ cdef class Span:
|
|||
vector (ndarray[ndim=1, dtype='float32']): A meaning representation
|
||||
of the span.
|
||||
|
||||
DOCS: https://spacy.io/api/span#init
|
||||
DOCS: https://nightly.spacy.io/api/span#init
|
||||
"""
|
||||
if not (0 <= start <= end <= len(doc)):
|
||||
raise IndexError(Errors.E035.format(start=start, end=end, length=len(doc)))
|
||||
|
@ -151,7 +151,7 @@ cdef class Span:
|
|||
|
||||
RETURNS (int): The number of tokens in the span.
|
||||
|
||||
DOCS: https://spacy.io/api/span#len
|
||||
DOCS: https://nightly.spacy.io/api/span#len
|
||||
"""
|
||||
self._recalculate_indices()
|
||||
if self.end < self.start:
|
||||
|
@ -168,7 +168,7 @@ cdef class Span:
|
|||
the span to get.
|
||||
RETURNS (Token or Span): The token at `span[i]`.
|
||||
|
||||
DOCS: https://spacy.io/api/span#getitem
|
||||
DOCS: https://nightly.spacy.io/api/span#getitem
|
||||
"""
|
||||
self._recalculate_indices()
|
||||
if isinstance(i, slice):
|
||||
|
@ -189,7 +189,7 @@ cdef class Span:
|
|||
|
||||
YIELDS (Token): A `Token` object.
|
||||
|
||||
DOCS: https://spacy.io/api/span#iter
|
||||
DOCS: https://nightly.spacy.io/api/span#iter
|
||||
"""
|
||||
self._recalculate_indices()
|
||||
for i in range(self.start, self.end):
|
||||
|
@ -210,7 +210,7 @@ cdef class Span:
|
|||
copy_user_data (bool): Whether or not to copy the original doc's user data.
|
||||
RETURNS (Doc): The `Doc` copy of the span.
|
||||
|
||||
DOCS: https://spacy.io/api/span#as_doc
|
||||
DOCS: https://nightly.spacy.io/api/span#as_doc
|
||||
"""
|
||||
# TODO: make copy_user_data a keyword-only argument (Python 3 only)
|
||||
words = [t.text for t in self]
|
||||
|
@ -292,7 +292,7 @@ cdef class Span:
|
|||
RETURNS (np.array[ndim=2, dtype=numpy.int32]): LCA matrix with shape
|
||||
(n, n), where n = len(self).
|
||||
|
||||
DOCS: https://spacy.io/api/span#get_lca_matrix
|
||||
DOCS: https://nightly.spacy.io/api/span#get_lca_matrix
|
||||
"""
|
||||
return numpy.asarray(_get_lca_matrix(self.doc, self.start, self.end))
|
||||
|
||||
|
@ -304,7 +304,7 @@ cdef class Span:
|
|||
`Span`, `Token` and `Lexeme` objects.
|
||||
RETURNS (float): A scalar similarity score. Higher is more similar.
|
||||
|
||||
DOCS: https://spacy.io/api/span#similarity
|
||||
DOCS: https://nightly.spacy.io/api/span#similarity
|
||||
"""
|
||||
if "similarity" in self.doc.user_span_hooks:
|
||||
return self.doc.user_span_hooks["similarity"](self, other)
|
||||
|
@ -400,7 +400,7 @@ cdef class Span:
|
|||
|
||||
RETURNS (tuple): Entities in the span, one `Span` per entity.
|
||||
|
||||
DOCS: https://spacy.io/api/span#ents
|
||||
DOCS: https://nightly.spacy.io/api/span#ents
|
||||
"""
|
||||
ents = []
|
||||
for ent in self.doc.ents:
|
||||
|
@ -415,7 +415,7 @@ cdef class Span:
|
|||
|
||||
RETURNS (bool): Whether a word vector is associated with the object.
|
||||
|
||||
DOCS: https://spacy.io/api/span#has_vector
|
||||
DOCS: https://nightly.spacy.io/api/span#has_vector
|
||||
"""
|
||||
if "has_vector" in self.doc.user_span_hooks:
|
||||
return self.doc.user_span_hooks["has_vector"](self)
|
||||
|
@ -434,7 +434,7 @@ cdef class Span:
|
|||
RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
|
||||
representing the span's semantics.
|
||||
|
||||
DOCS: https://spacy.io/api/span#vector
|
||||
DOCS: https://nightly.spacy.io/api/span#vector
|
||||
"""
|
||||
if "vector" in self.doc.user_span_hooks:
|
||||
return self.doc.user_span_hooks["vector"](self)
|
||||
|
@ -448,7 +448,7 @@ cdef class Span:
|
|||
|
||||
RETURNS (float): The L2 norm of the vector representation.
|
||||
|
||||
DOCS: https://spacy.io/api/span#vector_norm
|
||||
DOCS: https://nightly.spacy.io/api/span#vector_norm
|
||||
"""
|
||||
if "vector_norm" in self.doc.user_span_hooks:
|
||||
return self.doc.user_span_hooks["vector"](self)
|
||||
|
@ -508,7 +508,7 @@ cdef class Span:
|
|||
|
||||
YIELDS (Span): Base noun-phrase `Span` objects.
|
||||
|
||||
DOCS: https://spacy.io/api/span#noun_chunks
|
||||
DOCS: https://nightly.spacy.io/api/span#noun_chunks
|
||||
"""
|
||||
if not self.doc.is_parsed:
|
||||
raise ValueError(Errors.E029)
|
||||
|
@ -533,7 +533,7 @@ cdef class Span:
|
|||
|
||||
RETURNS (Token): The root token.
|
||||
|
||||
DOCS: https://spacy.io/api/span#root
|
||||
DOCS: https://nightly.spacy.io/api/span#root
|
||||
"""
|
||||
self._recalculate_indices()
|
||||
if "root" in self.doc.user_span_hooks:
|
||||
|
@ -590,7 +590,7 @@ cdef class Span:
|
|||
|
||||
RETURNS (tuple): A tuple of Token objects.
|
||||
|
||||
DOCS: https://spacy.io/api/span#lefts
|
||||
DOCS: https://nightly.spacy.io/api/span#lefts
|
||||
"""
|
||||
return self.root.conjuncts
|
||||
|
||||
|
@ -601,7 +601,7 @@ cdef class Span:
|
|||
|
||||
YIELDS (Token):A left-child of a token of the span.
|
||||
|
||||
DOCS: https://spacy.io/api/span#lefts
|
||||
DOCS: https://nightly.spacy.io/api/span#lefts
|
||||
"""
|
||||
for token in reversed(self): # Reverse, so we get tokens in order
|
||||
for left in token.lefts:
|
||||
|
@ -615,7 +615,7 @@ cdef class Span:
|
|||
|
||||
YIELDS (Token): A right-child of a token of the span.
|
||||
|
||||
DOCS: https://spacy.io/api/span#rights
|
||||
DOCS: https://nightly.spacy.io/api/span#rights
|
||||
"""
|
||||
for token in self:
|
||||
for right in token.rights:
|
||||
|
@ -630,7 +630,7 @@ cdef class Span:
|
|||
RETURNS (int): The number of leftward immediate children of the
|
||||
span, in the syntactic dependency parse.
|
||||
|
||||
DOCS: https://spacy.io/api/span#n_lefts
|
||||
DOCS: https://nightly.spacy.io/api/span#n_lefts
|
||||
"""
|
||||
return len(list(self.lefts))
|
||||
|
||||
|
@ -642,7 +642,7 @@ cdef class Span:
|
|||
RETURNS (int): The number of rightward immediate children of the
|
||||
span, in the syntactic dependency parse.
|
||||
|
||||
DOCS: https://spacy.io/api/span#n_rights
|
||||
DOCS: https://nightly.spacy.io/api/span#n_rights
|
||||
"""
|
||||
return len(list(self.rights))
|
||||
|
||||
|
@ -652,7 +652,7 @@ cdef class Span:
|
|||
|
||||
YIELDS (Token): A token within the span, or a descendant from it.
|
||||
|
||||
DOCS: https://spacy.io/api/span#subtree
|
||||
DOCS: https://nightly.spacy.io/api/span#subtree
|
||||
"""
|
||||
for word in self.lefts:
|
||||
yield from word.subtree
|
||||
|
|
|
@ -30,7 +30,7 @@ cdef class Token:
|
|||
"""An individual token – i.e. a word, punctuation symbol, whitespace,
|
||||
etc.
|
||||
|
||||
DOCS: https://spacy.io/api/token
|
||||
DOCS: https://nightly.spacy.io/api/token
|
||||
"""
|
||||
@classmethod
|
||||
def set_extension(cls, name, **kwargs):
|
||||
|
@ -43,8 +43,8 @@ cdef class Token:
|
|||
method (callable): Optional method for method extension.
|
||||
force (bool): Force overwriting existing attribute.
|
||||
|
||||
DOCS: https://spacy.io/api/token#set_extension
|
||||
USAGE: https://spacy.io/usage/processing-pipelines#custom-components-attributes
|
||||
DOCS: https://nightly.spacy.io/api/token#set_extension
|
||||
USAGE: https://nightly.spacy.io/usage/processing-pipelines#custom-components-attributes
|
||||
"""
|
||||
if cls.has_extension(name) and not kwargs.get("force", False):
|
||||
raise ValueError(Errors.E090.format(name=name, obj="Token"))
|
||||
|
@ -57,7 +57,7 @@ cdef class Token:
|
|||
name (str): Name of the extension.
|
||||
RETURNS (tuple): A `(default, method, getter, setter)` tuple.
|
||||
|
||||
DOCS: https://spacy.io/api/token#get_extension
|
||||
DOCS: https://nightly.spacy.io/api/token#get_extension
|
||||
"""
|
||||
return Underscore.token_extensions.get(name)
|
||||
|
||||
|
@ -68,7 +68,7 @@ cdef class Token:
|
|||
name (str): Name of the extension.
|
||||
RETURNS (bool): Whether the extension has been registered.
|
||||
|
||||
DOCS: https://spacy.io/api/token#has_extension
|
||||
DOCS: https://nightly.spacy.io/api/token#has_extension
|
||||
"""
|
||||
return name in Underscore.token_extensions
|
||||
|
||||
|
@ -80,7 +80,7 @@ cdef class Token:
|
|||
RETURNS (tuple): A `(default, method, getter, setter)` tuple of the
|
||||
removed extension.
|
||||
|
||||
DOCS: https://spacy.io/api/token#remove_extension
|
||||
DOCS: https://nightly.spacy.io/api/token#remove_extension
|
||||
"""
|
||||
if not cls.has_extension(name):
|
||||
raise ValueError(Errors.E046.format(name=name))
|
||||
|
@ -93,7 +93,7 @@ cdef class Token:
|
|||
doc (Doc): The parent document.
|
||||
offset (int): The index of the token within the document.
|
||||
|
||||
DOCS: https://spacy.io/api/token#init
|
||||
DOCS: https://nightly.spacy.io/api/token#init
|
||||
"""
|
||||
self.vocab = vocab
|
||||
self.doc = doc
|
||||
|
@ -108,7 +108,7 @@ cdef class Token:
|
|||
|
||||
RETURNS (int): The number of unicode characters in the token.
|
||||
|
||||
DOCS: https://spacy.io/api/token#len
|
||||
DOCS: https://nightly.spacy.io/api/token#len
|
||||
"""
|
||||
return self.c.lex.length
|
||||
|
||||
|
@ -171,7 +171,7 @@ cdef class Token:
|
|||
flag_id (int): The ID of the flag attribute.
|
||||
RETURNS (bool): Whether the flag is set.
|
||||
|
||||
DOCS: https://spacy.io/api/token#check_flag
|
||||
DOCS: https://nightly.spacy.io/api/token#check_flag
|
||||
"""
|
||||
return Lexeme.c_check_flag(self.c.lex, flag_id)
|
||||
|
||||
|
@ -181,7 +181,7 @@ cdef class Token:
|
|||
i (int): The relative position of the token to get. Defaults to 1.
|
||||
RETURNS (Token): The token at position `self.doc[self.i+i]`.
|
||||
|
||||
DOCS: https://spacy.io/api/token#nbor
|
||||
DOCS: https://nightly.spacy.io/api/token#nbor
|
||||
"""
|
||||
if self.i+i < 0 or (self.i+i >= len(self.doc)):
|
||||
raise IndexError(Errors.E042.format(i=self.i, j=i, length=len(self.doc)))
|
||||
|
@ -195,7 +195,7 @@ cdef class Token:
|
|||
`Span`, `Token` and `Lexeme` objects.
|
||||
RETURNS (float): A scalar similarity score. Higher is more similar.
|
||||
|
||||
DOCS: https://spacy.io/api/token#similarity
|
||||
DOCS: https://nightly.spacy.io/api/token#similarity
|
||||
"""
|
||||
if "similarity" in self.doc.user_token_hooks:
|
||||
return self.doc.user_token_hooks["similarity"](self, other)
|
||||
|
@ -373,7 +373,7 @@ cdef class Token:
|
|||
|
||||
RETURNS (bool): Whether a word vector is associated with the object.
|
||||
|
||||
DOCS: https://spacy.io/api/token#has_vector
|
||||
DOCS: https://nightly.spacy.io/api/token#has_vector
|
||||
"""
|
||||
if "has_vector" in self.doc.user_token_hooks:
|
||||
return self.doc.user_token_hooks["has_vector"](self)
|
||||
|
@ -388,7 +388,7 @@ cdef class Token:
|
|||
RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
|
||||
representing the token's semantics.
|
||||
|
||||
DOCS: https://spacy.io/api/token#vector
|
||||
DOCS: https://nightly.spacy.io/api/token#vector
|
||||
"""
|
||||
if "vector" in self.doc.user_token_hooks:
|
||||
return self.doc.user_token_hooks["vector"](self)
|
||||
|
@ -403,7 +403,7 @@ cdef class Token:
|
|||
|
||||
RETURNS (float): The L2 norm of the vector representation.
|
||||
|
||||
DOCS: https://spacy.io/api/token#vector_norm
|
||||
DOCS: https://nightly.spacy.io/api/token#vector_norm
|
||||
"""
|
||||
if "vector_norm" in self.doc.user_token_hooks:
|
||||
return self.doc.user_token_hooks["vector_norm"](self)
|
||||
|
@ -426,7 +426,7 @@ cdef class Token:
|
|||
RETURNS (int): The number of leftward immediate children of the
|
||||
word, in the syntactic dependency parse.
|
||||
|
||||
DOCS: https://spacy.io/api/token#n_lefts
|
||||
DOCS: https://nightly.spacy.io/api/token#n_lefts
|
||||
"""
|
||||
return self.c.l_kids
|
||||
|
||||
|
@ -438,7 +438,7 @@ cdef class Token:
|
|||
RETURNS (int): The number of rightward immediate children of the
|
||||
word, in the syntactic dependency parse.
|
||||
|
||||
DOCS: https://spacy.io/api/token#n_rights
|
||||
DOCS: https://nightly.spacy.io/api/token#n_rights
|
||||
"""
|
||||
return self.c.r_kids
|
||||
|
||||
|
@ -470,7 +470,7 @@ cdef class Token:
|
|||
RETURNS (bool / None): Whether the token starts a sentence.
|
||||
None if unknown.
|
||||
|
||||
DOCS: https://spacy.io/api/token#is_sent_start
|
||||
DOCS: https://nightly.spacy.io/api/token#is_sent_start
|
||||
"""
|
||||
def __get__(self):
|
||||
if self.c.sent_start == 0:
|
||||
|
@ -499,7 +499,7 @@ cdef class Token:
|
|||
RETURNS (bool / None): Whether the token ends a sentence.
|
||||
None if unknown.
|
||||
|
||||
DOCS: https://spacy.io/api/token#is_sent_end
|
||||
DOCS: https://nightly.spacy.io/api/token#is_sent_end
|
||||
"""
|
||||
def __get__(self):
|
||||
if self.i + 1 == len(self.doc):
|
||||
|
@ -521,7 +521,7 @@ cdef class Token:
|
|||
|
||||
YIELDS (Token): A left-child of the token.
|
||||
|
||||
DOCS: https://spacy.io/api/token#lefts
|
||||
DOCS: https://nightly.spacy.io/api/token#lefts
|
||||
"""
|
||||
cdef int nr_iter = 0
|
||||
cdef const TokenC* ptr = self.c - (self.i - self.c.l_edge)
|
||||
|
@ -541,7 +541,7 @@ cdef class Token:
|
|||
|
||||
YIELDS (Token): A right-child of the token.
|
||||
|
||||
DOCS: https://spacy.io/api/token#rights
|
||||
DOCS: https://nightly.spacy.io/api/token#rights
|
||||
"""
|
||||
cdef const TokenC* ptr = self.c + (self.c.r_edge - self.i)
|
||||
tokens = []
|
||||
|
@ -563,7 +563,7 @@ cdef class Token:
|
|||
|
||||
YIELDS (Token): A child token such that `child.head==self`.
|
||||
|
||||
DOCS: https://spacy.io/api/token#children
|
||||
DOCS: https://nightly.spacy.io/api/token#children
|
||||
"""
|
||||
yield from self.lefts
|
||||
yield from self.rights
|
||||
|
@ -576,7 +576,7 @@ cdef class Token:
|
|||
YIELDS (Token): A descendent token such that
|
||||
`self.is_ancestor(descendent) or token == self`.
|
||||
|
||||
DOCS: https://spacy.io/api/token#subtree
|
||||
DOCS: https://nightly.spacy.io/api/token#subtree
|
||||
"""
|
||||
for word in self.lefts:
|
||||
yield from word.subtree
|
||||
|
@ -607,7 +607,7 @@ cdef class Token:
|
|||
YIELDS (Token): A sequence of ancestor tokens such that
|
||||
`ancestor.is_ancestor(self)`.
|
||||
|
||||
DOCS: https://spacy.io/api/token#ancestors
|
||||
DOCS: https://nightly.spacy.io/api/token#ancestors
|
||||
"""
|
||||
cdef const TokenC* head_ptr = self.c
|
||||
# Guard against infinite loop, no token can have
|
||||
|
@ -625,7 +625,7 @@ cdef class Token:
|
|||
descendant (Token): Another token.
|
||||
RETURNS (bool): Whether this token is the ancestor of the descendant.
|
||||
|
||||
DOCS: https://spacy.io/api/token#is_ancestor
|
||||
DOCS: https://nightly.spacy.io/api/token#is_ancestor
|
||||
"""
|
||||
if self.doc is not descendant.doc:
|
||||
return False
|
||||
|
@ -729,7 +729,7 @@ cdef class Token:
|
|||
|
||||
RETURNS (tuple): The coordinated tokens.
|
||||
|
||||
DOCS: https://spacy.io/api/token#conjuncts
|
||||
DOCS: https://nightly.spacy.io/api/token#conjuncts
|
||||
"""
|
||||
cdef Token word, child
|
||||
if "conjuncts" in self.doc.user_token_hooks:
|
||||
|
|
|
@ -76,7 +76,7 @@ class registry(thinc.registry):
|
|||
lemmatizers = catalogue.create("spacy", "lemmatizers", entry_points=True)
|
||||
lookups = catalogue.create("spacy", "lookups", entry_points=True)
|
||||
displacy_colors = catalogue.create("spacy", "displacy_colors", entry_points=True)
|
||||
assets = catalogue.create("spacy", "assets", entry_points=True)
|
||||
misc = catalogue.create("spacy", "misc", entry_points=True)
|
||||
# Callback functions used to manipulate nlp object etc.
|
||||
callbacks = catalogue.create("spacy", "callbacks")
|
||||
batchers = catalogue.create("spacy", "batchers", entry_points=True)
|
||||
|
|
|
@ -44,7 +44,7 @@ cdef class Vectors:
|
|||
the table need to be assigned - so len(list(vectors.keys())) may be
|
||||
greater or smaller than vectors.shape[0].
|
||||
|
||||
DOCS: https://spacy.io/api/vectors
|
||||
DOCS: https://nightly.spacy.io/api/vectors
|
||||
"""
|
||||
cdef public object name
|
||||
cdef public object data
|
||||
|
@ -59,7 +59,7 @@ cdef class Vectors:
|
|||
keys (iterable): A sequence of keys, aligned with the data.
|
||||
name (str): A name to identify the vectors table.
|
||||
|
||||
DOCS: https://spacy.io/api/vectors#init
|
||||
DOCS: https://nightly.spacy.io/api/vectors#init
|
||||
"""
|
||||
self.name = name
|
||||
if data is None:
|
||||
|
@ -83,7 +83,7 @@ cdef class Vectors:
|
|||
|
||||
RETURNS (tuple): A `(rows, dims)` pair.
|
||||
|
||||
DOCS: https://spacy.io/api/vectors#shape
|
||||
DOCS: https://nightly.spacy.io/api/vectors#shape
|
||||
"""
|
||||
return self.data.shape
|
||||
|
||||
|
@ -93,7 +93,7 @@ cdef class Vectors:
|
|||
|
||||
RETURNS (int): The vector size.
|
||||
|
||||
DOCS: https://spacy.io/api/vectors#size
|
||||
DOCS: https://nightly.spacy.io/api/vectors#size
|
||||
"""
|
||||
return self.data.shape[0] * self.data.shape[1]
|
||||
|
||||
|
@ -103,7 +103,7 @@ cdef class Vectors:
|
|||
|
||||
RETURNS (bool): `True` if no slots are available for new keys.
|
||||
|
||||
DOCS: https://spacy.io/api/vectors#is_full
|
||||
DOCS: https://nightly.spacy.io/api/vectors#is_full
|
||||
"""
|
||||
return self._unset.size() == 0
|
||||
|
||||
|
@ -114,7 +114,7 @@ cdef class Vectors:
|
|||
|
||||
RETURNS (int): The number of keys in the table.
|
||||
|
||||
DOCS: https://spacy.io/api/vectors#n_keys
|
||||
DOCS: https://nightly.spacy.io/api/vectors#n_keys
|
||||
"""
|
||||
return len(self.key2row)
|
||||
|
||||
|
@ -127,7 +127,7 @@ cdef class Vectors:
|
|||
key (int): The key to get the vector for.
|
||||
RETURNS (ndarray): The vector for the key.
|
||||
|
||||
DOCS: https://spacy.io/api/vectors#getitem
|
||||
DOCS: https://nightly.spacy.io/api/vectors#getitem
|
||||
"""
|
||||
i = self.key2row[key]
|
||||
if i is None:
|
||||
|
@ -141,7 +141,7 @@ cdef class Vectors:
|
|||
key (int): The key to set the vector for.
|
||||
vector (ndarray): The vector to set.
|
||||
|
||||
DOCS: https://spacy.io/api/vectors#setitem
|
||||
DOCS: https://nightly.spacy.io/api/vectors#setitem
|
||||
"""
|
||||
i = self.key2row[key]
|
||||
self.data[i] = vector
|
||||
|
@ -153,7 +153,7 @@ cdef class Vectors:
|
|||
|
||||
YIELDS (int): A key in the table.
|
||||
|
||||
DOCS: https://spacy.io/api/vectors#iter
|
||||
DOCS: https://nightly.spacy.io/api/vectors#iter
|
||||
"""
|
||||
yield from self.key2row
|
||||
|
||||
|
@ -162,7 +162,7 @@ cdef class Vectors:
|
|||
|
||||
RETURNS (int): The number of vectors in the data.
|
||||
|
||||
DOCS: https://spacy.io/api/vectors#len
|
||||
DOCS: https://nightly.spacy.io/api/vectors#len
|
||||
"""
|
||||
return self.data.shape[0]
|
||||
|
||||
|
@ -172,7 +172,7 @@ cdef class Vectors:
|
|||
key (int): The key to check.
|
||||
RETURNS (bool): Whether the key has a vector entry.
|
||||
|
||||
DOCS: https://spacy.io/api/vectors#contains
|
||||
DOCS: https://nightly.spacy.io/api/vectors#contains
|
||||
"""
|
||||
return key in self.key2row
|
||||
|
||||
|
@ -189,7 +189,7 @@ cdef class Vectors:
|
|||
inplace (bool): Reallocate the memory.
|
||||
RETURNS (list): The removed items as a list of `(key, row)` tuples.
|
||||
|
||||
DOCS: https://spacy.io/api/vectors#resize
|
||||
DOCS: https://nightly.spacy.io/api/vectors#resize
|
||||
"""
|
||||
xp = get_array_module(self.data)
|
||||
if inplace:
|
||||
|
@ -224,7 +224,7 @@ cdef class Vectors:
|
|||
|
||||
YIELDS (ndarray): A vector in the table.
|
||||
|
||||
DOCS: https://spacy.io/api/vectors#values
|
||||
DOCS: https://nightly.spacy.io/api/vectors#values
|
||||
"""
|
||||
for row, vector in enumerate(range(self.data.shape[0])):
|
||||
if not self._unset.count(row):
|
||||
|
@ -235,7 +235,7 @@ cdef class Vectors:
|
|||
|
||||
YIELDS (tuple): A key/vector pair.
|
||||
|
||||
DOCS: https://spacy.io/api/vectors#items
|
||||
DOCS: https://nightly.spacy.io/api/vectors#items
|
||||
"""
|
||||
for key, row in self.key2row.items():
|
||||
yield key, self.data[row]
|
||||
|
@ -281,7 +281,7 @@ cdef class Vectors:
|
|||
row (int / None): The row number of a vector to map the key to.
|
||||
RETURNS (int): The row the vector was added to.
|
||||
|
||||
DOCS: https://spacy.io/api/vectors#add
|
||||
DOCS: https://nightly.spacy.io/api/vectors#add
|
||||
"""
|
||||
# use int for all keys and rows in key2row for more efficient access
|
||||
# and serialization
|
||||
|
@ -368,7 +368,7 @@ cdef class Vectors:
|
|||
path (str / Path): A path to a directory, which will be created if
|
||||
it doesn't exists.
|
||||
|
||||
DOCS: https://spacy.io/api/vectors#to_disk
|
||||
DOCS: https://nightly.spacy.io/api/vectors#to_disk
|
||||
"""
|
||||
xp = get_array_module(self.data)
|
||||
if xp is numpy:
|
||||
|
@ -396,7 +396,7 @@ cdef class Vectors:
|
|||
path (str / Path): Directory path, string or Path-like object.
|
||||
RETURNS (Vectors): The modified object.
|
||||
|
||||
DOCS: https://spacy.io/api/vectors#from_disk
|
||||
DOCS: https://nightly.spacy.io/api/vectors#from_disk
|
||||
"""
|
||||
def load_key2row(path):
|
||||
if path.exists():
|
||||
|
@ -432,7 +432,7 @@ cdef class Vectors:
|
|||
exclude (list): String names of serialization fields to exclude.
|
||||
RETURNS (bytes): The serialized form of the `Vectors` object.
|
||||
|
||||
DOCS: https://spacy.io/api/vectors#to_bytes
|
||||
DOCS: https://nightly.spacy.io/api/vectors#to_bytes
|
||||
"""
|
||||
def serialize_weights():
|
||||
if hasattr(self.data, "to_bytes"):
|
||||
|
@ -453,7 +453,7 @@ cdef class Vectors:
|
|||
exclude (list): String names of serialization fields to exclude.
|
||||
RETURNS (Vectors): The `Vectors` object.
|
||||
|
||||
DOCS: https://spacy.io/api/vectors#from_bytes
|
||||
DOCS: https://nightly.spacy.io/api/vectors#from_bytes
|
||||
"""
|
||||
def deserialize_weights(b):
|
||||
if hasattr(self.data, "from_bytes"):
|
||||
|
|
|
@ -54,7 +54,7 @@ cdef class Vocab:
|
|||
instance also provides access to the `StringStore`, and owns underlying
|
||||
C-data that is shared between `Doc` objects.
|
||||
|
||||
DOCS: https://spacy.io/api/vocab
|
||||
DOCS: https://nightly.spacy.io/api/vocab
|
||||
"""
|
||||
def __init__(self, lex_attr_getters=None, strings=tuple(), lookups=None,
|
||||
oov_prob=-20., vectors_name=None, writing_system={},
|
||||
|
@ -117,7 +117,7 @@ cdef class Vocab:
|
|||
available bit will be chosen.
|
||||
RETURNS (int): The integer ID by which the flag value can be checked.
|
||||
|
||||
DOCS: https://spacy.io/api/vocab#add_flag
|
||||
DOCS: https://nightly.spacy.io/api/vocab#add_flag
|
||||
"""
|
||||
if flag_id == -1:
|
||||
for bit in range(1, 64):
|
||||
|
@ -201,7 +201,7 @@ cdef class Vocab:
|
|||
string (unicode): The ID string.
|
||||
RETURNS (bool) Whether the string has an entry in the vocabulary.
|
||||
|
||||
DOCS: https://spacy.io/api/vocab#contains
|
||||
DOCS: https://nightly.spacy.io/api/vocab#contains
|
||||
"""
|
||||
cdef hash_t int_key
|
||||
if isinstance(key, bytes):
|
||||
|
@ -218,7 +218,7 @@ cdef class Vocab:
|
|||
|
||||
YIELDS (Lexeme): An entry in the vocabulary.
|
||||
|
||||
DOCS: https://spacy.io/api/vocab#iter
|
||||
DOCS: https://nightly.spacy.io/api/vocab#iter
|
||||
"""
|
||||
cdef attr_t key
|
||||
cdef size_t addr
|
||||
|
@ -241,7 +241,7 @@ cdef class Vocab:
|
|||
>>> apple = nlp.vocab.strings["apple"]
|
||||
>>> assert nlp.vocab[apple] == nlp.vocab[u"apple"]
|
||||
|
||||
DOCS: https://spacy.io/api/vocab#getitem
|
||||
DOCS: https://nightly.spacy.io/api/vocab#getitem
|
||||
"""
|
||||
cdef attr_t orth
|
||||
if isinstance(id_or_string, unicode):
|
||||
|
@ -309,7 +309,7 @@ cdef class Vocab:
|
|||
word was mapped to, and `score` the similarity score between the
|
||||
two words.
|
||||
|
||||
DOCS: https://spacy.io/api/vocab#prune_vectors
|
||||
DOCS: https://nightly.spacy.io/api/vocab#prune_vectors
|
||||
"""
|
||||
xp = get_array_module(self.vectors.data)
|
||||
# Make prob negative so it sorts by rank ascending
|
||||
|
@ -349,7 +349,7 @@ cdef class Vocab:
|
|||
and shape determined by the `vocab.vectors` instance. Usually, a
|
||||
numpy ndarray of shape (300,) and dtype float32.
|
||||
|
||||
DOCS: https://spacy.io/api/vocab#get_vector
|
||||
DOCS: https://nightly.spacy.io/api/vocab#get_vector
|
||||
"""
|
||||
if isinstance(orth, str):
|
||||
orth = self.strings.add(orth)
|
||||
|
@ -396,7 +396,7 @@ cdef class Vocab:
|
|||
orth (int / unicode): The word.
|
||||
vector (numpy.ndarray[ndim=1, dtype='float32']): The vector to set.
|
||||
|
||||
DOCS: https://spacy.io/api/vocab#set_vector
|
||||
DOCS: https://nightly.spacy.io/api/vocab#set_vector
|
||||
"""
|
||||
if isinstance(orth, str):
|
||||
orth = self.strings.add(orth)
|
||||
|
@ -418,7 +418,7 @@ cdef class Vocab:
|
|||
orth (int / unicode): The word.
|
||||
RETURNS (bool): Whether the word has a vector.
|
||||
|
||||
DOCS: https://spacy.io/api/vocab#has_vector
|
||||
DOCS: https://nightly.spacy.io/api/vocab#has_vector
|
||||
"""
|
||||
if isinstance(orth, str):
|
||||
orth = self.strings.add(orth)
|
||||
|
@ -431,7 +431,7 @@ cdef class Vocab:
|
|||
it doesn't exist.
|
||||
exclude (list): String names of serialization fields to exclude.
|
||||
|
||||
DOCS: https://spacy.io/api/vocab#to_disk
|
||||
DOCS: https://nightly.spacy.io/api/vocab#to_disk
|
||||
"""
|
||||
path = util.ensure_path(path)
|
||||
if not path.exists():
|
||||
|
@ -452,7 +452,7 @@ cdef class Vocab:
|
|||
exclude (list): String names of serialization fields to exclude.
|
||||
RETURNS (Vocab): The modified `Vocab` object.
|
||||
|
||||
DOCS: https://spacy.io/api/vocab#to_disk
|
||||
DOCS: https://nightly.spacy.io/api/vocab#to_disk
|
||||
"""
|
||||
path = util.ensure_path(path)
|
||||
getters = ["strings", "vectors"]
|
||||
|
@ -477,7 +477,7 @@ cdef class Vocab:
|
|||
exclude (list): String names of serialization fields to exclude.
|
||||
RETURNS (bytes): The serialized form of the `Vocab` object.
|
||||
|
||||
DOCS: https://spacy.io/api/vocab#to_bytes
|
||||
DOCS: https://nightly.spacy.io/api/vocab#to_bytes
|
||||
"""
|
||||
def deserialize_vectors():
|
||||
if self.vectors is None:
|
||||
|
@ -499,7 +499,7 @@ cdef class Vocab:
|
|||
exclude (list): String names of serialization fields to exclude.
|
||||
RETURNS (Vocab): The `Vocab` object.
|
||||
|
||||
DOCS: https://spacy.io/api/vocab#from_bytes
|
||||
DOCS: https://nightly.spacy.io/api/vocab#from_bytes
|
||||
"""
|
||||
def serialize_vectors(b):
|
||||
if self.vectors is None:
|
||||
|
|
|
@ -25,36 +25,6 @@ usage documentation on
|
|||
|
||||
## Tok2Vec architectures {#tok2vec-arch source="spacy/ml/models/tok2vec.py"}
|
||||
|
||||
### spacy.HashEmbedCNN.v1 {#HashEmbedCNN}
|
||||
|
||||
> #### Example Config
|
||||
>
|
||||
> ```ini
|
||||
> [model]
|
||||
> @architectures = "spacy.HashEmbedCNN.v1"
|
||||
> pretrained_vectors = null
|
||||
> width = 96
|
||||
> depth = 4
|
||||
> embed_size = 2000
|
||||
> window_size = 1
|
||||
> maxout_pieces = 3
|
||||
> subword_features = true
|
||||
> ```
|
||||
|
||||
Build spaCy's "standard" embedding layer, which uses hash embedding with subword
|
||||
features and a CNN with layer-normalized maxout.
|
||||
|
||||
| Name | Description |
|
||||
| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `width` | The width of the input and output. These are required to be the same, so that residual connections can be used. Recommended values are `96`, `128` or `300`. ~~int~~ |
|
||||
| `depth` | The number of convolutional layers to use. Recommended values are between `2` and `8`. ~~int~~ |
|
||||
| `embed_size` | The number of rows in the hash embedding tables. This can be surprisingly small, due to the use of the hash embeddings. Recommended values are between `2000` and `10000`. ~~int~~ |
|
||||
| `window_size` | The number of tokens on either side to concatenate during the convolutions. The receptive field of the CNN will be `depth * (window_size * 2 + 1)`, so a 4-layer network with a window size of `2` will be sensitive to 17 words at a time. Recommended value is `1`. ~~int~~ |
|
||||
| `maxout_pieces` | The number of pieces to use in the maxout non-linearity. If `1`, the [`Mish`](https://thinc.ai/docs/api-layers#mish) non-linearity is used instead. Recommended values are `1`-`3`. ~~int~~ |
|
||||
| `subword_features` | Whether to also embed subword features, specifically the prefix, suffix and word shape. This is recommended for alphabetic languages like English, but not if single-character tokens are used for a language such as Chinese. ~~bool~~ |
|
||||
| `pretrained_vectors` | Whether to also use static vectors. ~~bool~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
|
||||
### spacy.Tok2Vec.v1 {#Tok2Vec}
|
||||
|
||||
> #### Example config
|
||||
|
@ -72,7 +42,8 @@ features and a CNN with layer-normalized maxout.
|
|||
> # ...
|
||||
> ```
|
||||
|
||||
Construct a tok2vec model out of embedding and encoding subnetworks. See the
|
||||
Construct a tok2vec model out of two subnetworks: one for embedding and one for
|
||||
encoding. See the
|
||||
["Embed, Encode, Attend, Predict"](https://explosion.ai/blog/deep-learning-formula-nlp)
|
||||
blog post for background.
|
||||
|
||||
|
@ -82,6 +53,39 @@ blog post for background.
|
|||
| `encode` | Encode context into the embeddings, using an architecture such as a CNN, BiLSTM or transformer. For example, [MaxoutWindowEncoder](/api/architectures#MaxoutWindowEncoder). ~~Model[List[Floats2d], List[Floats2d]]~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
|
||||
### spacy.HashEmbedCNN.v1 {#HashEmbedCNN}
|
||||
|
||||
> #### Example Config
|
||||
>
|
||||
> ```ini
|
||||
> [model]
|
||||
> @architectures = "spacy.HashEmbedCNN.v1"
|
||||
> pretrained_vectors = null
|
||||
> width = 96
|
||||
> depth = 4
|
||||
> embed_size = 2000
|
||||
> window_size = 1
|
||||
> maxout_pieces = 3
|
||||
> subword_features = true
|
||||
> ```
|
||||
|
||||
Build spaCy's "standard" tok2vec layer. This layer is defined by a
|
||||
[MultiHashEmbed](/api/architectures#MultiHashEmbed) embedding layer that uses
|
||||
subword features, and a
|
||||
[MaxoutWindowEncoder](/api/architectures#MaxoutWindowEncoder) encoding layer
|
||||
consisting of a CNN and a layer-normalized maxout activation function.
|
||||
|
||||
| Name | Description |
|
||||
| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `width` | The width of the input and output. These are required to be the same, so that residual connections can be used. Recommended values are `96`, `128` or `300`. ~~int~~ |
|
||||
| `depth` | The number of convolutional layers to use. Recommended values are between `2` and `8`. ~~int~~ |
|
||||
| `embed_size` | The number of rows in the hash embedding tables. This can be surprisingly small, due to the use of the hash embeddings. Recommended values are between `2000` and `10000`. ~~int~~ |
|
||||
| `window_size` | The number of tokens on either side to concatenate during the convolutions. The receptive field of the CNN will be `depth * (window_size * 2 + 1)`, so a 4-layer network with a window size of `2` will be sensitive to 17 words at a time. Recommended value is `1`. ~~int~~ |
|
||||
| `maxout_pieces` | The number of pieces to use in the maxout non-linearity. If `1`, the [`Mish`](https://thinc.ai/docs/api-layers#mish) non-linearity is used instead. Recommended values are `1`-`3`. ~~int~~ |
|
||||
| `subword_features` | Whether to also embed subword features, specifically the prefix, suffix and word shape. This is recommended for alphabetic languages like English, but not if single-character tokens are used for a language such as Chinese. ~~bool~~ |
|
||||
| `pretrained_vectors` | Whether to also use static vectors. ~~bool~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
|
||||
### spacy.Tok2VecListener.v1 {#Tok2VecListener}
|
||||
|
||||
> #### Example config
|
||||
|
@ -118,11 +122,11 @@ Instead of defining its own `Tok2Vec` instance, a model architecture like
|
|||
[Tagger](/api/architectures#tagger) can define a listener as its `tok2vec`
|
||||
argument that connects to the shared `tok2vec` component in the pipeline.
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `width` | The width of the vectors produced by the "upstream" [`Tok2Vec`](/api/tok2vec) component. ~~int~~ |
|
||||
| `upstream` | A string to identify the "upstream" `Tok2Vec` component to communicate with. The upstream name should either be the wildcard string `"*"`, or the name of the `Tok2Vec` component. You'll almost never have multiple upstream `Tok2Vec` components, so the wildcard string will almost always be fine. ~~str~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `width` | The width of the vectors produced by the "upstream" [`Tok2Vec`](/api/tok2vec) component. ~~int~~ |
|
||||
| `upstream` | A string to identify the "upstream" `Tok2Vec` component to communicate with. By default, the upstream name is the wildcard string `"*"`, but you could also specify the name of the `Tok2Vec` component. You'll almost never have multiple upstream `Tok2Vec` components, so the wildcard string will almost always be fine. ~~str~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
|
||||
### spacy.MultiHashEmbed.v1 {#MultiHashEmbed}
|
||||
|
||||
|
@ -316,18 +320,18 @@ for details and system requirements.
|
|||
> tokenizer_config = {"use_fast": true}
|
||||
>
|
||||
> [model.get_spans]
|
||||
> @span_getters = "strided_spans.v1"
|
||||
> @span_getters = "spacy-transformers.strided_spans.v1"
|
||||
> window = 128
|
||||
> stride = 96
|
||||
> ```
|
||||
|
||||
Load and wrap a transformer model from the
|
||||
[HuggingFace `transformers`](https://huggingface.co/transformers) library. You
|
||||
can any transformer that has pretrained weights and a PyTorch implementation.
|
||||
The `name` variable is passed through to the underlying library, so it can be
|
||||
either a string or a path. If it's a string, the pretrained weights will be
|
||||
downloaded via the transformers library if they are not already available
|
||||
locally.
|
||||
can use any transformer that has pretrained weights and a PyTorch
|
||||
implementation. The `name` variable is passed through to the underlying library,
|
||||
so it can be either a string or a path. If it's a string, the pretrained weights
|
||||
will be downloaded via the transformers library if they are not already
|
||||
available locally.
|
||||
|
||||
In order to support longer documents, the
|
||||
[TransformerModel](/api/architectures#TransformerModel) layer allows you to pass
|
||||
|
@ -346,13 +350,13 @@ in other components, see
|
|||
| `tokenizer_config` | Tokenizer settings passed to [`transformers.AutoTokenizer`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoTokenizer). ~~Dict[str, Any]~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], FullTransformerBatch]~~ |
|
||||
|
||||
### spacy-transformers.Tok2VecListener.v1 {#transformers-Tok2VecListener}
|
||||
### spacy-transformers.TransformerListener.v1 {#TransformerListener}
|
||||
|
||||
> #### Example Config
|
||||
>
|
||||
> ```ini
|
||||
> [model]
|
||||
> @architectures = "spacy-transformers.Tok2VecListener.v1"
|
||||
> @architectures = "spacy-transformers.TransformerListener.v1"
|
||||
> grad_factor = 1.0
|
||||
>
|
||||
> [model.pooling]
|
||||
|
@ -669,11 +673,11 @@ into the "real world". This requires 3 main components:
|
|||
> subword_features = true
|
||||
>
|
||||
> [kb_loader]
|
||||
> @assets = "spacy.EmptyKB.v1"
|
||||
> @misc = "spacy.EmptyKB.v1"
|
||||
> entity_vector_length = 64
|
||||
>
|
||||
> [get_candidates]
|
||||
> @assets = "spacy.CandidateGenerator.v1"
|
||||
> @misc = "spacy.CandidateGenerator.v1"
|
||||
> ```
|
||||
|
||||
The `EntityLinker` model architecture is a Thinc `Model` with a
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
---
|
||||
title: Command Line Interface
|
||||
teaser: Download, train and package models, and debug spaCy
|
||||
teaser: Download, train and package pipelines, and debug spaCy
|
||||
source: spacy/cli
|
||||
menu:
|
||||
- ['download', 'download']
|
||||
|
@ -17,45 +17,47 @@ menu:
|
|||
---
|
||||
|
||||
spaCy's CLI provides a range of helpful commands for downloading and training
|
||||
models, converting data and debugging your config, data and installation. For a
|
||||
list of available commands, you can type `python -m spacy --help`. You can also
|
||||
add the `--help` flag to any command or subcommand to see the description,
|
||||
pipelines, converting data and debugging your config, data and installation. For
|
||||
a list of available commands, you can type `python -m spacy --help`. You can
|
||||
also add the `--help` flag to any command or subcommand to see the description,
|
||||
available arguments and usage.
|
||||
|
||||
## download {#download tag="command"}
|
||||
|
||||
Download [models](/usage/models) for spaCy. The downloader finds the
|
||||
best-matching compatible version and uses `pip install` to download the model as
|
||||
a package. Direct downloads don't perform any compatibility checks and require
|
||||
the model name to be specified with its version (e.g. `en_core_web_sm-2.2.0`).
|
||||
Download [trained pipelines](/usage/models) for spaCy. The downloader finds the
|
||||
best-matching compatible version and uses `pip install` to download the Python
|
||||
package. Direct downloads don't perform any compatibility checks and require the
|
||||
pipeline name to be specified with its version (e.g. `en_core_web_sm-2.2.0`).
|
||||
|
||||
> #### Downloading best practices
|
||||
>
|
||||
> The `download` command is mostly intended as a convenient, interactive wrapper
|
||||
> – it performs compatibility checks and prints detailed messages in case things
|
||||
> go wrong. It's **not recommended** to use this command as part of an automated
|
||||
> process. If you know which model your project needs, you should consider a
|
||||
> [direct download via pip](/usage/models#download-pip), or uploading the model
|
||||
> to a local PyPi installation and fetching it straight from there. This will
|
||||
> also allow you to add it as a versioned package dependency to your project.
|
||||
> process. If you know which package your project needs, you should consider a
|
||||
> [direct download via pip](/usage/models#download-pip), or uploading the
|
||||
> package to a local PyPi installation and fetching it straight from there. This
|
||||
> will also allow you to add it as a versioned package dependency to your
|
||||
> project.
|
||||
|
||||
```cli
|
||||
$ python -m spacy download [model] [--direct] [pip_args]
|
||||
```
|
||||
|
||||
| Name | Description |
|
||||
| ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `model` | Model name, e.g. [`en_core_web_sm`](/models/en#en_core_web_sm). ~~str (positional)~~ |
|
||||
| `--direct`, `-d` | Force direct download of exact model version. ~~bool (flag)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| pip args <Tag variant="new">2.1</Tag> | Additional installation options to be passed to `pip install` when installing the model package. For example, `--user` to install to the user home directory or `--no-deps` to not install model dependencies. ~~Any (option/flag)~~ |
|
||||
| **CREATES** | The installed model package in your `site-packages` directory. |
|
||||
| Name | Description |
|
||||
| ------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `model` | Pipeline package name, e.g. [`en_core_web_sm`](/models/en#en_core_web_sm). ~~str (positional)~~ |
|
||||
| `--direct`, `-d` | Force direct download of exact package version. ~~bool (flag)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| pip args <Tag variant="new">2.1</Tag> | Additional installation options to be passed to `pip install` when installing the pipeline package. For example, `--user` to install to the user home directory or `--no-deps` to not install package dependencies. ~~Any (option/flag)~~ |
|
||||
| **CREATES** | The installed pipeline package in your `site-packages` directory. |
|
||||
|
||||
## info {#info tag="command"}
|
||||
|
||||
Print information about your spaCy installation, models and local setup, and
|
||||
generate [Markdown](https://en.wikipedia.org/wiki/Markdown)-formatted markup to
|
||||
copy-paste into [GitHub issues](https://github.com/explosion/spaCy/issues).
|
||||
Print information about your spaCy installation, trained pipelines and local
|
||||
setup, and generate [Markdown](https://en.wikipedia.org/wiki/Markdown)-formatted
|
||||
markup to copy-paste into
|
||||
[GitHub issues](https://github.com/explosion/spaCy/issues).
|
||||
|
||||
```cli
|
||||
$ python -m spacy info [--markdown] [--silent]
|
||||
|
@ -65,41 +67,41 @@ $ python -m spacy info [--markdown] [--silent]
|
|||
$ python -m spacy info [model] [--markdown] [--silent]
|
||||
```
|
||||
|
||||
| Name | Description |
|
||||
| ------------------------------------------------ | ------------------------------------------------------------------------------ |
|
||||
| `model` | A model, i.e. package name or path (optional). ~~Optional[str] \(positional)~~ |
|
||||
| `--markdown`, `-md` | Print information as Markdown. ~~bool (flag)~~ |
|
||||
| `--silent`, `-s` <Tag variant="new">2.0.12</Tag> | Don't print anything, just return the values. ~~bool (flag)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| **PRINTS** | Information about your spaCy installation. |
|
||||
| Name | Description |
|
||||
| ------------------------------------------------ | ----------------------------------------------------------------------------------------- |
|
||||
| `model` | A trained pipeline, i.e. package name or path (optional). ~~Optional[str] \(positional)~~ |
|
||||
| `--markdown`, `-md` | Print information as Markdown. ~~bool (flag)~~ |
|
||||
| `--silent`, `-s` <Tag variant="new">2.0.12</Tag> | Don't print anything, just return the values. ~~bool (flag)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| **PRINTS** | Information about your spaCy installation. |
|
||||
|
||||
## validate {#validate new="2" tag="command"}
|
||||
|
||||
Find all models installed in the current environment and check whether they are
|
||||
compatible with the currently installed version of spaCy. Should be run after
|
||||
upgrading spaCy via `pip install -U spacy` to ensure that all installed models
|
||||
are can be used with the new version. It will show a list of models and their
|
||||
installed versions. If any model is out of date, the latest compatible versions
|
||||
and command for updating are shown.
|
||||
Find all trained pipeline packages installed in the current environment and
|
||||
check whether they are compatible with the currently installed version of spaCy.
|
||||
Should be run after upgrading spaCy via `pip install -U spacy` to ensure that
|
||||
all installed packages are can be used with the new version. It will show a list
|
||||
of packages and their installed versions. If any package is out of date, the
|
||||
latest compatible versions and command for updating are shown.
|
||||
|
||||
> #### Automated validation
|
||||
>
|
||||
> You can also use the `validate` command as part of your build process or test
|
||||
> suite, to ensure all models are up to date before proceeding. If incompatible
|
||||
> models are found, it will return `1`.
|
||||
> suite, to ensure all packages are up to date before proceeding. If
|
||||
> incompatible packages are found, it will return `1`.
|
||||
|
||||
```cli
|
||||
$ python -m spacy validate
|
||||
```
|
||||
|
||||
| Name | Description |
|
||||
| ---------- | --------------------------------------------------------- |
|
||||
| **PRINTS** | Details about the compatibility of your installed models. |
|
||||
| Name | Description |
|
||||
| ---------- | -------------------------------------------------------------------- |
|
||||
| **PRINTS** | Details about the compatibility of your installed pipeline packages. |
|
||||
|
||||
## init {#init new="3"}
|
||||
|
||||
The `spacy init` CLI includes helpful commands for initializing training config
|
||||
files and model directories.
|
||||
files and pipeline directories.
|
||||
|
||||
### init config {#init-config new="3" tag="command"}
|
||||
|
||||
|
@ -125,7 +127,7 @@ $ python -m spacy init config [output_file] [--lang] [--pipeline] [--optimize] [
|
|||
| ------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `output_file` | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~ |
|
||||
| `--lang`, `-l` | Optional code of the [language](/usage/models#languages) to use. Defaults to `"en"`. ~~str (option)~~ |
|
||||
| `--pipeline`, `-p` | Comma-separated list of trainable [pipeline components](/usage/processing-pipelines#built-in) to include in the model. Defaults to `"tagger,parser,ner"`. ~~str (option)~~ |
|
||||
| `--pipeline`, `-p` | Comma-separated list of trainable [pipeline components](/usage/processing-pipelines#built-in) to include. Defaults to `"tagger,parser,ner"`. ~~str (option)~~ |
|
||||
| `--optimize`, `-o` | `"efficiency"` or `"accuracy"`. Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters. Defaults to `"efficiency"`. ~~str (option)~~ |
|
||||
| `--cpu`, `-C` | Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters. ~~bool (flag)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
|
@ -165,36 +167,38 @@ $ python -m spacy init fill-config [base_path] [output_file] [--diff]
|
|||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| **CREATES** | Complete and auto-filled config file for training. |
|
||||
|
||||
### init model {#init-model new="2" tag="command"}
|
||||
### init vocab {#init-vocab new="3" tag="command"}
|
||||
|
||||
Create a new model directory from raw data, like word frequencies, Brown
|
||||
clusters and word vectors. Note that in order to populate the model's vocab, you
|
||||
Create a blank pipeline directory from raw data, like word frequencies, Brown
|
||||
clusters and word vectors. Note that in order to populate the vocabulary, you
|
||||
need to pass in a JSONL-formatted
|
||||
[vocabulary file](/api/data-formats#vocab-jsonl) as `--jsonl-loc` with optional
|
||||
`id` values that correspond to the vectors table. Just loading in vectors will
|
||||
not automatically populate the vocab.
|
||||
|
||||
<Infobox title="New in v3.0" variant="warning">
|
||||
<Infobox title="New in v3.0" variant="warning" id="init-model">
|
||||
|
||||
The `init-model` command is now available as a subcommand of `spacy init`.
|
||||
This command was previously called `init-model`.
|
||||
|
||||
</Infobox>
|
||||
|
||||
```cli
|
||||
$ python -m spacy init model [lang] [output_dir] [--jsonl-loc] [--vectors-loc] [--prune-vectors]
|
||||
$ python -m spacy init vocab [lang] [output_dir] [--jsonl-loc] [--vectors-loc] [--prune-vectors] [--vectors-name] [--meta-name] [--base]
|
||||
```
|
||||
|
||||
| Name | Description |
|
||||
| ------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `lang` | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. ~~str (positional)~~ |
|
||||
| `output_dir` | Model output directory. Will be created if it doesn't exist. ~~Path (positional)~~ |
|
||||
| `lang` | Pipeline language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. ~~str (positional)~~ |
|
||||
| `output_dir` | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~ |
|
||||
| `--jsonl-loc`, `-j` | Optional location of JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) with lexical attributes. ~~Optional[Path] \(option)~~ |
|
||||
| `--vectors-loc`, `-v` | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Optional[Path] \(option)~~ |
|
||||
| `--truncate-vectors`, `-t` <Tag variant="new">2.3</Tag> | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ |
|
||||
| `--prune-vectors`, `-V` | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~ |
|
||||
| `--vectors-name`, `-vn` | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. ~~str (option)~~ |
|
||||
| `--vectors-name`, `-vn` | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. ~~Optional[str] \(option)~~ |
|
||||
| `--meta-name`, `-mn` | Optional name of the package for the pipeline meta. ~~Optional[str] \(option)~~ |
|
||||
| `--base`, `-b` | Optional name of or path to base pipeline to start with (mostly relevant for pipelines with custom tokenizers). ~~Optional[str] \(option)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| **CREATES** | A spaCy model containing the vocab and vectors. |
|
||||
| **CREATES** | A spaCy pipeline directory containing the vocab and vectors. |
|
||||
|
||||
## convert {#convert tag="command"}
|
||||
|
||||
|
@ -205,7 +209,7 @@ management functions. The converter can be specified on the command line, or
|
|||
chosen based on the file extension of the input file.
|
||||
|
||||
```cli
|
||||
$ python -m spacy convert [input_file] [output_dir] [--converter] [--file-type] [--n-sents] [--seg-sents] [--model] [--morphology] [--merge-subtokens] [--ner-map] [--lang]
|
||||
$ python -m spacy convert [input_file] [output_dir] [--converter] [--file-type] [--n-sents] [--seg-sents] [--base] [--morphology] [--merge-subtokens] [--ner-map] [--lang]
|
||||
```
|
||||
|
||||
| Name | Description |
|
||||
|
@ -216,7 +220,7 @@ $ python -m spacy convert [input_file] [output_dir] [--converter] [--file-type]
|
|||
| `--file-type`, `-t` <Tag variant="new">2.1</Tag> | Type of file to create. Either `spacy` (default) for binary [`DocBin`](/api/docbin) data or `json` for v2.x JSON format. ~~str (option)~~ |
|
||||
| `--n-sents`, `-n` | Number of sentences per document. ~~int (option)~~ |
|
||||
| `--seg-sents`, `-s` <Tag variant="new">2.2</Tag> | Segment sentences (for `--converter ner`). ~~bool (flag)~~ |
|
||||
| `--model`, `-b` <Tag variant="new">2.2</Tag> | Model for parser-based sentence segmentation (for `--seg-sents`). ~~Optional[str](option)~~ |
|
||||
| `--base`, `-b` | Trained spaCy pipeline for sentence segmentation to use as base (for `--seg-sents`). ~~Optional[str](option)~~ |
|
||||
| `--morphology`, `-m` | Enable appending morphology to tags. ~~bool (flag)~~ |
|
||||
| `--ner-map`, `-nm` | NER tag mapping (as JSON-encoded dict of entity types). ~~Optional[Path](option)~~ |
|
||||
| `--lang`, `-l` <Tag variant="new">2.1</Tag> | Language code (if tokenizer required). ~~Optional[str] \(option)~~ |
|
||||
|
@ -267,7 +271,7 @@ training -> dropout field required
|
|||
training -> optimizer field required
|
||||
training -> optimize extra fields not permitted
|
||||
|
||||
{'vectors': 'en_vectors_web_lg', 'seed': 0, 'accumulate_gradient': 1, 'init_tok2vec': None, 'raw_text': None, 'patience': 1600, 'max_epochs': 0, 'max_steps': 20000, 'eval_frequency': 200, 'frozen_components': [], 'optimize': None, 'batcher': {'@batchers': 'batch_by_words.v1', 'discard_oversize': False, 'tolerance': 0.2, 'get_length': None, 'size': {'@schedules': 'compounding.v1', 'start': 100, 'stop': 1000, 'compound': 1.001, 't': 0.0}}, 'dev_corpus': {'@readers': 'spacy.Corpus.v1', 'path': '', 'max_length': 0, 'gold_preproc': False, 'limit': 0}, 'score_weights': {'tag_acc': 0.5, 'dep_uas': 0.25, 'dep_las': 0.25, 'sents_f': 0.0}, 'train_corpus': {'@readers': 'spacy.Corpus.v1', 'path': '', 'max_length': 0, 'gold_preproc': False, 'limit': 0}}
|
||||
{'vectors': 'en_vectors_web_lg', 'seed': 0, 'accumulate_gradient': 1, 'init_tok2vec': None, 'raw_text': None, 'patience': 1600, 'max_epochs': 0, 'max_steps': 20000, 'eval_frequency': 200, 'frozen_components': [], 'optimize': None, 'batcher': {'@batchers': 'spacy.batch_by_words.v1', 'discard_oversize': False, 'tolerance': 0.2, 'get_length': None, 'size': {'@schedules': 'compounding.v1', 'start': 100, 'stop': 1000, 'compound': 1.001, 't': 0.0}}, 'dev_corpus': {'@readers': 'spacy.Corpus.v1', 'path': '', 'max_length': 0, 'gold_preproc': False, 'limit': 0}, 'score_weights': {'tag_acc': 0.5, 'dep_uas': 0.25, 'dep_las': 0.25, 'sents_f': 0.0}, 'train_corpus': {'@readers': 'spacy.Corpus.v1', 'path': '', 'max_length': 0, 'gold_preproc': False, 'limit': 0}}
|
||||
|
||||
If your config contains missing values, you can run the 'init fill-config'
|
||||
command to fill in all the defaults, if possible:
|
||||
|
@ -357,7 +361,7 @@ Module spacy.gold.loggers
|
|||
File /path/to/spacy/gold/loggers.py (line 8)
|
||||
ℹ [training.batcher]
|
||||
Registry @batchers
|
||||
Name batch_by_words.v1
|
||||
Name spacy.batch_by_words.v1
|
||||
Module spacy.gold.batchers
|
||||
File /path/to/spacy/gold/batchers.py (line 49)
|
||||
ℹ [training.batcher.size]
|
||||
|
@ -594,11 +598,11 @@ $ python -m spacy debug profile [model] [inputs] [--n-texts]
|
|||
|
||||
| Name | Description |
|
||||
| ----------------- | ---------------------------------------------------------------------------------- |
|
||||
| `model` | A loadable spaCy model. ~~str (positional)~~ |
|
||||
| `model` | A loadable spaCy pipeline (package name or path). ~~str (positional)~~ |
|
||||
| `inputs` | Optional path to input file, or `-` for standard input. ~~Path (positional)~~ |
|
||||
| `--n-texts`, `-n` | Maximum number of texts to use if available. Defaults to `10000`. ~~int (option)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| **PRINTS** | Profiling information for the model. |
|
||||
| **PRINTS** | Profiling information for the pipeline. |
|
||||
|
||||
### debug model {#debug-model new="3" tag="command"}
|
||||
|
||||
|
@ -724,10 +728,10 @@ $ python -m spacy debug model ./config.cfg tagger -l "5,15" -DIM -PAR -P0 -P1 -P
|
|||
|
||||
## train {#train tag="command"}
|
||||
|
||||
Train a model. Expects data in spaCy's
|
||||
Train a pipeline. Expects data in spaCy's
|
||||
[binary format](/api/data-formats#training) and a
|
||||
[config file](/api/data-formats#config) with all settings and hyperparameters.
|
||||
Will save out the best model from all epochs, as well as the final model. The
|
||||
Will save out the best model from all epochs, as well as the final pipeline. The
|
||||
`--code` argument can be used to provide a Python file that's imported before
|
||||
the training process starts. This lets you register
|
||||
[custom functions](/usage/training#custom-functions) and architectures and refer
|
||||
|
@ -753,12 +757,12 @@ $ python -m spacy train [config_path] [--output] [--code] [--verbose] [overrides
|
|||
| Name | Description |
|
||||
| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
|
||||
| `--output`, `-o` | Directory to store model in. Will be created if it doesn't exist. ~~Optional[Path] \(positional)~~ |
|
||||
| `--output`, `-o` | Directory to store trained pipeline in. Will be created if it doesn't exist. ~~Optional[Path] \(positional)~~ |
|
||||
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||
| `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
|
||||
| **CREATES** | The final model and the best model. |
|
||||
| **CREATES** | The final trained pipeline and the best trained pipeline. |
|
||||
|
||||
## pretrain {#pretrain new="2.1" tag="command,experimental"}
|
||||
|
||||
|
@ -769,7 +773,7 @@ a component like a CNN, BiLSTM, etc to predict vectors which match the
|
|||
pretrained ones. The weights are saved to a directory after each epoch. You can
|
||||
then include a **path to one of these pretrained weights files** in your
|
||||
[training config](/usage/training#config) as the `init_tok2vec` setting when you
|
||||
train your model. This technique may be especially helpful if you have little
|
||||
train your pipeline. This technique may be especially helpful if you have little
|
||||
labelled data. See the usage docs on [pretraining](/usage/training#pretraining)
|
||||
for more info.
|
||||
|
||||
|
@ -792,7 +796,7 @@ $ python -m spacy pretrain [texts_loc] [output_dir] [config_path] [--code] [--re
|
|||
| Name | Description |
|
||||
| ----------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `texts_loc` | Path to JSONL file with raw texts to learn from, with text provided as the key `"text"` or tokens as the key `"tokens"`. [See here](/api/data-formats#pretrain) for details. ~~Path (positional)~~ |
|
||||
| `output_dir` | Directory to write models to on each epoch. ~~Path (positional)~~ |
|
||||
| `output_dir` | Directory to save binary weights to on each epoch. ~~Path (positional)~~ |
|
||||
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
|
||||
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||
| `--resume-path`, `-r` | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~ |
|
||||
|
@ -803,7 +807,8 @@ $ python -m spacy pretrain [texts_loc] [output_dir] [config_path] [--code] [--re
|
|||
|
||||
## evaluate {#evaluate new="2" tag="command"}
|
||||
|
||||
Evaluate a model. Expects a loadable spaCy model and evaluation data in the
|
||||
Evaluate a trained pipeline. Expects a loadable spaCy pipeline (package name or
|
||||
path) and evaluation data in the
|
||||
[binary `.spacy` format](/api/data-formats#binary-training). The
|
||||
`--gold-preproc` option sets up the evaluation examples with gold-standard
|
||||
sentences and tokens for the predictions. Gold preprocessing helps the
|
||||
|
@ -819,7 +824,7 @@ $ python -m spacy evaluate [model] [data_path] [--output] [--gold-preproc] [--gp
|
|||
|
||||
| Name | Description |
|
||||
| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `model` | Model to evaluate. Can be a package or a path to a model data directory. ~~str (positional)~~ |
|
||||
| `model` | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~ |
|
||||
| `data_path` | Location of evaluation data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~ |
|
||||
| `--output`, `-o` | Output JSON file for metrics. If not set, no metrics will be exported. ~~Optional[Path] \(option)~~ |
|
||||
| `--gold-preproc`, `-G` | Use gold preprocessing. ~~bool (flag)~~ |
|
||||
|
@ -831,13 +836,12 @@ $ python -m spacy evaluate [model] [data_path] [--output] [--gold-preproc] [--gp
|
|||
|
||||
## package {#package tag="command"}
|
||||
|
||||
Generate an installable
|
||||
[model Python package](/usage/training#models-generating) from an existing model
|
||||
data directory. All data files are copied over. If the path to a
|
||||
[`meta.json`](/api/data-formats#meta) is supplied, or a `meta.json` is found in
|
||||
the input directory, this file is used. Otherwise, the data can be entered
|
||||
directly from the command line. spaCy will then create a `.tar.gz` archive file
|
||||
that you can distribute and install with `pip install`.
|
||||
Generate an installable [Python package](/usage/training#models-generating) from
|
||||
an existing pipeline data directory. All data files are copied over. If the path
|
||||
to a [`meta.json`](/api/data-formats#meta) is supplied, or a `meta.json` is
|
||||
found in the input directory, this file is used. Otherwise, the data can be
|
||||
entered directly from the command line. spaCy will then create a `.tar.gz`
|
||||
archive file that you can distribute and install with `pip install`.
|
||||
|
||||
<Infobox title="New in v3.0" variant="warning">
|
||||
|
||||
|
@ -855,13 +859,13 @@ $ python -m spacy package [input_dir] [output_dir] [--meta-path] [--create-meta]
|
|||
>
|
||||
> ```cli
|
||||
> $ python -m spacy package /input /output
|
||||
> $ cd /output/en_model-0.0.0
|
||||
> $ pip install dist/en_model-0.0.0.tar.gz
|
||||
> $ cd /output/en_pipeline-0.0.0
|
||||
> $ pip install dist/en_pipeline-0.0.0.tar.gz
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `input_dir` | Path to directory containing model data. ~~Path (positional)~~ |
|
||||
| `input_dir` | Path to directory containing pipeline data. ~~Path (positional)~~ |
|
||||
| `output_dir` | Directory to create package folder in. ~~Path (positional)~~ |
|
||||
| `--meta-path`, `-m` <Tag variant="new">2</Tag> | Path to [`meta.json`](/api/data-formats#meta) file (optional). ~~Optional[Path] \(option)~~ |
|
||||
| `--create-meta`, `-C` <Tag variant="new">2</Tag> | Create a `meta.json` file on the command line, even if one already exists in the directory. If an existing file is found, its entries will be shown as the defaults in the command line prompt. ~~bool (flag)~~ |
|
||||
|
@ -869,13 +873,13 @@ $ python -m spacy package [input_dir] [output_dir] [--meta-path] [--create-meta]
|
|||
| `--version`, `-v` <Tag variant="new">3</Tag> | Package version to override in meta. Useful when training new versions, as it doesn't require editing the meta template. ~~Optional[str] \(option)~~ |
|
||||
| `--force`, `-f` | Force overwriting of existing folder in output directory. ~~bool (flag)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| **CREATES** | A Python package containing the spaCy model. |
|
||||
| **CREATES** | A Python package containing the spaCy pipeline. |
|
||||
|
||||
## project {#project new="3"}
|
||||
|
||||
The `spacy project` CLI includes subcommands for working with
|
||||
[spaCy projects](/usage/projects), end-to-end workflows for building and
|
||||
deploying custom spaCy models.
|
||||
deploying custom spaCy pipelines.
|
||||
|
||||
### project clone {#project-clone tag="command"}
|
||||
|
||||
|
@ -1015,9 +1019,9 @@ Download all files or directories listed as `outputs` for commands, unless they
|
|||
are not already present locally. When searching for files in the remote, `pull`
|
||||
won't just look at the output path, but will also consider the **command
|
||||
string** and the **hashes of the dependencies**. For instance, let's say you've
|
||||
previously pushed a model checkpoint to the remote, but now you've changed some
|
||||
previously pushed a checkpoint to the remote, but now you've changed some
|
||||
hyper-parameters. Because you've changed the inputs to the command, if you run
|
||||
`pull`, you won't retrieve the stale result. If you train your model and push
|
||||
`pull`, you won't retrieve the stale result. If you train your pipeline and push
|
||||
the outputs to the remote, the outputs will be saved alongside the prior
|
||||
outputs, so if you change the config back, you'll be able to fetch back the
|
||||
result.
|
||||
|
|
|
@ -6,18 +6,18 @@ menu:
|
|||
- ['Training Data', 'training']
|
||||
- ['Pretraining Data', 'pretraining']
|
||||
- ['Vocabulary', 'vocab-jsonl']
|
||||
- ['Model Meta', 'meta']
|
||||
- ['Pipeline Meta', 'meta']
|
||||
---
|
||||
|
||||
This section documents input and output formats of data used by spaCy, including
|
||||
the [training config](/usage/training#config), training data and lexical
|
||||
vocabulary data. For an overview of label schemes used by the models, see the
|
||||
[models directory](/models). Each model documents the label schemes used in its
|
||||
components, depending on the data it was trained on.
|
||||
[models directory](/models). Each trained pipeline documents the label schemes
|
||||
used in its components, depending on the data it was trained on.
|
||||
|
||||
## Training config {#config new="3"}
|
||||
|
||||
Config files define the training process and model pipeline and can be passed to
|
||||
Config files define the training process and pipeline and can be passed to
|
||||
[`spacy train`](/api/cli#train). They use
|
||||
[Thinc's configuration system](https://thinc.ai/docs/usage-config) under the
|
||||
hood. For details on how to use training configs, see the
|
||||
|
@ -74,16 +74,16 @@ your config and check that it's valid, you can run the
|
|||
Defines the `nlp` object, its tokenizer and
|
||||
[processing pipeline](/usage/processing-pipelines) component names.
|
||||
|
||||
| Name | Description |
|
||||
| ------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `lang` | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). Defaults to `null`. ~~str~~ |
|
||||
| `pipeline` | Names of pipeline components in order. Should correspond to sections in the `[components]` block, e.g. `[components.ner]`. See docs on [defining components](/usage/training#config-components). Defaults to `[]`. ~~List[str]~~ |
|
||||
| `disabled` | Names of pipeline components that are loaded but disabled by default and not run as part of the pipeline. Should correspond to components listed in `pipeline`. After a model is loaded, disabled components can be enabled using [`Language.enable_pipe`](/api/language#enable_pipe). ~~List[str]~~ |
|
||||
| `load_vocab_data` | Whether to load additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) if available. Defaults to `true`. ~~bool~~ |
|
||||
| `before_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `Language` subclass before it's initialized. Defaults to `null`. ~~Optional[Callable[[Type[Language]], Type[Language]]]~~ |
|
||||
| `after_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object right after it's initialized. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ |
|
||||
| `after_pipeline_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object after the pipeline components have been added. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ |
|
||||
| `tokenizer` | The tokenizer to use. Defaults to [`Tokenizer`](/api/tokenizer). ~~Callable[[str], Doc]~~ |
|
||||
| Name | Description |
|
||||
| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `lang` | Pipeline language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). Defaults to `null`. ~~str~~ |
|
||||
| `pipeline` | Names of pipeline components in order. Should correspond to sections in the `[components]` block, e.g. `[components.ner]`. See docs on [defining components](/usage/training#config-components). Defaults to `[]`. ~~List[str]~~ |
|
||||
| `disabled` | Names of pipeline components that are loaded but disabled by default and not run as part of the pipeline. Should correspond to components listed in `pipeline`. After a pipeline is loaded, disabled components can be enabled using [`Language.enable_pipe`](/api/language#enable_pipe). ~~List[str]~~ |
|
||||
| `load_vocab_data` | Whether to load additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) if available. Defaults to `true`. ~~bool~~ |
|
||||
| `before_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `Language` subclass before it's initialized. Defaults to `null`. ~~Optional[Callable[[Type[Language]], Type[Language]]]~~ |
|
||||
| `after_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object right after it's initialized. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ |
|
||||
| `after_pipeline_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object after the pipeline components have been added. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ |
|
||||
| `tokenizer` | The tokenizer to use. Defaults to [`Tokenizer`](/api/tokenizer). ~~Callable[[str], Doc]~~ |
|
||||
|
||||
### components {#config-components tag="section"}
|
||||
|
||||
|
@ -105,8 +105,8 @@ This section includes definitions of the
|
|||
[pipeline components](/usage/processing-pipelines) and their models, if
|
||||
available. Components in this section can be referenced in the `pipeline` of the
|
||||
`[nlp]` block. Component blocks need to specify either a `factory` (named
|
||||
function to use to create component) or a `source` (name of path of pretrained
|
||||
model to copy components from). See the docs on
|
||||
function to use to create component) or a `source` (name of path of trained
|
||||
pipeline to copy components from). See the docs on
|
||||
[defining pipeline components](/usage/training#config-components) for details.
|
||||
|
||||
### paths, system {#config-variables tag="variables"}
|
||||
|
@ -145,7 +145,7 @@ process that are used when you run [`spacy train`](/api/cli#train).
|
|||
| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ |
|
||||
| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ |
|
||||
| `train_corpus` | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/corpus). ~~Callable[[Language], Iterator[Example]]~~ |
|
||||
| `vectors` | Model name or path to model containing pretrained word vectors to use, e.g. created with [`init model`](/api/cli#init-model). Defaults to `null`. ~~Optional[str]~~ |
|
||||
| `vectors` | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~ |
|
||||
|
||||
### pretraining {#config-pretraining tag="section,optional"}
|
||||
|
||||
|
@ -184,7 +184,7 @@ run [`spacy pretrain`](/api/cli#pretrain).
|
|||
|
||||
The main data format used in spaCy v3.0 is a **binary format** created by
|
||||
serializing a [`DocBin`](/api/docbin), which represents a collection of `Doc`
|
||||
objects. This means that you can train spaCy models using the same format it
|
||||
objects. This means that you can train spaCy pipelines using the same format it
|
||||
outputs: annotated `Doc` objects. The binary format is extremely **efficient in
|
||||
storage**, especially when packing multiple documents together.
|
||||
|
||||
|
@ -286,8 +286,8 @@ a dictionary of gold-standard annotations.
|
|||
[internal training API](/usage/training#api) and they're expected when you call
|
||||
[`nlp.update`](/api/language#update). However, for most use cases, you
|
||||
**shouldn't** have to write your own training scripts. It's recommended to train
|
||||
your models via the [`spacy train`](/api/cli#train) command with a config file
|
||||
to keep track of your settings and hyperparameters and your own
|
||||
your pipelines via the [`spacy train`](/api/cli#train) command with a config
|
||||
file to keep track of your settings and hyperparameters and your own
|
||||
[registered functions](/usage/training/#custom-code) to customize the setup.
|
||||
|
||||
</Infobox>
|
||||
|
@ -406,15 +406,15 @@ in line-by-line, while still making it easy to represent newlines in the data.
|
|||
|
||||
## Lexical data for vocabulary {#vocab-jsonl new="2"}
|
||||
|
||||
To populate a model's vocabulary, you can use the
|
||||
[`spacy init model`](/api/cli#init-model) command and load in a
|
||||
To populate a pipeline's vocabulary, you can use the
|
||||
[`spacy init vocab`](/api/cli#init-vocab) command and load in a
|
||||
[newline-delimited JSON](http://jsonlines.org/) (JSONL) file containing one
|
||||
lexical entry per line via the `--jsonl-loc` option. The first line defines the
|
||||
language and vocabulary settings. All other lines are expected to be JSON
|
||||
objects describing an individual lexeme. The lexical attributes will be then set
|
||||
as attributes on spaCy's [`Lexeme`](/api/lexeme#attributes) object. The `vocab`
|
||||
command outputs a ready-to-use spaCy model with a `Vocab` containing the lexical
|
||||
data.
|
||||
command outputs a ready-to-use spaCy pipeline with a `Vocab` containing the
|
||||
lexical data.
|
||||
|
||||
```python
|
||||
### First line
|
||||
|
@ -459,11 +459,11 @@ Here's an example of the 20 most frequent lexemes in the English training data:
|
|||
https://github.com/explosion/spaCy/tree/master/examples/training/vocab-data.jsonl
|
||||
```
|
||||
|
||||
## Model meta {#meta}
|
||||
## Pipeline meta {#meta}
|
||||
|
||||
The model meta is available as the file `meta.json` and exported automatically
|
||||
when you save an `nlp` object to disk. Its contents are available as
|
||||
[`nlp.meta`](/api/language#meta).
|
||||
The pipeline meta is available as the file `meta.json` and exported
|
||||
automatically when you save an `nlp` object to disk. Its contents are available
|
||||
as [`nlp.meta`](/api/language#meta).
|
||||
|
||||
<Infobox variant="warning" title="Changed in v3.0">
|
||||
|
||||
|
@ -473,8 +473,8 @@ creating a Python package with [`spacy package`](/api/cli#package). How to set
|
|||
up the `nlp` object is now defined in the
|
||||
[`config.cfg`](/api/data-formats#config), which includes detailed information
|
||||
about the pipeline components and their model architectures, and all other
|
||||
settings and hyperparameters used to train the model. It's the **single source
|
||||
of truth** used for loading a model.
|
||||
settings and hyperparameters used to train the pipeline. It's the **single
|
||||
source of truth** used for loading a pipeline.
|
||||
|
||||
</Infobox>
|
||||
|
||||
|
@ -482,12 +482,12 @@ of truth** used for loading a model.
|
|||
>
|
||||
> ```json
|
||||
> {
|
||||
> "name": "example_model",
|
||||
> "name": "example_pipeline",
|
||||
> "lang": "en",
|
||||
> "version": "1.0.0",
|
||||
> "spacy_version": ">=3.0.0,<3.1.0",
|
||||
> "parent_package": "spacy",
|
||||
> "description": "Example model for spaCy",
|
||||
> "description": "Example pipeline for spaCy",
|
||||
> "author": "You",
|
||||
> "email": "you@example.com",
|
||||
> "url": "https://example.com",
|
||||
|
@ -510,23 +510,23 @@ of truth** used for loading a model.
|
|||
> }
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ---------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `lang` | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). Defaults to `"en"`. ~~str~~ |
|
||||
| `name` | Model name, e.g. `"core_web_sm"`. The final model package name will be `{lang}_{name}`. Defaults to `"model"`. ~~str~~ |
|
||||
| `version` | Model version. Will be used to version a Python package created with [`spacy package`](/api/cli#package). Defaults to `"0.0.0"`. ~~str~~ |
|
||||
| `spacy_version` | spaCy version range the model is compatible with. Defaults to the spaCy version used to create the model, up to next minor version, which is the default compatibility for the available [pretrained models](/models). For instance, a model trained with v3.0.0 will have the version range `">=3.0.0,<3.1.0"`. ~~str~~ |
|
||||
| `parent_package` | Name of the spaCy package. Typically `"spacy"` or `"spacy_nightly"`. Defaults to `"spacy"`. ~~str~~ |
|
||||
| `description` | Model description. Also used for Python package. Defaults to `""`. ~~str~~ |
|
||||
| `author` | Model author name. Also used for Python package. Defaults to `""`. ~~str~~ |
|
||||
| `email` | Model author email. Also used for Python package. Defaults to `""`. ~~str~~ |
|
||||
| `url` | Model author URL. Also used for Python package. Defaults to `""`. ~~str~~ |
|
||||
| `license` | Model license. Also used for Python package. Defaults to `""`. ~~str~~ |
|
||||
| `sources` | Data sources used to train the model. Typically a list of dicts with the keys `"name"`, `"url"`, `"author"` and `"license"`. [See here](https://github.com/explosion/spacy-models/tree/master/meta) for examples. Defaults to `None`. ~~Optional[List[Dict[str, str]]]~~ |
|
||||
| `vectors` | Information about the word vectors included with the model. Typically a dict with the keys `"width"`, `"vectors"` (number of vectors), `"keys"` and `"name"`. ~~Dict[str, Any]~~ |
|
||||
| `pipeline` | Names of pipeline component names in the model, in order. Corresponds to [`nlp.pipe_names`](/api/language#pipe_names). Only exists for reference and is not used to create the components. This information is defined in the [`config.cfg`](/api/data-formats#config). Defaults to `[]`. ~~List[str]~~ |
|
||||
| `labels` | Label schemes of the trained pipeline components, keyed by component name. Corresponds to [`nlp.pipe_labels`](/api/language#pipe_labels). [See here](https://github.com/explosion/spacy-models/tree/master/meta) for examples. Defaults to `{}`. ~~Dict[str, Dict[str, List[str]]]~~ |
|
||||
| `accuracy` | Training accuracy, added automatically by [`spacy train`](/api/cli#train). Dictionary of [score names](/usage/training#metrics) mapped to scores. Defaults to `{}`. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
|
||||
| `speed` | Model speed, added automatically by [`spacy train`](/api/cli#train). Typically a dictionary with the keys `"cpu"`, `"gpu"` and `"nwords"` (words per second). Defaults to `{}`. ~~Dict[str, Optional[Union[float, str]]]~~ |
|
||||
| `spacy_git_version` <Tag variant="new">3</Tag> | Git commit of [`spacy`](https://github.com/explosion/spaCy) used to create model. ~~str~~ |
|
||||
| other | Any other custom meta information you want to add. The data is preserved in [`nlp.meta`](/api/language#meta). ~~Any~~ |
|
||||
| Name | Description |
|
||||
| ---------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `lang` | Pipeline language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). Defaults to `"en"`. ~~str~~ |
|
||||
| `name` | Pipeline name, e.g. `"core_web_sm"`. The final package name will be `{lang}_{name}`. Defaults to `"pipeline"`. ~~str~~ |
|
||||
| `version` | Pipeline version. Will be used to version a Python package created with [`spacy package`](/api/cli#package). Defaults to `"0.0.0"`. ~~str~~ |
|
||||
| `spacy_version` | spaCy version range the package is compatible with. Defaults to the spaCy version used to create the pipeline, up to next minor version, which is the default compatibility for the available [trained pipelines](/models). For instance, a pipeline trained with v3.0.0 will have the version range `">=3.0.0,<3.1.0"`. ~~str~~ |
|
||||
| `parent_package` | Name of the spaCy package. Typically `"spacy"` or `"spacy_nightly"`. Defaults to `"spacy"`. ~~str~~ |
|
||||
| `description` | Pipeline description. Also used for Python package. Defaults to `""`. ~~str~~ |
|
||||
| `author` | Pipeline author name. Also used for Python package. Defaults to `""`. ~~str~~ |
|
||||
| `email` | Pipeline author email. Also used for Python package. Defaults to `""`. ~~str~~ |
|
||||
| `url` | Pipeline author URL. Also used for Python package. Defaults to `""`. ~~str~~ |
|
||||
| `license` | Pipeline license. Also used for Python package. Defaults to `""`. ~~str~~ |
|
||||
| `sources` | Data sources used to train the pipeline. Typically a list of dicts with the keys `"name"`, `"url"`, `"author"` and `"license"`. [See here](https://github.com/explosion/spacy-models/tree/master/meta) for examples. Defaults to `None`. ~~Optional[List[Dict[str, str]]]~~ |
|
||||
| `vectors` | Information about the word vectors included with the pipeline. Typically a dict with the keys `"width"`, `"vectors"` (number of vectors), `"keys"` and `"name"`. ~~Dict[str, Any]~~ |
|
||||
| `pipeline` | Names of pipeline component names, in order. Corresponds to [`nlp.pipe_names`](/api/language#pipe_names). Only exists for reference and is not used to create the components. This information is defined in the [`config.cfg`](/api/data-formats#config). Defaults to `[]`. ~~List[str]~~ |
|
||||
| `labels` | Label schemes of the trained pipeline components, keyed by component name. Corresponds to [`nlp.pipe_labels`](/api/language#pipe_labels). [See here](https://github.com/explosion/spacy-models/tree/master/meta) for examples. Defaults to `{}`. ~~Dict[str, Dict[str, List[str]]]~~ |
|
||||
| `accuracy` | Training accuracy, added automatically by [`spacy train`](/api/cli#train). Dictionary of [score names](/usage/training#metrics) mapped to scores. Defaults to `{}`. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
|
||||
| `speed` | Inference speed, added automatically by [`spacy train`](/api/cli#train). Typically a dictionary with the keys `"cpu"`, `"gpu"` and `"nwords"` (words per second). Defaults to `{}`. ~~Dict[str, Optional[Union[float, str]]]~~ |
|
||||
| `spacy_git_version` <Tag variant="new">3</Tag> | Git commit of [`spacy`](https://github.com/explosion/spaCy) used to create pipeline. ~~str~~ |
|
||||
| other | Any other custom meta information you want to add. The data is preserved in [`nlp.meta`](/api/language#meta). ~~Any~~ |
|
||||
|
|
|
@ -9,8 +9,8 @@ The `DependencyMatcher` follows the same API as the [`Matcher`](/api/matcher)
|
|||
and [`PhraseMatcher`](/api/phrasematcher) and lets you match on dependency trees
|
||||
using the
|
||||
[Semgrex syntax](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html).
|
||||
It requires a pretrained [`DependencyParser`](/api/parser) or other component
|
||||
that sets the `Token.dep` attribute.
|
||||
It requires a trained [`DependencyParser`](/api/parser) or other component that
|
||||
sets the `Token.dep` attribute.
|
||||
|
||||
## Pattern format {#patterns}
|
||||
|
||||
|
|
|
@ -13,8 +13,8 @@ An `EntityLinker` component disambiguates textual mentions (tagged as named
|
|||
entities) to unique identifiers, grounding the named entities into the "real
|
||||
world". It requires a `KnowledgeBase`, as well as a function to generate
|
||||
plausible candidates from that `KnowledgeBase` given a certain textual mention,
|
||||
and a ML model to pick the right candidate, given the local context of the
|
||||
mention.
|
||||
and a machine learning model to pick the right candidate, given the local
|
||||
context of the mention.
|
||||
|
||||
## Config and implementation {#config}
|
||||
|
||||
|
@ -34,8 +34,8 @@ architectures and their arguments and hyperparameters.
|
|||
> "incl_prior": True,
|
||||
> "incl_context": True,
|
||||
> "model": DEFAULT_NEL_MODEL,
|
||||
> "kb_loader": {'@assets': 'spacy.EmptyKB.v1', 'entity_vector_length': 64},
|
||||
> "get_candidates": {'@assets': 'spacy.CandidateGenerator.v1'},
|
||||
> "kb_loader": {'@misc': 'spacy.EmptyKB.v1', 'entity_vector_length': 64},
|
||||
> "get_candidates": {'@misc': 'spacy.CandidateGenerator.v1'},
|
||||
> }
|
||||
> nlp.add_pipe("entity_linker", config=config)
|
||||
> ```
|
||||
|
@ -66,7 +66,7 @@ https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/entity_linker.py
|
|||
> entity_linker = nlp.add_pipe("entity_linker", config=config)
|
||||
>
|
||||
> # Construction via add_pipe with custom KB and candidate generation
|
||||
> config = {"kb": {"@assets": "my_kb.v1"}}
|
||||
> config = {"kb": {"@misc": "my_kb.v1"}}
|
||||
> entity_linker = nlp.add_pipe("entity_linker", config=config)
|
||||
>
|
||||
> # Construction from class
|
||||
|
|
|
@ -7,9 +7,9 @@ source: spacy/language.py
|
|||
|
||||
Usually you'll load this once per process as `nlp` and pass the instance around
|
||||
your application. The `Language` class is created when you call
|
||||
[`spacy.load()`](/api/top-level#spacy.load) and contains the shared vocabulary
|
||||
and [language data](/usage/adding-languages), optional model data loaded from a
|
||||
[model package](/models) or a path, and a
|
||||
[`spacy.load`](/api/top-level#spacy.load) and contains the shared vocabulary and
|
||||
[language data](/usage/adding-languages), optional binary weights, e.g. provided
|
||||
by a [trained pipeline](/models), and the
|
||||
[processing pipeline](/usage/processing-pipelines) containing components like
|
||||
the tagger or parser that are called on a document in order. You can also add
|
||||
your own processing pipeline components that take a `Doc` object, modify it and
|
||||
|
@ -37,7 +37,7 @@ Initialize a `Language` object.
|
|||
| `vocab` | A `Vocab` object. If `True`, a vocab is created using the default language data settings. ~~Vocab~~ |
|
||||
| _keyword-only_ | |
|
||||
| `max_length` | Maximum number of characters allowed in a single text. Defaults to `10 ** 6`. ~~int~~ |
|
||||
| `meta` | Custom meta data for the `Language` class. Is written to by models to add model meta data. ~~dict~~ |
|
||||
| `meta` | Custom meta data for the `Language` class. Is written to by pipelines to add meta data. ~~dict~~ |
|
||||
| `create_tokenizer` | Optional function that receives the `nlp` object and returns a tokenizer. ~~Callable[[Language], Callable[[str], Doc]]~~ |
|
||||
|
||||
## Language.from_config {#from_config tag="classmethod" new="3"}
|
||||
|
@ -232,7 +232,7 @@ tuples of `Doc` and `GoldParse` objects.
|
|||
|
||||
## Language.resume_training {#resume_training tag="method,experimental" new="3"}
|
||||
|
||||
Continue training a pretrained model. Create and return an optimizer, and
|
||||
Continue training a trained pipeline. Create and return an optimizer, and
|
||||
initialize "rehearsal" for any pipeline component that has a `rehearse` method.
|
||||
Rehearsal is used to prevent models from "forgetting" their initialized
|
||||
"knowledge". To perform rehearsal, collect samples of text you want the models
|
||||
|
@ -314,7 +314,7 @@ the "catastrophic forgetting" problem. This feature is experimental.
|
|||
|
||||
## Language.evaluate {#evaluate tag="method"}
|
||||
|
||||
Evaluate a model's pipeline components.
|
||||
Evaluate a pipeline's components.
|
||||
|
||||
<Infobox variant="warning" title="Changed in v3.0">
|
||||
|
||||
|
@ -386,24 +386,24 @@ component, adds it to the pipeline and returns it.
|
|||
> nlp.add_pipe("component", before="ner")
|
||||
> component = nlp.add_pipe("component", name="custom_name", last=True)
|
||||
>
|
||||
> # Add component from source model
|
||||
> # Add component from source pipeline
|
||||
> source_nlp = spacy.load("en_core_web_sm")
|
||||
> nlp.add_pipe("ner", source=source_nlp)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `factory_name` | Name of the registered component factory. ~~str~~ |
|
||||
| `name` | Optional unique name of pipeline component instance. If not set, the factory name is used. An error is raised if the name already exists in the pipeline. ~~Optional[str]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `before` | Component name or index to insert component directly before. ~~Optional[Union[str, int]]~~ |
|
||||
| `after` | Component name or index to insert component directly after. ~~Optional[Union[str, int]]~~ |
|
||||
| `first` | Insert component first / not first in the pipeline. ~~Optional[bool]~~ |
|
||||
| `last` | Insert component last / not last in the pipeline. ~~Optional[bool]~~ |
|
||||
| `config` <Tag variant="new">3</Tag> | Optional config parameters to use for this component. Will be merged with the `default_config` specified by the component factory. ~~Optional[Dict[str, Any]]~~ |
|
||||
| `source` <Tag variant="new">3</Tag> | Optional source model to copy component from. If a source is provided, the `factory_name` is interpreted as the name of the component in the source pipeline. Make sure that the vocab, vectors and settings of the source model match the target model. ~~Optional[Language]~~ |
|
||||
| `validate` <Tag variant="new">3</Tag> | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ |
|
||||
| **RETURNS** | The pipeline component. ~~Callable[[Doc], Doc]~~ |
|
||||
| Name | Description |
|
||||
| ------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `factory_name` | Name of the registered component factory. ~~str~~ |
|
||||
| `name` | Optional unique name of pipeline component instance. If not set, the factory name is used. An error is raised if the name already exists in the pipeline. ~~Optional[str]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `before` | Component name or index to insert component directly before. ~~Optional[Union[str, int]]~~ |
|
||||
| `after` | Component name or index to insert component directly after. ~~Optional[Union[str, int]]~~ |
|
||||
| `first` | Insert component first / not first in the pipeline. ~~Optional[bool]~~ |
|
||||
| `last` | Insert component last / not last in the pipeline. ~~Optional[bool]~~ |
|
||||
| `config` <Tag variant="new">3</Tag> | Optional config parameters to use for this component. Will be merged with the `default_config` specified by the component factory. ~~Optional[Dict[str, Any]]~~ |
|
||||
| `source` <Tag variant="new">3</Tag> | Optional source pipeline to copy component from. If a source is provided, the `factory_name` is interpreted as the name of the component in the source pipeline. Make sure that the vocab, vectors and settings of the source pipeline match the target pipeline. ~~Optional[Language]~~ |
|
||||
| `validate` <Tag variant="new">3</Tag> | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ |
|
||||
| **RETURNS** | The pipeline component. ~~Callable[[Doc], Doc]~~ |
|
||||
|
||||
## Language.create_pipe {#create_pipe tag="method" new="2"}
|
||||
|
||||
|
@ -790,9 +790,10 @@ token.ent_iob, token.ent_type
|
|||
|
||||
## Language.meta {#meta tag="property"}
|
||||
|
||||
Custom meta data for the Language class. If a model is loaded, contains meta
|
||||
data of the model. The `Language.meta` is also what's serialized as the
|
||||
[`meta.json`](/api/data-formats#meta) when you save an `nlp` object to disk.
|
||||
Custom meta data for the Language class. If a trained pipeline is loaded, this
|
||||
contains meta data of the pipeline. The `Language.meta` is also what's
|
||||
serialized as the [`meta.json`](/api/data-formats#meta) when you save an `nlp`
|
||||
object to disk.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -827,13 +828,15 @@ subclass of the built-in `dict`. It supports the additional methods `to_disk`
|
|||
|
||||
## Language.to_disk {#to_disk tag="method" new="2"}
|
||||
|
||||
Save the current state to a directory. If a model is loaded, this will **include
|
||||
the model**.
|
||||
Save the current state to a directory. Under the hood, this method delegates to
|
||||
the `to_disk` methods of the individual pipeline components, if available. This
|
||||
means that if a trained pipeline is loaded, all components and their weights
|
||||
will be saved to disk.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> nlp.to_disk("/path/to/models")
|
||||
> nlp.to_disk("/path/to/pipeline")
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
|
@ -844,22 +847,28 @@ the model**.
|
|||
|
||||
## Language.from_disk {#from_disk tag="method" new="2"}
|
||||
|
||||
Loads state from a directory. Modifies the object in place and returns it. If
|
||||
the saved `Language` object contains a model, the model will be loaded. Note
|
||||
that this method is commonly used via the subclasses like `English` or `German`
|
||||
to make language-specific functionality like the
|
||||
[lexical attribute getters](/usage/adding-languages#lex-attrs) available to the
|
||||
loaded object.
|
||||
Loads state from a directory, including all data that was saved with the
|
||||
`Language` object. Modifies the object in place and returns it.
|
||||
|
||||
<Infobox variant="warning" title="Important note">
|
||||
|
||||
Keep in mind that this method **only loads serialized state** and doesn't set up
|
||||
the `nlp` object. This means that it requires the correct language class to be
|
||||
initialized and all pipeline components to be added to the pipeline. If you want
|
||||
to load a serialized pipeline from a directory, you should use
|
||||
[`spacy.load`](/api/top-level#spacy.load), which will set everything up for you.
|
||||
|
||||
</Infobox>
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> from spacy.language import Language
|
||||
> nlp = Language().from_disk("/path/to/model")
|
||||
> nlp = Language().from_disk("/path/to/pipeline")
|
||||
>
|
||||
> # using language-specific subclass
|
||||
> # Using language-specific subclass
|
||||
> from spacy.lang.en import English
|
||||
> nlp = English().from_disk("/path/to/en_model")
|
||||
> nlp = English().from_disk("/path/to/pipeline")
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
|
@ -924,7 +933,7 @@ available to the loaded object.
|
|||
| `components` <Tag variant="new">3</Tag> | List of all available `(name, component)` tuples, including components that are currently disabled. ~~List[Tuple[str, Callable[[Doc], Doc]]]~~ |
|
||||
| `component_names` <Tag variant="new">3</Tag> | List of all available component names, including components that are currently disabled. ~~List[str]~~ |
|
||||
| `disabled` <Tag variant="new">3</Tag> | Names of components that are currently disabled and don't run as part of the pipeline. ~~List[str]~~ |
|
||||
| `path` <Tag variant="new">2</Tag> | Path to the model data directory, if a model is loaded. Otherwise `None`. ~~Optional[Path]~~ |
|
||||
| `path` <Tag variant="new">2</Tag> | Path to the pipeline data directory, if a pipeline is loaded from a path or package. Otherwise `None`. ~~Optional[Path]~~ |
|
||||
|
||||
## Class attributes {#class-attributes}
|
||||
|
||||
|
@ -1004,7 +1013,7 @@ serialization by passing in the string names via the `exclude` argument.
|
|||
>
|
||||
> ```python
|
||||
> data = nlp.to_bytes(exclude=["tokenizer", "vocab"])
|
||||
> nlp.from_disk("./model-data", exclude=["ner"])
|
||||
> nlp.from_disk("/pipeline", exclude=["ner"])
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
|
|
|
@ -116,31 +116,12 @@ Find all token sequences matching the supplied patterns on the `Doc` or `Span`.
|
|||
> matches = matcher(doc)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `doclike` | The `Doc` or `Span` to match over. ~~Union[Doc, Span]~~ |
|
||||
| **RETURNS** | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. ~~List[Tuple[int, int, int]]~~ |
|
||||
|
||||
## Matcher.pipe {#pipe tag="method"}
|
||||
|
||||
Match a stream of documents, yielding them in turn.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> from spacy.matcher import Matcher
|
||||
> matcher = Matcher(nlp.vocab)
|
||||
> for doc in matcher.pipe(docs, batch_size=50):
|
||||
> pass
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| --------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `docs` | A stream of documents or spans. ~~Iterable[Union[Doc, Span]]~~ |
|
||||
| `batch_size` | The number of documents to accumulate into a working set. ~~int~~ |
|
||||
| `return_matches` <Tag variant="new">2.1</Tag> | Yield the match lists along with the docs, making results `(doc, matches)` tuples. ~~bool~~ |
|
||||
| `as_tuples` | Interpret the input stream as `(doc, context)` tuples, and yield `(result, context)` tuples out. If both `return_matches` and `as_tuples` are `True`, the output will be a sequence of `((doc, matches), context)` tuples. ~~bool~~ |
|
||||
| **YIELDS** | Documents, in order. ~~Union[Doc, Tuple[Doc, Any], Tuple[Tuple[Doc, Any], Any]]~~ |
|
||||
| Name | Description |
|
||||
| ------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `doclike` | The `Doc` or `Span` to match over. ~~Union[Doc, Span]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `as_spans` <Tag variant="new">3</Tag> | Instead of tuples, return a list of [`Span`](/api/span) objects of the matches, with the `match_id` assigned as the span label. Defaults to `False`. ~~bool~~ |
|
||||
| **RETURNS** | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. If `as_spans` is set to `True`, a list of `Span` objects is returned instead. ~~Union[List[Tuple[int, int, int]], List[Span]]~~ |
|
||||
|
||||
## Matcher.\_\_len\_\_ {#len tag="method" new="2"}
|
||||
|
||||
|
|
|
@ -57,10 +57,12 @@ Find all token sequences matching the supplied patterns on the `Doc`.
|
|||
> matches = matcher(doc)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ----------------------------------- |
|
||||
| `doc` | The document to match over. ~~Doc~~ |
|
||||
| **RETURNS** | list | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end]`. The `match_id` is the ID of the added match pattern. ~~List[Tuple[int, int, int]]~~ |
|
||||
| Name | Description |
|
||||
| ------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `doc` | The document to match over. ~~Doc~~ |
|
||||
| _keyword-only_ | |
|
||||
| `as_spans` <Tag variant="new">3</Tag> | Instead of tuples, return a list of [`Span`](/api/span) objects of the matches, with the `match_id` assigned as the span label. Defaults to `False`. ~~bool~~ |
|
||||
| **RETURNS** | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. If `as_spans` is set to `True`, a list of `Span` objects is returned instead. ~~Union[List[Tuple[int, int, int]], List[Span]]~~ |
|
||||
|
||||
<Infobox title="Note on retrieving the string representation of the match_id" variant="warning">
|
||||
|
||||
|
@ -74,27 +76,6 @@ match_id_string = nlp.vocab.strings[match_id]
|
|||
|
||||
</Infobox>
|
||||
|
||||
## PhraseMatcher.pipe {#pipe tag="method"}
|
||||
|
||||
Match a stream of documents, yielding them in turn.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> from spacy.matcher import PhraseMatcher
|
||||
> matcher = PhraseMatcher(nlp.vocab)
|
||||
> for doc in matcher.pipe(docs, batch_size=50):
|
||||
> pass
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| --------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `docs` | A stream of documents. ~~Iterable[Doc]~~ |
|
||||
| `batch_size` | The number of documents to accumulate into a working set. ~~int~~ |
|
||||
| `return_matches` <Tag variant="new">2.1</Tag> | Yield the match lists along with the docs, making results `(doc, matches)` tuples. ~~bool~~ |
|
||||
| `as_tuples` | Interpret the input stream as `(doc, context)` tuples, and yield `(result, context)` tuples out. If both `return_matches` and `as_tuples` are `True`, the output will be a sequence of `((doc, matches), context)` tuples. ~~bool~~ |
|
||||
| **YIELDS** | Documents and optional matches or context in order. ~~Union[Doc, Tuple[Doc, Any], Tuple[Tuple[Doc, Any], Any]]~~ |
|
||||
|
||||
## PhraseMatcher.\_\_len\_\_ {#len tag="method"}
|
||||
|
||||
Get the number of rules added to the matcher. Note that this only returns the
|
||||
|
|
|
@ -286,7 +286,7 @@ context, the original parameters are restored.
|
|||
|
||||
## Pipe.add_label {#add_label tag="method"}
|
||||
|
||||
Add a new label to the pipe. It's possible to extend pretrained models with new
|
||||
Add a new label to the pipe. It's possible to extend trained models with new
|
||||
labels, but care should be taken to avoid the "catastrophic forgetting" problem.
|
||||
|
||||
> #### Example
|
||||
|
|
|
@ -4,6 +4,7 @@ menu:
|
|||
- ['spacy', 'spacy']
|
||||
- ['displacy', 'displacy']
|
||||
- ['registry', 'registry']
|
||||
- ['Loggers', 'loggers']
|
||||
- ['Batchers', 'batchers']
|
||||
- ['Data & Alignment', 'gold']
|
||||
- ['Utility Functions', 'util']
|
||||
|
@ -11,14 +12,14 @@ menu:
|
|||
|
||||
## spaCy {#spacy hidden="true"}
|
||||
|
||||
### spacy.load {#spacy.load tag="function" model="any"}
|
||||
### spacy.load {#spacy.load tag="function"}
|
||||
|
||||
Load a model using the name of an installed
|
||||
[model package](/usage/training#models-generating), a string path or a
|
||||
`Path`-like object. spaCy will try resolving the load argument in this order. If
|
||||
a model is loaded from a model name, spaCy will assume it's a Python package and
|
||||
import it and call the model's own `load()` method. If a model is loaded from a
|
||||
path, spaCy will assume it's a data directory, load its
|
||||
Load a pipeline using the name of an installed
|
||||
[package](/usage/saving-loading#models), a string path or a `Path`-like object.
|
||||
spaCy will try resolving the load argument in this order. If a pipeline is
|
||||
loaded from a string name, spaCy will assume it's a Python package and import it
|
||||
and call the package's own `load()` method. If a pipeline is loaded from a path,
|
||||
spaCy will assume it's a data directory, load its
|
||||
[`config.cfg`](/api/data-formats#config) and use the language and pipeline
|
||||
information to construct the `Language` class. The data will be loaded in via
|
||||
[`Language.from_disk`](/api/language#from_disk).
|
||||
|
@ -35,38 +36,38 @@ specified separately using the new `exclude` keyword argument.
|
|||
>
|
||||
> ```python
|
||||
> nlp = spacy.load("en_core_web_sm") # package
|
||||
> nlp = spacy.load("/path/to/en") # string path
|
||||
> nlp = spacy.load(Path("/path/to/en")) # pathlib Path
|
||||
> nlp = spacy.load("/path/to/pipeline") # string path
|
||||
> nlp = spacy.load(Path("/path/to/pipeline")) # pathlib Path
|
||||
>
|
||||
> nlp = spacy.load("en_core_web_sm", exclude=["parser", "tagger"])
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `name` | Model to load, i.e. package name or path. ~~Union[str, Path]~~ |
|
||||
| `name` | Pipeline to load, i.e. package name or path. ~~Union[str, Path]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~List[str]~~ |
|
||||
| `exclude` <Tag variant="new">3</Tag> | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ |
|
||||
| `config` <Tag variant="new">3</Tag> | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ |
|
||||
| **RETURNS** | A `Language` object with the loaded model. ~~Language~~ |
|
||||
| **RETURNS** | A `Language` object with the loaded pipeline. ~~Language~~ |
|
||||
|
||||
Essentially, `spacy.load()` is a convenience wrapper that reads the model's
|
||||
Essentially, `spacy.load()` is a convenience wrapper that reads the pipeline's
|
||||
[`config.cfg`](/api/data-formats#config), uses the language and pipeline
|
||||
information to construct a `Language` object, loads in the model data and
|
||||
returns it.
|
||||
weights, and returns it.
|
||||
|
||||
```python
|
||||
### Abstract example
|
||||
cls = util.get_lang_class(lang) # get language for ID, e.g. "en"
|
||||
nlp = cls() # initialize the language
|
||||
cls = spacy.util.get_lang_class(lang) # 1. Get Language class, e.g. English
|
||||
nlp = cls() # 2. Initialize it
|
||||
for name in pipeline:
|
||||
nlp.add_pipe(name) # add component to pipeline
|
||||
nlp.from_disk(model_data_path) # load in model data
|
||||
nlp.add_pipe(name) # 3. Add the component to the pipeline
|
||||
nlp.from_disk(data_path) # 4. Load in the binary data
|
||||
```
|
||||
|
||||
### spacy.blank {#spacy.blank tag="function" new="2"}
|
||||
|
||||
Create a blank model of a given language class. This function is the twin of
|
||||
Create a blank pipeline of a given language class. This function is the twin of
|
||||
`spacy.load()`.
|
||||
|
||||
> #### Example
|
||||
|
@ -84,9 +85,7 @@ Create a blank model of a given language class. This function is the twin of
|
|||
### spacy.info {#spacy.info tag="function"}
|
||||
|
||||
The same as the [`info` command](/api/cli#info). Pretty-print information about
|
||||
your installation, models and local setup from within spaCy. To get the model
|
||||
meta data as a dictionary instead, you can use the `meta` attribute on your
|
||||
`nlp` object with a loaded model, e.g. `nlp.meta`.
|
||||
your installation, installed pipelines and local setup from within spaCy.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -96,12 +95,12 @@ meta data as a dictionary instead, you can use the `meta` attribute on your
|
|||
> markdown = spacy.info(markdown=True, silent=True)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------ |
|
||||
| `model` | A model, i.e. a package name or path (optional). ~~Optional[str]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `markdown` | Print information as Markdown. ~~bool~~ |
|
||||
| `silent` | Don't print anything, just return. ~~bool~~ |
|
||||
| Name | Description |
|
||||
| -------------- | ---------------------------------------------------------------------------- |
|
||||
| `model` | Optional pipeline, i.e. a package name or path (optional). ~~Optional[str]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `markdown` | Print information as Markdown. ~~bool~~ |
|
||||
| `silent` | Don't print anything, just return. ~~bool~~ |
|
||||
|
||||
### spacy.explain {#spacy.explain tag="function"}
|
||||
|
||||
|
@ -132,7 +131,7 @@ list of available terms, see
|
|||
Allocate data and perform operations on [GPU](/usage/#gpu), if available. If
|
||||
data has already been allocated on CPU, it will not be moved. Ideally, this
|
||||
function should be called right after importing spaCy and _before_ loading any
|
||||
models.
|
||||
pipelines.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -151,7 +150,7 @@ models.
|
|||
Allocate data and perform operations on [GPU](/usage/#gpu). Will raise an error
|
||||
if no GPU is available. If data has already been allocated on CPU, it will not
|
||||
be moved. Ideally, this function should be called right after importing spaCy
|
||||
and _before_ loading any models.
|
||||
and _before_ loading any pipelines.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -270,9 +269,9 @@ If a setting is not present in the options, the default value will be used.
|
|||
| `template` <Tag variant="new">2.2</Tag> | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. See [`templates.py`](https://github.com/explosion/spaCy/blob/master/spacy/displacy/templates.py) for examples. ~~Optional[str]~~ |
|
||||
|
||||
By default, displaCy comes with colors for all entity types used by
|
||||
[spaCy models](/models). If you're using custom entity types, you can use the
|
||||
`colors` setting to add your own colors for them. Your application or model
|
||||
package can also expose a
|
||||
[spaCy's trained pipelines](/models). If you're using custom entity types, you
|
||||
can use the `colors` setting to add your own colors for them. Your application
|
||||
or pipeline package can also expose a
|
||||
[`spacy_displacy_colors` entry point](/usage/saving-loading#entry-points-displacy)
|
||||
to add custom labels and their colors automatically.
|
||||
|
||||
|
@ -308,7 +307,6 @@ factories.
|
|||
| Registry name | Description |
|
||||
| ----------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `architectures` | Registry for functions that create [model architectures](/api/architectures). Can be used to register custom model architectures and reference them in the `config.cfg`. |
|
||||
| `assets` | Registry for data assets, knowledge bases etc. |
|
||||
| `batchers` | Registry for training and evaluation [data batchers](#batchers). |
|
||||
| `callbacks` | Registry for custom callbacks to [modify the `nlp` object](/usage/training#custom-code-nlp-callbacks) before training. |
|
||||
| `displacy_colors` | Registry for custom color scheme for the [`displacy` NER visualizer](/usage/visualizers). Automatically reads from [entry points](/usage/saving-loading#entry-points). |
|
||||
|
@ -316,8 +314,10 @@ factories.
|
|||
| `initializers` | Registry for functions that create [initializers](https://thinc.ai/docs/api-initializers). |
|
||||
| `languages` | Registry for language-specific `Language` subclasses. Automatically reads from [entry points](/usage/saving-loading#entry-points). |
|
||||
| `layers` | Registry for functions that create [layers](https://thinc.ai/docs/api-layers). |
|
||||
| `loggers` | Registry for functions that log [training results](/usage/training). |
|
||||
| `lookups` | Registry for large lookup tables available via `vocab.lookups`. |
|
||||
| `losses` | Registry for functions that create [losses](https://thinc.ai/docs/api-loss). |
|
||||
| `misc` | Registry for miscellaneous functions that return data assets, knowledge bases or anything else you may need. |
|
||||
| `optimizers` | Registry for functions that create [optimizers](https://thinc.ai/docs/api-optimizers). |
|
||||
| `readers` | Registry for training and evaluation data readers like [`Corpus`](/api/corpus). |
|
||||
| `schedules` | Registry for functions that create [schedules](https://thinc.ai/docs/api-schedules). |
|
||||
|
@ -340,7 +340,7 @@ See the [`Transformer`](/api/transformer) API reference and
|
|||
> def annotation_setter(docs, trf_data) -> None:
|
||||
> # Set annotations on the docs
|
||||
>
|
||||
> return annotation_sette
|
||||
> return annotation_setter
|
||||
> ```
|
||||
|
||||
| Registry name | Description |
|
||||
|
@ -348,6 +348,110 @@ See the [`Transformer`](/api/transformer) API reference and
|
|||
| [`span_getters`](/api/transformer#span_getters) | Registry for functions that take a batch of `Doc` objects and return a list of `Span` objects to process by the transformer, e.g. sentences. |
|
||||
| [`annotation_setters`](/api/transformer#annotation_setters) | Registry for functions that create annotation setters. Annotation setters are functions that take a batch of `Doc` objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set additional annotations on the `Doc`. |
|
||||
|
||||
## Loggers {#loggers source="spacy/gold/loggers.py" new="3"}
|
||||
|
||||
A logger records the training results. When a logger is created, two functions
|
||||
are returned: one for logging the information for each training step, and a
|
||||
second function that is called to finalize the logging when the training is
|
||||
finished. To log each training step, a
|
||||
[dictionary](/usage/training#custom-logging) is passed on from the
|
||||
[`spacy train`](/api/cli#train), including information such as the training loss
|
||||
and the accuracy scores on the development set.
|
||||
|
||||
There are two built-in logging functions: a logger printing results to the
|
||||
console in tabular format (which is the default), and one that also sends the
|
||||
results to a [Weights & Biases](https://www.wandb.com/) dashboard. Instead of
|
||||
using one of the built-in loggers listed here, you can also
|
||||
[implement your own](/usage/training#custom-logging).
|
||||
|
||||
#### spacy.ConsoleLogger {#ConsoleLogger tag="registered function"}
|
||||
|
||||
> #### Example config
|
||||
>
|
||||
> ```ini
|
||||
> [training.logger]
|
||||
> @loggers = "spacy.ConsoleLogger.v1"
|
||||
> ```
|
||||
|
||||
Writes the results of a training step to the console in a tabular format.
|
||||
|
||||
<Accordion title="Example console output" spaced>
|
||||
|
||||
```cli
|
||||
$ python -m spacy train config.cfg
|
||||
```
|
||||
|
||||
```
|
||||
ℹ Using CPU
|
||||
ℹ Loading config and nlp from: config.cfg
|
||||
ℹ Pipeline: ['tok2vec', 'tagger']
|
||||
ℹ Start training
|
||||
ℹ Training. Initial learn rate: 0.0
|
||||
|
||||
E # LOSS TOK2VEC LOSS TAGGER TAG_ACC SCORE
|
||||
--- ------ ------------ ----------- ------- ------
|
||||
1 0 0.00 86.20 0.22 0.00
|
||||
1 200 3.08 18968.78 34.00 0.34
|
||||
1 400 31.81 22539.06 33.64 0.34
|
||||
1 600 92.13 22794.91 43.80 0.44
|
||||
1 800 183.62 21541.39 56.05 0.56
|
||||
1 1000 352.49 25461.82 65.15 0.65
|
||||
1 1200 422.87 23708.82 71.84 0.72
|
||||
1 1400 601.92 24994.79 76.57 0.77
|
||||
1 1600 662.57 22268.02 80.20 0.80
|
||||
1 1800 1101.50 28413.77 82.56 0.83
|
||||
1 2000 1253.43 28736.36 85.00 0.85
|
||||
1 2200 1411.02 28237.53 87.42 0.87
|
||||
1 2400 1605.35 28439.95 88.70 0.89
|
||||
```
|
||||
|
||||
Note that the cumulative loss keeps increasing within one epoch, but should
|
||||
start decreasing across epochs.
|
||||
|
||||
</Accordion>
|
||||
|
||||
#### spacy.WandbLogger {#WandbLogger tag="registered function"}
|
||||
|
||||
> #### Installation
|
||||
>
|
||||
> ```bash
|
||||
> $ pip install wandb
|
||||
> $ wandb login
|
||||
> ```
|
||||
|
||||
Built-in logger that sends the results of each training step to the dashboard of
|
||||
the [Weights & Biases](https://www.wandb.com/) tool. To use this logger, Weights
|
||||
& Biases should be installed, and you should be logged in. The logger will send
|
||||
the full config file to W&B, as well as various system information such as
|
||||
memory utilization, network traffic, disk IO, GPU statistics, etc. This will
|
||||
also include information such as your hostname and operating system, as well as
|
||||
the location of your Python executable.
|
||||
|
||||
<Infobox variant="warning">
|
||||
|
||||
Note that by default, the full (interpolated)
|
||||
[training config](/usage/training#config) is sent over to the W&B dashboard. If
|
||||
you prefer to **exclude certain information** such as path names, you can list
|
||||
those fields in "dot notation" in the `remove_config_values` parameter. These
|
||||
fields will then be removed from the config before uploading, but will otherwise
|
||||
remain in the config file stored on your local system.
|
||||
|
||||
</Infobox>
|
||||
|
||||
> #### Example config
|
||||
>
|
||||
> ```ini
|
||||
> [training.logger]
|
||||
> @loggers = "spacy.WandbLogger.v1"
|
||||
> project_name = "monitor_spacy_training"
|
||||
> remove_config_values = ["paths.train", "paths.dev", "training.dev_corpus.path", "training.train_corpus.path"]
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `project_name` | The name of the project in the Weights & Biases interface. The project will be created automatically if it doesn't exist yet. ~~str~~ |
|
||||
| `remove_config_values` | A list of values to include from the config before it is uploaded to W&B (default: empty). ~~List[str]~~ |
|
||||
|
||||
## Batchers {#batchers source="spacy/gold/batchers.py" new="3"}
|
||||
|
||||
A data batcher implements a batching strategy that essentially turns a stream of
|
||||
|
@ -362,7 +466,7 @@ Instead of using one of the built-in batchers listed here, you can also
|
|||
[implement your own](/usage/training#custom-code-readers-batchers), which may or
|
||||
may not use a custom schedule.
|
||||
|
||||
#### batch_by_words.v1 {#batch_by_words tag="registered function"}
|
||||
#### batch_by_words {#batch_by_words tag="registered function"}
|
||||
|
||||
Create minibatches of roughly a given number of words. If any examples are
|
||||
longer than the specified batch length, they will appear in a batch by
|
||||
|
@ -374,7 +478,7 @@ themselves, or be discarded if `discard_oversize` is set to `True`. The argument
|
|||
>
|
||||
> ```ini
|
||||
> [training.batcher]
|
||||
> @batchers = "batch_by_words.v1"
|
||||
> @batchers = "spacy.batch_by_words.v1"
|
||||
> size = 100
|
||||
> tolerance = 0.2
|
||||
> discard_oversize = false
|
||||
|
@ -389,13 +493,13 @@ themselves, or be discarded if `discard_oversize` is set to `True`. The argument
|
|||
| `discard_oversize` | Whether to discard sequences that by themselves exceed the tolerated size. ~~bool~~ |
|
||||
| `get_length` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~ |
|
||||
|
||||
#### batch_by_sequence.v1 {#batch_by_sequence tag="registered function"}
|
||||
#### batch_by_sequence {#batch_by_sequence tag="registered function"}
|
||||
|
||||
> #### Example config
|
||||
>
|
||||
> ```ini
|
||||
> [training.batcher]
|
||||
> @batchers = "batch_by_sequence.v1"
|
||||
> @batchers = "spacy.batch_by_sequence.v1"
|
||||
> size = 32
|
||||
> get_length = null
|
||||
> ```
|
||||
|
@ -407,13 +511,13 @@ Create a batcher that creates batches of the specified size.
|
|||
| `size` | The target number of items per batch. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). ~~Union[int, Sequence[int]]~~ |
|
||||
| `get_length` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~ |
|
||||
|
||||
#### batch_by_padded.v1 {#batch_by_padded tag="registered function"}
|
||||
#### batch_by_padded {#batch_by_padded tag="registered function"}
|
||||
|
||||
> #### Example config
|
||||
>
|
||||
> ```ini
|
||||
> [training.batcher]
|
||||
> @batchers = "batch_by_padded.v1"
|
||||
> @batchers = "spacy.batch_by_padded.v1"
|
||||
> size = 100
|
||||
> buffer = 256
|
||||
> discard_oversize = false
|
||||
|
@ -560,8 +664,8 @@ loaded lazily, to avoid expensive setup code associated with the language data.
|
|||
|
||||
### util.load_model {#util.load_model tag="function" new="2"}
|
||||
|
||||
Load a model from a package or data path. If called with a package name, spaCy
|
||||
will assume the model is a Python package and import and call its `load()`
|
||||
Load a pipeline from a package or data path. If called with a string name, spaCy
|
||||
will assume the pipeline is a Python package and import and call its `load()`
|
||||
method. If called with a path, spaCy will assume it's a data directory, read the
|
||||
language and pipeline settings from the [`config.cfg`](/api/data-formats#config)
|
||||
and create a `Language` object. The model data will then be loaded in via
|
||||
|
@ -577,16 +681,16 @@ and create a `Language` object. The model data will then be loaded in via
|
|||
|
||||
| Name | Description |
|
||||
| ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `name` | Package name or model path. ~~str~~ |
|
||||
| `name` | Package name or path. ~~str~~ |
|
||||
| `vocab` <Tag variant="new">3</Tag> | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~. |
|
||||
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~List[str]~~ |
|
||||
| `exclude` <Tag variant="new">3</Tag> | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ |
|
||||
| `config` <Tag variant="new">3</Tag> | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ |
|
||||
| **RETURNS** | `Language` class with the loaded model. ~~Language~~ |
|
||||
| **RETURNS** | `Language` class with the loaded pipeline. ~~Language~~ |
|
||||
|
||||
### util.load_model_from_init_py {#util.load_model_from_init_py tag="function" new="2"}
|
||||
|
||||
A helper function to use in the `load()` method of a model package's
|
||||
A helper function to use in the `load()` method of a pipeline package's
|
||||
[`__init__.py`](https://github.com/explosion/spacy-models/tree/master/template/model/xx_model_name/__init__.py).
|
||||
|
||||
> #### Example
|
||||
|
@ -600,70 +704,72 @@ A helper function to use in the `load()` method of a model package's
|
|||
|
||||
| Name | Description |
|
||||
| ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `init_file` | Path to model's `__init__.py`, i.e. `__file__`. ~~Union[str, Path]~~ |
|
||||
| `init_file` | Path to package's `__init__.py`, i.e. `__file__`. ~~Union[str, Path]~~ |
|
||||
| `vocab` <Tag variant="new">3</Tag> | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~. |
|
||||
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~List[str]~~ |
|
||||
| `exclude` <Tag variant="new">3</Tag> | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ |
|
||||
| `config` <Tag variant="new">3</Tag> | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ |
|
||||
| **RETURNS** | `Language` class with the loaded model. ~~Language~~ |
|
||||
| **RETURNS** | `Language` class with the loaded pipeline. ~~Language~~ |
|
||||
|
||||
### util.load_config {#util.load_config tag="function" new="3"}
|
||||
|
||||
Load a model's [`config.cfg`](/api/data-formats#config) from a file path. The
|
||||
config typically includes details about the model pipeline and how its
|
||||
components are created, as well as all training settings and hyperparameters.
|
||||
Load a pipeline's [`config.cfg`](/api/data-formats#config) from a file path. The
|
||||
config typically includes details about the components and how they're created,
|
||||
as well as all training settings and hyperparameters.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> config = util.load_config("/path/to/model/config.cfg")
|
||||
> config = util.load_config("/path/to/config.cfg")
|
||||
> print(config.to_str())
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | Path to the model's `config.cfg`. ~~Union[str, Path]~~ |
|
||||
| `path` | Path to the pipeline's `config.cfg`. ~~Union[str, Path]~~ |
|
||||
| `overrides` | Optional config overrides to replace in loaded config. Can be provided as nested dict, or as flat dict with keys in dot notation, e.g. `"nlp.pipeline"`. ~~Dict[str, Any]~~ |
|
||||
| `interpolate` | Whether to interpolate the config and replace variables like `${paths.train}` with their values. Defaults to `False`. ~~bool~~ |
|
||||
| **RETURNS** | The model's config. ~~Config~~ |
|
||||
| **RETURNS** | The pipeline's config. ~~Config~~ |
|
||||
|
||||
### util.load_meta {#util.load_meta tag="function" new="3"}
|
||||
|
||||
Get a model's [`meta.json`](/api/data-formats#meta) from a file path and
|
||||
validate its contents.
|
||||
Get a pipeline's [`meta.json`](/api/data-formats#meta) from a file path and
|
||||
validate its contents. The meta typically includes details about author,
|
||||
licensing, data sources and version.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> meta = util.load_meta("/path/to/model/meta.json")
|
||||
> meta = util.load_meta("/path/to/meta.json")
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ----------------------------------------------------- |
|
||||
| `path` | Path to the model's `meta.json`. ~~Union[str, Path]~~ |
|
||||
| **RETURNS** | The model's meta data. ~~Dict[str, Any]~~ |
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------------------------------- |
|
||||
| `path` | Path to the pipeline's `meta.json`. ~~Union[str, Path]~~ |
|
||||
| **RETURNS** | The pipeline's meta data. ~~Dict[str, Any]~~ |
|
||||
|
||||
### util.get_installed_models {#util.get_installed_models tag="function" new="3"}
|
||||
|
||||
List all model packages installed in the current environment. This will include
|
||||
any spaCy model that was packaged with [`spacy package`](/api/cli#package).
|
||||
Under the hood, model packages expose a Python entry point that spaCy can check,
|
||||
without having to load the model.
|
||||
List all pipeline packages installed in the current environment. This will
|
||||
include any spaCy pipeline that was packaged with
|
||||
[`spacy package`](/api/cli#package). Under the hood, pipeline packages expose a
|
||||
Python entry point that spaCy can check, without having to load the `nlp`
|
||||
object.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> model_names = util.get_installed_models()
|
||||
> names = util.get_installed_models()
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ---------------------------------------------------------------------------------- |
|
||||
| **RETURNS** | The string names of the models installed in the current environment. ~~List[str]~~ |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------------------------- |
|
||||
| **RETURNS** | The string names of the pipelines installed in the current environment. ~~List[str]~~ |
|
||||
|
||||
### util.is_package {#util.is_package tag="function"}
|
||||
|
||||
Check if string maps to a package installed via pip. Mainly used to validate
|
||||
[model packages](/usage/models).
|
||||
[pipeline packages](/usage/models).
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -680,7 +786,8 @@ Check if string maps to a package installed via pip. Mainly used to validate
|
|||
### util.get_package_path {#util.get_package_path tag="function" new="2"}
|
||||
|
||||
Get path to an installed package. Mainly used to resolve the location of
|
||||
[model packages](/usage/models). Currently imports the package to find its path.
|
||||
[pipeline packages](/usage/models). Currently imports the package to find its
|
||||
path.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -689,10 +796,10 @@ Get path to an installed package. Mainly used to resolve the location of
|
|||
> # /usr/lib/python3.6/site-packages/en_core_web_sm
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| -------------- | ----------------------------------------- |
|
||||
| `package_name` | Name of installed package. ~~str~~ |
|
||||
| **RETURNS** | Path to model package directory. ~~Path~~ |
|
||||
| Name | Description |
|
||||
| -------------- | -------------------------------------------- |
|
||||
| `package_name` | Name of installed package. ~~str~~ |
|
||||
| **RETURNS** | Path to pipeline package directory. ~~Path~~ |
|
||||
|
||||
### util.is_in_jupyter {#util.is_in_jupyter tag="function" new="2"}
|
||||
|
||||
|
|
|
@ -25,8 +25,8 @@ work out-of-the-box.
|
|||
|
||||
</Infobox>
|
||||
|
||||
This pipeline component lets you use transformer models in your pipeline.
|
||||
Supports all models that are available via the
|
||||
This pipeline component lets you use transformer models in your pipeline. It
|
||||
supports all models that are available via the
|
||||
[HuggingFace `transformers`](https://huggingface.co/transformers) library.
|
||||
Usually you will connect subsequent components to the shared transformer using
|
||||
the [TransformerListener](/api/architectures#TransformerListener) layer. This
|
||||
|
@ -50,8 +50,8 @@ The default config is defined by the pipeline component factory and describes
|
|||
how the component should be configured. You can override its settings via the
|
||||
`config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your
|
||||
[`config.cfg` for training](/usage/training#config). See the
|
||||
[model architectures](/api/architectures) documentation for details on the
|
||||
architectures and their arguments and hyperparameters.
|
||||
[model architectures](/api/architectures#transformers) documentation for details
|
||||
on the transformer architectures and their arguments and hyperparameters.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -61,11 +61,11 @@ architectures and their arguments and hyperparameters.
|
|||
> nlp.add_pipe("transformer", config=DEFAULT_CONFIG)
|
||||
> ```
|
||||
|
||||
| Setting | Description |
|
||||
| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `max_batch_items` | Maximum size of a padded batch. Defaults to `4096`. ~~int~~ |
|
||||
| `annotation_setter` | Function that takes a batch of `Doc` objects and transformer outputs can set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. Defaults to `null_annotation_setter` (no additional annotations). ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
|
||||
| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Defaults to [TransformerModel](/api/architectures#TransformerModel). ~~Model[List[Doc], FullTransformerBatch]~~ |
|
||||
| Setting | Description |
|
||||
| ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `max_batch_items` | Maximum size of a padded batch. Defaults to `4096`. ~~int~~ |
|
||||
| `annotation_setter` | Function that takes a batch of `Doc` objects and transformer outputs to set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. Defaults to `null_annotation_setter` (no additional annotations). ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
|
||||
| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Defaults to [TransformerModel](/api/architectures#TransformerModel). ~~Model[List[Doc], FullTransformerBatch]~~ |
|
||||
|
||||
```python
|
||||
https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/pipeline_component.py
|
||||
|
@ -102,14 +102,14 @@ attribute. You can also provide a callback to set additional annotations. In
|
|||
your application, you would normally use a shortcut for this and instantiate the
|
||||
component using its string name and [`nlp.add_pipe`](/api/language#create_pipe).
|
||||
|
||||
| Name | Description |
|
||||
| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
||||
| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Usually you will want to use the [TransformerModel](/api/architectures#TransformerModel) layer for this. ~~Model[List[Doc], FullTransformerBatch]~~ |
|
||||
| `annotation_setter` | Function that takes a batch of `Doc` objects and transformer outputs can set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. By default, no annotations are set. ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
||||
| `max_batch_items` | Maximum size of a padded batch. Defaults to `128*32`. ~~int~~ |
|
||||
| Name | Description |
|
||||
| ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
||||
| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Usually you will want to use the [TransformerModel](/api/architectures#TransformerModel) layer for this. ~~Model[List[Doc], FullTransformerBatch]~~ |
|
||||
| `annotation_setter` | Function that takes a batch of `Doc` objects and transformer outputs and stores the annotations on the `Doc`. The `Doc._.trf_data` attribute is set prior to calling the callback. By default, no additional annotations are set. ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
||||
| `max_batch_items` | Maximum size of a padded batch. Defaults to `128*32`. ~~int~~ |
|
||||
|
||||
## Transformer.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
|
@ -383,9 +383,8 @@ return tensors that refer to a whole padded batch of documents. These tensors
|
|||
are wrapped into the
|
||||
[FullTransformerBatch](/api/transformer#fulltransformerbatch) object. The
|
||||
`FullTransformerBatch` then splits out the per-document data, which is handled
|
||||
by this class. Instances of this class
|
||||
are`typically assigned to the [Doc._.trf_data`](/api/transformer#custom-attributes)
|
||||
extension attribute.
|
||||
by this class. Instances of this class are typically assigned to the
|
||||
[`Doc._.trf_data`](/api/transformer#custom-attributes) extension attribute.
|
||||
|
||||
| Name | Description |
|
||||
| --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
|
@ -447,13 +446,14 @@ overlap, and you can also omit sections of the Doc if they are not relevant.
|
|||
|
||||
Span getters can be referenced in the `[components.transformer.model.get_spans]`
|
||||
block of the config to customize the sequences processed by the transformer. You
|
||||
can also register custom span getters using the `@spacy.registry.span_getters`
|
||||
decorator.
|
||||
can also register
|
||||
[custom span getters](/usage/embeddings-transformers#transformers-training-custom-settings)
|
||||
using the `@spacy.registry.span_getters` decorator.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> @spacy.registry.span_getters("sent_spans.v1")
|
||||
> @spacy.registry.span_getters("custom_sent_spans")
|
||||
> def configure_get_sent_spans() -> Callable:
|
||||
> def get_sent_spans(docs: Iterable[Doc]) -> List[List[Span]]:
|
||||
> return [list(doc.sents) for doc in docs]
|
||||
|
@ -472,7 +472,7 @@ decorator.
|
|||
>
|
||||
> ```ini
|
||||
> [transformer.model.get_spans]
|
||||
> @span_getters = "doc_spans.v1"
|
||||
> @span_getters = "spacy-transformers.doc_spans.v1"
|
||||
> ```
|
||||
|
||||
Create a span getter that uses the whole document as its spans. This is the best
|
||||
|
@ -485,7 +485,7 @@ texts.
|
|||
>
|
||||
> ```ini
|
||||
> [transformer.model.get_spans]
|
||||
> @span_getters = "sent_spans.v1"
|
||||
> @span_getters = "spacy-transformers.sent_spans.v1"
|
||||
> ```
|
||||
|
||||
Create a span getter that uses sentence boundary markers to extract the spans.
|
||||
|
@ -500,7 +500,7 @@ more meaningful windows to attend over.
|
|||
>
|
||||
> ```ini
|
||||
> [transformer.model.get_spans]
|
||||
> @span_getters = "strided_spans.v1"
|
||||
> @span_getters = "spacy-transformers.strided_spans.v1"
|
||||
> window = 128
|
||||
> stride = 96
|
||||
> ```
|
||||
|
@ -518,7 +518,7 @@ right context.
|
|||
|
||||
## Annotation setters {#annotation_setters tag="registered functions" source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/annotation_setters.py"}
|
||||
|
||||
Annotation setters are functions that that take a batch of `Doc` objects and a
|
||||
Annotation setters are functions that take a batch of `Doc` objects and a
|
||||
[`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set
|
||||
additional annotations on the `Doc`, e.g. to set custom or built-in attributes.
|
||||
You can register custom annotation setters using the
|
||||
|
@ -551,6 +551,6 @@ The following built-in functions are available:
|
|||
The component sets the following
|
||||
[custom extension attributes](/usage/processing-pipeline#custom-components-attributes):
|
||||
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------ |
|
||||
| `Doc.trf_data` | Transformer tokens and outputs for the `Doc` object. ~~TransformerData~~ |
|
||||
| Name | Description |
|
||||
| ---------------- | ------------------------------------------------------------------------ |
|
||||
| `Doc._.trf_data` | Transformer tokens and outputs for the `Doc` object. ~~TransformerData~~ |
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user