mirror of
https://github.com/explosion/spaCy.git
synced 2025-03-20 09:54:32 +03:00
Merge branch 'develop' into nightly.spacy.io
This commit is contained in:
commit
db84d129b3
|
@ -1,6 +1,6 @@
|
||||||
# fmt: off
|
# fmt: off
|
||||||
__title__ = "spacy-nightly"
|
__title__ = "spacy-nightly"
|
||||||
__version__ = "3.0.0a17"
|
__version__ = "3.0.0a18"
|
||||||
__release__ = True
|
__release__ = True
|
||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
|
|
|
@ -300,7 +300,9 @@ def ensure_pathy(path):
|
||||||
return Pathy(path)
|
return Pathy(path)
|
||||||
|
|
||||||
|
|
||||||
def git_sparse_checkout(repo: str, subpath: str, dest: Path, *, branch: str = "master"):
|
def git_checkout(
|
||||||
|
repo: str, subpath: str, dest: Path, *, branch: str = "master", sparse: bool = False
|
||||||
|
):
|
||||||
git_version = get_git_version()
|
git_version = get_git_version()
|
||||||
if dest.exists():
|
if dest.exists():
|
||||||
msg.fail("Destination of checkout must not exist", exits=1)
|
msg.fail("Destination of checkout must not exist", exits=1)
|
||||||
|
@ -323,11 +325,14 @@ def git_sparse_checkout(repo: str, subpath: str, dest: Path, *, branch: str = "m
|
||||||
# We're using Git and sparse checkout to only clone the files we need
|
# We're using Git and sparse checkout to only clone the files we need
|
||||||
with make_tempdir() as tmp_dir:
|
with make_tempdir() as tmp_dir:
|
||||||
supports_sparse = git_version >= (2, 22)
|
supports_sparse = git_version >= (2, 22)
|
||||||
|
use_sparse = supports_sparse and sparse
|
||||||
# This is the "clone, but don't download anything" part.
|
# This is the "clone, but don't download anything" part.
|
||||||
cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 " f"-b {branch} "
|
cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 " f"-b {branch} "
|
||||||
if supports_sparse:
|
if use_sparse:
|
||||||
cmd += f"--filter=blob:none" # <-- The key bit
|
cmd += f"--filter=blob:none" # <-- The key bit
|
||||||
else:
|
# Only show warnings if the user explicitly wants sparse checkout but
|
||||||
|
# the Git version doesn't support it
|
||||||
|
elif sparse:
|
||||||
err_old = (
|
err_old = (
|
||||||
f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) "
|
f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) "
|
||||||
f"that doesn't fully support sparse checkout yet."
|
f"that doesn't fully support sparse checkout yet."
|
||||||
|
@ -342,19 +347,19 @@ def git_sparse_checkout(repo: str, subpath: str, dest: Path, *, branch: str = "m
|
||||||
try_run_command(cmd)
|
try_run_command(cmd)
|
||||||
# Now we need to find the missing filenames for the subpath we want.
|
# Now we need to find the missing filenames for the subpath we want.
|
||||||
# Looking for this 'rev-list' command in the git --help? Hah.
|
# Looking for this 'rev-list' command in the git --help? Hah.
|
||||||
cmd = f"git -C {tmp_dir} rev-list --objects --all {'--missing=print ' if supports_sparse else ''} -- {subpath}"
|
cmd = f"git -C {tmp_dir} rev-list --objects --all {'--missing=print ' if use_sparse else ''} -- {subpath}"
|
||||||
ret = try_run_command(cmd)
|
ret = try_run_command(cmd)
|
||||||
git_repo = _from_http_to_git(repo)
|
git_repo = _from_http_to_git(repo)
|
||||||
# Now pass those missings into another bit of git internals
|
# Now pass those missings into another bit of git internals
|
||||||
missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")])
|
missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")])
|
||||||
if supports_sparse and not missings:
|
if use_sparse and not missings:
|
||||||
err = (
|
err = (
|
||||||
f"Could not find any relevant files for '{subpath}'. "
|
f"Could not find any relevant files for '{subpath}'. "
|
||||||
f"Did you specify a correct and complete path within repo '{repo}' "
|
f"Did you specify a correct and complete path within repo '{repo}' "
|
||||||
f"and branch {branch}?"
|
f"and branch {branch}?"
|
||||||
)
|
)
|
||||||
msg.fail(err, exits=1)
|
msg.fail(err, exits=1)
|
||||||
if supports_sparse:
|
if use_sparse:
|
||||||
cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}"
|
cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}"
|
||||||
try_run_command(cmd)
|
try_run_command(cmd)
|
||||||
# And finally, we can checkout our subpath
|
# And finally, we can checkout our subpath
|
||||||
|
|
|
@ -6,14 +6,15 @@ import shutil
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
from ...util import ensure_path, working_dir
|
from ...util import ensure_path, working_dir
|
||||||
from .._util import project_cli, Arg, PROJECT_FILE, load_project_config, get_checksum
|
from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config
|
||||||
from .._util import download_file, git_sparse_checkout, get_git_version
|
from .._util import get_checksum, download_file, git_checkout, get_git_version
|
||||||
|
|
||||||
|
|
||||||
@project_cli.command("assets")
|
@project_cli.command("assets")
|
||||||
def project_assets_cli(
|
def project_assets_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
|
project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
|
||||||
|
sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse checkout for assets provided via Git, to only check out and clone the files needed. Requires Git v22.2+.")
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""Fetch project assets like datasets and pretrained weights. Assets are
|
"""Fetch project assets like datasets and pretrained weights. Assets are
|
||||||
|
@ -23,10 +24,10 @@ def project_assets_cli(
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/cli#project-assets
|
DOCS: https://nightly.spacy.io/api/cli#project-assets
|
||||||
"""
|
"""
|
||||||
project_assets(project_dir)
|
project_assets(project_dir, sparse_checkout=sparse_checkout)
|
||||||
|
|
||||||
|
|
||||||
def project_assets(project_dir: Path) -> None:
|
def project_assets(project_dir: Path, *, sparse_checkout: bool = False) -> None:
|
||||||
"""Fetch assets for a project using DVC if possible.
|
"""Fetch assets for a project using DVC if possible.
|
||||||
|
|
||||||
project_dir (Path): Path to project directory.
|
project_dir (Path): Path to project directory.
|
||||||
|
@ -58,11 +59,12 @@ def project_assets(project_dir: Path) -> None:
|
||||||
shutil.rmtree(dest)
|
shutil.rmtree(dest)
|
||||||
else:
|
else:
|
||||||
dest.unlink()
|
dest.unlink()
|
||||||
git_sparse_checkout(
|
git_checkout(
|
||||||
asset["git"]["repo"],
|
asset["git"]["repo"],
|
||||||
asset["git"]["path"],
|
asset["git"]["path"],
|
||||||
dest,
|
dest,
|
||||||
branch=asset["git"].get("branch"),
|
branch=asset["git"].get("branch"),
|
||||||
|
sparse=sparse_checkout,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
url = asset.get("url")
|
url = asset.get("url")
|
||||||
|
|
|
@ -7,7 +7,7 @@ import re
|
||||||
from ... import about
|
from ... import about
|
||||||
from ...util import ensure_path
|
from ...util import ensure_path
|
||||||
from .._util import project_cli, Arg, Opt, COMMAND, PROJECT_FILE
|
from .._util import project_cli, Arg, Opt, COMMAND, PROJECT_FILE
|
||||||
from .._util import git_sparse_checkout, get_git_version
|
from .._util import git_checkout, get_git_version
|
||||||
|
|
||||||
|
|
||||||
@project_cli.command("clone")
|
@project_cli.command("clone")
|
||||||
|
@ -16,7 +16,8 @@ def project_clone_cli(
|
||||||
name: str = Arg(..., help="The name of the template to clone"),
|
name: str = Arg(..., help="The name of the template to clone"),
|
||||||
dest: Optional[Path] = Arg(None, help="Where to clone the project. Defaults to current working directory", exists=False),
|
dest: Optional[Path] = Arg(None, help="Where to clone the project. Defaults to current working directory", exists=False),
|
||||||
repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to clone from"),
|
repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to clone from"),
|
||||||
branch: str = Opt(about.__projects_branch__, "--branch", "-b", help="The branch to clone from")
|
branch: str = Opt(about.__projects_branch__, "--branch", "-b", help="The branch to clone from"),
|
||||||
|
sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse Git checkout to only check out and clone the files needed. Requires Git v22.2+.")
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""Clone a project template from a repository. Calls into "git" and will
|
"""Clone a project template from a repository. Calls into "git" and will
|
||||||
|
@ -28,7 +29,7 @@ def project_clone_cli(
|
||||||
"""
|
"""
|
||||||
if dest is None:
|
if dest is None:
|
||||||
dest = Path.cwd() / Path(name).parts[-1]
|
dest = Path.cwd() / Path(name).parts[-1]
|
||||||
project_clone(name, dest, repo=repo, branch=branch)
|
project_clone(name, dest, repo=repo, branch=branch, sparse_checkout=sparse_checkout)
|
||||||
|
|
||||||
|
|
||||||
def project_clone(
|
def project_clone(
|
||||||
|
@ -37,6 +38,7 @@ def project_clone(
|
||||||
*,
|
*,
|
||||||
repo: str = about.__projects__,
|
repo: str = about.__projects__,
|
||||||
branch: str = about.__projects_branch__,
|
branch: str = about.__projects_branch__,
|
||||||
|
sparse_checkout: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Clone a project template from a repository.
|
"""Clone a project template from a repository.
|
||||||
|
|
||||||
|
@ -50,7 +52,7 @@ def project_clone(
|
||||||
project_dir = dest.resolve()
|
project_dir = dest.resolve()
|
||||||
repo_name = re.sub(r"(http(s?)):\/\/github.com/", "", repo)
|
repo_name = re.sub(r"(http(s?)):\/\/github.com/", "", repo)
|
||||||
try:
|
try:
|
||||||
git_sparse_checkout(repo, name, dest, branch=branch)
|
git_checkout(repo, name, dest, branch=branch, sparse=sparse_checkout)
|
||||||
except subprocess.CalledProcessError:
|
except subprocess.CalledProcessError:
|
||||||
err = f"Could not clone '{name}' from repo '{repo_name}'"
|
err = f"Could not clone '{name}' from repo '{repo_name}'"
|
||||||
msg.fail(err, exits=1)
|
msg.fail(err, exits=1)
|
||||||
|
|
|
@ -89,7 +89,6 @@ def train(
|
||||||
nlp, config = util.load_model_from_config(config)
|
nlp, config = util.load_model_from_config(config)
|
||||||
if config["training"]["vectors"] is not None:
|
if config["training"]["vectors"] is not None:
|
||||||
util.load_vectors_into_model(nlp, config["training"]["vectors"])
|
util.load_vectors_into_model(nlp, config["training"]["vectors"])
|
||||||
verify_config(nlp)
|
|
||||||
raw_text, tag_map, morph_rules, weights_data = load_from_paths(config)
|
raw_text, tag_map, morph_rules, weights_data = load_from_paths(config)
|
||||||
T_cfg = config["training"]
|
T_cfg = config["training"]
|
||||||
optimizer = T_cfg["optimizer"]
|
optimizer = T_cfg["optimizer"]
|
||||||
|
@ -108,6 +107,8 @@ def train(
|
||||||
nlp.resume_training(sgd=optimizer)
|
nlp.resume_training(sgd=optimizer)
|
||||||
with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
|
with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
|
||||||
nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
|
nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
|
||||||
|
# Verify the config after calling 'begin_training' to ensure labels are properly initialized
|
||||||
|
verify_config(nlp)
|
||||||
|
|
||||||
if tag_map:
|
if tag_map:
|
||||||
# Replace tag map with provided mapping
|
# Replace tag map with provided mapping
|
||||||
|
@ -401,7 +402,7 @@ def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> No
|
||||||
|
|
||||||
|
|
||||||
def verify_config(nlp: Language) -> None:
|
def verify_config(nlp: Language) -> None:
|
||||||
"""Perform additional checks based on the config and loaded nlp object."""
|
"""Perform additional checks based on the config, loaded nlp object and training data."""
|
||||||
# TODO: maybe we should validate based on the actual components, the list
|
# TODO: maybe we should validate based on the actual components, the list
|
||||||
# in config["nlp"]["pipeline"] instead?
|
# in config["nlp"]["pipeline"] instead?
|
||||||
for pipe_config in nlp.config["components"].values():
|
for pipe_config in nlp.config["components"].values():
|
||||||
|
@ -415,18 +416,13 @@ def verify_textcat_config(nlp: Language, pipe_config: Dict[str, Any]) -> None:
|
||||||
# if 'positive_label' is provided: double check whether it's in the data and
|
# if 'positive_label' is provided: double check whether it's in the data and
|
||||||
# the task is binary
|
# the task is binary
|
||||||
if pipe_config.get("positive_label"):
|
if pipe_config.get("positive_label"):
|
||||||
textcat_labels = nlp.get_pipe("textcat").cfg.get("labels", [])
|
textcat_labels = nlp.get_pipe("textcat").labels
|
||||||
pos_label = pipe_config.get("positive_label")
|
pos_label = pipe_config.get("positive_label")
|
||||||
if pos_label not in textcat_labels:
|
if pos_label not in textcat_labels:
|
||||||
msg.fail(
|
raise ValueError(
|
||||||
f"The textcat's 'positive_label' config setting '{pos_label}' "
|
Errors.E920.format(pos_label=pos_label, labels=textcat_labels)
|
||||||
f"does not match any label in the training data.",
|
|
||||||
exits=1,
|
|
||||||
)
|
)
|
||||||
if len(textcat_labels) != 2:
|
if len(list(textcat_labels)) != 2:
|
||||||
msg.fail(
|
raise ValueError(
|
||||||
f"A textcat 'positive_label' '{pos_label}' was "
|
Errors.E919.format(pos_label=pos_label, labels=textcat_labels)
|
||||||
f"provided for training data that does not appear to be a "
|
|
||||||
f"binary classification problem with two labels.",
|
|
||||||
exits=1,
|
|
||||||
)
|
)
|
||||||
|
|
|
@ -480,6 +480,11 @@ class Errors:
|
||||||
E201 = ("Span index out of range.")
|
E201 = ("Span index out of range.")
|
||||||
|
|
||||||
# TODO: fix numbering after merging develop into master
|
# TODO: fix numbering after merging develop into master
|
||||||
|
E919 = ("A textcat 'positive_label' '{pos_label}' was provided for training "
|
||||||
|
"data that does not appear to be a binary classification problem "
|
||||||
|
"with two labels. Labels found: {labels}")
|
||||||
|
E920 = ("The textcat's 'positive_label' config setting '{pos_label}' "
|
||||||
|
"does not match any label in the training data. Labels found: {labels}")
|
||||||
E921 = ("The method 'set_output' can only be called on components that have "
|
E921 = ("The method 'set_output' can only be called on components that have "
|
||||||
"a Model with a 'resize_output' attribute. Otherwise, the output "
|
"a Model with a 'resize_output' attribute. Otherwise, the output "
|
||||||
"layer can not be dynamically changed.")
|
"layer can not be dynamically changed.")
|
||||||
|
|
|
@ -56,7 +56,12 @@ subword_features = true
|
||||||
@Language.factory(
|
@Language.factory(
|
||||||
"textcat",
|
"textcat",
|
||||||
assigns=["doc.cats"],
|
assigns=["doc.cats"],
|
||||||
default_config={"labels": [], "threshold": 0.5, "model": DEFAULT_TEXTCAT_MODEL},
|
default_config={
|
||||||
|
"labels": [],
|
||||||
|
"threshold": 0.5,
|
||||||
|
"positive_label": None,
|
||||||
|
"model": DEFAULT_TEXTCAT_MODEL,
|
||||||
|
},
|
||||||
scores=[
|
scores=[
|
||||||
"cats_score",
|
"cats_score",
|
||||||
"cats_score_desc",
|
"cats_score_desc",
|
||||||
|
@ -74,8 +79,9 @@ def make_textcat(
|
||||||
nlp: Language,
|
nlp: Language,
|
||||||
name: str,
|
name: str,
|
||||||
model: Model[List[Doc], List[Floats2d]],
|
model: Model[List[Doc], List[Floats2d]],
|
||||||
labels: Iterable[str],
|
labels: List[str],
|
||||||
threshold: float,
|
threshold: float,
|
||||||
|
positive_label: Optional[str],
|
||||||
) -> "TextCategorizer":
|
) -> "TextCategorizer":
|
||||||
"""Create a TextCategorizer compoment. The text categorizer predicts categories
|
"""Create a TextCategorizer compoment. The text categorizer predicts categories
|
||||||
over a whole document. It can learn one or more labels, and the labels can
|
over a whole document. It can learn one or more labels, and the labels can
|
||||||
|
@ -88,8 +94,16 @@ def make_textcat(
|
||||||
labels (list): A list of categories to learn. If empty, the model infers the
|
labels (list): A list of categories to learn. If empty, the model infers the
|
||||||
categories from the data.
|
categories from the data.
|
||||||
threshold (float): Cutoff to consider a prediction "positive".
|
threshold (float): Cutoff to consider a prediction "positive".
|
||||||
|
positive_label (Optional[str]): The positive label for a binary task with exclusive classes, None otherwise.
|
||||||
"""
|
"""
|
||||||
return TextCategorizer(nlp.vocab, model, name, labels=labels, threshold=threshold)
|
return TextCategorizer(
|
||||||
|
nlp.vocab,
|
||||||
|
model,
|
||||||
|
name,
|
||||||
|
labels=labels,
|
||||||
|
threshold=threshold,
|
||||||
|
positive_label=positive_label,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class TextCategorizer(Pipe):
|
class TextCategorizer(Pipe):
|
||||||
|
@ -104,8 +118,9 @@ class TextCategorizer(Pipe):
|
||||||
model: Model,
|
model: Model,
|
||||||
name: str = "textcat",
|
name: str = "textcat",
|
||||||
*,
|
*,
|
||||||
labels: Iterable[str],
|
labels: List[str],
|
||||||
threshold: float,
|
threshold: float,
|
||||||
|
positive_label: Optional[str],
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize a text categorizer.
|
"""Initialize a text categorizer.
|
||||||
|
|
||||||
|
@ -113,8 +128,9 @@ class TextCategorizer(Pipe):
|
||||||
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
||||||
name (str): The component instance name, used to add entries to the
|
name (str): The component instance name, used to add entries to the
|
||||||
losses during training.
|
losses during training.
|
||||||
labels (Iterable[str]): The labels to use.
|
labels (List[str]): The labels to use.
|
||||||
threshold (float): Cutoff to consider a prediction "positive".
|
threshold (float): Cutoff to consider a prediction "positive".
|
||||||
|
positive_label (Optional[str]): The positive label for a binary task with exclusive classes, None otherwise.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/textcategorizer#init
|
DOCS: https://nightly.spacy.io/api/textcategorizer#init
|
||||||
"""
|
"""
|
||||||
|
@ -122,7 +138,11 @@ class TextCategorizer(Pipe):
|
||||||
self.model = model
|
self.model = model
|
||||||
self.name = name
|
self.name = name
|
||||||
self._rehearsal_model = None
|
self._rehearsal_model = None
|
||||||
cfg = {"labels": labels, "threshold": threshold}
|
cfg = {
|
||||||
|
"labels": labels,
|
||||||
|
"threshold": threshold,
|
||||||
|
"positive_label": positive_label,
|
||||||
|
}
|
||||||
self.cfg = dict(cfg)
|
self.cfg = dict(cfg)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -131,10 +151,10 @@ class TextCategorizer(Pipe):
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/textcategorizer#labels
|
DOCS: https://nightly.spacy.io/api/textcategorizer#labels
|
||||||
"""
|
"""
|
||||||
return tuple(self.cfg.setdefault("labels", []))
|
return tuple(self.cfg["labels"])
|
||||||
|
|
||||||
@labels.setter
|
@labels.setter
|
||||||
def labels(self, value: Iterable[str]) -> None:
|
def labels(self, value: List[str]) -> None:
|
||||||
self.cfg["labels"] = tuple(value)
|
self.cfg["labels"] = tuple(value)
|
||||||
|
|
||||||
def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
|
def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
|
||||||
|
@ -353,17 +373,10 @@ class TextCategorizer(Pipe):
|
||||||
sgd = self.create_optimizer()
|
sgd = self.create_optimizer()
|
||||||
return sgd
|
return sgd
|
||||||
|
|
||||||
def score(
|
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||||
self,
|
|
||||||
examples: Iterable[Example],
|
|
||||||
*,
|
|
||||||
positive_label: Optional[str] = None,
|
|
||||||
**kwargs,
|
|
||||||
) -> Dict[str, Any]:
|
|
||||||
"""Score a batch of examples.
|
"""Score a batch of examples.
|
||||||
|
|
||||||
examples (Iterable[Example]): The examples to score.
|
examples (Iterable[Example]): The examples to score.
|
||||||
positive_label (str): Optional positive label.
|
|
||||||
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats.
|
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/textcategorizer#score
|
DOCS: https://nightly.spacy.io/api/textcategorizer#score
|
||||||
|
@ -374,7 +387,7 @@ class TextCategorizer(Pipe):
|
||||||
"cats",
|
"cats",
|
||||||
labels=self.labels,
|
labels=self.labels,
|
||||||
multi_label=self.model.attrs["multi_label"],
|
multi_label=self.model.attrs["multi_label"],
|
||||||
positive_label=positive_label,
|
positive_label=self.cfg["positive_label"],
|
||||||
threshold=self.cfg["threshold"],
|
threshold=self.cfg["threshold"],
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
|
@ -10,6 +10,7 @@ from spacy.tokens import Doc
|
||||||
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
|
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
|
||||||
|
|
||||||
from ..util import make_tempdir
|
from ..util import make_tempdir
|
||||||
|
from ...cli.train import verify_textcat_config
|
||||||
from ...training import Example
|
from ...training import Example
|
||||||
|
|
||||||
|
|
||||||
|
@ -130,7 +131,10 @@ def test_overfitting_IO():
|
||||||
fix_random_seed(0)
|
fix_random_seed(0)
|
||||||
nlp = English()
|
nlp = English()
|
||||||
# Set exclusive labels
|
# Set exclusive labels
|
||||||
textcat = nlp.add_pipe("textcat", config={"model": {"exclusive_classes": True}})
|
textcat = nlp.add_pipe(
|
||||||
|
"textcat",
|
||||||
|
config={"model": {"exclusive_classes": True}, "positive_label": "POSITIVE"},
|
||||||
|
)
|
||||||
train_examples = []
|
train_examples = []
|
||||||
for text, annotations in TRAIN_DATA:
|
for text, annotations in TRAIN_DATA:
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||||
|
@ -159,7 +163,7 @@ def test_overfitting_IO():
|
||||||
assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.001)
|
assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.001)
|
||||||
|
|
||||||
# Test scoring
|
# Test scoring
|
||||||
scores = nlp.evaluate(train_examples, scorer_cfg={"positive_label": "POSITIVE"})
|
scores = nlp.evaluate(train_examples)
|
||||||
assert scores["cats_micro_f"] == 1.0
|
assert scores["cats_micro_f"] == 1.0
|
||||||
assert scores["cats_score"] == 1.0
|
assert scores["cats_score"] == 1.0
|
||||||
assert "cats_score_desc" in scores
|
assert "cats_score_desc" in scores
|
||||||
|
@ -194,3 +198,29 @@ def test_textcat_configs(textcat_config):
|
||||||
for i in range(5):
|
for i in range(5):
|
||||||
losses = {}
|
losses = {}
|
||||||
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
||||||
|
|
||||||
|
|
||||||
|
def test_positive_class():
|
||||||
|
nlp = English()
|
||||||
|
pipe_config = {"positive_label": "POS", "labels": ["POS", "NEG"]}
|
||||||
|
textcat = nlp.add_pipe("textcat", config=pipe_config)
|
||||||
|
assert textcat.labels == ("POS", "NEG")
|
||||||
|
verify_textcat_config(nlp, pipe_config)
|
||||||
|
|
||||||
|
|
||||||
|
def test_positive_class_not_present():
|
||||||
|
nlp = English()
|
||||||
|
pipe_config = {"positive_label": "POS", "labels": ["SOME", "THING"]}
|
||||||
|
textcat = nlp.add_pipe("textcat", config=pipe_config)
|
||||||
|
assert textcat.labels == ("SOME", "THING")
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
verify_textcat_config(nlp, pipe_config)
|
||||||
|
|
||||||
|
|
||||||
|
def test_positive_class_not_binary():
|
||||||
|
nlp = English()
|
||||||
|
pipe_config = {"positive_label": "POS", "labels": ["SOME", "THING", "POS"]}
|
||||||
|
textcat = nlp.add_pipe("textcat", config=pipe_config)
|
||||||
|
assert textcat.labels == ("SOME", "THING", "POS")
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
verify_textcat_config(nlp, pipe_config)
|
||||||
|
|
|
@ -136,7 +136,7 @@ def test_serialize_textcat_empty(en_vocab):
|
||||||
# See issue #1105
|
# See issue #1105
|
||||||
cfg = {"model": DEFAULT_TEXTCAT_MODEL}
|
cfg = {"model": DEFAULT_TEXTCAT_MODEL}
|
||||||
model = registry.make_from_config(cfg, validate=True)["model"]
|
model = registry.make_from_config(cfg, validate=True)["model"]
|
||||||
textcat = TextCategorizer(en_vocab, model, labels=["ENTITY", "ACTION", "MODIFIER"], threshold=0.5)
|
textcat = TextCategorizer(en_vocab, model, labels=["ENTITY", "ACTION", "MODIFIER"], threshold=0.5, positive_label=None)
|
||||||
textcat.to_bytes(exclude=["vocab"])
|
textcat.to_bytes(exclude=["vocab"])
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -137,7 +137,7 @@ def test_cli_converters_conllu2json_subtokens():
|
||||||
assert biluo_tags == ["O", "U-PER", "O", "O"]
|
assert biluo_tags == ["O", "U-PER", "O", "O"]
|
||||||
|
|
||||||
|
|
||||||
def test_cli_converters_iob2json(en_vocab):
|
def test_cli_converters_iob2json():
|
||||||
lines = [
|
lines = [
|
||||||
"I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O",
|
"I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O",
|
||||||
"I|O like|O London|B-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O",
|
"I|O like|O London|B-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O",
|
||||||
|
@ -145,7 +145,7 @@ def test_cli_converters_iob2json(en_vocab):
|
||||||
"I|PRP|O like|VBP|O London|NNP|B-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O",
|
"I|PRP|O like|VBP|O London|NNP|B-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O",
|
||||||
]
|
]
|
||||||
input_data = "\n".join(lines)
|
input_data = "\n".join(lines)
|
||||||
converted_docs = iob2docs(input_data, en_vocab, n_sents=10)
|
converted_docs = iob2docs(input_data, n_sents=10)
|
||||||
assert len(converted_docs) == 1
|
assert len(converted_docs) == 1
|
||||||
converted = docs_to_json(converted_docs)
|
converted = docs_to_json(converted_docs)
|
||||||
assert converted["id"] == 0
|
assert converted["id"] == 0
|
||||||
|
|
|
@ -1,12 +1,13 @@
|
||||||
from wasabi import Printer
|
from wasabi import Printer
|
||||||
|
|
||||||
from .conll_ner2docs import n_sents_info
|
from .conll_ner2docs import n_sents_info
|
||||||
|
from ...vocab import Vocab
|
||||||
from ...training import iob_to_biluo, tags_to_entities
|
from ...training import iob_to_biluo, tags_to_entities
|
||||||
from ...tokens import Doc, Span
|
from ...tokens import Doc, Span
|
||||||
from ...util import minibatch
|
from ...util import minibatch
|
||||||
|
|
||||||
|
|
||||||
def iob2docs(input_data, vocab, n_sents=10, no_print=False, *args, **kwargs):
|
def iob2docs(input_data, n_sents=10, no_print=False, *args, **kwargs):
|
||||||
"""
|
"""
|
||||||
Convert IOB files with one sentence per line and tags separated with '|'
|
Convert IOB files with one sentence per line and tags separated with '|'
|
||||||
into Doc objects so they can be saved. IOB and IOB2 are accepted.
|
into Doc objects so they can be saved. IOB and IOB2 are accepted.
|
||||||
|
@ -18,6 +19,7 @@ def iob2docs(input_data, vocab, n_sents=10, no_print=False, *args, **kwargs):
|
||||||
I|PRP|O like|VBP|O London|NNP|I-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O
|
I|PRP|O like|VBP|O London|NNP|I-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O
|
||||||
I|PRP|O like|VBP|O London|NNP|B-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O
|
I|PRP|O like|VBP|O London|NNP|B-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O
|
||||||
"""
|
"""
|
||||||
|
vocab = Vocab() # need vocab to make a minimal Doc
|
||||||
msg = Printer(no_print=no_print)
|
msg = Printer(no_print=no_print)
|
||||||
if n_sents > 0:
|
if n_sents > 0:
|
||||||
n_sents_info(msg, n_sents)
|
n_sents_info(msg, n_sents)
|
||||||
|
|
|
@ -22,9 +22,10 @@ def create_docbin_reader(
|
||||||
) -> Callable[["Language"], Iterable[Example]]:
|
) -> Callable[["Language"], Iterable[Example]]:
|
||||||
return Corpus(path, gold_preproc=gold_preproc, max_length=max_length, limit=limit)
|
return Corpus(path, gold_preproc=gold_preproc, max_length=max_length, limit=limit)
|
||||||
|
|
||||||
|
|
||||||
@util.registry.readers("spacy.JsonlReader.v1")
|
@util.registry.readers("spacy.JsonlReader.v1")
|
||||||
def create_jsonl_reader(
|
def create_jsonl_reader(
|
||||||
path: Path, min_length: int=0, max_length: int = 0, limit: int = 0
|
path: Path, min_length: int = 0, max_length: int = 0, limit: int = 0
|
||||||
) -> Callable[["Language"], Iterable[Doc]]:
|
) -> Callable[["Language"], Iterable[Doc]]:
|
||||||
return JsonlTexts(path, min_length=min_length, max_length=max_length, limit=limit)
|
return JsonlTexts(path, min_length=min_length, max_length=max_length, limit=limit)
|
||||||
|
|
||||||
|
@ -52,7 +53,6 @@ def walk_corpus(path: Union[str, Path], file_type) -> List[Path]:
|
||||||
return locs
|
return locs
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class Corpus:
|
class Corpus:
|
||||||
"""Iterate Example objects from a file or directory of DocBin (.spacy)
|
"""Iterate Example objects from a file or directory of DocBin (.spacy)
|
||||||
formatted data files.
|
formatted data files.
|
||||||
|
@ -162,20 +162,21 @@ class Corpus:
|
||||||
|
|
||||||
|
|
||||||
class JsonlTexts:
|
class JsonlTexts:
|
||||||
"""Iterate Doc objects from a file or directory of jsonl
|
"""Iterate Doc objects from a file or directory of jsonl
|
||||||
formatted raw text files.
|
formatted raw text files.
|
||||||
|
|
||||||
path (Path): The directory or filename to read from.
|
path (Path): The directory or filename to read from.
|
||||||
min_length (int): Minimum document length (in tokens). Shorter documents
|
min_length (int): Minimum document length (in tokens). Shorter documents
|
||||||
will be skipped. Defaults to 0, which indicates no limit.
|
will be skipped. Defaults to 0, which indicates no limit.
|
||||||
|
|
||||||
max_length (int): Maximum document length (in tokens). Longer documents will
|
max_length (int): Maximum document length (in tokens). Longer documents will
|
||||||
be skipped. Defaults to 0, which indicates no limit.
|
be skipped. Defaults to 0, which indicates no limit.
|
||||||
limit (int): Limit corpus to a subset of examples, e.g. for debugging.
|
limit (int): Limit corpus to a subset of examples, e.g. for debugging.
|
||||||
Defaults to 0, which indicates no limit.
|
Defaults to 0, which indicates no limit.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/corpus
|
DOCS: https://nightly.spacy.io/api/corpus#jsonltexts
|
||||||
"""
|
"""
|
||||||
|
|
||||||
file_type = "jsonl"
|
file_type = "jsonl"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
|
@ -195,9 +196,9 @@ class JsonlTexts:
|
||||||
"""Yield examples from the data.
|
"""Yield examples from the data.
|
||||||
|
|
||||||
nlp (Language): The current nlp object.
|
nlp (Language): The current nlp object.
|
||||||
YIELDS (Doc): The docs.
|
YIELDS (Example): The example objects.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/corpus#call
|
DOCS: https://nightly.spacy.io/api/corpus#jsonltexts-call
|
||||||
"""
|
"""
|
||||||
for loc in walk_corpus(self.path, "jsonl"):
|
for loc in walk_corpus(self.path, "jsonl"):
|
||||||
records = srsly.read_jsonl(loc)
|
records = srsly.read_jsonl(loc)
|
||||||
|
|
|
@ -680,7 +680,10 @@ def run_command(
|
||||||
Errors.E970.format(str_command=" ".join(command), tool=command[0])
|
Errors.E970.format(str_command=" ".join(command), tool=command[0])
|
||||||
) from None
|
) from None
|
||||||
except subprocess.CalledProcessError as e:
|
except subprocess.CalledProcessError as e:
|
||||||
# We don't want a duplicate traceback here
|
# We don't want a duplicate traceback here so we're making sure the
|
||||||
|
# CalledProcessError isn't re-raised. We also print both the string
|
||||||
|
# message and the stderr, in case the error only has one of them.
|
||||||
|
print(e.stderr)
|
||||||
print(e)
|
print(e)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
if ret.returncode != 0:
|
if ret.returncode != 0:
|
||||||
|
|
|
@ -791,20 +791,19 @@ auto-generated by setting `--pretraining` on
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
```cli
|
```cli
|
||||||
$ python -m spacy pretrain [texts_loc] [output_dir] [config_path] [--code] [--resume-path] [--epoch-resume] [overrides]
|
$ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [--epoch-resume] [overrides]
|
||||||
```
|
```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `texts_loc` | Path to JSONL file with raw texts to learn from, with text provided as the key `"text"` or tokens as the key `"tokens"`. [See here](/api/data-formats#pretrain) for details. ~~Path (positional)~~ |
|
| `output_dir` | Directory to save binary weights to on each epoch. ~~Path (positional)~~ |
|
||||||
| `output_dir` | Directory to save binary weights to on each epoch. ~~Path (positional)~~ |
|
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
|
||||||
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
|
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||||
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
| `--resume-path`, `-r` | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~ |
|
||||||
| `--resume-path`, `-r` | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~ |
|
| `--epoch-resume`, `-er` | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. ~~Optional[int] \(option)~~ |
|
||||||
| `--epoch-resume`, `-er` | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. ~~Optional[int] \(option)~~ |
|
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.dropout 0.2`. ~~Any (option/flag)~~ |
|
||||||
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.dropout 0.2`. ~~Any (option/flag)~~ |
|
| **CREATES** | The pretrained weights that can be used to initialize `spacy train`. |
|
||||||
| **CREATES** | The pretrained weights that can be used to initialize `spacy train`. |
|
|
||||||
|
|
||||||
## evaluate {#evaluate new="2" tag="command"}
|
## evaluate {#evaluate new="2" tag="command"}
|
||||||
|
|
||||||
|
@ -886,8 +885,8 @@ deploying custom spaCy pipelines.
|
||||||
### project clone {#project-clone tag="command"}
|
### project clone {#project-clone tag="command"}
|
||||||
|
|
||||||
Clone a project template from a Git repository. Calls into `git` under the hood
|
Clone a project template from a Git repository. Calls into `git` under the hood
|
||||||
and uses the sparse checkout feature, so you're only downloading what you need.
|
and can use the sparse checkout feature if available, so you're only downloading
|
||||||
By default, spaCy's
|
what you need. By default, spaCy's
|
||||||
[project templates repo](https://github.com/explosion/projects) is used, but you
|
[project templates repo](https://github.com/explosion/projects) is used, but you
|
||||||
can provide any other repo (public or private) that you have access to using the
|
can provide any other repo (public or private) that you have access to using the
|
||||||
`--repo` option.
|
`--repo` option.
|
||||||
|
@ -895,7 +894,7 @@ can provide any other repo (public or private) that you have access to using the
|
||||||
<!-- TODO: update example once we've decided on repo structure -->
|
<!-- TODO: update example once we've decided on repo structure -->
|
||||||
|
|
||||||
```cli
|
```cli
|
||||||
$ python -m spacy project clone [name] [dest] [--repo] [--branch]
|
$ python -m spacy project clone [name] [dest] [--repo] [--branch] [--sparse]
|
||||||
```
|
```
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
|
@ -910,14 +909,15 @@ $ python -m spacy project clone [name] [dest] [--repo] [--branch]
|
||||||
> $ python -m spacy project clone template --repo https://github.com/your_org/your_repo
|
> $ python -m spacy project clone template --repo https://github.com/your_org/your_repo
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `name` | The name of the template to clone, relative to the repo. Can be a top-level directory or a subdirectory like `dir/template`. ~~str (positional)~~ |
|
| `name` | The name of the template to clone, relative to the repo. Can be a top-level directory or a subdirectory like `dir/template`. ~~str (positional)~~ |
|
||||||
| `dest` | Where to clone the project. Defaults to current working directory. ~~Path (positional)~~ |
|
| `dest` | Where to clone the project. Defaults to current working directory. ~~Path (positional)~~ |
|
||||||
| `--repo`, `-r` | The repository to clone from. Can be any public or private Git repo you have access to. ~~str (option)~~ |
|
| `--repo`, `-r` | The repository to clone from. Can be any public or private Git repo you have access to. ~~str (option)~~ |
|
||||||
| `--branch`, `-b` | The branch to clone from. Defaults to `master`. ~~str (option)~~ |
|
| `--branch`, `-b` | The branch to clone from. Defaults to `master`. ~~str (option)~~ |
|
||||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
| `--sparse`, `-S` | Enable [sparse checkout](https://git-scm.com/docs/git-sparse-checkout) to only check out and download what's needed. Requires Git v22.2+. ~~bool (flag)~~ |
|
||||||
| **CREATES** | The cloned [project directory](/usage/projects#project-files). |
|
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||||
|
| **CREATES** | The cloned [project directory](/usage/projects#project-files). |
|
||||||
|
|
||||||
### project assets {#project-assets tag="command"}
|
### project assets {#project-assets tag="command"}
|
||||||
|
|
||||||
|
@ -937,14 +937,15 @@ $ python -m spacy project assets [project_dir]
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```cli
|
> ```cli
|
||||||
> $ python -m spacy project assets
|
> $ python -m spacy project assets [--sparse]
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------- | --------------------------------------------------------------------------------------- |
|
| ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ |
|
| `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ |
|
||||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
| `--sparse`, `-S` | Enable [sparse checkout](https://git-scm.com/docs/git-sparse-checkout) to only check out and download what's needed. Requires Git v22.2+. ~~bool (flag)~~ |
|
||||||
| **CREATES** | Downloaded or copied assets defined in the `project.yml`. |
|
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||||
|
| **CREATES** | Downloaded or copied assets defined in the `project.yml`. |
|
||||||
|
|
||||||
### project run {#project-run tag="command"}
|
### project run {#project-run tag="command"}
|
||||||
|
|
||||||
|
|
|
@ -94,3 +94,79 @@ Yield examples from the data.
|
||||||
| ---------- | -------------------------------------- |
|
| ---------- | -------------------------------------- |
|
||||||
| `nlp` | The current `nlp` object. ~~Language~~ |
|
| `nlp` | The current `nlp` object. ~~Language~~ |
|
||||||
| **YIELDS** | The examples. ~~Example~~ |
|
| **YIELDS** | The examples. ~~Example~~ |
|
||||||
|
|
||||||
|
## JsonlTexts {#jsonltexts tag="class"}
|
||||||
|
|
||||||
|
Iterate Doc objects from a file or directory of JSONL (newline-delimited JSON)
|
||||||
|
formatted raw text files. Can be used to read the raw text corpus for language
|
||||||
|
model [pretraining](/usage/embeddings-transformers#pretraining) from a JSONL
|
||||||
|
file.
|
||||||
|
|
||||||
|
> #### Tip: Writing JSONL
|
||||||
|
>
|
||||||
|
> Our utility library [`srsly`](https://github.com/explosion/srsly) provides a
|
||||||
|
> handy `write_jsonl` helper that takes a file path and list of dictionaries and
|
||||||
|
> writes out JSONL-formatted data.
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> import srsly
|
||||||
|
> data = [{"text": "Some text"}, {"text": "More..."}]
|
||||||
|
> srsly.write_jsonl("/path/to/text.jsonl", data)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
```json
|
||||||
|
### Example
|
||||||
|
{"text": "Can I ask where you work now and what you do, and if you enjoy it?"}
|
||||||
|
{"text": "They may just pull out of the Seattle market completely, at least until they have autonomous vehicles."}
|
||||||
|
{"text": "My cynical view on this is that it will never be free to the public. Reason: what would be the draw of joining the military? Right now their selling point is free Healthcare and Education. Ironically both are run horribly and most, that I've talked to, come out wishing they never went in."}
|
||||||
|
```
|
||||||
|
|
||||||
|
### JsonlTexts.\_\init\_\_ {#jsonltexts-init tag="method"}
|
||||||
|
|
||||||
|
Initialize the reader.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> from spacy.training import JsonlTexts
|
||||||
|
>
|
||||||
|
> corpus = JsonlTexts("./data/texts.jsonl")
|
||||||
|
> ```
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> ### Example config
|
||||||
|
> [pretraining.corpus]
|
||||||
|
> @readers = "spacy.JsonlReader.v1"
|
||||||
|
> path = "corpus/raw_text.jsonl"
|
||||||
|
> min_length = 0
|
||||||
|
> max_length = 0
|
||||||
|
> limit = 0
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| -------------- | -------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `path` | The directory or filename to read from. Expects newline-delimited JSON with a key `"text"` for each record. ~~Union[str, Path]~~ |
|
||||||
|
| _keyword-only_ | |
|
||||||
|
| `min_length` | Minimum document length (in tokens). Shorter documents will be skipped. Defaults to `0`, which indicates no limit. ~~int~~ |
|
||||||
|
| `max_length` | Maximum document length (in tokens). Longer documents will be skipped. Defaults to `0`, which indicates no limit. ~~int~~ |
|
||||||
|
| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ |
|
||||||
|
|
||||||
|
### JsonlTexts.\_\_call\_\_ {#jsonltexts-call tag="method"}
|
||||||
|
|
||||||
|
Yield examples from the data.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> from spacy.training import JsonlTexts
|
||||||
|
> import spacy
|
||||||
|
>
|
||||||
|
> corpus = JsonlTexts("./texts.jsonl")
|
||||||
|
> nlp = spacy.blank("en")
|
||||||
|
> data = corpus(nlp)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ---------- | -------------------------------------- |
|
||||||
|
| `nlp` | The current `nlp` object. ~~Language~~ |
|
||||||
|
| **YIELDS** | The examples. ~~Example~~ |
|
||||||
|
|
|
@ -4,7 +4,6 @@ teaser: Details on spaCy's input and output data formats
|
||||||
menu:
|
menu:
|
||||||
- ['Training Config', 'config']
|
- ['Training Config', 'config']
|
||||||
- ['Training Data', 'training']
|
- ['Training Data', 'training']
|
||||||
- ['Pretraining Data', 'pretraining']
|
|
||||||
- ['Vocabulary', 'vocab-jsonl']
|
- ['Vocabulary', 'vocab-jsonl']
|
||||||
- ['Pipeline Meta', 'meta']
|
- ['Pipeline Meta', 'meta']
|
||||||
---
|
---
|
||||||
|
@ -131,7 +130,7 @@ process that are used when you run [`spacy train`](/api/cli#train).
|
||||||
| --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ |
|
| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ |
|
||||||
| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ |
|
| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ |
|
||||||
| `dev_corpus` | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/corpus). ~~Callable[[Language], Iterator[Example]]~~ |
|
| `dev_corpus` | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/top-level#Corpus). ~~Callable[[Language], Iterator[Example]]~~ |
|
||||||
| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ |
|
| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ |
|
||||||
| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ |
|
| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ |
|
||||||
| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ |
|
| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ |
|
||||||
|
@ -143,28 +142,26 @@ process that are used when you run [`spacy train`](/api/cli#train).
|
||||||
| `raw_text` | Optional path to a jsonl file with unlabelled text documents for a [rehearsal](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~ |
|
| `raw_text` | Optional path to a jsonl file with unlabelled text documents for a [rehearsal](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~ |
|
||||||
| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ |
|
| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ |
|
||||||
| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ |
|
| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ |
|
||||||
| `train_corpus` | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/corpus). ~~Callable[[Language], Iterator[Example]]~~ |
|
| `train_corpus` | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/top-level#Corpus). ~~Callable[[Language], Iterator[Example]]~~ |
|
||||||
| `vectors` | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~ |
|
| `vectors` | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~ |
|
||||||
|
|
||||||
### pretraining {#config-pretraining tag="section,optional"}
|
### pretraining {#config-pretraining tag="section,optional"}
|
||||||
|
|
||||||
This section is optional and defines settings and controls for
|
This section is optional and defines settings and controls for
|
||||||
[language model pretraining](/usage/training#pretraining). It's used when you
|
[language model pretraining](/usage/embeddings-transformers#pretraining). It's
|
||||||
run [`spacy pretrain`](/api/cli#pretrain).
|
used when you run [`spacy pretrain`](/api/cli#pretrain).
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `max_epochs` | Maximum number of epochs. Defaults to `1000`. ~~int~~ |
|
| `max_epochs` | Maximum number of epochs. Defaults to `1000`. ~~int~~ |
|
||||||
| `min_length` | Minimum length of examples. Defaults to `5`. ~~int~~ |
|
| `dropout` | The dropout rate. Defaults to `0.2`. ~~float~~ |
|
||||||
| `max_length` | Maximum length of examples. Defaults to `500`. ~~int~~ |
|
| `n_save_every` | Saving frequency. Defaults to `null`. ~~Optional[int]~~ |
|
||||||
| `dropout` | The dropout rate. Defaults to `0.2`. ~~float~~ |
|
| `objective` | The pretraining objective. Defaults to `{"type": "characters", "n_characters": 4}`. ~~Dict[str, Any]~~ |
|
||||||
| `n_save_every` | Saving frequency. Defaults to `null`. ~~Optional[int]~~ |
|
| `optimizer` | The optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ |
|
||||||
| `batch_size` | The batch size or batch size [schedule](https://thinc.ai/docs/api-schedules). Defaults to `3000`. ~~Union[int, Sequence[int]]~~ |
|
| `corpus` | Callable that takes the current `nlp` object and yields [`Doc`](/api/doc) objects. Defaults to [`JsonlReader`](/api/top-level#JsonlReader). ~~Callable[[Language, str], Iterable[Example]]~~ |
|
||||||
| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ |
|
| `batcher` | Batcher for the training data. ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ |
|
||||||
| `use_pytorch_for_gpu_memory` | Allocate memory via PyTorch. Defaults to variable `${system.use_pytorch_for_gpu_memory}`. ~~bool~~ |
|
| `component` | Component to find the layer to pretrain. Defaults to `"tok2vec"`. ~~str~~ |
|
||||||
| `tok2vec_model` | The model section of the embedding component in the config. Defaults to `"components.tok2vec.model"`. ~~str~~ |
|
| `layer` | The layer to pretrain. If empty, the whole component model will be used. ~~str~~ |
|
||||||
| `objective` | The pretraining objective. Defaults to `{"type": "characters", "n_characters": 4}`. ~~Dict[str, Any]~~ |
|
|
||||||
| `optimizer` | The optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ |
|
|
||||||
|
|
||||||
## Training data {#training}
|
## Training data {#training}
|
||||||
|
|
||||||
|
@ -369,40 +366,6 @@ gold_dict = {"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}}}
|
||||||
example = Example.from_dict(doc, gold_dict)
|
example = Example.from_dict(doc, gold_dict)
|
||||||
```
|
```
|
||||||
|
|
||||||
## Pretraining data {#pretraining}
|
|
||||||
|
|
||||||
The [`spacy pretrain`](/api/cli#pretrain) command lets you pretrain the
|
|
||||||
"token-to-vector" embedding layer of pipeline components from raw text. Raw text
|
|
||||||
can be provided as a `.jsonl` (newline-delimited JSON) file containing one input
|
|
||||||
text per line (roughly paragraph length is good). Optionally, custom
|
|
||||||
tokenization can be provided. The JSONL format means that the texts can be read
|
|
||||||
in line-by-line, while still making it easy to represent newlines in the data.
|
|
||||||
|
|
||||||
> #### Tip: Writing JSONL
|
|
||||||
>
|
|
||||||
> Our utility library [`srsly`](https://github.com/explosion/srsly) provides a
|
|
||||||
> handy `write_jsonl` helper that takes a file path and list of dictionaries and
|
|
||||||
> writes out JSONL-formatted data.
|
|
||||||
>
|
|
||||||
> ```python
|
|
||||||
> import srsly
|
|
||||||
> data = [{"text": "Some text"}, {"text": "More..."}]
|
|
||||||
> srsly.write_jsonl("/path/to/text.jsonl", data)
|
|
||||||
> ```
|
|
||||||
|
|
||||||
| Key | Description |
|
|
||||||
| -------- | --------------------------------------------------------------------- |
|
|
||||||
| `text` | The raw input text. Is not required if `tokens` is available. ~~str~~ |
|
|
||||||
| `tokens` | Optional tokenization, one string per token. ~~List[str]~~ |
|
|
||||||
|
|
||||||
```json
|
|
||||||
### Example
|
|
||||||
{"text": "Can I ask where you work now and what you do, and if you enjoy it?"}
|
|
||||||
{"text": "They may just pull out of the Seattle market completely, at least until they have autonomous vehicles."}
|
|
||||||
{"text": "My cynical view on this is that it will never be free to the public. Reason: what would be the draw of joining the military? Right now their selling point is free Healthcare and Education. Ironically both are run horribly and most, that I've talked to, come out wishing they never went in."}
|
|
||||||
{"tokens": ["If", "tokens", "are", "provided", "then", "we", "can", "skip", "the", "raw", "input", "text"]}
|
|
||||||
```
|
|
||||||
|
|
||||||
## Lexical data for vocabulary {#vocab-jsonl new="2"}
|
## Lexical data for vocabulary {#vocab-jsonl new="2"}
|
||||||
|
|
||||||
To populate a pipeline's vocabulary, you can use the
|
To populate a pipeline's vocabulary, you can use the
|
||||||
|
|
|
@ -36,11 +36,12 @@ architectures and their arguments and hyperparameters.
|
||||||
> nlp.add_pipe("textcat", config=config)
|
> nlp.add_pipe("textcat", config=config)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Setting | Description |
|
| Setting | Description |
|
||||||
| ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ---------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `labels` | A list of categories to learn. If empty, the model infers the categories from the data. Defaults to `[]`. ~~Iterable[str]~~ |
|
| `labels` | A list of categories to learn. If empty, the model infers the categories from the data. Defaults to `[]`. ~~Iterable[str]~~ |
|
||||||
| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~ |
|
| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~ |
|
||||||
| `model` | A model instance that predicts scores for each category. Defaults to [TextCatEnsemble](/api/architectures#TextCatEnsemble). ~~Model[List[Doc], List[Floats2d]]~~ |
|
| `positive_label` | The positive label for a binary task with exclusive classes, None otherwise and by default. ~~Optional[str]~~ |
|
||||||
|
| `model` | A model instance that predicts scores for each category. Defaults to [TextCatEnsemble](/api/architectures#TextCatEnsemble). ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||||
|
|
||||||
```python
|
```python
|
||||||
%%GITHUB_SPACY/spacy/pipeline/textcat.py
|
%%GITHUB_SPACY/spacy/pipeline/textcat.py
|
||||||
|
@ -60,21 +61,22 @@ architectures and their arguments and hyperparameters.
|
||||||
>
|
>
|
||||||
> # Construction from class
|
> # Construction from class
|
||||||
> from spacy.pipeline import TextCategorizer
|
> from spacy.pipeline import TextCategorizer
|
||||||
> textcat = TextCategorizer(nlp.vocab, model, labels=[], threshold=0.5)
|
> textcat = TextCategorizer(nlp.vocab, model, labels=[], threshold=0.5, positive_label="POS")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
Create a new pipeline instance. In your application, you would normally use a
|
Create a new pipeline instance. In your application, you would normally use a
|
||||||
shortcut for this and instantiate the component using its string name and
|
shortcut for this and instantiate the component using its string name and
|
||||||
[`nlp.add_pipe`](/api/language#create_pipe).
|
[`nlp.add_pipe`](/api/language#create_pipe).
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------- | -------------------------------------------------------------------------------------------------------------------------- |
|
| ---------------- | -------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
||||||
| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ |
|
| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||||
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `labels` | The labels to use. ~~Iterable[str]~~ |
|
| `labels` | The labels to use. ~~Iterable[str]~~ |
|
||||||
| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~ |
|
| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~ |
|
||||||
|
| `positive_label` | The positive label for a binary task with exclusive classes, None otherwise. ~~Optional[str]~~ |
|
||||||
|
|
||||||
## TextCategorizer.\_\_call\_\_ {#call tag="method"}
|
## TextCategorizer.\_\_call\_\_ {#call tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -5,6 +5,7 @@ menu:
|
||||||
- ['displacy', 'displacy']
|
- ['displacy', 'displacy']
|
||||||
- ['registry', 'registry']
|
- ['registry', 'registry']
|
||||||
- ['Loggers', 'loggers']
|
- ['Loggers', 'loggers']
|
||||||
|
- ['Readers', 'readers']
|
||||||
- ['Batchers', 'batchers']
|
- ['Batchers', 'batchers']
|
||||||
- ['Data & Alignment', 'gold']
|
- ['Data & Alignment', 'gold']
|
||||||
- ['Utility Functions', 'util']
|
- ['Utility Functions', 'util']
|
||||||
|
@ -363,7 +364,7 @@ results to a [Weights & Biases](https://www.wandb.com/) dashboard. Instead of
|
||||||
using one of the built-in loggers listed here, you can also
|
using one of the built-in loggers listed here, you can also
|
||||||
[implement your own](/usage/training#custom-logging).
|
[implement your own](/usage/training#custom-logging).
|
||||||
|
|
||||||
#### spacy.ConsoleLogger {#ConsoleLogger tag="registered function"}
|
#### ConsoleLogger {#ConsoleLogger tag="registered function"}
|
||||||
|
|
||||||
> #### Example config
|
> #### Example config
|
||||||
>
|
>
|
||||||
|
@ -409,7 +410,7 @@ start decreasing across epochs.
|
||||||
|
|
||||||
</Accordion>
|
</Accordion>
|
||||||
|
|
||||||
#### spacy.WandbLogger {#WandbLogger tag="registered function"}
|
#### WandbLogger {#WandbLogger tag="registered function"}
|
||||||
|
|
||||||
> #### Installation
|
> #### Installation
|
||||||
>
|
>
|
||||||
|
@ -451,6 +452,71 @@ remain in the config file stored on your local system.
|
||||||
| `project_name` | The name of the project in the Weights & Biases interface. The project will be created automatically if it doesn't exist yet. ~~str~~ |
|
| `project_name` | The name of the project in the Weights & Biases interface. The project will be created automatically if it doesn't exist yet. ~~str~~ |
|
||||||
| `remove_config_values` | A list of values to include from the config before it is uploaded to W&B (default: empty). ~~List[str]~~ |
|
| `remove_config_values` | A list of values to include from the config before it is uploaded to W&B (default: empty). ~~List[str]~~ |
|
||||||
|
|
||||||
|
## Readers {#readers source="spacy/training/corpus.py" new="3"}
|
||||||
|
|
||||||
|
Corpus readers are registered functions that load data and return a function
|
||||||
|
that takes the current `nlp` object and yields [`Example`](/api/example) objects
|
||||||
|
that can be used for [training](/usage/training) and
|
||||||
|
[pretraining](/usage/embeddings-transformers#pretraining). You can replace it
|
||||||
|
with your own registered function in the
|
||||||
|
[`@readers` registry](/api/top-level#registry) to customize the data loading and
|
||||||
|
streaming.
|
||||||
|
|
||||||
|
### Corpus {#corpus}
|
||||||
|
|
||||||
|
The `Corpus` reader manages annotated corpora and can be used for training and
|
||||||
|
development datasets in the [DocBin](/api/docbin) (`.spacy`) format. Also see
|
||||||
|
the [`Corpus`](/api/corpus) class.
|
||||||
|
|
||||||
|
> #### Example config
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> [paths]
|
||||||
|
> train = "corpus/train.spacy"
|
||||||
|
>
|
||||||
|
> [training.train_corpus]
|
||||||
|
> @readers = "spacy.Corpus.v1"
|
||||||
|
> path = ${paths.train}
|
||||||
|
> gold_preproc = false
|
||||||
|
> max_length = 0
|
||||||
|
> limit = 0
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `path` | The directory or filename to read from. Expects data in spaCy's binary [`.spacy` format](/api/data-formats#binary-training). ~~Union[str, Path]~~ |
|
||||||
|
| `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. See [`Corpus`](/api/corpus#init) for details. ~~bool~~ |
|
||||||
|
| `max_length` | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ |
|
||||||
|
| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ |
|
||||||
|
|
||||||
|
### JsonlReader {#jsonlreader}
|
||||||
|
|
||||||
|
Create [`Example`](/api/example) objects from a JSONL (newline-delimited JSON)
|
||||||
|
file of texts keyed by `"text"`. Can be used to read the raw text corpus for
|
||||||
|
language model [pretraining](/usage/embeddings-transformers#pretraining) from a
|
||||||
|
JSONL file. Also see the [`JsonlReader`](/api/corpus#jsonlreader) class.
|
||||||
|
|
||||||
|
> #### Example config
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> [paths]
|
||||||
|
> pretrain = "corpus/raw_text.jsonl"
|
||||||
|
>
|
||||||
|
> [pretraining.corpus]
|
||||||
|
> @readers = "spacy.JsonlReader.v1"
|
||||||
|
> path = ${paths.pretrain}
|
||||||
|
> min_length = 0
|
||||||
|
> max_length = 0
|
||||||
|
> limit = 0
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ------------ | -------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `path` | The directory or filename to read from. Expects newline-delimited JSON with a key `"text"` for each record. ~~Union[str, Path]~~ |
|
||||||
|
| `min_length` | Minimum document length (in tokens). Shorter documents will be skipped. Defaults to `0`, which indicates no limit. ~~int~~ |
|
||||||
|
| `max_length` | Maximum document length (in tokens). Longer documents will be skipped. Defaults to `0`, which indicates no limit. ~~int~~ |
|
||||||
|
| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ |
|
||||||
|
|
||||||
## Batchers {#batchers source="spacy/training/batchers.py" new="3"}
|
## Batchers {#batchers source="spacy/training/batchers.py" new="3"}
|
||||||
|
|
||||||
A data batcher implements a batching strategy that essentially turns a stream of
|
A data batcher implements a batching strategy that essentially turns a stream of
|
||||||
|
@ -465,7 +531,7 @@ Instead of using one of the built-in batchers listed here, you can also
|
||||||
[implement your own](/usage/training#custom-code-readers-batchers), which may or
|
[implement your own](/usage/training#custom-code-readers-batchers), which may or
|
||||||
may not use a custom schedule.
|
may not use a custom schedule.
|
||||||
|
|
||||||
#### batch_by_words {#batch_by_words tag="registered function"}
|
### batch_by_words {#batch_by_words tag="registered function"}
|
||||||
|
|
||||||
Create minibatches of roughly a given number of words. If any examples are
|
Create minibatches of roughly a given number of words. If any examples are
|
||||||
longer than the specified batch length, they will appear in a batch by
|
longer than the specified batch length, they will appear in a batch by
|
||||||
|
@ -492,7 +558,7 @@ themselves, or be discarded if `discard_oversize` is set to `True`. The argument
|
||||||
| `discard_oversize` | Whether to discard sequences that by themselves exceed the tolerated size. ~~bool~~ |
|
| `discard_oversize` | Whether to discard sequences that by themselves exceed the tolerated size. ~~bool~~ |
|
||||||
| `get_length` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~ |
|
| `get_length` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~ |
|
||||||
|
|
||||||
#### batch_by_sequence {#batch_by_sequence tag="registered function"}
|
### batch_by_sequence {#batch_by_sequence tag="registered function"}
|
||||||
|
|
||||||
> #### Example config
|
> #### Example config
|
||||||
>
|
>
|
||||||
|
@ -510,7 +576,7 @@ Create a batcher that creates batches of the specified size.
|
||||||
| `size` | The target number of items per batch. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). ~~Union[int, Sequence[int]]~~ |
|
| `size` | The target number of items per batch. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). ~~Union[int, Sequence[int]]~~ |
|
||||||
| `get_length` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~ |
|
| `get_length` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~ |
|
||||||
|
|
||||||
#### batch_by_padded {#batch_by_padded tag="registered function"}
|
### batch_by_padded {#batch_by_padded tag="registered function"}
|
||||||
|
|
||||||
> #### Example config
|
> #### Example config
|
||||||
>
|
>
|
||||||
|
|
|
@ -383,7 +383,7 @@ hints. The new version of spaCy's machine learning library
|
||||||
types for models and arrays, and a custom `mypy` plugin that can be used to
|
types for models and arrays, and a custom `mypy` plugin that can be used to
|
||||||
type-check model definitions.
|
type-check model definitions.
|
||||||
|
|
||||||
For data validation, spacy v3.0 adopts
|
For data validation, spaCy v3.0 adopts
|
||||||
[`pydantic`](https://github.com/samuelcolvin/pydantic). It also powers the data
|
[`pydantic`](https://github.com/samuelcolvin/pydantic). It also powers the data
|
||||||
validation of Thinc's [config system](https://thinc.ai/docs/usage-config), which
|
validation of Thinc's [config system](https://thinc.ai/docs/usage-config), which
|
||||||
lets you to register **custom functions with typed arguments**, reference them
|
lets you to register **custom functions with typed arguments**, reference them
|
||||||
|
|
Loading…
Reference in New Issue
Block a user