mirror of
https://github.com/explosion/spaCy.git
synced 2025-03-19 01:22:14 +03:00
Merge branch 'develop' into nightly.spacy.io
This commit is contained in:
commit
5fdd6b4606
|
@ -1,8 +1,9 @@
|
||||||
recursive-include include *.h
|
recursive-include include *.h
|
||||||
recursive-include spacy *.pyx *.pxd *.txt *.cfg
|
recursive-include spacy *.pyx *.pxd *.txt *.cfg *.jinja
|
||||||
include LICENSE
|
include LICENSE
|
||||||
include README.md
|
include README.md
|
||||||
include pyproject.toml
|
include pyproject.toml
|
||||||
recursive-exclude spacy/lang *.json
|
recursive-exclude spacy/lang *.json
|
||||||
recursive-include spacy/lang *.json.gz
|
recursive-include spacy/lang *.json.gz
|
||||||
|
recursive-include spacy/cli *.json
|
||||||
recursive-include licenses *
|
recursive-include licenses *
|
||||||
|
|
|
@ -6,7 +6,7 @@ requires = [
|
||||||
"cymem>=2.0.2,<2.1.0",
|
"cymem>=2.0.2,<2.1.0",
|
||||||
"preshed>=3.0.2,<3.1.0",
|
"preshed>=3.0.2,<3.1.0",
|
||||||
"murmurhash>=0.28.0,<1.1.0",
|
"murmurhash>=0.28.0,<1.1.0",
|
||||||
"thinc>=8.0.0a23,<8.0.0a30",
|
"thinc>=8.0.0a27,<8.0.0a30",
|
||||||
"blis>=0.4.0,<0.5.0",
|
"blis>=0.4.0,<0.5.0",
|
||||||
"pytokenizations",
|
"pytokenizations",
|
||||||
"smart_open>=2.0.0,<3.0.0"
|
"smart_open>=2.0.0,<3.0.0"
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# Our libraries
|
# Our libraries
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.0.0a23,<8.0.0a30
|
thinc>=8.0.0a27,<8.0.0a30
|
||||||
blis>=0.4.0,<0.5.0
|
blis>=0.4.0,<0.5.0
|
||||||
ml_datasets>=0.1.1
|
ml_datasets>=0.1.1
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
|
@ -26,3 +26,4 @@ pytest>=4.6.5
|
||||||
pytest-timeout>=1.3.0,<2.0.0
|
pytest-timeout>=1.3.0,<2.0.0
|
||||||
mock>=2.0.0,<3.0.0
|
mock>=2.0.0,<3.0.0
|
||||||
flake8>=3.5.0,<3.6.0
|
flake8>=3.5.0,<3.6.0
|
||||||
|
jinja2
|
||||||
|
|
|
@ -34,13 +34,13 @@ setup_requires =
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
thinc>=8.0.0a23,<8.0.0a30
|
thinc>=8.0.0a27,<8.0.0a30
|
||||||
install_requires =
|
install_requires =
|
||||||
# Our libraries
|
# Our libraries
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.0.0a23,<8.0.0a30
|
thinc>=8.0.0a27,<8.0.0a30
|
||||||
blis>=0.4.0,<0.5.0
|
blis>=0.4.0,<0.5.0
|
||||||
wasabi>=0.7.1,<1.1.0
|
wasabi>=0.7.1,<1.1.0
|
||||||
srsly>=2.1.0,<3.0.0
|
srsly>=2.1.0,<3.0.0
|
||||||
|
|
|
@ -14,7 +14,7 @@ from . import pipeline # noqa: F401
|
||||||
from .cli.info import info # noqa: F401
|
from .cli.info import info # noqa: F401
|
||||||
from .glossary import explain # noqa: F401
|
from .glossary import explain # noqa: F401
|
||||||
from .about import __version__ # noqa: F401
|
from .about import __version__ # noqa: F401
|
||||||
from .util import registry # noqa: F401
|
from .util import registry, logger # noqa: F401
|
||||||
|
|
||||||
from .errors import Errors
|
from .errors import Errors
|
||||||
from .language import Language
|
from .language import Language
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# fmt: off
|
# fmt: off
|
||||||
__title__ = "spacy-nightly"
|
__title__ = "spacy-nightly"
|
||||||
__version__ = "3.0.0a6"
|
__version__ = "3.0.0a7"
|
||||||
__release__ = True
|
__release__ = True
|
||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
|
|
|
@ -15,7 +15,7 @@ from .debug_model import debug_model # noqa: F401
|
||||||
from .evaluate import evaluate # noqa: F401
|
from .evaluate import evaluate # noqa: F401
|
||||||
from .convert import convert # noqa: F401
|
from .convert import convert # noqa: F401
|
||||||
from .init_model import init_model # noqa: F401
|
from .init_model import init_model # noqa: F401
|
||||||
from .init_config import init_config # noqa: F401
|
from .init_config import init_config, fill_config # noqa: F401
|
||||||
from .validate import validate # noqa: F401
|
from .validate import validate # noqa: F401
|
||||||
from .project.clone import project_clone # noqa: F401
|
from .project.clone import project_clone # noqa: F401
|
||||||
from .project.assets import project_assets # noqa: F401
|
from .project.assets import project_assets # noqa: F401
|
||||||
|
|
|
@ -179,13 +179,13 @@ def show_validation_error(
|
||||||
file_path: Optional[Union[str, Path]] = None,
|
file_path: Optional[Union[str, Path]] = None,
|
||||||
*,
|
*,
|
||||||
title: str = "Config validation error",
|
title: str = "Config validation error",
|
||||||
hint_init: bool = True,
|
hint_fill: bool = True,
|
||||||
):
|
):
|
||||||
"""Helper to show custom config validation errors on the CLI.
|
"""Helper to show custom config validation errors on the CLI.
|
||||||
|
|
||||||
file_path (str / Path): Optional file path of config file, used in hints.
|
file_path (str / Path): Optional file path of config file, used in hints.
|
||||||
title (str): Title of the custom formatted error.
|
title (str): Title of the custom formatted error.
|
||||||
hint_init (bool): Show hint about filling config.
|
hint_fill (bool): Show hint about filling config.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
yield
|
yield
|
||||||
|
@ -195,14 +195,14 @@ def show_validation_error(
|
||||||
# helper for this in Thinc
|
# helper for this in Thinc
|
||||||
err_text = str(e).replace("Config validation error", "").strip()
|
err_text = str(e).replace("Config validation error", "").strip()
|
||||||
print(err_text)
|
print(err_text)
|
||||||
if hint_init and "field required" in err_text:
|
if hint_fill and "field required" in err_text:
|
||||||
config_path = file_path if file_path is not None else "config.cfg"
|
config_path = file_path if file_path is not None else "config.cfg"
|
||||||
msg.text(
|
msg.text(
|
||||||
"If your config contains missing values, you can run the 'init "
|
"If your config contains missing values, you can run the 'init "
|
||||||
"config' command to fill in all the defaults, if possible:",
|
"fill-config' command to fill in all the defaults, if possible:",
|
||||||
spaced=True,
|
spaced=True,
|
||||||
)
|
)
|
||||||
print(f"{COMMAND} init config {config_path} --base {config_path}\n")
|
print(f"{COMMAND} init fill-config {config_path} --base {config_path}\n")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -5,7 +5,6 @@ import sys
|
||||||
import srsly
|
import srsly
|
||||||
from wasabi import Printer, MESSAGES, msg, diff_strings
|
from wasabi import Printer, MESSAGES, msg, diff_strings
|
||||||
import typer
|
import typer
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
|
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
|
||||||
from ._util import import_code, debug_cli, get_sourced_components
|
from ._util import import_code, debug_cli, get_sourced_components
|
||||||
|
@ -49,11 +48,8 @@ def debug_config_cli(
|
||||||
overrides = parse_config_overrides(ctx.args)
|
overrides = parse_config_overrides(ctx.args)
|
||||||
import_code(code_path)
|
import_code(code_path)
|
||||||
with show_validation_error(config_path):
|
with show_validation_error(config_path):
|
||||||
config = Config().from_disk(config_path, overrides=overrides)
|
config = util.load_config(config_path, overrides=overrides)
|
||||||
try:
|
nlp, _ = util.load_model_from_config(config, auto_fill=auto_fill)
|
||||||
nlp, _ = util.load_model_from_config(config, auto_fill=auto_fill)
|
|
||||||
except ValueError as e:
|
|
||||||
msg.fail(str(e), exits=1)
|
|
||||||
if auto_fill:
|
if auto_fill:
|
||||||
orig_config = config.to_str()
|
orig_config = config.to_str()
|
||||||
filled_config = nlp.config.to_str()
|
filled_config = nlp.config.to_str()
|
||||||
|
@ -134,7 +130,7 @@ def debug_data(
|
||||||
if not config_path.exists():
|
if not config_path.exists():
|
||||||
msg.fail("Config file not found", config_path, exists=1)
|
msg.fail("Config file not found", config_path, exists=1)
|
||||||
with show_validation_error(config_path):
|
with show_validation_error(config_path):
|
||||||
cfg = Config().from_disk(config_path, overrides=config_overrides)
|
cfg = util.load_config(config_path, overrides=config_overrides)
|
||||||
nlp, config = util.load_model_from_config(cfg)
|
nlp, config = util.load_model_from_config(cfg)
|
||||||
# Use original config here, not resolved version
|
# Use original config here, not resolved version
|
||||||
sourced_components = get_sourced_components(cfg)
|
sourced_components = get_sourced_components(cfg)
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
from typing import Dict, Any, Optional
|
from typing import Dict, Any, Optional
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam, Config
|
from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam
|
||||||
from thinc.api import Model, data_validation
|
from thinc.api import Model, data_validation
|
||||||
import typer
|
import typer
|
||||||
|
|
||||||
|
@ -49,16 +49,12 @@ def debug_model_cli(
|
||||||
}
|
}
|
||||||
config_overrides = parse_config_overrides(ctx.args)
|
config_overrides = parse_config_overrides(ctx.args)
|
||||||
with show_validation_error(config_path):
|
with show_validation_error(config_path):
|
||||||
cfg = Config().from_disk(config_path, overrides=config_overrides)
|
config = util.load_config(config_path, overrides=config_overrides)
|
||||||
try:
|
nlp, config = util.load_model_from_config(config_path)
|
||||||
nlp, config = util.load_model_from_config(cfg)
|
|
||||||
except ValueError as e:
|
|
||||||
msg.fail(str(e), exits=1)
|
|
||||||
seed = config["pretraining"]["seed"]
|
seed = config["pretraining"]["seed"]
|
||||||
if seed is not None:
|
if seed is not None:
|
||||||
msg.info(f"Fixing random seed: {seed}")
|
msg.info(f"Fixing random seed: {seed}")
|
||||||
fix_random_seed(seed)
|
fix_random_seed(seed)
|
||||||
|
|
||||||
pipe = nlp.get_pipe(component)
|
pipe = nlp.get_pipe(component)
|
||||||
if hasattr(pipe, "model"):
|
if hasattr(pipe, "model"):
|
||||||
model = pipe.model
|
model = pipe.model
|
||||||
|
|
|
@ -60,7 +60,6 @@ def evaluate(
|
||||||
fix_random_seed()
|
fix_random_seed()
|
||||||
if use_gpu >= 0:
|
if use_gpu >= 0:
|
||||||
require_gpu(use_gpu)
|
require_gpu(use_gpu)
|
||||||
util.set_env_log(False)
|
|
||||||
data_path = util.ensure_path(data_path)
|
data_path = util.ensure_path(data_path)
|
||||||
output_path = util.ensure_path(output)
|
output_path = util.ensure_path(output)
|
||||||
displacy_path = util.ensure_path(displacy_path)
|
displacy_path = util.ensure_path(displacy_path)
|
||||||
|
|
|
@ -1,81 +1,178 @@
|
||||||
from typing import Optional, List
|
from typing import Optional, List, Tuple
|
||||||
|
from enum import Enum
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from wasabi import Printer, diff_strings
|
||||||
from thinc.api import Config
|
from thinc.api import Config
|
||||||
from wasabi import msg
|
from pydantic import BaseModel
|
||||||
|
import srsly
|
||||||
|
import re
|
||||||
|
|
||||||
from ..util import load_model_from_config, get_lang_class, load_model
|
from .. import util
|
||||||
from ._util import init_cli, Arg, Opt, show_validation_error
|
from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
|
||||||
|
|
||||||
|
|
||||||
|
TEMPLATE_ROOT = Path(__file__).parent / "templates"
|
||||||
|
TEMPLATE_PATH = TEMPLATE_ROOT / "quickstart_training.jinja"
|
||||||
|
RECOMMENDATIONS_PATH = TEMPLATE_ROOT / "quickstart_training_recommendations.json"
|
||||||
|
|
||||||
|
|
||||||
|
class Optimizations(str, Enum):
|
||||||
|
efficiency = "efficiency"
|
||||||
|
accuracy = "accuracy"
|
||||||
|
|
||||||
|
|
||||||
|
class RecommendationsTrfItem(BaseModel):
|
||||||
|
name: str
|
||||||
|
size_factor: int
|
||||||
|
|
||||||
|
|
||||||
|
class RecommendationsTrf(BaseModel):
|
||||||
|
efficiency: RecommendationsTrfItem
|
||||||
|
accuracy: RecommendationsTrfItem
|
||||||
|
|
||||||
|
|
||||||
|
class RecommendationSchema(BaseModel):
|
||||||
|
word_vectors: Optional[str] = None
|
||||||
|
transformer: Optional[RecommendationsTrf] = None
|
||||||
|
|
||||||
|
|
||||||
@init_cli.command("config")
|
@init_cli.command("config")
|
||||||
def init_config_cli(
|
def init_config_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
output_path: Path = Arg("-", help="Output path or - for stdout", allow_dash=True),
|
output_file: Path = Arg("-", help="File to save config.cfg to (or - for stdout)", allow_dash=True),
|
||||||
base_path: Optional[Path] = Opt(None, "--base", "-b", help="Optional base config to fill", exists=True, dir_okay=False),
|
lang: Optional[str] = Opt("en", "--lang", "-l", help="Two-letter code of the language to use"),
|
||||||
model: Optional[str] = Opt(None, "--model", "-m", help="Optional model to copy config from"),
|
pipeline: Optional[str] = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include in the model (without 'tok2vec' or 'transformer')"),
|
||||||
lang: Optional[str] = Opt(None, "--lang", "-l", help="Optional language code for blank config"),
|
optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
||||||
pipeline: Optional[str] = Opt(None, "--pipeline", "-p", help="Optional pipeline components to use")
|
cpu: bool = Opt(False, "--cpu", "-C", help="Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""Generate a starter config.cfg for training."""
|
"""
|
||||||
validate_cli_args(base_path, model, lang)
|
Generate a starter config.cfg for training. Based on your requirements
|
||||||
is_stdout = str(output_path) == "-"
|
specified via the CLI arguments, this command generates a config with the
|
||||||
pipeline = [p.strip() for p in pipeline.split(",")] if pipeline else []
|
optimal settings for you use case. This includes the choice of architecture,
|
||||||
cfg = init_config(output_path, base_path, model, lang, pipeline, silent=is_stdout)
|
pretrained weights and related hyperparameters.
|
||||||
if is_stdout:
|
"""
|
||||||
print(cfg.to_str())
|
if isinstance(optimize, Optimizations): # instance of enum from the CLI
|
||||||
|
optimize = optimize.value
|
||||||
|
pipeline = [p.strip() for p in pipeline.split(",")]
|
||||||
|
init_config(output_file, lang=lang, pipeline=pipeline, optimize=optimize, cpu=cpu)
|
||||||
|
|
||||||
|
|
||||||
|
@init_cli.command("fill-config")
|
||||||
|
def init_fill_config_cli(
|
||||||
|
# fmt: off
|
||||||
|
base_path: Path = Arg(..., help="Base config to fill", exists=True, dir_okay=False),
|
||||||
|
output_file: Path = Arg("-", help="File to save config.cfg to (or - for stdout)", allow_dash=True),
|
||||||
|
diff: bool = Opt(False, "--diff", "-D", help="Print a visual diff highlighting the changes")
|
||||||
|
# fmt: on
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Fill partial config.cfg with default values. Will add all missing settings
|
||||||
|
from the default config and will create all objects, check the registered
|
||||||
|
functions for their default values and update the base config. This command
|
||||||
|
can be used with a config generated via the training quickstart widget:
|
||||||
|
https://nightly.spacy.io/usage/training#quickstart
|
||||||
|
"""
|
||||||
|
fill_config(output_file, base_path, diff=diff)
|
||||||
|
|
||||||
|
|
||||||
|
def fill_config(
|
||||||
|
output_file: Path, base_path: Path, *, diff: bool = False
|
||||||
|
) -> Tuple[Config, Config]:
|
||||||
|
is_stdout = str(output_file) == "-"
|
||||||
|
msg = Printer(no_print=is_stdout)
|
||||||
|
with show_validation_error(hint_fill=False):
|
||||||
|
config = util.load_config(base_path)
|
||||||
|
nlp, _ = util.load_model_from_config(config, auto_fill=True)
|
||||||
|
before = config.to_str()
|
||||||
|
after = nlp.config.to_str()
|
||||||
|
if before == after:
|
||||||
|
msg.warn("Nothing to auto-fill: base config is already complete")
|
||||||
else:
|
else:
|
||||||
cfg.to_disk(output_path)
|
msg.good("Auto-filled config with all values")
|
||||||
msg.good("Saved config", output_path)
|
if diff and not is_stdout:
|
||||||
|
if before == after:
|
||||||
|
msg.warn("No diff to show: nothing was auto-filled")
|
||||||
|
else:
|
||||||
|
msg.divider("START CONFIG DIFF")
|
||||||
|
print("")
|
||||||
|
print(diff_strings(before, after))
|
||||||
|
msg.divider("END CONFIG DIFF")
|
||||||
|
print("")
|
||||||
|
save_config(nlp.config, output_file, is_stdout=is_stdout)
|
||||||
|
|
||||||
|
|
||||||
def init_config(
|
def init_config(
|
||||||
output_path: Path,
|
output_file: Path, *, lang: str, pipeline: List[str], optimize: str, cpu: bool
|
||||||
config_path: Optional[Path],
|
|
||||||
model: Optional[str],
|
|
||||||
lang: Optional[str],
|
|
||||||
pipeline: Optional[List[str]],
|
|
||||||
silent: bool = False,
|
|
||||||
) -> Config:
|
|
||||||
if config_path is not None:
|
|
||||||
msg.info("Generating config from base config", show=not silent)
|
|
||||||
with show_validation_error(config_path, hint_init=False):
|
|
||||||
config = Config().from_disk(config_path)
|
|
||||||
try:
|
|
||||||
nlp, _ = load_model_from_config(config, auto_fill=True)
|
|
||||||
except ValueError as e:
|
|
||||||
msg.fail(str(e), exits=1)
|
|
||||||
return nlp.config
|
|
||||||
if model is not None:
|
|
||||||
ext = f" with pipeline {pipeline}" if pipeline else ""
|
|
||||||
msg.info(f"Generating config from model {model}{ext}", show=not silent)
|
|
||||||
nlp = load_model(model)
|
|
||||||
for existing_pipe_name in nlp.pipe_names:
|
|
||||||
if existing_pipe_name not in pipeline:
|
|
||||||
nlp.remove_pipe(existing_pipe_name)
|
|
||||||
for pipe_name in pipeline:
|
|
||||||
if pipe_name not in nlp.pipe_names:
|
|
||||||
nlp.add_pipe(pipe_name)
|
|
||||||
return nlp.config
|
|
||||||
if lang is not None:
|
|
||||||
ext = f" with pipeline {pipeline}" if pipeline else ""
|
|
||||||
msg.info(f"Generating config for language '{lang}'{ext}", show=not silent)
|
|
||||||
nlp = get_lang_class(lang)()
|
|
||||||
for pipe_name in pipeline:
|
|
||||||
nlp.add_pipe(pipe_name)
|
|
||||||
return nlp.config
|
|
||||||
|
|
||||||
|
|
||||||
def validate_cli_args(
|
|
||||||
config_path: Optional[Path], model: Optional[str], lang: Optional[str]
|
|
||||||
) -> None:
|
) -> None:
|
||||||
args = {"--base": config_path, "--model": model, "--lang": lang}
|
is_stdout = str(output_file) == "-"
|
||||||
if sum(arg is not None for arg in args.values()) != 1:
|
msg = Printer(no_print=is_stdout)
|
||||||
existing = " ".join(f"{a} {v}" for a, v in args.items() if v is not None)
|
try:
|
||||||
|
from jinja2 import Template
|
||||||
|
except ImportError:
|
||||||
|
msg.fail("This command requires jinja2", "pip install jinja2", exits=1)
|
||||||
|
recommendations = srsly.read_json(RECOMMENDATIONS_PATH)
|
||||||
|
lang_defaults = util.get_lang_class(lang).Defaults
|
||||||
|
has_letters = lang_defaults.writing_system.get("has_letters", True)
|
||||||
|
# Filter out duplicates since tok2vec and transformer are added by template
|
||||||
|
pipeline = [pipe for pipe in pipeline if pipe not in ("tok2vec", "transformer")]
|
||||||
|
reco = RecommendationSchema(**recommendations.get(lang, {})).dict()
|
||||||
|
with TEMPLATE_PATH.open("r") as f:
|
||||||
|
template = Template(f.read())
|
||||||
|
variables = {
|
||||||
|
"lang": lang,
|
||||||
|
"components": pipeline,
|
||||||
|
"optimize": optimize,
|
||||||
|
"hardware": "cpu" if cpu else "gpu",
|
||||||
|
"transformer_data": reco["transformer"],
|
||||||
|
"word_vectors": reco["word_vectors"],
|
||||||
|
"has_letters": has_letters,
|
||||||
|
}
|
||||||
|
base_template = template.render(variables).strip()
|
||||||
|
# Giving up on getting the newlines right in jinja for now
|
||||||
|
base_template = re.sub(r"\n\n\n+", "\n\n", base_template)
|
||||||
|
# Access variables declared in templates
|
||||||
|
template_vars = template.make_module(variables)
|
||||||
|
use_case = {
|
||||||
|
"Language": lang,
|
||||||
|
"Pipeline": ", ".join(pipeline),
|
||||||
|
"Optimize for": optimize,
|
||||||
|
"Hardware": variables["hardware"].upper(),
|
||||||
|
"Transformer": template_vars.transformer.get("name", False),
|
||||||
|
}
|
||||||
|
msg.info("Generated template specific for your use case")
|
||||||
|
for label, value in use_case.items():
|
||||||
|
msg.text(f"- {label}: {value}")
|
||||||
|
use_transformer = bool(template_vars.use_transformer)
|
||||||
|
if use_transformer:
|
||||||
|
require_spacy_transformers(msg)
|
||||||
|
with show_validation_error(hint_fill=False):
|
||||||
|
config = util.load_config_from_str(base_template)
|
||||||
|
nlp, _ = util.load_model_from_config(config, auto_fill=True)
|
||||||
|
if use_transformer:
|
||||||
|
nlp.config.pop("pretraining", {}) # TODO: solve this better
|
||||||
|
msg.good("Auto-filled config with all values")
|
||||||
|
save_config(nlp.config, output_file, is_stdout=is_stdout)
|
||||||
|
|
||||||
|
|
||||||
|
def save_config(config: Config, output_file: Path, is_stdout: bool = False) -> None:
|
||||||
|
msg = Printer(no_print=is_stdout)
|
||||||
|
if is_stdout:
|
||||||
|
print(config.to_str())
|
||||||
|
else:
|
||||||
|
config.to_disk(output_file, interpolate=False)
|
||||||
|
msg.good("Saved config", output_file)
|
||||||
|
msg.text("You can now add your data and train your model:")
|
||||||
|
variables = ["--paths.train ./train.spacy", "--paths.dev ./dev.spacy"]
|
||||||
|
print(f"{COMMAND} train {output_file.parts[-1]} {' '.join(variables)}")
|
||||||
|
|
||||||
|
|
||||||
|
def require_spacy_transformers(msg: Printer) -> None:
|
||||||
|
try:
|
||||||
|
import spacy_transformers # noqa: F401
|
||||||
|
except ImportError:
|
||||||
msg.fail(
|
msg.fail(
|
||||||
"The init config command expects only one of the following arguments: "
|
"Using a transformer-based pipeline requires spacy-transformers "
|
||||||
"--base (base config to fill and update), --lang (language code to "
|
"to be installed.",
|
||||||
"use for blank config) or --model (base model to copy config from).",
|
|
||||||
f"Got: {existing if existing else 'no arguments'}",
|
|
||||||
exits=1,
|
exits=1,
|
||||||
)
|
)
|
||||||
|
|
|
@ -5,7 +5,7 @@ import time
|
||||||
import re
|
import re
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from thinc.api import use_pytorch_for_gpu_memory, require_gpu, Config
|
from thinc.api import use_pytorch_for_gpu_memory, require_gpu
|
||||||
from thinc.api import set_dropout_rate, to_categorical, fix_random_seed
|
from thinc.api import set_dropout_rate, to_categorical, fix_random_seed
|
||||||
from thinc.api import CosineDistance, L2Distance
|
from thinc.api import CosineDistance, L2Distance
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
|
@ -88,7 +88,7 @@ def pretrain(
|
||||||
msg.info("Using CPU")
|
msg.info("Using CPU")
|
||||||
msg.info(f"Loading config from: {config_path}")
|
msg.info(f"Loading config from: {config_path}")
|
||||||
with show_validation_error(config_path):
|
with show_validation_error(config_path):
|
||||||
config = Config().from_disk(config_path, overrides=config_overrides)
|
config = util.load_config(config_path, overrides=config_overrides)
|
||||||
nlp, config = util.load_model_from_config(config)
|
nlp, config = util.load_model_from_config(config)
|
||||||
# TODO: validate that [pretraining] block exists
|
# TODO: validate that [pretraining] block exists
|
||||||
if not output_dir.exists():
|
if not output_dir.exists():
|
||||||
|
|
237
spacy/cli/templates/quickstart_training.jinja
Normal file
237
spacy/cli/templates/quickstart_training.jinja
Normal file
|
@ -0,0 +1,237 @@
|
||||||
|
{# This is a template for training configs used for the quickstart widget in
|
||||||
|
the docs and the init config command. It encodes various best practices and
|
||||||
|
can help generate the best possible configuration, given a user's requirements. #}
|
||||||
|
{%- set use_transformer = (transformer_data and hardware != "cpu") -%}
|
||||||
|
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
|
||||||
|
[paths]
|
||||||
|
train = ""
|
||||||
|
dev = ""
|
||||||
|
|
||||||
|
[system]
|
||||||
|
use_pytorch_for_gpu_memory = {{ "true" if use_transformer else "false" }}
|
||||||
|
|
||||||
|
[nlp]
|
||||||
|
lang = "{{ lang }}"
|
||||||
|
{%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components %}
|
||||||
|
pipeline = {{ full_pipeline|pprint()|replace("'", '"')|safe }}
|
||||||
|
tokenizer = {"@tokenizers": "spacy.Tokenizer.v1"}
|
||||||
|
|
||||||
|
[components]
|
||||||
|
|
||||||
|
{# TRANSFORMER PIPELINE #}
|
||||||
|
{%- if use_transformer -%}
|
||||||
|
[components.transformer]
|
||||||
|
factory = "transformer"
|
||||||
|
|
||||||
|
[components.transformer.model]
|
||||||
|
@architectures = "spacy-transformers.TransformerModel.v1"
|
||||||
|
name = "{{ transformer["name"] }}"
|
||||||
|
tokenizer_config = {"use_fast": true}
|
||||||
|
|
||||||
|
[components.transformer.model.get_spans]
|
||||||
|
@span_getters = "strided_spans.v1"
|
||||||
|
window = 128
|
||||||
|
stride = 96
|
||||||
|
|
||||||
|
{% if "tagger" in components %}
|
||||||
|
[components.tagger]
|
||||||
|
factory = "tagger"
|
||||||
|
|
||||||
|
[components.tagger.model]
|
||||||
|
@architectures = "spacy.Tagger.v1"
|
||||||
|
nO = null
|
||||||
|
|
||||||
|
[components.tagger.model.tok2vec]
|
||||||
|
@architectures = "spacy-transformers.Tok2VecListener.v1"
|
||||||
|
grad_factor = 1.0
|
||||||
|
|
||||||
|
[components.tagger.model.tok2vec.pooling]
|
||||||
|
@layers = "reduce_mean.v1"
|
||||||
|
{%- endif %}
|
||||||
|
|
||||||
|
{% if "parser" in components -%}
|
||||||
|
[components.parser]
|
||||||
|
factory = "parser"
|
||||||
|
|
||||||
|
[components.parser.model]
|
||||||
|
@architectures = "spacy.TransitionBasedParser.v1"
|
||||||
|
nr_feature_tokens = 8
|
||||||
|
hidden_width = 128
|
||||||
|
maxout_pieces = 3
|
||||||
|
use_upper = false
|
||||||
|
nO = null
|
||||||
|
|
||||||
|
[components.parser.model.tok2vec]
|
||||||
|
@architectures = "spacy-transformers.Tok2VecListener.v1"
|
||||||
|
grad_factor = 1.0
|
||||||
|
|
||||||
|
[components.parser.model.tok2vec.pooling]
|
||||||
|
@layers = "reduce_mean.v1"
|
||||||
|
{%- endif %}
|
||||||
|
|
||||||
|
{% if "ner" in components -%}
|
||||||
|
[components.ner]
|
||||||
|
factory = "ner"
|
||||||
|
|
||||||
|
[components.ner.model]
|
||||||
|
@architectures = "spacy.TransitionBasedParser.v1"
|
||||||
|
nr_feature_tokens = 3
|
||||||
|
hidden_width = 64
|
||||||
|
maxout_pieces = 2
|
||||||
|
use_upper = false
|
||||||
|
nO = null
|
||||||
|
|
||||||
|
[components.ner.model.tok2vec]
|
||||||
|
@architectures = "spacy-transformers.Tok2VecListener.v1"
|
||||||
|
grad_factor = 1.0
|
||||||
|
|
||||||
|
[components.ner.model.tok2vec.pooling]
|
||||||
|
@layers = "reduce_mean.v1"
|
||||||
|
{% endif -%}
|
||||||
|
|
||||||
|
{# NON-TRANSFORMER PIPELINE #}
|
||||||
|
{% else -%}
|
||||||
|
|
||||||
|
{%- if hardware == "gpu" -%}
|
||||||
|
# There are no recommended transformer weights available for language '{{ lang }}'
|
||||||
|
# yet, so the pipeline described here is not transformer-based.
|
||||||
|
{%- endif %}
|
||||||
|
|
||||||
|
[components.tok2vec]
|
||||||
|
factory = "tok2vec"
|
||||||
|
|
||||||
|
[components.tok2vec.model]
|
||||||
|
@architectures = "spacy.Tok2Vec.v1"
|
||||||
|
|
||||||
|
[components.tok2vec.model.embed]
|
||||||
|
@architectures = "spacy.MultiHashEmbed.v1"
|
||||||
|
width = ${components.tok2vec.model.encode:width}
|
||||||
|
rows = {{ 2000 if optimize == "efficiency" else 7000 }}
|
||||||
|
also_embed_subwords = {{ true if has_letters else false }}
|
||||||
|
also_use_static_vectors = {{ true if optimize == "accuracy" else false }}
|
||||||
|
|
||||||
|
[components.tok2vec.model.encode]
|
||||||
|
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
||||||
|
width = {{ 96 if optimize == "efficiency" else 256 }}
|
||||||
|
depth = {{ 4 if optimize == "efficiency" else 8 }}
|
||||||
|
window_size = 1
|
||||||
|
maxout_pieces = 3
|
||||||
|
|
||||||
|
{% if "tagger" in components %}
|
||||||
|
[components.tagger]
|
||||||
|
factory = "tagger"
|
||||||
|
|
||||||
|
[components.tagger.model]
|
||||||
|
@architectures = "spacy.Tagger.v1"
|
||||||
|
nO = null
|
||||||
|
|
||||||
|
[components.tagger.model.tok2vec]
|
||||||
|
@architectures = "spacy.Tok2VecListener.v1"
|
||||||
|
width = ${components.tok2vec.model.encode:width}
|
||||||
|
{%- endif %}
|
||||||
|
|
||||||
|
{% if "parser" in components -%}
|
||||||
|
[components.parser]
|
||||||
|
factory = "parser"
|
||||||
|
|
||||||
|
[components.parser.model]
|
||||||
|
@architectures = "spacy.TransitionBasedParser.v1"
|
||||||
|
nr_feature_tokens = 8
|
||||||
|
hidden_width = 128
|
||||||
|
maxout_pieces = 3
|
||||||
|
use_upper = true
|
||||||
|
nO = null
|
||||||
|
|
||||||
|
[components.parser.model.tok2vec]
|
||||||
|
@architectures = "spacy.Tok2VecListener.v1"
|
||||||
|
width = ${components.tok2vec.model.encode:width}
|
||||||
|
{%- endif %}
|
||||||
|
|
||||||
|
{% if "ner" in components %}
|
||||||
|
[components.ner]
|
||||||
|
factory = "ner"
|
||||||
|
|
||||||
|
[components.ner.model]
|
||||||
|
@architectures = "spacy.TransitionBasedParser.v1"
|
||||||
|
nr_feature_tokens = 6
|
||||||
|
hidden_width = 64
|
||||||
|
maxout_pieces = 2
|
||||||
|
use_upper = true
|
||||||
|
nO = null
|
||||||
|
|
||||||
|
[components.ner.model.tok2vec]
|
||||||
|
@architectures = "spacy.Tok2VecListener.v1"
|
||||||
|
width = ${components.tok2vec.model.encode:width}
|
||||||
|
{% endif %}
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
{% for pipe in components %}
|
||||||
|
{% if pipe not in ["tagger", "parser", "ner"] %}
|
||||||
|
{# Other components defined by the user: we just assume they're factories #}
|
||||||
|
[components.{{ pipe }}]
|
||||||
|
factory = "{{ pipe }}"
|
||||||
|
{% endif %}
|
||||||
|
{% endfor %}
|
||||||
|
|
||||||
|
[training]
|
||||||
|
{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
|
||||||
|
vectors = null
|
||||||
|
{% else -%}
|
||||||
|
vectors = "{{ word_vectors }}"
|
||||||
|
{% endif -%}
|
||||||
|
{% if use_transformer -%}
|
||||||
|
accumulate_gradient = {{ transformer["size_factor"] }}
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
[training.optimizer]
|
||||||
|
@optimizers = "Adam.v1"
|
||||||
|
|
||||||
|
[training.optimizer.learn_rate]
|
||||||
|
@schedules = "warmup_linear.v1"
|
||||||
|
warmup_steps = 250
|
||||||
|
total_steps = 20000
|
||||||
|
initial_rate = 5e-5
|
||||||
|
|
||||||
|
[training.train_corpus]
|
||||||
|
@readers = "spacy.Corpus.v1"
|
||||||
|
path = ${paths:train}
|
||||||
|
max_length = {{ 500 if hardware == "gpu" else 0 }}
|
||||||
|
|
||||||
|
[training.dev_corpus]
|
||||||
|
@readers = "spacy.Corpus.v1"
|
||||||
|
path = ${paths:dev}
|
||||||
|
max_length = 0
|
||||||
|
|
||||||
|
{% if use_transformer %}
|
||||||
|
[training.batcher]
|
||||||
|
@batchers = "batch_by_padded.v1"
|
||||||
|
discard_oversize = true
|
||||||
|
size = 2000
|
||||||
|
buffer = 256
|
||||||
|
{%- else %}
|
||||||
|
[training.batcher]
|
||||||
|
@batchers = "batch_by_words.v1"
|
||||||
|
discard_oversize = false
|
||||||
|
tolerance = 0.2
|
||||||
|
|
||||||
|
[training.batcher.size]
|
||||||
|
@schedules = "compounding.v1"
|
||||||
|
start = 100
|
||||||
|
stop = 1000
|
||||||
|
compound = 1.001
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
[training.score_weights]
|
||||||
|
{%- if "tagger" in components %}
|
||||||
|
tag_acc = {{ (1.0 / components|length)|round(2) }}
|
||||||
|
{%- endif -%}
|
||||||
|
{%- if "parser" in components %}
|
||||||
|
dep_uas = 0.0
|
||||||
|
dep_las = {{ (1.0 / components|length)|round(2) }}
|
||||||
|
sents_f = 0.0
|
||||||
|
{%- endif %}
|
||||||
|
{%- if "ner" in components %}
|
||||||
|
ents_f = {{ (1.0 / components|length)|round(2) }}
|
||||||
|
ents_p = 0.0
|
||||||
|
ents_r = 0.0
|
||||||
|
{%- endif -%}
|
13
spacy/cli/templates/quickstart_training_recommendations.json
Normal file
13
spacy/cli/templates/quickstart_training_recommendations.json
Normal file
|
@ -0,0 +1,13 @@
|
||||||
|
{
|
||||||
|
"en": {
|
||||||
|
"word_vectors": "en_vectors_web_lg",
|
||||||
|
"transformer": {
|
||||||
|
"efficiency": { "name": "roberta-base", "size_factor": 3 },
|
||||||
|
"accuracy": { "name": "roberta-base", "size_factor": 3 }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"de": {
|
||||||
|
"word_vectors": null,
|
||||||
|
"transformer": null
|
||||||
|
}
|
||||||
|
}
|
|
@ -9,6 +9,7 @@ from thinc.api import use_pytorch_for_gpu_memory, require_gpu, fix_random_seed
|
||||||
from thinc.api import Config, Optimizer
|
from thinc.api import Config, Optimizer
|
||||||
import random
|
import random
|
||||||
import typer
|
import typer
|
||||||
|
import logging
|
||||||
|
|
||||||
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
||||||
from ._util import import_code, get_sourced_components
|
from ._util import import_code, get_sourced_components
|
||||||
|
@ -17,7 +18,6 @@ from .. import util
|
||||||
from ..gold.example import Example
|
from ..gold.example import Example
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
|
|
||||||
|
|
||||||
# Don't remove - required to load the built-in architectures
|
# Don't remove - required to load the built-in architectures
|
||||||
from ..ml import models # noqa: F401
|
from ..ml import models # noqa: F401
|
||||||
|
|
||||||
|
@ -48,7 +48,7 @@ def train_cli(
|
||||||
used to register custom functions and architectures that can then be
|
used to register custom functions and architectures that can then be
|
||||||
referenced in the config.
|
referenced in the config.
|
||||||
"""
|
"""
|
||||||
util.set_env_log(verbose)
|
util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
|
||||||
verify_cli_args(config_path, output_path)
|
verify_cli_args(config_path, output_path)
|
||||||
overrides = parse_config_overrides(ctx.args)
|
overrides = parse_config_overrides(ctx.args)
|
||||||
import_code(code_path)
|
import_code(code_path)
|
||||||
|
@ -75,7 +75,7 @@ def train(
|
||||||
msg.info("Using CPU")
|
msg.info("Using CPU")
|
||||||
msg.info(f"Loading config and nlp from: {config_path}")
|
msg.info(f"Loading config and nlp from: {config_path}")
|
||||||
with show_validation_error(config_path):
|
with show_validation_error(config_path):
|
||||||
config = Config().from_disk(config_path, overrides=config_overrides)
|
config = util.load_config(config_path, overrides=config_overrides)
|
||||||
if config.get("training", {}).get("seed") is not None:
|
if config.get("training", {}).get("seed") is not None:
|
||||||
fix_random_seed(config["training"]["seed"])
|
fix_random_seed(config["training"]["seed"])
|
||||||
# Use original config here before it's resolved to functions
|
# Use original config here before it's resolved to functions
|
||||||
|
@ -102,9 +102,9 @@ def train(
|
||||||
if resume_components:
|
if resume_components:
|
||||||
with nlp.select_pipes(enable=resume_components):
|
with nlp.select_pipes(enable=resume_components):
|
||||||
msg.info(f"Resuming training for: {resume_components}")
|
msg.info(f"Resuming training for: {resume_components}")
|
||||||
nlp.resume_training()
|
nlp.resume_training(sgd=optimizer)
|
||||||
with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
|
with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
|
||||||
nlp.begin_training(lambda: train_corpus(nlp))
|
nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
|
||||||
|
|
||||||
if tag_map:
|
if tag_map:
|
||||||
# Replace tag map with provided mapping
|
# Replace tag map with provided mapping
|
||||||
|
@ -295,7 +295,11 @@ def train_while_improving(
|
||||||
nlp.rehearse(raw_batch, sgd=optimizer, losses=losses, exclude=exclude)
|
nlp.rehearse(raw_batch, sgd=optimizer, losses=losses, exclude=exclude)
|
||||||
# TODO: refactor this so we don't have to run it separately in here
|
# TODO: refactor this so we don't have to run it separately in here
|
||||||
for name, proc in nlp.pipeline:
|
for name, proc in nlp.pipeline:
|
||||||
if name not in exclude and hasattr(proc, "model"):
|
if (
|
||||||
|
name not in exclude
|
||||||
|
and hasattr(proc, "model")
|
||||||
|
and proc.model not in (True, False, None)
|
||||||
|
):
|
||||||
proc.model.finish_update(optimizer)
|
proc.model.finish_update(optimizer)
|
||||||
optimizer.step_schedules()
|
optimizer.step_schedules()
|
||||||
if not (step % eval_frequency):
|
if not (step % eval_frequency):
|
||||||
|
|
|
@ -55,12 +55,6 @@ class Warnings:
|
||||||
"loaded. (Shape: {shape})")
|
"loaded. (Shape: {shape})")
|
||||||
W021 = ("Unexpected hash collision in PhraseMatcher. Matches may be "
|
W021 = ("Unexpected hash collision in PhraseMatcher. Matches may be "
|
||||||
"incorrect. Modify PhraseMatcher._terminal_hash to fix.")
|
"incorrect. Modify PhraseMatcher._terminal_hash to fix.")
|
||||||
W022 = ("Training a new part-of-speech tagger using a model with no "
|
|
||||||
"lemmatization rules or data. This means that the trained model "
|
|
||||||
"may not be able to lemmatize correctly. If this is intentional "
|
|
||||||
"or the language you're using doesn't have lemmatization data, "
|
|
||||||
"you can ignore this warning. If this is surprising, make sure you "
|
|
||||||
"have the spacy-lookups-data package installed.")
|
|
||||||
W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
|
W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
|
||||||
"the Knowledge Base.")
|
"the Knowledge Base.")
|
||||||
W026 = ("Unable to set all sentence boundaries from dependency parses.")
|
W026 = ("Unable to set all sentence boundaries from dependency parses.")
|
||||||
|
@ -482,6 +476,15 @@ class Errors:
|
||||||
E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
|
E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
|
||||||
|
|
||||||
# TODO: fix numbering after merging develop into master
|
# TODO: fix numbering after merging develop into master
|
||||||
|
E930 = ("Received invalid get_examples callback in {name}.begin_training. "
|
||||||
|
"Expected function that returns an iterable of Example objects but "
|
||||||
|
"got: {obj}")
|
||||||
|
E931 = ("Encountered Pipe subclass without Pipe.{method} method in component "
|
||||||
|
"'{name}'. If the component is trainable and you want to use this "
|
||||||
|
"method, make sure it's overwritten on the subclass. If your "
|
||||||
|
"component isn't trainable, add a method that does nothing or "
|
||||||
|
"don't use the Pipe base class.")
|
||||||
|
E940 = ("Found NaN values in scores.")
|
||||||
E941 = ("Can't find model '{name}'. It looks like you're trying to load a "
|
E941 = ("Can't find model '{name}'. It looks like you're trying to load a "
|
||||||
"model from a shortcut, which is deprecated as of spaCy v3.0. To "
|
"model from a shortcut, which is deprecated as of spaCy v3.0. To "
|
||||||
"load the model, use its full name instead:\n\n"
|
"load the model, use its full name instead:\n\n"
|
||||||
|
@ -578,8 +581,7 @@ class Errors:
|
||||||
"but received None.")
|
"but received None.")
|
||||||
E977 = ("Can not compare a MorphAnalysis with a string object. "
|
E977 = ("Can not compare a MorphAnalysis with a string object. "
|
||||||
"This is likely a bug in spaCy, so feel free to open an issue.")
|
"This is likely a bug in spaCy, so feel free to open an issue.")
|
||||||
E978 = ("The '{method}' method of {name} takes a list of Example objects, "
|
E978 = ("The {name} method takes a list of Example objects, but got: {types}")
|
||||||
"but found {types} instead.")
|
|
||||||
E979 = ("Cannot convert {type} to an Example object.")
|
E979 = ("Cannot convert {type} to an Example object.")
|
||||||
E980 = ("Each link annotation should refer to a dictionary with at most one "
|
E980 = ("Each link annotation should refer to a dictionary with at most one "
|
||||||
"identifier mapping to 1.0, and all others to 0.0.")
|
"identifier mapping to 1.0, and all others to 0.0.")
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
from .corpus import Corpus # noqa: F401
|
from .corpus import Corpus # noqa: F401
|
||||||
from .example import Example # noqa: F401
|
from .example import Example, validate_examples # noqa: F401
|
||||||
from .align import Alignment # noqa: F401
|
from .align import Alignment # noqa: F401
|
||||||
from .iob_utils import iob_to_biluo, biluo_to_iob # noqa: F401
|
from .iob_utils import iob_to_biluo, biluo_to_iob # noqa: F401
|
||||||
from .iob_utils import biluo_tags_from_offsets, offsets_from_biluo_tags # noqa: F401
|
from .iob_utils import biluo_tags_from_offsets, offsets_from_biluo_tags # noqa: F401
|
||||||
|
|
|
@ -62,7 +62,7 @@ class Corpus:
|
||||||
if str(path) in seen:
|
if str(path) in seen:
|
||||||
continue
|
continue
|
||||||
seen.add(str(path))
|
seen.add(str(path))
|
||||||
if path.parts[-1].startswith("."):
|
if path.parts and path.parts[-1].startswith("."):
|
||||||
continue
|
continue
|
||||||
elif path.is_dir():
|
elif path.is_dir():
|
||||||
paths.extend(path.iterdir())
|
paths.extend(path.iterdir())
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
|
from collections import Iterable as IterableInstance
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
|
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
|
@ -26,6 +26,22 @@ cpdef Doc annotations2doc(vocab, tok_annot, doc_annot):
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
def validate_examples(examples, method):
|
||||||
|
"""Check that a batch of examples received during processing is valid.
|
||||||
|
This function lives here to prevent circular imports.
|
||||||
|
|
||||||
|
examples (Iterable[Examples]): A batch of examples.
|
||||||
|
method (str): The method name to show in error messages.
|
||||||
|
"""
|
||||||
|
if not isinstance(examples, IterableInstance):
|
||||||
|
err = Errors.E978.format(name=method, types=type(examples))
|
||||||
|
raise TypeError(err)
|
||||||
|
wrong = set([type(eg) for eg in examples if not isinstance(eg, Example)])
|
||||||
|
if wrong:
|
||||||
|
err = Errors.E978.format(name=method, types=wrong)
|
||||||
|
raise TypeError(err)
|
||||||
|
|
||||||
|
|
||||||
cdef class Example:
|
cdef class Example:
|
||||||
def __init__(self, Doc predicted, Doc reference, *, alignment=None):
|
def __init__(self, Doc predicted, Doc reference, *, alignment=None):
|
||||||
if predicted is None:
|
if predicted is None:
|
||||||
|
@ -263,12 +279,10 @@ def _annot2array(vocab, tok_annot, doc_annot):
|
||||||
values.append([vocab.morphology.add(v) for v in value])
|
values.append([vocab.morphology.add(v) for v in value])
|
||||||
else:
|
else:
|
||||||
attrs.append(key)
|
attrs.append(key)
|
||||||
try:
|
if not all(isinstance(v, str) for v in value):
|
||||||
values.append([vocab.strings.add(v) for v in value])
|
types = set([type(v) for v in value])
|
||||||
except TypeError:
|
|
||||||
types= set([type(v) for v in value])
|
|
||||||
raise TypeError(Errors.E969.format(field=key, types=types)) from None
|
raise TypeError(Errors.E969.format(field=key, types=types)) from None
|
||||||
|
values.append([vocab.strings.add(v) for v in value])
|
||||||
array = numpy.asarray(values, dtype="uint64")
|
array = numpy.asarray(values, dtype="uint64")
|
||||||
return attrs, array.T
|
return attrs, array.T
|
||||||
|
|
||||||
|
|
|
@ -5,7 +5,6 @@ import random
|
||||||
import itertools
|
import itertools
|
||||||
import weakref
|
import weakref
|
||||||
import functools
|
import functools
|
||||||
from collections import Iterable as IterableInstance
|
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
from copy import copy, deepcopy
|
from copy import copy, deepcopy
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -19,10 +18,10 @@ from timeit import default_timer as timer
|
||||||
from .tokens.underscore import Underscore
|
from .tokens.underscore import Underscore
|
||||||
from .vocab import Vocab, create_vocab
|
from .vocab import Vocab, create_vocab
|
||||||
from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
|
from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
|
||||||
from .gold import Example
|
from .gold import Example, validate_examples
|
||||||
from .scorer import Scorer
|
from .scorer import Scorer
|
||||||
from .util import create_default_optimizer, registry
|
from .util import create_default_optimizer, registry
|
||||||
from .util import SimpleFrozenDict, combine_score_weights
|
from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER
|
||||||
from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
|
from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
|
||||||
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||||
from .lang.punctuation import TOKENIZER_INFIXES
|
from .lang.punctuation import TOKENIZER_INFIXES
|
||||||
|
@ -37,7 +36,7 @@ from . import about
|
||||||
|
|
||||||
# This is the base config will all settings (training etc.)
|
# This is the base config will all settings (training etc.)
|
||||||
DEFAULT_CONFIG_PATH = Path(__file__).parent / "default_config.cfg"
|
DEFAULT_CONFIG_PATH = Path(__file__).parent / "default_config.cfg"
|
||||||
DEFAULT_CONFIG = Config().from_disk(DEFAULT_CONFIG_PATH)
|
DEFAULT_CONFIG = util.load_config(DEFAULT_CONFIG_PATH)
|
||||||
|
|
||||||
|
|
||||||
class BaseDefaults:
|
class BaseDefaults:
|
||||||
|
@ -46,7 +45,7 @@ class BaseDefaults:
|
||||||
Language.Defaults.
|
Language.Defaults.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
config: Config = Config()
|
config: Config = Config(section_order=CONFIG_SECTION_ORDER)
|
||||||
tokenizer_exceptions: Dict[str, List[dict]] = BASE_EXCEPTIONS
|
tokenizer_exceptions: Dict[str, List[dict]] = BASE_EXCEPTIONS
|
||||||
prefixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_PREFIXES
|
prefixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_PREFIXES
|
||||||
suffixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_SUFFIXES
|
suffixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_SUFFIXES
|
||||||
|
@ -135,7 +134,7 @@ class Language:
|
||||||
# of the rest.
|
# of the rest.
|
||||||
util.registry._entry_point_factories.get_all()
|
util.registry._entry_point_factories.get_all()
|
||||||
|
|
||||||
self._config = util.deep_merge_configs(self.default_config, DEFAULT_CONFIG)
|
self._config = DEFAULT_CONFIG.merge(self.default_config)
|
||||||
self._meta = dict(meta)
|
self._meta = dict(meta)
|
||||||
self._path = None
|
self._path = None
|
||||||
self._optimizer = None
|
self._optimizer = None
|
||||||
|
@ -168,9 +167,7 @@ class Language:
|
||||||
|
|
||||||
def __init_subclass__(cls, **kwargs):
|
def __init_subclass__(cls, **kwargs):
|
||||||
super().__init_subclass__(**kwargs)
|
super().__init_subclass__(**kwargs)
|
||||||
cls.default_config = util.deep_merge_configs(
|
cls.default_config = DEFAULT_CONFIG.merge(cls.Defaults.config)
|
||||||
cls.Defaults.config, DEFAULT_CONFIG
|
|
||||||
)
|
|
||||||
cls.default_config["nlp"]["lang"] = cls.lang
|
cls.default_config["nlp"]["lang"] = cls.lang
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -533,6 +530,7 @@ class Language:
|
||||||
name: Optional[str] = None,
|
name: Optional[str] = None,
|
||||||
*,
|
*,
|
||||||
config: Optional[Dict[str, Any]] = SimpleFrozenDict(),
|
config: Optional[Dict[str, Any]] = SimpleFrozenDict(),
|
||||||
|
raw_config: Optional[Config] = None,
|
||||||
validate: bool = True,
|
validate: bool = True,
|
||||||
) -> Callable[[Doc], Doc]:
|
) -> Callable[[Doc], Doc]:
|
||||||
"""Create a pipeline component. Mostly used internally. To create and
|
"""Create a pipeline component. Mostly used internally. To create and
|
||||||
|
@ -543,6 +541,7 @@ class Language:
|
||||||
Defaults to factory name if not set.
|
Defaults to factory name if not set.
|
||||||
config (Optional[Dict[str, Any]]): Config parameters to use for this
|
config (Optional[Dict[str, Any]]): Config parameters to use for this
|
||||||
component. Will be merged with default config, if available.
|
component. Will be merged with default config, if available.
|
||||||
|
raw_config (Optional[Config]): Internals: the non-interpolated config.
|
||||||
validate (bool): Whether to validate the component config against the
|
validate (bool): Whether to validate the component config against the
|
||||||
arguments and types expected by the factory.
|
arguments and types expected by the factory.
|
||||||
RETURNS (Callable[[Doc], Doc]): The pipeline component.
|
RETURNS (Callable[[Doc], Doc]): The pipeline component.
|
||||||
|
@ -569,7 +568,7 @@ class Language:
|
||||||
# This is unideal, but the alternative would mean you always need to
|
# This is unideal, but the alternative would mean you always need to
|
||||||
# specify the full config settings, which is not really viable.
|
# specify the full config settings, which is not really viable.
|
||||||
if pipe_meta.default_config:
|
if pipe_meta.default_config:
|
||||||
config = util.deep_merge_configs(config, pipe_meta.default_config)
|
config = Config(pipe_meta.default_config).merge(config)
|
||||||
# We need to create a top-level key because Thinc doesn't allow resolving
|
# We need to create a top-level key because Thinc doesn't allow resolving
|
||||||
# top-level references to registered functions. Also gives nicer errors.
|
# top-level references to registered functions. Also gives nicer errors.
|
||||||
# The name allows components to know their pipe name and use it in the
|
# The name allows components to know their pipe name and use it in the
|
||||||
|
@ -583,12 +582,14 @@ class Language:
|
||||||
cfg = {factory_name: config}
|
cfg = {factory_name: config}
|
||||||
# We're calling the internal _fill here to avoid constructing the
|
# We're calling the internal _fill here to avoid constructing the
|
||||||
# registered functions twice
|
# registered functions twice
|
||||||
# TODO: customize validation to make it more readable / relate it to
|
|
||||||
# pipeline component and why it failed, explain default config
|
|
||||||
resolved, filled = registry.resolve(cfg, validate=validate)
|
resolved, filled = registry.resolve(cfg, validate=validate)
|
||||||
filled = filled[factory_name]
|
filled = Config(filled[factory_name])
|
||||||
filled["factory"] = factory_name
|
filled["factory"] = factory_name
|
||||||
filled.pop("@factories", None)
|
filled.pop("@factories", None)
|
||||||
|
# Merge the final filled config with the raw config (including non-
|
||||||
|
# interpolated variables)
|
||||||
|
if raw_config:
|
||||||
|
filled = filled.merge(raw_config)
|
||||||
self._pipe_configs[name] = filled
|
self._pipe_configs[name] = filled
|
||||||
return resolved[factory_name]
|
return resolved[factory_name]
|
||||||
|
|
||||||
|
@ -614,7 +615,10 @@ class Language:
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
pipe = source.get_pipe(source_name)
|
pipe = source.get_pipe(source_name)
|
||||||
pipe_config = util.copy_config(source.config["components"][source_name])
|
# Make sure the source config is interpolated so we don't end up with
|
||||||
|
# orphaned variables in our final config
|
||||||
|
source_config = source.config.interpolate()
|
||||||
|
pipe_config = util.copy_config(source_config["components"][source_name])
|
||||||
self._pipe_configs[name] = pipe_config
|
self._pipe_configs[name] = pipe_config
|
||||||
return pipe, pipe_config["factory"]
|
return pipe, pipe_config["factory"]
|
||||||
|
|
||||||
|
@ -629,6 +633,7 @@ class Language:
|
||||||
last: Optional[bool] = None,
|
last: Optional[bool] = None,
|
||||||
source: Optional["Language"] = None,
|
source: Optional["Language"] = None,
|
||||||
config: Optional[Dict[str, Any]] = SimpleFrozenDict(),
|
config: Optional[Dict[str, Any]] = SimpleFrozenDict(),
|
||||||
|
raw_config: Optional[Config] = None,
|
||||||
validate: bool = True,
|
validate: bool = True,
|
||||||
) -> Callable[[Doc], Doc]:
|
) -> Callable[[Doc], Doc]:
|
||||||
"""Add a component to the processing pipeline. Valid components are
|
"""Add a component to the processing pipeline. Valid components are
|
||||||
|
@ -650,6 +655,7 @@ class Language:
|
||||||
component from.
|
component from.
|
||||||
config (Optional[Dict[str, Any]]): Config parameters to use for this
|
config (Optional[Dict[str, Any]]): Config parameters to use for this
|
||||||
component. Will be merged with default config, if available.
|
component. Will be merged with default config, if available.
|
||||||
|
raw_config (Optional[Config]): Internals: the non-interpolated config.
|
||||||
validate (bool): Whether to validate the component config against the
|
validate (bool): Whether to validate the component config against the
|
||||||
arguments and types expected by the factory.
|
arguments and types expected by the factory.
|
||||||
RETURNS (Callable[[Doc], Doc]): The pipeline component.
|
RETURNS (Callable[[Doc], Doc]): The pipeline component.
|
||||||
|
@ -679,7 +685,11 @@ class Language:
|
||||||
lang_code=self.lang,
|
lang_code=self.lang,
|
||||||
)
|
)
|
||||||
pipe_component = self.create_pipe(
|
pipe_component = self.create_pipe(
|
||||||
factory_name, name=name, config=config, validate=validate,
|
factory_name,
|
||||||
|
name=name,
|
||||||
|
config=config,
|
||||||
|
raw_config=raw_config,
|
||||||
|
validate=validate,
|
||||||
)
|
)
|
||||||
pipe_index = self._get_pipe_index(before, after, first, last)
|
pipe_index = self._get_pipe_index(before, after, first, last)
|
||||||
self._pipe_meta[name] = self.get_factory_meta(factory_name)
|
self._pipe_meta[name] = self.get_factory_meta(factory_name)
|
||||||
|
@ -935,17 +945,7 @@ class Language:
|
||||||
losses = {}
|
losses = {}
|
||||||
if len(examples) == 0:
|
if len(examples) == 0:
|
||||||
return losses
|
return losses
|
||||||
if not isinstance(examples, IterableInstance):
|
validate_examples(examples, "Language.update")
|
||||||
raise TypeError(
|
|
||||||
Errors.E978.format(
|
|
||||||
name="language", method="update", types=type(examples)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
wrong_types = set([type(eg) for eg in examples if not isinstance(eg, Example)])
|
|
||||||
if wrong_types:
|
|
||||||
raise TypeError(
|
|
||||||
Errors.E978.format(name="language", method="update", types=wrong_types)
|
|
||||||
)
|
|
||||||
if sgd is None:
|
if sgd is None:
|
||||||
if self._optimizer is None:
|
if self._optimizer is None:
|
||||||
self._optimizer = create_default_optimizer()
|
self._optimizer = create_default_optimizer()
|
||||||
|
@ -962,7 +962,11 @@ class Language:
|
||||||
proc.update(examples, sgd=None, losses=losses, **component_cfg[name])
|
proc.update(examples, sgd=None, losses=losses, **component_cfg[name])
|
||||||
if sgd not in (None, False):
|
if sgd not in (None, False):
|
||||||
for name, proc in self.pipeline:
|
for name, proc in self.pipeline:
|
||||||
if name not in exclude and hasattr(proc, "model"):
|
if (
|
||||||
|
name not in exclude
|
||||||
|
and hasattr(proc, "model")
|
||||||
|
and proc.model not in (True, False, None)
|
||||||
|
):
|
||||||
proc.model.finish_update(sgd)
|
proc.model.finish_update(sgd)
|
||||||
return losses
|
return losses
|
||||||
|
|
||||||
|
@ -999,19 +1003,7 @@ class Language:
|
||||||
"""
|
"""
|
||||||
if len(examples) == 0:
|
if len(examples) == 0:
|
||||||
return
|
return
|
||||||
if not isinstance(examples, IterableInstance):
|
validate_examples(examples, "Language.rehearse")
|
||||||
raise TypeError(
|
|
||||||
Errors.E978.format(
|
|
||||||
name="language", method="rehearse", types=type(examples)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
wrong_types = set([type(eg) for eg in examples if not isinstance(eg, Example)])
|
|
||||||
if wrong_types:
|
|
||||||
raise TypeError(
|
|
||||||
Errors.E978.format(
|
|
||||||
name="language", method="rehearse", types=wrong_types
|
|
||||||
)
|
|
||||||
)
|
|
||||||
if sgd is None:
|
if sgd is None:
|
||||||
if self._optimizer is None:
|
if self._optimizer is None:
|
||||||
self._optimizer = create_default_optimizer()
|
self._optimizer = create_default_optimizer()
|
||||||
|
@ -1060,7 +1052,15 @@ class Language:
|
||||||
if get_examples is None:
|
if get_examples is None:
|
||||||
get_examples = lambda: []
|
get_examples = lambda: []
|
||||||
else: # Populate vocab
|
else: # Populate vocab
|
||||||
|
if not hasattr(get_examples, "__call__"):
|
||||||
|
err = Errors.E930.format(name="Language", obj=type(get_examples))
|
||||||
|
raise ValueError(err)
|
||||||
for example in get_examples():
|
for example in get_examples():
|
||||||
|
if not isinstance(example, Example):
|
||||||
|
err = Errors.E978.format(
|
||||||
|
name="Language.begin_training", types=type(example)
|
||||||
|
)
|
||||||
|
raise ValueError(err)
|
||||||
for word in [t.text for t in example.reference]:
|
for word in [t.text for t in example.reference]:
|
||||||
_ = self.vocab[word] # noqa: F841
|
_ = self.vocab[word] # noqa: F841
|
||||||
if device >= 0: # TODO: do we need this here?
|
if device >= 0: # TODO: do we need this here?
|
||||||
|
@ -1133,17 +1133,7 @@ class Language:
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#evaluate
|
DOCS: https://spacy.io/api/language#evaluate
|
||||||
"""
|
"""
|
||||||
if not isinstance(examples, IterableInstance):
|
validate_examples(examples, "Language.evaluate")
|
||||||
err = Errors.E978.format(
|
|
||||||
name="language", method="evaluate", types=type(examples)
|
|
||||||
)
|
|
||||||
raise TypeError(err)
|
|
||||||
wrong_types = set([type(eg) for eg in examples if not isinstance(eg, Example)])
|
|
||||||
if wrong_types:
|
|
||||||
err = Errors.E978.format(
|
|
||||||
name="language", method="evaluate", types=wrong_types
|
|
||||||
)
|
|
||||||
raise TypeError(err)
|
|
||||||
if component_cfg is None:
|
if component_cfg is None:
|
||||||
component_cfg = {}
|
component_cfg = {}
|
||||||
if scorer_cfg is None:
|
if scorer_cfg is None:
|
||||||
|
@ -1400,7 +1390,9 @@ class Language:
|
||||||
DOCS: https://spacy.io/api/language#from_config
|
DOCS: https://spacy.io/api/language#from_config
|
||||||
"""
|
"""
|
||||||
if auto_fill:
|
if auto_fill:
|
||||||
config = util.deep_merge_configs(config, cls.default_config)
|
config = Config(
|
||||||
|
cls.default_config, section_order=CONFIG_SECTION_ORDER
|
||||||
|
).merge(config)
|
||||||
if "nlp" not in config:
|
if "nlp" not in config:
|
||||||
raise ValueError(Errors.E985.format(config=config))
|
raise ValueError(Errors.E985.format(config=config))
|
||||||
config_lang = config["nlp"]["lang"]
|
config_lang = config["nlp"]["lang"]
|
||||||
|
@ -1438,16 +1430,20 @@ class Language:
|
||||||
or lang_cls is not cls
|
or lang_cls is not cls
|
||||||
):
|
):
|
||||||
raise ValueError(Errors.E943.format(value=type(lang_cls)))
|
raise ValueError(Errors.E943.format(value=type(lang_cls)))
|
||||||
|
# Note that we don't load vectors here, instead they get loaded explicitly
|
||||||
|
# inside stuff like the spacy train function. If we loaded them here,
|
||||||
|
# then we would load them twice at runtime: once when we make from config,
|
||||||
|
# and then again when we load from disk.
|
||||||
nlp = lang_cls(vocab=vocab, create_tokenizer=create_tokenizer)
|
nlp = lang_cls(vocab=vocab, create_tokenizer=create_tokenizer)
|
||||||
if after_creation is not None:
|
if after_creation is not None:
|
||||||
nlp = after_creation(nlp)
|
nlp = after_creation(nlp)
|
||||||
if not isinstance(nlp, cls):
|
if not isinstance(nlp, cls):
|
||||||
raise ValueError(Errors.E942.format(name="creation", value=type(nlp)))
|
raise ValueError(Errors.E942.format(name="creation", value=type(nlp)))
|
||||||
# Note that we don't load vectors here, instead they get loaded explicitly
|
# To create the components we need to use the final interpolated config
|
||||||
# inside stuff like the spacy train function. If we loaded them here,
|
# so all values are available (if component configs use variables).
|
||||||
# then we would load them twice at runtime: once when we make from config,
|
# Later we replace the component config with the raw config again.
|
||||||
# and then again when we load from disk.
|
interpolated = filled.interpolate() if not filled.is_interpolated else filled
|
||||||
pipeline = config.get("components", {})
|
pipeline = interpolated.get("components", {})
|
||||||
# If components are loaded from a source (existing models), we cache
|
# If components are loaded from a source (existing models), we cache
|
||||||
# them here so they're only loaded once
|
# them here so they're only loaded once
|
||||||
source_nlps = {}
|
source_nlps = {}
|
||||||
|
@ -1456,6 +1452,7 @@ class Language:
|
||||||
opts = ", ".join(pipeline.keys())
|
opts = ", ".join(pipeline.keys())
|
||||||
raise ValueError(Errors.E956.format(name=pipe_name, opts=opts))
|
raise ValueError(Errors.E956.format(name=pipe_name, opts=opts))
|
||||||
pipe_cfg = util.copy_config(pipeline[pipe_name])
|
pipe_cfg = util.copy_config(pipeline[pipe_name])
|
||||||
|
raw_config = Config(filled["components"][pipe_name])
|
||||||
if pipe_name not in disable:
|
if pipe_name not in disable:
|
||||||
if "factory" not in pipe_cfg and "source" not in pipe_cfg:
|
if "factory" not in pipe_cfg and "source" not in pipe_cfg:
|
||||||
err = Errors.E984.format(name=pipe_name, config=pipe_cfg)
|
err = Errors.E984.format(name=pipe_name, config=pipe_cfg)
|
||||||
|
@ -1465,7 +1462,11 @@ class Language:
|
||||||
# The pipe name (key in the config) here is the unique name
|
# The pipe name (key in the config) here is the unique name
|
||||||
# of the component, not necessarily the factory
|
# of the component, not necessarily the factory
|
||||||
nlp.add_pipe(
|
nlp.add_pipe(
|
||||||
factory, name=pipe_name, config=pipe_cfg, validate=validate,
|
factory,
|
||||||
|
name=pipe_name,
|
||||||
|
config=pipe_cfg,
|
||||||
|
validate=validate,
|
||||||
|
raw_config=raw_config,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
model = pipe_cfg["source"]
|
model = pipe_cfg["source"]
|
||||||
|
@ -1663,7 +1664,7 @@ def _fix_pretrained_vectors_name(nlp: Language) -> None:
|
||||||
else:
|
else:
|
||||||
raise ValueError(Errors.E092)
|
raise ValueError(Errors.E092)
|
||||||
for name, proc in nlp.pipeline:
|
for name, proc in nlp.pipeline:
|
||||||
if not hasattr(proc, "cfg"):
|
if not hasattr(proc, "cfg") or not isinstance(proc.cfg, dict):
|
||||||
continue
|
continue
|
||||||
proc.cfg.setdefault("deprecation_fixes", {})
|
proc.cfg.setdefault("deprecation_fixes", {})
|
||||||
proc.cfg["deprecation_fixes"]["vectors_name"] = nlp.vocab.vectors.name
|
proc.cfg["deprecation_fixes"]["vectors_name"] = nlp.vocab.vectors.name
|
||||||
|
|
|
@ -9,6 +9,7 @@ from .functions import merge_subtokens
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ._parser_internals import nonproj
|
from ._parser_internals import nonproj
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
|
from ..gold import validate_examples
|
||||||
|
|
||||||
|
|
||||||
default_model_config = """
|
default_model_config = """
|
||||||
|
@ -147,6 +148,7 @@ cdef class DependencyParser(Parser):
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/dependencyparser#score
|
DOCS: https://spacy.io/api/dependencyparser#score
|
||||||
"""
|
"""
|
||||||
|
validate_examples(examples, "DependencyParser.score")
|
||||||
def dep_getter(token, attr):
|
def dep_getter(token, attr):
|
||||||
dep = getattr(token, attr)
|
dep = getattr(token, attr)
|
||||||
dep = token.vocab.strings.as_string(dep).lower()
|
dep = token.vocab.strings.as_string(dep).lower()
|
||||||
|
|
|
@ -11,7 +11,7 @@ from ..tokens import Doc
|
||||||
from .pipe import Pipe, deserialize_config
|
from .pipe import Pipe, deserialize_config
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..vocab import Vocab
|
from ..vocab import Vocab
|
||||||
from ..gold import Example
|
from ..gold import Example, validate_examples
|
||||||
from ..errors import Errors, Warnings
|
from ..errors import Errors, Warnings
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
@ -142,7 +142,7 @@ class EntityLinker(Pipe):
|
||||||
|
|
||||||
def begin_training(
|
def begin_training(
|
||||||
self,
|
self,
|
||||||
get_examples: Callable[[], Iterable[Example]] = lambda: [],
|
get_examples: Callable[[], Iterable[Example]],
|
||||||
*,
|
*,
|
||||||
pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
|
pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
|
||||||
sgd: Optional[Optimizer] = None,
|
sgd: Optional[Optimizer] = None,
|
||||||
|
@ -197,14 +197,9 @@ class EntityLinker(Pipe):
|
||||||
losses.setdefault(self.name, 0.0)
|
losses.setdefault(self.name, 0.0)
|
||||||
if not examples:
|
if not examples:
|
||||||
return losses
|
return losses
|
||||||
|
validate_examples(examples, "EntityLinker.update")
|
||||||
sentence_docs = []
|
sentence_docs = []
|
||||||
try:
|
docs = [eg.predicted for eg in examples]
|
||||||
docs = [eg.predicted for eg in examples]
|
|
||||||
except AttributeError:
|
|
||||||
types = set([type(eg) for eg in examples])
|
|
||||||
raise TypeError(
|
|
||||||
Errors.E978.format(name="EntityLinker", method="update", types=types)
|
|
||||||
) from None
|
|
||||||
if set_annotations:
|
if set_annotations:
|
||||||
# This seems simpler than other ways to get that exact output -- but
|
# This seems simpler than other ways to get that exact output -- but
|
||||||
# it does run the model twice :(
|
# it does run the model twice :(
|
||||||
|
@ -250,6 +245,7 @@ class EntityLinker(Pipe):
|
||||||
return losses
|
return losses
|
||||||
|
|
||||||
def get_loss(self, examples: Iterable[Example], sentence_encodings):
|
def get_loss(self, examples: Iterable[Example], sentence_encodings):
|
||||||
|
validate_examples(examples, "EntityLinker.get_loss")
|
||||||
entity_encodings = []
|
entity_encodings = []
|
||||||
for eg in examples:
|
for eg in examples:
|
||||||
kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
|
kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
|
||||||
|
|
|
@ -9,6 +9,7 @@ from ..util import ensure_path, to_disk, from_disk
|
||||||
from ..tokens import Doc, Span
|
from ..tokens import Doc, Span
|
||||||
from ..matcher import Matcher, PhraseMatcher
|
from ..matcher import Matcher, PhraseMatcher
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
|
from ..gold import validate_examples
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_ENT_ID_SEP = "||"
|
DEFAULT_ENT_ID_SEP = "||"
|
||||||
|
@ -312,6 +313,7 @@ class EntityRuler:
|
||||||
return label
|
return label
|
||||||
|
|
||||||
def score(self, examples, **kwargs):
|
def score(self, examples, **kwargs):
|
||||||
|
validate_examples(examples, "EntityRuler.score")
|
||||||
return Scorer.score_spans(examples, "ents", **kwargs)
|
return Scorer.score_spans(examples, "ents", **kwargs)
|
||||||
|
|
||||||
def from_bytes(
|
def from_bytes(
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
from typing import Optional, List, Dict, Any
|
from typing import Optional, List, Dict, Any
|
||||||
|
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
|
|
||||||
from .pipe import Pipe
|
from .pipe import Pipe
|
||||||
|
@ -9,6 +8,7 @@ from ..lookups import Lookups, load_lookups
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
from ..tokens import Doc, Token
|
from ..tokens import Doc, Token
|
||||||
from ..vocab import Vocab
|
from ..vocab import Vocab
|
||||||
|
from ..gold import validate_examples
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
|
@ -127,6 +127,7 @@ class Lemmatizer(Pipe):
|
||||||
"""
|
"""
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.model = model
|
self.model = model
|
||||||
|
self.name = name
|
||||||
self._mode = mode
|
self._mode = mode
|
||||||
self.lookups = lookups if lookups is not None else Lookups()
|
self.lookups = lookups if lookups is not None else Lookups()
|
||||||
self.overwrite = overwrite
|
self.overwrite = overwrite
|
||||||
|
@ -135,10 +136,10 @@ class Lemmatizer(Pipe):
|
||||||
elif self.mode == "rule":
|
elif self.mode == "rule":
|
||||||
self.lemmatize = self.rule_lemmatize
|
self.lemmatize = self.rule_lemmatize
|
||||||
else:
|
else:
|
||||||
try:
|
mode_attr = f"{self.mode}_lemmatize"
|
||||||
self.lemmatize = getattr(self, f"{self.mode}_lemmatize")
|
if not hasattr(self, mode_attr):
|
||||||
except AttributeError:
|
|
||||||
raise ValueError(Errors.E1003.format(mode=mode))
|
raise ValueError(Errors.E1003.format(mode=mode))
|
||||||
|
self.lemmatize = getattr(self, mode_attr)
|
||||||
self.cache = {}
|
self.cache = {}
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -271,6 +272,7 @@ class Lemmatizer(Pipe):
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lemmatizer#score
|
DOCS: https://spacy.io/api/lemmatizer#score
|
||||||
"""
|
"""
|
||||||
|
validate_examples(examples, "Lemmatizer.score")
|
||||||
return Scorer.score_token_attr(examples, "lemma", **kwargs)
|
return Scorer.score_token_attr(examples, "lemma", **kwargs)
|
||||||
|
|
||||||
def to_disk(self, path, *, exclude=tuple()):
|
def to_disk(self, path, *, exclude=tuple()):
|
||||||
|
|
|
@ -6,15 +6,16 @@ from thinc.api import SequenceCategoricalCrossentropy, Model, Config
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
from ..vocab cimport Vocab
|
from ..vocab cimport Vocab
|
||||||
from ..morphology cimport Morphology
|
from ..morphology cimport Morphology
|
||||||
|
|
||||||
from ..parts_of_speech import IDS as POS_IDS
|
from ..parts_of_speech import IDS as POS_IDS
|
||||||
from ..symbols import POS
|
from ..symbols import POS
|
||||||
|
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
from .pipe import deserialize_config
|
from .pipe import deserialize_config
|
||||||
from .tagger import Tagger
|
from .tagger import Tagger
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
|
from ..gold import validate_examples
|
||||||
|
|
||||||
|
|
||||||
default_model_config = """
|
default_model_config = """
|
||||||
|
@ -126,7 +127,7 @@ class Morphologizer(Tagger):
|
||||||
self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
|
self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
def begin_training(self, get_examples=lambda: [], *, pipeline=None, sgd=None):
|
def begin_training(self, get_examples, *, pipeline=None, sgd=None):
|
||||||
"""Initialize the pipe for training, using data examples if available.
|
"""Initialize the pipe for training, using data examples if available.
|
||||||
|
|
||||||
get_examples (Callable[[], Iterable[Example]]): Optional function that
|
get_examples (Callable[[], Iterable[Example]]): Optional function that
|
||||||
|
@ -140,6 +141,9 @@ class Morphologizer(Tagger):
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/morphologizer#begin_training
|
DOCS: https://spacy.io/api/morphologizer#begin_training
|
||||||
"""
|
"""
|
||||||
|
if not hasattr(get_examples, "__call__"):
|
||||||
|
err = Errors.E930.format(name="Morphologizer", obj=type(get_examples))
|
||||||
|
raise ValueError(err)
|
||||||
for example in get_examples():
|
for example in get_examples():
|
||||||
for i, token in enumerate(example.reference):
|
for i, token in enumerate(example.reference):
|
||||||
pos = token.pos_
|
pos = token.pos_
|
||||||
|
@ -192,6 +196,7 @@ class Morphologizer(Tagger):
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/morphologizer#get_loss
|
DOCS: https://spacy.io/api/morphologizer#get_loss
|
||||||
"""
|
"""
|
||||||
|
validate_examples(examples, "Morphologizer.get_loss")
|
||||||
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
|
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
|
||||||
truths = []
|
truths = []
|
||||||
for eg in examples:
|
for eg in examples:
|
||||||
|
@ -228,6 +233,7 @@ class Morphologizer(Tagger):
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/morphologizer#score
|
DOCS: https://spacy.io/api/morphologizer#score
|
||||||
"""
|
"""
|
||||||
|
validate_examples(examples, "Morphologizer.score")
|
||||||
results = {}
|
results = {}
|
||||||
results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
|
results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
|
||||||
results.update(Scorer.score_token_attr(examples, "morph", **kwargs))
|
results.update(Scorer.score_token_attr(examples, "morph", **kwargs))
|
||||||
|
|
|
@ -8,6 +8,7 @@ from ..tokens.doc cimport Doc
|
||||||
|
|
||||||
from .pipe import Pipe
|
from .pipe import Pipe
|
||||||
from .tagger import Tagger
|
from .tagger import Tagger
|
||||||
|
from ..gold import validate_examples
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ._parser_internals import nonproj
|
from ._parser_internals import nonproj
|
||||||
from ..attrs import POS, ID
|
from ..attrs import POS, ID
|
||||||
|
@ -80,10 +81,11 @@ class MultitaskObjective(Tagger):
|
||||||
def set_annotations(self, docs, dep_ids):
|
def set_annotations(self, docs, dep_ids):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None):
|
def begin_training(self, get_examples, pipeline=None, sgd=None):
|
||||||
gold_examples = nonproj.preprocess_training_data(get_examples())
|
if not hasattr(get_examples, "__call__"):
|
||||||
# for raw_text, doc_annot in gold_tuples:
|
err = Errors.E930.format(name="MultitaskObjective", obj=type(get_examples))
|
||||||
for example in gold_examples:
|
raise ValueError(err)
|
||||||
|
for example in get_examples():
|
||||||
for token in example.y:
|
for token in example.y:
|
||||||
label = self.make_label(token)
|
label = self.make_label(token)
|
||||||
if label is not None and label not in self.labels:
|
if label is not None and label not in self.labels:
|
||||||
|
@ -175,7 +177,7 @@ class ClozeMultitask(Pipe):
|
||||||
def set_annotations(self, docs, dep_ids):
|
def set_annotations(self, docs, dep_ids):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None):
|
def begin_training(self, get_examples, pipeline=None, sgd=None):
|
||||||
self.model.initialize()
|
self.model.initialize()
|
||||||
X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
|
X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
|
||||||
self.model.output_layer.begin_training(X)
|
self.model.output_layer.begin_training(X)
|
||||||
|
@ -189,6 +191,7 @@ class ClozeMultitask(Pipe):
|
||||||
return tokvecs, vectors
|
return tokvecs, vectors
|
||||||
|
|
||||||
def get_loss(self, examples, vectors, prediction):
|
def get_loss(self, examples, vectors, prediction):
|
||||||
|
validate_examples(examples, "ClozeMultitask.get_loss")
|
||||||
# The simplest way to implement this would be to vstack the
|
# The simplest way to implement this would be to vstack the
|
||||||
# token.vector values, but that's a bit inefficient, especially on GPU.
|
# token.vector values, but that's a bit inefficient, especially on GPU.
|
||||||
# Instead we fetch the index into the vectors table for each of our tokens,
|
# Instead we fetch the index into the vectors table for each of our tokens,
|
||||||
|
@ -206,18 +209,16 @@ class ClozeMultitask(Pipe):
|
||||||
if losses is not None and self.name not in losses:
|
if losses is not None and self.name not in losses:
|
||||||
losses[self.name] = 0.
|
losses[self.name] = 0.
|
||||||
set_dropout_rate(self.model, drop)
|
set_dropout_rate(self.model, drop)
|
||||||
try:
|
validate_examples(examples, "ClozeMultitask.rehearse")
|
||||||
predictions, bp_predictions = self.model.begin_update([eg.predicted for eg in examples])
|
docs = [eg.predicted for eg in examples]
|
||||||
except AttributeError:
|
predictions, bp_predictions = self.model.begin_update()
|
||||||
types = set([type(eg) for eg in examples])
|
|
||||||
raise TypeError(Errors.E978.format(name="ClozeMultitask", method="rehearse", types=types)) from None
|
|
||||||
loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions)
|
loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions)
|
||||||
bp_predictions(d_predictions)
|
bp_predictions(d_predictions)
|
||||||
if sgd is not None:
|
if sgd is not None:
|
||||||
self.model.finish_update(sgd)
|
self.model.finish_update(sgd)
|
||||||
|
|
||||||
if losses is not None:
|
if losses is not None:
|
||||||
losses[self.name] += loss
|
losses[self.name] += loss
|
||||||
|
return losses
|
||||||
|
|
||||||
def add_label(self, label):
|
def add_label(self, label):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
|
@ -7,6 +7,7 @@ from ._parser_internals.ner cimport BiluoPushDown
|
||||||
|
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
|
from ..gold import validate_examples
|
||||||
|
|
||||||
|
|
||||||
default_model_config = """
|
default_model_config = """
|
||||||
|
@ -50,7 +51,7 @@ def make_ner(
|
||||||
):
|
):
|
||||||
"""Create a transition-based EntityRecognizer component. The entity recognizer
|
"""Create a transition-based EntityRecognizer component. The entity recognizer
|
||||||
identifies non-overlapping labelled spans of tokens.
|
identifies non-overlapping labelled spans of tokens.
|
||||||
|
|
||||||
The transition-based algorithm used encodes certain assumptions that are
|
The transition-based algorithm used encodes certain assumptions that are
|
||||||
effective for "traditional" named entity recognition tasks, but may not be
|
effective for "traditional" named entity recognition tasks, but may not be
|
||||||
a good fit for every span identification problem. Specifically, the loss
|
a good fit for every span identification problem. Specifically, the loss
|
||||||
|
@ -120,4 +121,5 @@ cdef class EntityRecognizer(Parser):
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entityrecognizer#score
|
DOCS: https://spacy.io/api/entityrecognizer#score
|
||||||
"""
|
"""
|
||||||
|
validate_examples(examples, "EntityRecognizer.score")
|
||||||
return Scorer.score_spans(examples, "ents", **kwargs)
|
return Scorer.score_spans(examples, "ents", **kwargs)
|
||||||
|
|
|
@ -1,2 +1,5 @@
|
||||||
cdef class Pipe:
|
cdef class Pipe:
|
||||||
|
cdef public object vocab
|
||||||
|
cdef public object model
|
||||||
cdef public str name
|
cdef public str name
|
||||||
|
cdef public object cfg
|
||||||
|
|
|
@ -1,9 +1,10 @@
|
||||||
# cython: infer_types=True, profile=True
|
# cython: infer_types=True, profile=True
|
||||||
import srsly
|
import srsly
|
||||||
|
from thinc.api import set_dropout_rate, Model
|
||||||
|
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
|
|
||||||
from ..util import create_default_optimizer
|
from ..gold import validate_examples
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
@ -16,7 +17,6 @@ cdef class Pipe:
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/pipe
|
DOCS: https://spacy.io/api/pipe
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, vocab, model, name, **cfg):
|
def __init__(self, vocab, model, name, **cfg):
|
||||||
"""Initialize a pipeline component.
|
"""Initialize a pipeline component.
|
||||||
|
|
||||||
|
@ -27,7 +27,10 @@ cdef class Pipe:
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/pipe#init
|
DOCS: https://spacy.io/api/pipe#init
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError
|
self.vocab = vocab
|
||||||
|
self.model = model
|
||||||
|
self.name = name
|
||||||
|
self.cfg = dict(cfg)
|
||||||
|
|
||||||
def __call__(self, Doc doc):
|
def __call__(self, Doc doc):
|
||||||
"""Apply the pipe to one document. The document is modified in place,
|
"""Apply the pipe to one document. The document is modified in place,
|
||||||
|
@ -68,7 +71,7 @@ cdef class Pipe:
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/pipe#predict
|
DOCS: https://spacy.io/api/pipe#predict
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError(Errors.E931.format(method="predict", name=self.name))
|
||||||
|
|
||||||
def set_annotations(self, docs, scores):
|
def set_annotations(self, docs, scores):
|
||||||
"""Modify a batch of documents, using pre-computed scores.
|
"""Modify a batch of documents, using pre-computed scores.
|
||||||
|
@ -78,7 +81,43 @@ cdef class Pipe:
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/pipe#set_annotations
|
DOCS: https://spacy.io/api/pipe#set_annotations
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError(Errors.E931.format(method="set_annotations", name=self.name))
|
||||||
|
|
||||||
|
def update(self, examples, *, drop=0.0, set_annotations=False, sgd=None, losses=None):
|
||||||
|
"""Learn from a batch of documents and gold-standard information,
|
||||||
|
updating the pipe's model. Delegates to predict and get_loss.
|
||||||
|
|
||||||
|
examples (Iterable[Example]): A batch of Example objects.
|
||||||
|
drop (float): The dropout rate.
|
||||||
|
set_annotations (bool): Whether or not to update the Example objects
|
||||||
|
with the predictions.
|
||||||
|
sgd (thinc.api.Optimizer): The optimizer.
|
||||||
|
losses (Dict[str, float]): Optional record of the loss during training.
|
||||||
|
Updated using the component name as the key.
|
||||||
|
RETURNS (Dict[str, float]): The updated losses dictionary.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/pipe#update
|
||||||
|
"""
|
||||||
|
if losses is None:
|
||||||
|
losses = {}
|
||||||
|
if not hasattr(self, "model") or self.model in (None, True, False):
|
||||||
|
return losses
|
||||||
|
losses.setdefault(self.name, 0.0)
|
||||||
|
validate_examples(examples, "Pipe.update")
|
||||||
|
if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
|
||||||
|
# Handle cases where there are no tokens in any docs.
|
||||||
|
return
|
||||||
|
set_dropout_rate(self.model, drop)
|
||||||
|
scores, bp_scores = self.model.begin_update([eg.predicted for eg in examples])
|
||||||
|
loss, d_scores = self.get_loss(examples, scores)
|
||||||
|
bp_scores(d_scores)
|
||||||
|
if sgd not in (None, False):
|
||||||
|
self.model.finish_update(sgd)
|
||||||
|
losses[self.name] += loss
|
||||||
|
if set_annotations:
|
||||||
|
docs = [eg.predicted for eg in examples]
|
||||||
|
self.set_annotations(docs, scores=scores)
|
||||||
|
return losses
|
||||||
|
|
||||||
def rehearse(self, examples, *, sgd=None, losses=None, **config):
|
def rehearse(self, examples, *, sgd=None, losses=None, **config):
|
||||||
"""Perform a "rehearsal" update from a batch of data. Rehearsal updates
|
"""Perform a "rehearsal" update from a batch of data. Rehearsal updates
|
||||||
|
@ -107,7 +146,7 @@ cdef class Pipe:
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/pipe#get_loss
|
DOCS: https://spacy.io/api/pipe#get_loss
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError(Errors.E931.format(method="get_loss", name=self.name))
|
||||||
|
|
||||||
def add_label(self, label):
|
def add_label(self, label):
|
||||||
"""Add an output label, to be predicted by the model. It's possible to
|
"""Add an output label, to be predicted by the model. It's possible to
|
||||||
|
@ -119,7 +158,7 @@ cdef class Pipe:
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/pipe#add_label
|
DOCS: https://spacy.io/api/pipe#add_label
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError(Errors.E931.format(method="add_label", name=self.name))
|
||||||
|
|
||||||
def create_optimizer(self):
|
def create_optimizer(self):
|
||||||
"""Create an optimizer for the pipeline component.
|
"""Create an optimizer for the pipeline component.
|
||||||
|
@ -128,9 +167,9 @@ cdef class Pipe:
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/pipe#create_optimizer
|
DOCS: https://spacy.io/api/pipe#create_optimizer
|
||||||
"""
|
"""
|
||||||
return create_default_optimizer()
|
return util.create_default_optimizer()
|
||||||
|
|
||||||
def begin_training(self, get_examples=lambda: [], *, pipeline=None, sgd=None):
|
def begin_training(self, get_examples, *, pipeline=None, sgd=None):
|
||||||
"""Initialize the pipe for training, using data examples if available.
|
"""Initialize the pipe for training, using data examples if available.
|
||||||
|
|
||||||
get_examples (Callable[[], Iterable[Example]]): Optional function that
|
get_examples (Callable[[], Iterable[Example]]): Optional function that
|
||||||
|
|
|
@ -7,6 +7,7 @@ from ..tokens.doc cimport Doc
|
||||||
from .pipe import Pipe
|
from .pipe import Pipe
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
|
from ..gold import validate_examples
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
|
@ -58,7 +59,7 @@ class Sentencizer(Pipe):
|
||||||
else:
|
else:
|
||||||
self.punct_chars = set(self.default_punct_chars)
|
self.punct_chars = set(self.default_punct_chars)
|
||||||
|
|
||||||
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None):
|
def begin_training(self, get_examples, pipeline=None, sgd=None):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def __call__(self, doc):
|
def __call__(self, doc):
|
||||||
|
@ -158,6 +159,7 @@ class Sentencizer(Pipe):
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/sentencizer#score
|
DOCS: https://spacy.io/api/sentencizer#score
|
||||||
"""
|
"""
|
||||||
|
validate_examples(examples, "Sentencizer.score")
|
||||||
results = Scorer.score_spans(examples, "sents", **kwargs)
|
results = Scorer.score_spans(examples, "sents", **kwargs)
|
||||||
del results["sents_per_type"]
|
del results["sents_per_type"]
|
||||||
return results
|
return results
|
||||||
|
|
|
@ -9,6 +9,7 @@ from .tagger import Tagger
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
|
from ..gold import validate_examples
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
|
@ -102,6 +103,7 @@ class SentenceRecognizer(Tagger):
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/sentencerecognizer#get_loss
|
DOCS: https://spacy.io/api/sentencerecognizer#get_loss
|
||||||
"""
|
"""
|
||||||
|
validate_examples(examples, "SentenceRecognizer.get_loss")
|
||||||
labels = self.labels
|
labels = self.labels
|
||||||
loss_func = SequenceCategoricalCrossentropy(names=labels, normalize=False)
|
loss_func = SequenceCategoricalCrossentropy(names=labels, normalize=False)
|
||||||
truths = []
|
truths = []
|
||||||
|
@ -121,7 +123,7 @@ class SentenceRecognizer(Tagger):
|
||||||
raise ValueError("nan value when computing loss")
|
raise ValueError("nan value when computing loss")
|
||||||
return float(loss), d_scores
|
return float(loss), d_scores
|
||||||
|
|
||||||
def begin_training(self, get_examples=lambda: [], *, pipeline=None, sgd=None):
|
def begin_training(self, get_examples, *, pipeline=None, sgd=None):
|
||||||
"""Initialize the pipe for training, using data examples if available.
|
"""Initialize the pipe for training, using data examples if available.
|
||||||
|
|
||||||
get_examples (Callable[[], Iterable[Example]]): Optional function that
|
get_examples (Callable[[], Iterable[Example]]): Optional function that
|
||||||
|
@ -151,6 +153,7 @@ class SentenceRecognizer(Tagger):
|
||||||
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
|
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
|
||||||
DOCS: https://spacy.io/api/sentencerecognizer#score
|
DOCS: https://spacy.io/api/sentencerecognizer#score
|
||||||
"""
|
"""
|
||||||
|
validate_examples(examples, "SentenceRecognizer.score")
|
||||||
results = Scorer.score_spans(examples, "sents", **kwargs)
|
results = Scorer.score_spans(examples, "sents", **kwargs)
|
||||||
del results["sents_per_type"]
|
del results["sents_per_type"]
|
||||||
return results
|
return results
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import List, Iterable, Optional, Dict, Tuple, Callable
|
from typing import List, Iterable, Optional, Dict, Tuple, Callable, Set
|
||||||
from thinc.types import Floats2d
|
from thinc.types import Floats2d
|
||||||
from thinc.api import SequenceCategoricalCrossentropy, set_dropout_rate, Model
|
from thinc.api import SequenceCategoricalCrossentropy, set_dropout_rate, Model
|
||||||
from thinc.api import Optimizer, Config
|
from thinc.api import Optimizer, Config
|
||||||
|
@ -6,6 +6,7 @@ from thinc.util import to_numpy
|
||||||
|
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
from ..gold import Example, spans_from_biluo_tags, iob_to_biluo, biluo_to_iob
|
from ..gold import Example, spans_from_biluo_tags, iob_to_biluo, biluo_to_iob
|
||||||
|
from ..gold import validate_examples
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..vocab import Vocab
|
from ..vocab import Vocab
|
||||||
|
@ -127,6 +128,7 @@ class SimpleNER(Pipe):
|
||||||
if losses is None:
|
if losses is None:
|
||||||
losses = {}
|
losses = {}
|
||||||
losses.setdefault("ner", 0.0)
|
losses.setdefault("ner", 0.0)
|
||||||
|
validate_examples(examples, "SimpleNER.update")
|
||||||
if not any(_has_ner(eg) for eg in examples):
|
if not any(_has_ner(eg) for eg in examples):
|
||||||
return losses
|
return losses
|
||||||
docs = [eg.predicted for eg in examples]
|
docs = [eg.predicted for eg in examples]
|
||||||
|
@ -142,6 +144,7 @@ class SimpleNER(Pipe):
|
||||||
return losses
|
return losses
|
||||||
|
|
||||||
def get_loss(self, examples: List[Example], scores) -> Tuple[List[Floats2d], float]:
|
def get_loss(self, examples: List[Example], scores) -> Tuple[List[Floats2d], float]:
|
||||||
|
validate_examples(examples, "SimpleNER.get_loss")
|
||||||
truths = []
|
truths = []
|
||||||
for eg in examples:
|
for eg in examples:
|
||||||
tags = eg.get_aligned_ner()
|
tags = eg.get_aligned_ner()
|
||||||
|
@ -161,14 +164,17 @@ class SimpleNER(Pipe):
|
||||||
|
|
||||||
def begin_training(
|
def begin_training(
|
||||||
self,
|
self,
|
||||||
get_examples: Callable,
|
get_examples: Callable[[], Iterable[Example]],
|
||||||
pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
|
pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
|
||||||
sgd: Optional[Optimizer] = None,
|
sgd: Optional[Optimizer] = None,
|
||||||
):
|
):
|
||||||
|
all_labels = set()
|
||||||
if not hasattr(get_examples, "__call__"):
|
if not hasattr(get_examples, "__call__"):
|
||||||
gold_tuples = get_examples
|
err = Errors.E930.format(name="SimpleNER", obj=type(get_examples))
|
||||||
get_examples = lambda: gold_tuples
|
raise ValueError(err)
|
||||||
for label in _get_labels(get_examples()):
|
for example in get_examples():
|
||||||
|
all_labels.update(_get_labels(example))
|
||||||
|
for label in sorted(all_labels):
|
||||||
self.add_label(label)
|
self.add_label(label)
|
||||||
labels = self.labels
|
labels = self.labels
|
||||||
n_actions = self.model.attrs["get_num_actions"](len(labels))
|
n_actions = self.model.attrs["get_num_actions"](len(labels))
|
||||||
|
@ -185,6 +191,7 @@ class SimpleNER(Pipe):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def score(self, examples, **kwargs):
|
def score(self, examples, **kwargs):
|
||||||
|
validate_examples(examples, "SimpleNER.score")
|
||||||
return Scorer.score_spans(examples, "ents", **kwargs)
|
return Scorer.score_spans(examples, "ents", **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
@ -196,10 +203,9 @@ def _has_ner(example: Example) -> bool:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def _get_labels(examples: List[Example]) -> List[str]:
|
def _get_labels(example: Example) -> Set[str]:
|
||||||
labels = set()
|
labels = set()
|
||||||
for eg in examples:
|
for ner_tag in example.get_aligned("ENT_TYPE", as_string=True):
|
||||||
for ner_tag in eg.get_aligned("ENT_TYPE", as_string=True):
|
if ner_tag != "O" and ner_tag != "-":
|
||||||
if ner_tag != "O" and ner_tag != "-":
|
labels.add(ner_tag)
|
||||||
labels.add(ner_tag)
|
return labels
|
||||||
return list(sorted(labels))
|
|
||||||
|
|
|
@ -16,6 +16,7 @@ from ..attrs import POS, ID
|
||||||
from ..parts_of_speech import X
|
from ..parts_of_speech import X
|
||||||
from ..errors import Errors, TempErrors, Warnings
|
from ..errors import Errors, TempErrors, Warnings
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
|
from ..gold import validate_examples
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
|
@ -187,19 +188,15 @@ class Tagger(Pipe):
|
||||||
if losses is None:
|
if losses is None:
|
||||||
losses = {}
|
losses = {}
|
||||||
losses.setdefault(self.name, 0.0)
|
losses.setdefault(self.name, 0.0)
|
||||||
try:
|
validate_examples(examples, "Tagger.update")
|
||||||
if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
|
if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
|
||||||
# Handle cases where there are no tokens in any docs.
|
# Handle cases where there are no tokens in any docs.
|
||||||
return
|
return
|
||||||
except AttributeError:
|
|
||||||
types = set([type(eg) for eg in examples])
|
|
||||||
raise TypeError(Errors.E978.format(name="Tagger", method="update", types=types)) from None
|
|
||||||
set_dropout_rate(self.model, drop)
|
set_dropout_rate(self.model, drop)
|
||||||
tag_scores, bp_tag_scores = self.model.begin_update(
|
tag_scores, bp_tag_scores = self.model.begin_update([eg.predicted for eg in examples])
|
||||||
[eg.predicted for eg in examples])
|
|
||||||
for sc in tag_scores:
|
for sc in tag_scores:
|
||||||
if self.model.ops.xp.isnan(sc.sum()):
|
if self.model.ops.xp.isnan(sc.sum()):
|
||||||
raise ValueError("nan value in scores")
|
raise ValueError(Errors.E940)
|
||||||
loss, d_tag_scores = self.get_loss(examples, tag_scores)
|
loss, d_tag_scores = self.get_loss(examples, tag_scores)
|
||||||
bp_tag_scores(d_tag_scores)
|
bp_tag_scores(d_tag_scores)
|
||||||
if sgd not in (None, False):
|
if sgd not in (None, False):
|
||||||
|
@ -226,11 +223,8 @@ class Tagger(Pipe):
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tagger#rehearse
|
DOCS: https://spacy.io/api/tagger#rehearse
|
||||||
"""
|
"""
|
||||||
try:
|
validate_examples(examples, "Tagger.rehearse")
|
||||||
docs = [eg.predicted for eg in examples]
|
docs = [eg.predicted for eg in examples]
|
||||||
except AttributeError:
|
|
||||||
types = set([type(eg) for eg in examples])
|
|
||||||
raise TypeError(Errors.E978.format(name="Tagger", method="rehearse", types=types)) from None
|
|
||||||
if self._rehearsal_model is None:
|
if self._rehearsal_model is None:
|
||||||
return
|
return
|
||||||
if not any(len(doc) for doc in docs):
|
if not any(len(doc) for doc in docs):
|
||||||
|
@ -256,6 +250,7 @@ class Tagger(Pipe):
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tagger#get_loss
|
DOCS: https://spacy.io/api/tagger#get_loss
|
||||||
"""
|
"""
|
||||||
|
validate_examples(examples, "Tagger.get_loss")
|
||||||
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
|
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
|
||||||
truths = [eg.get_aligned("TAG", as_string=True) for eg in examples]
|
truths = [eg.get_aligned("TAG", as_string=True) for eg in examples]
|
||||||
d_scores, loss = loss_func(scores, truths)
|
d_scores, loss = loss_func(scores, truths)
|
||||||
|
@ -263,7 +258,7 @@ class Tagger(Pipe):
|
||||||
raise ValueError("nan value when computing loss")
|
raise ValueError("nan value when computing loss")
|
||||||
return float(loss), d_scores
|
return float(loss), d_scores
|
||||||
|
|
||||||
def begin_training(self, get_examples=lambda: [], *, pipeline=None, sgd=None):
|
def begin_training(self, get_examples, *, pipeline=None, sgd=None):
|
||||||
"""Initialize the pipe for training, using data examples if available.
|
"""Initialize the pipe for training, using data examples if available.
|
||||||
|
|
||||||
get_examples (Callable[[], Iterable[Example]]): Optional function that
|
get_examples (Callable[[], Iterable[Example]]): Optional function that
|
||||||
|
@ -277,13 +272,12 @@ class Tagger(Pipe):
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tagger#begin_training
|
DOCS: https://spacy.io/api/tagger#begin_training
|
||||||
"""
|
"""
|
||||||
|
if not hasattr(get_examples, "__call__"):
|
||||||
|
err = Errors.E930.format(name="Tagger", obj=type(get_examples))
|
||||||
|
raise ValueError(err)
|
||||||
tags = set()
|
tags = set()
|
||||||
for example in get_examples():
|
for example in get_examples():
|
||||||
try:
|
for token in example.y:
|
||||||
y = example.y
|
|
||||||
except AttributeError:
|
|
||||||
raise TypeError(Errors.E978.format(name="Tagger", method="begin_training", types=type(example))) from None
|
|
||||||
for token in y:
|
|
||||||
tags.add(token.tag_)
|
tags.add(token.tag_)
|
||||||
for tag in sorted(tags):
|
for tag in sorted(tags):
|
||||||
self.add_label(tag)
|
self.add_label(tag)
|
||||||
|
@ -318,6 +312,7 @@ class Tagger(Pipe):
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tagger#score
|
DOCS: https://spacy.io/api/tagger#score
|
||||||
"""
|
"""
|
||||||
|
validate_examples(examples, "Tagger.score")
|
||||||
return Scorer.score_token_attr(examples, "tag", **kwargs)
|
return Scorer.score_token_attr(examples, "tag", **kwargs)
|
||||||
|
|
||||||
def to_bytes(self, *, exclude=tuple()):
|
def to_bytes(self, *, exclude=tuple()):
|
||||||
|
|
|
@ -5,7 +5,7 @@ import numpy
|
||||||
|
|
||||||
from .pipe import Pipe
|
from .pipe import Pipe
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..gold import Example
|
from ..gold import Example, validate_examples
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
from .. import util
|
from .. import util
|
||||||
|
@ -209,15 +209,10 @@ class TextCategorizer(Pipe):
|
||||||
if losses is None:
|
if losses is None:
|
||||||
losses = {}
|
losses = {}
|
||||||
losses.setdefault(self.name, 0.0)
|
losses.setdefault(self.name, 0.0)
|
||||||
try:
|
validate_examples(examples, "TextCategorizer.update")
|
||||||
if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
|
if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
|
||||||
# Handle cases where there are no tokens in any docs.
|
# Handle cases where there are no tokens in any docs.
|
||||||
return losses
|
return losses
|
||||||
except AttributeError:
|
|
||||||
types = set([type(eg) for eg in examples])
|
|
||||||
raise TypeError(
|
|
||||||
Errors.E978.format(name="TextCategorizer", method="update", types=types)
|
|
||||||
) from None
|
|
||||||
set_dropout_rate(self.model, drop)
|
set_dropout_rate(self.model, drop)
|
||||||
scores, bp_scores = self.model.begin_update([eg.predicted for eg in examples])
|
scores, bp_scores = self.model.begin_update([eg.predicted for eg in examples])
|
||||||
loss, d_scores = self.get_loss(examples, scores)
|
loss, d_scores = self.get_loss(examples, scores)
|
||||||
|
@ -252,19 +247,12 @@ class TextCategorizer(Pipe):
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/textcategorizer#rehearse
|
DOCS: https://spacy.io/api/textcategorizer#rehearse
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if losses is not None:
|
if losses is not None:
|
||||||
losses.setdefault(self.name, 0.0)
|
losses.setdefault(self.name, 0.0)
|
||||||
if self._rehearsal_model is None:
|
if self._rehearsal_model is None:
|
||||||
return losses
|
return losses
|
||||||
try:
|
validate_examples(examples, "TextCategorizer.rehearse")
|
||||||
docs = [eg.predicted for eg in examples]
|
docs = [eg.predicted for eg in examples]
|
||||||
except AttributeError:
|
|
||||||
types = set([type(eg) for eg in examples])
|
|
||||||
err = Errors.E978.format(
|
|
||||||
name="TextCategorizer", method="rehearse", types=types
|
|
||||||
)
|
|
||||||
raise TypeError(err) from None
|
|
||||||
if not any(len(doc) for doc in docs):
|
if not any(len(doc) for doc in docs):
|
||||||
# Handle cases where there are no tokens in any docs.
|
# Handle cases where there are no tokens in any docs.
|
||||||
return losses
|
return losses
|
||||||
|
@ -303,6 +291,7 @@ class TextCategorizer(Pipe):
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/textcategorizer#get_loss
|
DOCS: https://spacy.io/api/textcategorizer#get_loss
|
||||||
"""
|
"""
|
||||||
|
validate_examples(examples, "TextCategorizer.get_loss")
|
||||||
truths, not_missing = self._examples_to_truth(examples)
|
truths, not_missing = self._examples_to_truth(examples)
|
||||||
not_missing = self.model.ops.asarray(not_missing)
|
not_missing = self.model.ops.asarray(not_missing)
|
||||||
d_scores = (scores - truths) / scores.shape[0]
|
d_scores = (scores - truths) / scores.shape[0]
|
||||||
|
@ -338,7 +327,7 @@ class TextCategorizer(Pipe):
|
||||||
|
|
||||||
def begin_training(
|
def begin_training(
|
||||||
self,
|
self,
|
||||||
get_examples: Callable[[], Iterable[Example]] = lambda: [],
|
get_examples: Callable[[], Iterable[Example]],
|
||||||
*,
|
*,
|
||||||
pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
|
pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
|
||||||
sgd: Optional[Optimizer] = None,
|
sgd: Optional[Optimizer] = None,
|
||||||
|
@ -356,21 +345,20 @@ class TextCategorizer(Pipe):
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/textcategorizer#begin_training
|
DOCS: https://spacy.io/api/textcategorizer#begin_training
|
||||||
"""
|
"""
|
||||||
# TODO: begin_training is not guaranteed to see all data / labels ?
|
if not hasattr(get_examples, "__call__"):
|
||||||
examples = list(get_examples())
|
err = Errors.E930.format(name="TextCategorizer", obj=type(get_examples))
|
||||||
for example in examples:
|
raise ValueError(err)
|
||||||
try:
|
subbatch = [] # Select a subbatch of examples to initialize the model
|
||||||
y = example.y
|
for example in get_examples():
|
||||||
except AttributeError:
|
if len(subbatch) < 2:
|
||||||
err = Errors.E978.format(
|
subbatch.append(example)
|
||||||
name="TextCategorizer", method="update", types=type(example)
|
for cat in example.y.cats:
|
||||||
)
|
|
||||||
raise TypeError(err) from None
|
|
||||||
for cat in y.cats:
|
|
||||||
self.add_label(cat)
|
self.add_label(cat)
|
||||||
self.require_labels()
|
self.require_labels()
|
||||||
docs = [Doc(self.vocab, words=["hello"])]
|
docs = [eg.reference for eg in subbatch]
|
||||||
truths, _ = self._examples_to_truth(examples)
|
if not docs: # need at least one doc
|
||||||
|
docs = [Doc(self.vocab, words=["hello"])]
|
||||||
|
truths, _ = self._examples_to_truth(subbatch)
|
||||||
self.set_output(len(self.labels))
|
self.set_output(len(self.labels))
|
||||||
self.model.initialize(X=docs, Y=truths)
|
self.model.initialize(X=docs, Y=truths)
|
||||||
if sgd is None:
|
if sgd is None:
|
||||||
|
@ -392,6 +380,7 @@ class TextCategorizer(Pipe):
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/textcategorizer#score
|
DOCS: https://spacy.io/api/textcategorizer#score
|
||||||
"""
|
"""
|
||||||
|
validate_examples(examples, "TextCategorizer.score")
|
||||||
return Scorer.score_cats(
|
return Scorer.score_cats(
|
||||||
examples,
|
examples,
|
||||||
"cats",
|
"cats",
|
||||||
|
|
|
@ -2,7 +2,7 @@ from typing import Iterator, Sequence, Iterable, Optional, Dict, Callable, List,
|
||||||
from thinc.api import Model, set_dropout_rate, Optimizer, Config
|
from thinc.api import Model, set_dropout_rate, Optimizer, Config
|
||||||
|
|
||||||
from .pipe import Pipe
|
from .pipe import Pipe
|
||||||
from ..gold import Example
|
from ..gold import Example, validate_examples
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
from ..vocab import Vocab
|
from ..vocab import Vocab
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
|
@ -166,9 +166,8 @@ class Tok2Vec(Pipe):
|
||||||
"""
|
"""
|
||||||
if losses is None:
|
if losses is None:
|
||||||
losses = {}
|
losses = {}
|
||||||
|
validate_examples(examples, "Tok2Vec.update")
|
||||||
docs = [eg.predicted for eg in examples]
|
docs = [eg.predicted for eg in examples]
|
||||||
if isinstance(docs, Doc):
|
|
||||||
docs = [docs]
|
|
||||||
set_dropout_rate(self.model, drop)
|
set_dropout_rate(self.model, drop)
|
||||||
tokvecs, bp_tokvecs = self.model.begin_update(docs)
|
tokvecs, bp_tokvecs = self.model.begin_update(docs)
|
||||||
d_tokvecs = [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs]
|
d_tokvecs = [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs]
|
||||||
|
@ -194,7 +193,8 @@ class Tok2Vec(Pipe):
|
||||||
batch_id = Tok2VecListener.get_batch_id(docs)
|
batch_id = Tok2VecListener.get_batch_id(docs)
|
||||||
for listener in self.listeners[:-1]:
|
for listener in self.listeners[:-1]:
|
||||||
listener.receive(batch_id, tokvecs, accumulate_gradient)
|
listener.receive(batch_id, tokvecs, accumulate_gradient)
|
||||||
self.listeners[-1].receive(batch_id, tokvecs, backprop)
|
if self.listeners:
|
||||||
|
self.listeners[-1].receive(batch_id, tokvecs, backprop)
|
||||||
if set_annotations:
|
if set_annotations:
|
||||||
self.set_annotations(docs, tokvecs)
|
self.set_annotations(docs, tokvecs)
|
||||||
return losses
|
return losses
|
||||||
|
@ -204,7 +204,7 @@ class Tok2Vec(Pipe):
|
||||||
|
|
||||||
def begin_training(
|
def begin_training(
|
||||||
self,
|
self,
|
||||||
get_examples: Callable[[], Iterable[Example]] = lambda: [],
|
get_examples: Callable[[], Iterable[Example]],
|
||||||
*,
|
*,
|
||||||
pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
|
pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
|
||||||
sgd: Optional[Optimizer] = None,
|
sgd: Optional[Optimizer] = None,
|
||||||
|
|
|
@ -8,11 +8,8 @@ from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC
|
||||||
|
|
||||||
|
|
||||||
cdef class Parser(Pipe):
|
cdef class Parser(Pipe):
|
||||||
cdef readonly Vocab vocab
|
|
||||||
cdef public object model
|
|
||||||
cdef public object _rehearsal_model
|
cdef public object _rehearsal_model
|
||||||
cdef readonly TransitionSystem moves
|
cdef readonly TransitionSystem moves
|
||||||
cdef readonly object cfg
|
|
||||||
cdef public object _multitasks
|
cdef public object _multitasks
|
||||||
|
|
||||||
cdef void _parseC(self, StateC** states,
|
cdef void _parseC(self, StateC** states,
|
||||||
|
|
|
@ -8,22 +8,21 @@ from libc.string cimport memset
|
||||||
from libc.stdlib cimport calloc, free
|
from libc.stdlib cimport calloc, free
|
||||||
|
|
||||||
import srsly
|
import srsly
|
||||||
|
from thinc.api import set_dropout_rate
|
||||||
|
import numpy.random
|
||||||
|
import numpy
|
||||||
|
import warnings
|
||||||
|
|
||||||
from ._parser_internals.stateclass cimport StateClass
|
from ._parser_internals.stateclass cimport StateClass
|
||||||
from ..ml.parser_model cimport alloc_activations, free_activations
|
from ..ml.parser_model cimport alloc_activations, free_activations
|
||||||
from ..ml.parser_model cimport predict_states, arg_max_if_valid
|
from ..ml.parser_model cimport predict_states, arg_max_if_valid
|
||||||
from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
|
from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
|
||||||
from ..ml.parser_model cimport get_c_weights, get_c_sizes
|
from ..ml.parser_model cimport get_c_weights, get_c_sizes
|
||||||
|
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
|
|
||||||
|
from ..gold import validate_examples
|
||||||
from ..errors import Errors, Warnings
|
from ..errors import Errors, Warnings
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..util import create_default_optimizer
|
|
||||||
|
|
||||||
from thinc.api import set_dropout_rate
|
|
||||||
import numpy.random
|
|
||||||
import numpy
|
|
||||||
import warnings
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Parser(Pipe):
|
cdef class Parser(Pipe):
|
||||||
|
@ -266,6 +265,7 @@ cdef class Parser(Pipe):
|
||||||
if losses is None:
|
if losses is None:
|
||||||
losses = {}
|
losses = {}
|
||||||
losses.setdefault(self.name, 0.)
|
losses.setdefault(self.name, 0.)
|
||||||
|
validate_examples(examples, "Parser.update")
|
||||||
for multitask in self._multitasks:
|
for multitask in self._multitasks:
|
||||||
multitask.update(examples, drop=drop, sgd=sgd)
|
multitask.update(examples, drop=drop, sgd=sgd)
|
||||||
n_examples = len([eg for eg in examples if self.moves.has_gold(eg)])
|
n_examples = len([eg for eg in examples if self.moves.has_gold(eg)])
|
||||||
|
@ -329,7 +329,7 @@ cdef class Parser(Pipe):
|
||||||
if self._rehearsal_model is None:
|
if self._rehearsal_model is None:
|
||||||
return None
|
return None
|
||||||
losses.setdefault(self.name, 0.)
|
losses.setdefault(self.name, 0.)
|
||||||
|
validate_examples(examples, "Parser.rehearse")
|
||||||
docs = [eg.predicted for eg in examples]
|
docs = [eg.predicted for eg in examples]
|
||||||
states = self.moves.init_batch(docs)
|
states = self.moves.init_batch(docs)
|
||||||
# This is pretty dirty, but the NER can resize itself in init_batch,
|
# This is pretty dirty, but the NER can resize itself in init_batch,
|
||||||
|
@ -398,21 +398,18 @@ cdef class Parser(Pipe):
|
||||||
losses[self.name] += (d_scores**2).sum()
|
losses[self.name] += (d_scores**2).sum()
|
||||||
return d_scores
|
return d_scores
|
||||||
|
|
||||||
def create_optimizer(self):
|
|
||||||
return create_default_optimizer()
|
|
||||||
|
|
||||||
def set_output(self, nO):
|
def set_output(self, nO):
|
||||||
self.model.attrs["resize_output"](self.model, nO)
|
self.model.attrs["resize_output"](self.model, nO)
|
||||||
|
|
||||||
def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs):
|
def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs):
|
||||||
|
if not hasattr(get_examples, "__call__"):
|
||||||
|
err = Errors.E930.format(name="DependencyParser/EntityRecognizer", obj=type(get_examples))
|
||||||
|
raise ValueError(err)
|
||||||
self.cfg.update(kwargs)
|
self.cfg.update(kwargs)
|
||||||
lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})
|
lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})
|
||||||
if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS:
|
if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS:
|
||||||
langs = ", ".join(util.LEXEME_NORM_LANGS)
|
langs = ", ".join(util.LEXEME_NORM_LANGS)
|
||||||
warnings.warn(Warnings.W033.format(model="parser or NER", langs=langs))
|
util.logger.debug(Warnings.W033.format(model="parser or NER", langs=langs))
|
||||||
if not hasattr(get_examples, '__call__'):
|
|
||||||
gold_tuples = get_examples
|
|
||||||
get_examples = lambda: gold_tuples
|
|
||||||
actions = self.moves.get_actions(
|
actions = self.moves.get_actions(
|
||||||
examples=get_examples(),
|
examples=get_examples(),
|
||||||
min_freq=self.cfg['min_action_freq'],
|
min_freq=self.cfg['min_action_freq'],
|
||||||
|
|
|
@ -18,7 +18,7 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
|
||||||
cfg = {"model": DEFAULT_NER_MODEL}
|
cfg = {"model": DEFAULT_NER_MODEL}
|
||||||
model = registry.make_from_config(cfg, validate=True)["model"]
|
model = registry.make_from_config(cfg, validate=True)["model"]
|
||||||
ner = EntityRecognizer(en_vocab, model, **config)
|
ner = EntityRecognizer(en_vocab, model, **config)
|
||||||
ner.begin_training([])
|
ner.begin_training(lambda: [])
|
||||||
ner(doc)
|
ner(doc)
|
||||||
assert len(list(doc.ents)) == 0
|
assert len(list(doc.ents)) == 0
|
||||||
assert [w.ent_iob_ for w in doc] == (["O"] * len(doc))
|
assert [w.ent_iob_ for w in doc] == (["O"] * len(doc))
|
||||||
|
@ -41,7 +41,7 @@ def test_ents_reset(en_vocab):
|
||||||
cfg = {"model": DEFAULT_NER_MODEL}
|
cfg = {"model": DEFAULT_NER_MODEL}
|
||||||
model = registry.make_from_config(cfg, validate=True)["model"]
|
model = registry.make_from_config(cfg, validate=True)["model"]
|
||||||
ner = EntityRecognizer(en_vocab, model, **config)
|
ner = EntityRecognizer(en_vocab, model, **config)
|
||||||
ner.begin_training([])
|
ner.begin_training(lambda: [])
|
||||||
ner(doc)
|
ner(doc)
|
||||||
assert [t.ent_iob_ for t in doc] == (["O"] * len(doc))
|
assert [t.ent_iob_ for t in doc] == (["O"] * len(doc))
|
||||||
doc.ents = list(doc.ents)
|
doc.ents = list(doc.ents)
|
||||||
|
|
|
@ -35,7 +35,7 @@ def test_init_parser(parser):
|
||||||
def _train_parser(parser):
|
def _train_parser(parser):
|
||||||
fix_random_seed(1)
|
fix_random_seed(1)
|
||||||
parser.add_label("left")
|
parser.add_label("left")
|
||||||
parser.begin_training([], **parser.cfg)
|
parser.begin_training(lambda: [], **parser.cfg)
|
||||||
sgd = Adam(0.001)
|
sgd = Adam(0.001)
|
||||||
|
|
||||||
for i in range(5):
|
for i in range(5):
|
||||||
|
@ -75,7 +75,7 @@ def test_add_label_deserializes_correctly():
|
||||||
ner1.add_label("C")
|
ner1.add_label("C")
|
||||||
ner1.add_label("B")
|
ner1.add_label("B")
|
||||||
ner1.add_label("A")
|
ner1.add_label("A")
|
||||||
ner1.begin_training([])
|
ner1.begin_training(lambda: [])
|
||||||
ner2 = EntityRecognizer(Vocab(), model, **config)
|
ner2 = EntityRecognizer(Vocab(), model, **config)
|
||||||
|
|
||||||
# the second model needs to be resized before we can call from_bytes
|
# the second model needs to be resized before we can call from_bytes
|
||||||
|
|
|
@ -1,17 +1,17 @@
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from spacy import util
|
from spacy import util
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
|
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
from spacy.lookups import Lookups
|
from spacy.lookups import Lookups
|
||||||
from spacy.pipeline._parser_internals.ner import BiluoPushDown
|
from spacy.pipeline._parser_internals.ner import BiluoPushDown
|
||||||
from spacy.gold import Example
|
from spacy.gold import Example
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
|
import logging
|
||||||
|
|
||||||
from ..util import make_tempdir
|
from ..util import make_tempdir
|
||||||
|
|
||||||
|
|
||||||
TRAIN_DATA = [
|
TRAIN_DATA = [
|
||||||
("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
|
("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
|
||||||
("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]}),
|
("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]}),
|
||||||
|
@ -56,6 +56,7 @@ def test_get_oracle_moves(tsys, doc, entity_annots):
|
||||||
assert names == ["U-PERSON", "O", "O", "B-GPE", "L-GPE", "O"]
|
assert names == ["U-PERSON", "O", "O", "B-GPE", "L-GPE", "O"]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||||
def test_get_oracle_moves_negative_entities(tsys, doc, entity_annots):
|
def test_get_oracle_moves_negative_entities(tsys, doc, entity_annots):
|
||||||
entity_annots = [(s, e, "!" + label) for s, e, label in entity_annots]
|
entity_annots = [(s, e, "!" + label) for s, e, label in entity_annots]
|
||||||
example = Example.from_dict(doc, {"entities": entity_annots})
|
example = Example.from_dict(doc, {"entities": entity_annots})
|
||||||
|
@ -332,19 +333,21 @@ def test_overfitting_IO():
|
||||||
assert ents2[0].label_ == "LOC"
|
assert ents2[0].label_ == "LOC"
|
||||||
|
|
||||||
|
|
||||||
def test_ner_warns_no_lookups():
|
def test_ner_warns_no_lookups(caplog):
|
||||||
nlp = English()
|
nlp = English()
|
||||||
assert nlp.lang in util.LEXEME_NORM_LANGS
|
assert nlp.lang in util.LEXEME_NORM_LANGS
|
||||||
nlp.vocab.lookups = Lookups()
|
nlp.vocab.lookups = Lookups()
|
||||||
assert not len(nlp.vocab.lookups)
|
assert not len(nlp.vocab.lookups)
|
||||||
nlp.add_pipe("ner")
|
nlp.add_pipe("ner")
|
||||||
with pytest.warns(UserWarning):
|
with caplog.at_level(logging.DEBUG):
|
||||||
nlp.begin_training()
|
nlp.begin_training()
|
||||||
|
assert "W033" in caplog.text
|
||||||
|
caplog.clear()
|
||||||
nlp.vocab.lookups.add_table("lexeme_norm")
|
nlp.vocab.lookups.add_table("lexeme_norm")
|
||||||
nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A"
|
nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A"
|
||||||
with pytest.warns(None) as record:
|
with caplog.at_level(logging.DEBUG):
|
||||||
nlp.begin_training()
|
nlp.begin_training()
|
||||||
assert not record.list
|
assert "W033" not in caplog.text
|
||||||
|
|
||||||
|
|
||||||
@Language.factory("blocker")
|
@Language.factory("blocker")
|
||||||
|
|
|
@ -28,7 +28,7 @@ def parser(vocab):
|
||||||
parser.cfg["hidden_width"] = 32
|
parser.cfg["hidden_width"] = 32
|
||||||
# parser.add_label('right')
|
# parser.add_label('right')
|
||||||
parser.add_label("left")
|
parser.add_label("left")
|
||||||
parser.begin_training([], **parser.cfg)
|
parser.begin_training(lambda: [], **parser.cfg)
|
||||||
sgd = Adam(0.001)
|
sgd = Adam(0.001)
|
||||||
|
|
||||||
for i in range(10):
|
for i in range(10):
|
||||||
|
|
|
@ -136,7 +136,7 @@ def test_kb_undefined(nlp):
|
||||||
"""Test that the EL can't train without defining a KB"""
|
"""Test that the EL can't train without defining a KB"""
|
||||||
entity_linker = nlp.add_pipe("entity_linker", config={})
|
entity_linker = nlp.add_pipe("entity_linker", config={})
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
entity_linker.begin_training()
|
entity_linker.begin_training(lambda: [])
|
||||||
|
|
||||||
|
|
||||||
def test_kb_empty(nlp):
|
def test_kb_empty(nlp):
|
||||||
|
@ -145,7 +145,7 @@ def test_kb_empty(nlp):
|
||||||
entity_linker = nlp.add_pipe("entity_linker", config=config)
|
entity_linker = nlp.add_pipe("entity_linker", config=config)
|
||||||
assert len(entity_linker.kb) == 0
|
assert len(entity_linker.kb) == 0
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
entity_linker.begin_training()
|
entity_linker.begin_training(lambda: [])
|
||||||
|
|
||||||
|
|
||||||
def test_candidate_generation(nlp):
|
def test_candidate_generation(nlp):
|
||||||
|
@ -249,7 +249,7 @@ def test_preserving_links_asdoc(nlp):
|
||||||
ruler.add_patterns(patterns)
|
ruler.add_patterns(patterns)
|
||||||
el_config = {"kb": {"@assets": "myLocationsKB.v1"}, "incl_prior": False}
|
el_config = {"kb": {"@assets": "myLocationsKB.v1"}, "incl_prior": False}
|
||||||
el_pipe = nlp.add_pipe("entity_linker", config=el_config, last=True)
|
el_pipe = nlp.add_pipe("entity_linker", config=el_config, last=True)
|
||||||
el_pipe.begin_training()
|
el_pipe.begin_training(lambda: [])
|
||||||
el_pipe.incl_context = False
|
el_pipe.incl_context = False
|
||||||
el_pipe.incl_prior = True
|
el_pipe.incl_prior = True
|
||||||
|
|
||||||
|
|
|
@ -54,7 +54,7 @@ def test_textcat_learns_multilabel():
|
||||||
textcat = TextCategorizer(nlp.vocab, width=8)
|
textcat = TextCategorizer(nlp.vocab, width=8)
|
||||||
for letter in letters:
|
for letter in letters:
|
||||||
textcat.add_label(letter)
|
textcat.add_label(letter)
|
||||||
optimizer = textcat.begin_training()
|
optimizer = textcat.begin_training(lambda: [])
|
||||||
for i in range(30):
|
for i in range(30):
|
||||||
losses = {}
|
losses = {}
|
||||||
examples = [Example.from_dict(doc, {"cats": cats}) for doc, cat in docs]
|
examples = [Example.from_dict(doc, {"cats": cats}) for doc, cat in docs]
|
||||||
|
@ -104,7 +104,7 @@ def test_overfitting_IO():
|
||||||
doc = nlp(test_text)
|
doc = nlp(test_text)
|
||||||
cats = doc.cats
|
cats = doc.cats
|
||||||
# note that by default, exclusive_classes = false so we need a bigger error margin
|
# note that by default, exclusive_classes = false so we need a bigger error margin
|
||||||
assert cats["POSITIVE"] > 0.9
|
assert cats["POSITIVE"] > 0.8
|
||||||
assert cats["POSITIVE"] + cats["NEGATIVE"] == pytest.approx(1.0, 0.1)
|
assert cats["POSITIVE"] + cats["NEGATIVE"] == pytest.approx(1.0, 0.1)
|
||||||
|
|
||||||
# Also test the results are still the same after IO
|
# Also test the results are still the same after IO
|
||||||
|
@ -113,7 +113,7 @@ def test_overfitting_IO():
|
||||||
nlp2 = util.load_model_from_path(tmp_dir)
|
nlp2 = util.load_model_from_path(tmp_dir)
|
||||||
doc2 = nlp2(test_text)
|
doc2 = nlp2(test_text)
|
||||||
cats2 = doc2.cats
|
cats2 = doc2.cats
|
||||||
assert cats2["POSITIVE"] > 0.9
|
assert cats2["POSITIVE"] > 0.8
|
||||||
assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.1)
|
assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.1)
|
||||||
|
|
||||||
# Test scoring
|
# Test scoring
|
||||||
|
|
|
@ -25,7 +25,6 @@ def test_issue2070():
|
||||||
assert len(doc) == 11
|
assert len(doc) == 11
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
|
||||||
def test_issue2179():
|
def test_issue2179():
|
||||||
"""Test that spurious 'extra_labels' aren't created when initializing NER."""
|
"""Test that spurious 'extra_labels' aren't created when initializing NER."""
|
||||||
nlp = Italian()
|
nlp = Italian()
|
||||||
|
@ -135,7 +134,6 @@ def test_issue2464(en_vocab):
|
||||||
assert len(matches) == 3
|
assert len(matches) == 3
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
|
||||||
def test_issue2482():
|
def test_issue2482():
|
||||||
"""Test we can serialize and deserialize a blank NER or parser model."""
|
"""Test we can serialize and deserialize a blank NER or parser model."""
|
||||||
nlp = Italian()
|
nlp = Italian()
|
||||||
|
|
|
@ -20,7 +20,7 @@ def test_issue2564():
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
tagger = nlp.add_pipe("tagger")
|
tagger = nlp.add_pipe("tagger")
|
||||||
tagger.add_label("A")
|
tagger.add_label("A")
|
||||||
tagger.begin_training()
|
tagger.begin_training(lambda: [])
|
||||||
doc = nlp("hello world")
|
doc = nlp("hello world")
|
||||||
assert doc.is_tagged
|
assert doc.is_tagged
|
||||||
docs = nlp.pipe(["hello", "world"])
|
docs = nlp.pipe(["hello", "world"])
|
||||||
|
@ -136,7 +136,6 @@ def test_issue2782(text, lang_cls):
|
||||||
assert doc[0].like_num
|
assert doc[0].like_num
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
|
||||||
def test_issue2800():
|
def test_issue2800():
|
||||||
"""Test issue that arises when too many labels are added to NER model.
|
"""Test issue that arises when too many labels are added to NER model.
|
||||||
Used to cause segfault.
|
Used to cause segfault.
|
||||||
|
|
|
@ -90,7 +90,6 @@ def test_issue3199():
|
||||||
assert list(doc[0:3].noun_chunks) == []
|
assert list(doc[0:3].noun_chunks) == []
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
|
||||||
def test_issue3209():
|
def test_issue3209():
|
||||||
"""Test issue that occurred in spaCy nightly where NER labels were being
|
"""Test issue that occurred in spaCy nightly where NER labels were being
|
||||||
mapped to classes incorrectly after loading the model, when the labels
|
mapped to classes incorrectly after loading the model, when the labels
|
||||||
|
|
|
@ -91,7 +91,6 @@ def test_issue_3526_3(en_vocab):
|
||||||
assert new_ruler.overwrite is not ruler.overwrite
|
assert new_ruler.overwrite is not ruler.overwrite
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
|
||||||
def test_issue_3526_4(en_vocab):
|
def test_issue_3526_4(en_vocab):
|
||||||
nlp = Language(vocab=en_vocab)
|
nlp = Language(vocab=en_vocab)
|
||||||
patterns = [{"label": "ORG", "pattern": "Apple"}]
|
patterns = [{"label": "ORG", "pattern": "Apple"}]
|
||||||
|
@ -252,7 +251,6 @@ def test_issue3803():
|
||||||
assert [t.like_num for t in doc] == [True, True, True, True, True, True]
|
assert [t.like_num for t in doc] == [True, True, True, True, True, True]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
|
||||||
def test_issue3830_no_subtok():
|
def test_issue3830_no_subtok():
|
||||||
"""Test that the parser doesn't have subtok label if not learn_tokens"""
|
"""Test that the parser doesn't have subtok label if not learn_tokens"""
|
||||||
config = {
|
config = {
|
||||||
|
@ -270,7 +268,6 @@ def test_issue3830_no_subtok():
|
||||||
assert "subtok" not in parser.labels
|
assert "subtok" not in parser.labels
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
|
||||||
def test_issue3830_with_subtok():
|
def test_issue3830_with_subtok():
|
||||||
"""Test that the parser does have subtok label if learn_tokens=True."""
|
"""Test that the parser does have subtok label if learn_tokens=True."""
|
||||||
config = {
|
config = {
|
||||||
|
@ -333,7 +330,6 @@ def test_issue3879(en_vocab):
|
||||||
assert len(matcher(doc)) == 2 # fails because of a FP match 'is a test'
|
assert len(matcher(doc)) == 2 # fails because of a FP match 'is a test'
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
|
||||||
def test_issue3880():
|
def test_issue3880():
|
||||||
"""Test that `nlp.pipe()` works when an empty string ends the batch.
|
"""Test that `nlp.pipe()` works when an empty string ends the batch.
|
||||||
|
|
||||||
|
|
|
@ -81,7 +81,6 @@ def test_issue4030():
|
||||||
assert doc.cats["inoffensive"] == 0.0
|
assert doc.cats["inoffensive"] == 0.0
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
|
||||||
def test_issue4042():
|
def test_issue4042():
|
||||||
"""Test that serialization of an EntityRuler before NER works fine."""
|
"""Test that serialization of an EntityRuler before NER works fine."""
|
||||||
nlp = English()
|
nlp = English()
|
||||||
|
@ -110,7 +109,6 @@ def test_issue4042():
|
||||||
assert doc2.ents[0].label_ == "MY_ORG"
|
assert doc2.ents[0].label_ == "MY_ORG"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
|
||||||
def test_issue4042_bug2():
|
def test_issue4042_bug2():
|
||||||
"""
|
"""
|
||||||
Test that serialization of an NER works fine when new labels were added.
|
Test that serialization of an NER works fine when new labels were added.
|
||||||
|
@ -242,7 +240,6 @@ def test_issue4190():
|
||||||
assert result_1b == result_2
|
assert result_1b == result_2
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
|
||||||
def test_issue4267():
|
def test_issue4267():
|
||||||
""" Test that running an entity_ruler after ner gives consistent results"""
|
""" Test that running an entity_ruler after ner gives consistent results"""
|
||||||
nlp = English()
|
nlp = English()
|
||||||
|
@ -303,7 +300,7 @@ def test_issue4313():
|
||||||
config = {}
|
config = {}
|
||||||
ner = nlp.create_pipe("ner", config=config)
|
ner = nlp.create_pipe("ner", config=config)
|
||||||
ner.add_label("SOME_LABEL")
|
ner.add_label("SOME_LABEL")
|
||||||
ner.begin_training([])
|
ner.begin_training(lambda: [])
|
||||||
# add a new label to the doc
|
# add a new label to the doc
|
||||||
doc = nlp("What do you think about Apple ?")
|
doc = nlp("What do you think about Apple ?")
|
||||||
assert len(ner.labels) == 1
|
assert len(ner.labels) == 1
|
||||||
|
@ -324,7 +321,6 @@ def test_issue4313():
|
||||||
entity_scores[(start, end, label)] += score
|
entity_scores[(start, end, label)] += score
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
|
||||||
def test_issue4348():
|
def test_issue4348():
|
||||||
"""Test that training the tagger with empty data, doesn't throw errors"""
|
"""Test that training the tagger with empty data, doesn't throw errors"""
|
||||||
nlp = English()
|
nlp = English()
|
||||||
|
|
|
@ -179,7 +179,6 @@ def test_issue4707():
|
||||||
assert "entity_ruler" in new_nlp.pipe_names
|
assert "entity_ruler" in new_nlp.pipe_names
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
|
||||||
def test_issue4725_1():
|
def test_issue4725_1():
|
||||||
""" Ensure the pickling of the NER goes well"""
|
""" Ensure the pickling of the NER goes well"""
|
||||||
vocab = Vocab(vectors_name="test_vocab_add_vector")
|
vocab = Vocab(vectors_name="test_vocab_add_vector")
|
||||||
|
@ -198,7 +197,6 @@ def test_issue4725_1():
|
||||||
assert ner2.cfg["update_with_oracle_cut_size"] == 111
|
assert ner2.cfg["update_with_oracle_cut_size"] == 111
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
|
||||||
def test_issue4725_2():
|
def test_issue4725_2():
|
||||||
# ensures that this runs correctly and doesn't hang or crash because of the global vectors
|
# ensures that this runs correctly and doesn't hang or crash because of the global vectors
|
||||||
# if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows),
|
# if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows),
|
||||||
|
|
|
@ -1,8 +1,7 @@
|
||||||
import pytest
|
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
|
||||||
def test_issue5152():
|
def test_issue5152():
|
||||||
# Test that the comparison between a Span and a Token, goes well
|
# Test that the comparison between a Span and a Token, goes well
|
||||||
# There was a bug when the number of tokens in the span equaled the number of characters in the token (!)
|
# There was a bug when the number of tokens in the span equaled the number of characters in the token (!)
|
||||||
|
@ -14,6 +13,8 @@ def test_issue5152():
|
||||||
span_2 = text[0:3] # Talk about being
|
span_2 = text[0:3] # Talk about being
|
||||||
span_3 = text_var[0:3] # Talk of being
|
span_3 = text_var[0:3] # Talk of being
|
||||||
token = y[0] # Let
|
token = y[0] # Let
|
||||||
assert span.similarity(token) == 0.0
|
with pytest.warns(UserWarning):
|
||||||
|
assert span.similarity(token) == 0.0
|
||||||
assert span.similarity(span_2) == 1.0
|
assert span.similarity(span_2) == 1.0
|
||||||
assert span_2.similarity(span_3) < 1.0
|
with pytest.warns(UserWarning):
|
||||||
|
assert span_2.similarity(span_3) < 1.0
|
||||||
|
|
|
@ -62,7 +62,7 @@ def tagger():
|
||||||
# need to add model for two reasons:
|
# need to add model for two reasons:
|
||||||
# 1. no model leads to error in serialization,
|
# 1. no model leads to error in serialization,
|
||||||
# 2. the affected line is the one for model serialization
|
# 2. the affected line is the one for model serialization
|
||||||
tagger.begin_training(pipeline=nlp.pipeline)
|
tagger.begin_training(lambda: [], pipeline=nlp.pipeline)
|
||||||
return tagger
|
return tagger
|
||||||
|
|
||||||
|
|
||||||
|
@ -81,7 +81,7 @@ def entity_linker():
|
||||||
# need to add model for two reasons:
|
# need to add model for two reasons:
|
||||||
# 1. no model leads to error in serialization,
|
# 1. no model leads to error in serialization,
|
||||||
# 2. the affected line is the one for model serialization
|
# 2. the affected line is the one for model serialization
|
||||||
entity_linker.begin_training(pipeline=nlp.pipeline)
|
entity_linker.begin_training(lambda: [], pipeline=nlp.pipeline)
|
||||||
return entity_linker
|
return entity_linker
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -4,7 +4,7 @@ import spacy
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.lang.de import German
|
from spacy.lang.de import German
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
from spacy.util import registry, deep_merge_configs, load_model_from_config
|
from spacy.util import registry, load_model_from_config
|
||||||
from spacy.ml.models import build_Tok2Vec_model, build_tb_parser_model
|
from spacy.ml.models import build_Tok2Vec_model, build_tb_parser_model
|
||||||
from spacy.ml.models import MultiHashEmbed, MaxoutWindowEncoder
|
from spacy.ml.models import MultiHashEmbed, MaxoutWindowEncoder
|
||||||
|
|
||||||
|
@ -194,37 +194,6 @@ def test_serialize_parser():
|
||||||
assert upper.get_dim("nI") == 66
|
assert upper.get_dim("nI") == 66
|
||||||
|
|
||||||
|
|
||||||
def test_deep_merge_configs():
|
|
||||||
config = {"a": "hello", "b": {"c": "d"}}
|
|
||||||
defaults = {"a": "world", "b": {"c": "e", "f": "g"}}
|
|
||||||
merged = deep_merge_configs(config, defaults)
|
|
||||||
assert len(merged) == 2
|
|
||||||
assert merged["a"] == "hello"
|
|
||||||
assert merged["b"] == {"c": "d", "f": "g"}
|
|
||||||
config = {"a": "hello", "b": {"@test": "x", "foo": 1}}
|
|
||||||
defaults = {"a": "world", "b": {"@test": "x", "foo": 100, "bar": 2}, "c": 100}
|
|
||||||
merged = deep_merge_configs(config, defaults)
|
|
||||||
assert len(merged) == 3
|
|
||||||
assert merged["a"] == "hello"
|
|
||||||
assert merged["b"] == {"@test": "x", "foo": 1, "bar": 2}
|
|
||||||
assert merged["c"] == 100
|
|
||||||
config = {"a": "hello", "b": {"@test": "x", "foo": 1}, "c": 100}
|
|
||||||
defaults = {"a": "world", "b": {"@test": "y", "foo": 100, "bar": 2}}
|
|
||||||
merged = deep_merge_configs(config, defaults)
|
|
||||||
assert len(merged) == 3
|
|
||||||
assert merged["a"] == "hello"
|
|
||||||
assert merged["b"] == {"@test": "x", "foo": 1}
|
|
||||||
assert merged["c"] == 100
|
|
||||||
# Test that leaving out the factory just adds to existing
|
|
||||||
config = {"a": "hello", "b": {"foo": 1}, "c": 100}
|
|
||||||
defaults = {"a": "world", "b": {"@test": "y", "foo": 100, "bar": 2}}
|
|
||||||
merged = deep_merge_configs(config, defaults)
|
|
||||||
assert len(merged) == 3
|
|
||||||
assert merged["a"] == "hello"
|
|
||||||
assert merged["b"] == {"@test": "y", "foo": 1, "bar": 2}
|
|
||||||
assert merged["c"] == 100
|
|
||||||
|
|
||||||
|
|
||||||
def test_config_nlp_roundtrip():
|
def test_config_nlp_roundtrip():
|
||||||
"""Test that a config prduced by the nlp object passes training config
|
"""Test that a config prduced by the nlp object passes training config
|
||||||
validation."""
|
validation."""
|
||||||
|
@ -311,3 +280,22 @@ def test_config_overrides():
|
||||||
nlp = spacy.load(d)
|
nlp = spacy.load(d)
|
||||||
assert isinstance(nlp, English)
|
assert isinstance(nlp, English)
|
||||||
assert nlp.pipe_names == ["tok2vec", "tagger"]
|
assert nlp.pipe_names == ["tok2vec", "tagger"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_config_interpolation():
|
||||||
|
config = Config().from_str(nlp_config_string, interpolate=False)
|
||||||
|
assert config["training"]["train_corpus"]["path"] == "${paths:train}"
|
||||||
|
interpolated = config.interpolate()
|
||||||
|
assert interpolated["training"]["train_corpus"]["path"] == ""
|
||||||
|
nlp = English.from_config(config)
|
||||||
|
assert nlp.config["training"]["train_corpus"]["path"] == "${paths:train}"
|
||||||
|
# Ensure that variables are preserved in nlp config
|
||||||
|
width = "${components.tok2vec.model:width}"
|
||||||
|
assert config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
|
||||||
|
assert nlp.config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
|
||||||
|
interpolated2 = nlp.config.interpolate()
|
||||||
|
assert interpolated2["training"]["train_corpus"]["path"] == ""
|
||||||
|
assert interpolated2["components"]["tagger"]["model"]["tok2vec"]["width"] == 342
|
||||||
|
nlp2 = English.from_config(interpolated)
|
||||||
|
assert nlp2.config["training"]["train_corpus"]["path"] == ""
|
||||||
|
assert nlp2.config["components"]["tagger"]["model"]["tok2vec"]["width"] == 342
|
||||||
|
|
|
@ -1,11 +1,14 @@
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from spacy.gold import docs_to_json, biluo_tags_from_offsets
|
from spacy.gold import docs_to_json, biluo_tags_from_offsets
|
||||||
from spacy.gold.converters import iob2docs, conll_ner2docs, conllu2docs
|
from spacy.gold.converters import iob2docs, conll_ner2docs, conllu2docs
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.schemas import ProjectConfigSchema, validate
|
from spacy.schemas import ProjectConfigSchema, validate
|
||||||
from spacy.cli.pretrain import make_docs
|
from spacy.cli.pretrain import make_docs
|
||||||
|
from spacy.cli.init_config import init_config, RECOMMENDATIONS_PATH
|
||||||
|
from spacy.cli.init_config import RecommendationSchema
|
||||||
from spacy.cli._util import validate_project_commands, parse_config_overrides
|
from spacy.cli._util import validate_project_commands, parse_config_overrides
|
||||||
|
from spacy.util import get_lang_class
|
||||||
|
import srsly
|
||||||
|
|
||||||
|
|
||||||
def test_cli_converters_conllu2json():
|
def test_cli_converters_conllu2json():
|
||||||
|
@ -319,3 +322,20 @@ def test_parse_config_overrides(args, expected):
|
||||||
def test_parse_config_overrides_invalid(args):
|
def test_parse_config_overrides_invalid(args):
|
||||||
with pytest.raises(SystemExit):
|
with pytest.raises(SystemExit):
|
||||||
parse_config_overrides(args)
|
parse_config_overrides(args)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("lang", ["en", "nl"])
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"pipeline", [["tagger", "parser", "ner"], [], ["ner", "textcat", "sentencizer"]]
|
||||||
|
)
|
||||||
|
@pytest.mark.parametrize("optimize", ["efficiency", "accuracy"])
|
||||||
|
def test_init_config(lang, pipeline, optimize):
|
||||||
|
# TODO: add more tests and also check for GPU with transformers
|
||||||
|
init_config("-", lang=lang, pipeline=pipeline, optimize=optimize, cpu=True)
|
||||||
|
|
||||||
|
|
||||||
|
def test_model_recommendations():
|
||||||
|
recommendations = srsly.read_json(RECOMMENDATIONS_PATH)
|
||||||
|
for lang, data in recommendations.items():
|
||||||
|
assert get_lang_class(lang)
|
||||||
|
assert RecommendationSchema(**data)
|
||||||
|
|
|
@ -154,6 +154,7 @@ def test_example_from_dict_some_ner(en_vocab):
|
||||||
assert ner_tags == ["U-LOC", None, None, None]
|
assert ner_tags == ["U-LOC", None, None, None]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||||
def test_json2docs_no_ner(en_vocab):
|
def test_json2docs_no_ner(en_vocab):
|
||||||
data = [
|
data = [
|
||||||
{
|
{
|
||||||
|
@ -506,6 +507,7 @@ def test_roundtrip_docs_to_docbin(doc):
|
||||||
assert cats["BAKING"] == reloaded_example.reference.cats["BAKING"]
|
assert cats["BAKING"] == reloaded_example.reference.cats["BAKING"]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||||
def test_make_orth_variants(doc):
|
def test_make_orth_variants(doc):
|
||||||
nlp = English()
|
nlp = English()
|
||||||
with make_tempdir() as tmpdir:
|
with make_tempdir() as tmpdir:
|
||||||
|
@ -586,7 +588,7 @@ def test_tuple_format_implicit():
|
||||||
("Uber blew through $1 million a week", {"entities": [(0, 4, "ORG")]}),
|
("Uber blew through $1 million a week", {"entities": [(0, 4, "ORG")]}),
|
||||||
(
|
(
|
||||||
"Spotify steps up Asia expansion",
|
"Spotify steps up Asia expansion",
|
||||||
{"entities": [(0, 8, "ORG"), (17, 21, "LOC")]},
|
{"entities": [(0, 7, "ORG"), (17, 21, "LOC")]},
|
||||||
),
|
),
|
||||||
("Google rebrands its business apps", {"entities": [(0, 6, "ORG")]}),
|
("Google rebrands its business apps", {"entities": [(0, 6, "ORG")]}),
|
||||||
]
|
]
|
||||||
|
@ -601,7 +603,7 @@ def test_tuple_format_implicit_invalid():
|
||||||
("Uber blew through $1 million a week", {"frumble": [(0, 4, "ORG")]}),
|
("Uber blew through $1 million a week", {"frumble": [(0, 4, "ORG")]}),
|
||||||
(
|
(
|
||||||
"Spotify steps up Asia expansion",
|
"Spotify steps up Asia expansion",
|
||||||
{"entities": [(0, 8, "ORG"), (17, 21, "LOC")]},
|
{"entities": [(0, 7, "ORG"), (17, 21, "LOC")]},
|
||||||
),
|
),
|
||||||
("Google rebrands its business apps", {"entities": [(0, 6, "ORG")]}),
|
("Google rebrands its business apps", {"entities": [(0, 6, "ORG")]}),
|
||||||
]
|
]
|
||||||
|
|
|
@ -46,6 +46,7 @@ def test_Example_from_dict_with_tags(pred_words, annots):
|
||||||
assert aligned_tags == ["NN" for _ in predicted]
|
assert aligned_tags == ["NN" for _ in predicted]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||||
def test_aligned_tags():
|
def test_aligned_tags():
|
||||||
pred_words = ["Apply", "some", "sunscreen", "unless", "you", "can", "not"]
|
pred_words = ["Apply", "some", "sunscreen", "unless", "you", "can", "not"]
|
||||||
gold_words = ["Apply", "some", "sun", "screen", "unless", "you", "cannot"]
|
gold_words = ["Apply", "some", "sun", "screen", "unless", "you", "cannot"]
|
||||||
|
@ -198,8 +199,8 @@ def test_Example_from_dict_with_entities(annots):
|
||||||
def test_Example_from_dict_with_entities_invalid(annots):
|
def test_Example_from_dict_with_entities_invalid(annots):
|
||||||
vocab = Vocab()
|
vocab = Vocab()
|
||||||
predicted = Doc(vocab, words=annots["words"])
|
predicted = Doc(vocab, words=annots["words"])
|
||||||
example = Example.from_dict(predicted, annots)
|
with pytest.warns(UserWarning):
|
||||||
# TODO: shouldn't this throw some sort of warning ?
|
example = Example.from_dict(predicted, annots)
|
||||||
assert len(list(example.reference.ents)) == 0
|
assert len(list(example.reference.ents)) == 0
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -24,6 +24,7 @@ from .util import registry
|
||||||
from .attrs import intify_attrs
|
from .attrs import intify_attrs
|
||||||
from .symbols import ORTH
|
from .symbols import ORTH
|
||||||
from .scorer import Scorer
|
from .scorer import Scorer
|
||||||
|
from .gold import validate_examples
|
||||||
|
|
||||||
|
|
||||||
cdef class Tokenizer:
|
cdef class Tokenizer:
|
||||||
|
@ -712,6 +713,7 @@ cdef class Tokenizer:
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
def score(self, examples, **kwargs):
|
def score(self, examples, **kwargs):
|
||||||
|
validate_examples(examples, "Tokenizer.score")
|
||||||
return Scorer.score_tokenization(examples)
|
return Scorer.score_tokenization(examples)
|
||||||
|
|
||||||
def to_disk(self, path, **kwargs):
|
def to_disk(self, path, **kwargs):
|
||||||
|
|
123
spacy/util.py
123
spacy/util.py
|
@ -24,6 +24,7 @@ import tempfile
|
||||||
import shutil
|
import shutil
|
||||||
import shlex
|
import shlex
|
||||||
import inspect
|
import inspect
|
||||||
|
import logging
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import cupy.random
|
import cupy.random
|
||||||
|
@ -54,10 +55,19 @@ if TYPE_CHECKING:
|
||||||
from .vocab import Vocab # noqa: F401
|
from .vocab import Vocab # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
_PRINT_ENV = False
|
|
||||||
OOV_RANK = numpy.iinfo(numpy.uint64).max
|
OOV_RANK = numpy.iinfo(numpy.uint64).max
|
||||||
LEXEME_NORM_LANGS = ["da", "de", "el", "en", "id", "lb", "pt", "ru", "sr", "ta", "th"]
|
LEXEME_NORM_LANGS = ["da", "de", "el", "en", "id", "lb", "pt", "ru", "sr", "ta", "th"]
|
||||||
|
|
||||||
|
# Default order of sections in the config.cfg. Not all sections needs to exist,
|
||||||
|
# and additional sections are added at the end, in alphabetical order.
|
||||||
|
# fmt: off
|
||||||
|
CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "training", "pretraining"]
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
|
|
||||||
|
logging.basicConfig()
|
||||||
|
logger = logging.getLogger("spacy")
|
||||||
|
|
||||||
|
|
||||||
class registry(thinc.registry):
|
class registry(thinc.registry):
|
||||||
languages = catalogue.create("spacy", "languages", entry_points=True)
|
languages = catalogue.create("spacy", "languages", entry_points=True)
|
||||||
|
@ -109,11 +119,6 @@ class SimpleFrozenDict(dict):
|
||||||
raise NotImplementedError(self.error)
|
raise NotImplementedError(self.error)
|
||||||
|
|
||||||
|
|
||||||
def set_env_log(value: bool) -> None:
|
|
||||||
global _PRINT_ENV
|
|
||||||
_PRINT_ENV = value
|
|
||||||
|
|
||||||
|
|
||||||
def lang_class_is_loaded(lang: str) -> bool:
|
def lang_class_is_loaded(lang: str) -> bool:
|
||||||
"""Check whether a Language class is already loaded. Language classes are
|
"""Check whether a Language class is already loaded. Language classes are
|
||||||
loaded lazily, to avoid expensive setup code associated with the language
|
loaded lazily, to avoid expensive setup code associated with the language
|
||||||
|
@ -264,9 +269,7 @@ def load_model_from_path(
|
||||||
if not meta:
|
if not meta:
|
||||||
meta = get_model_meta(model_path)
|
meta = get_model_meta(model_path)
|
||||||
config_path = model_path / "config.cfg"
|
config_path = model_path / "config.cfg"
|
||||||
if not config_path.exists() or not config_path.is_file():
|
config = load_config(config_path, overrides=dict_to_dot(config))
|
||||||
raise IOError(Errors.E053.format(path=config_path, name="config.cfg"))
|
|
||||||
config = Config().from_disk(config_path, overrides=dict_to_dot(config))
|
|
||||||
nlp, _ = load_model_from_config(config, vocab=vocab, disable=disable)
|
nlp, _ = load_model_from_config(config, vocab=vocab, disable=disable)
|
||||||
return nlp.from_disk(model_path, exclude=disable)
|
return nlp.from_disk(model_path, exclude=disable)
|
||||||
|
|
||||||
|
@ -317,6 +320,29 @@ def load_model_from_init_py(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def load_config(
|
||||||
|
path: Union[str, Path],
|
||||||
|
overrides: Dict[str, Any] = SimpleFrozenDict(),
|
||||||
|
interpolate: bool = False,
|
||||||
|
) -> Config:
|
||||||
|
"""Load a config file. Takes care of path validation and section order."""
|
||||||
|
config_path = ensure_path(path)
|
||||||
|
if not config_path.exists() or not config_path.is_file():
|
||||||
|
raise IOError(Errors.E053.format(path=config_path, name="config.cfg"))
|
||||||
|
return Config(section_order=CONFIG_SECTION_ORDER).from_disk(
|
||||||
|
config_path, overrides=overrides, interpolate=interpolate
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def load_config_from_str(
|
||||||
|
text: str, overrides: Dict[str, Any] = SimpleFrozenDict(), interpolate: bool = False
|
||||||
|
):
|
||||||
|
"""Load a full config from a string."""
|
||||||
|
return Config(section_order=CONFIG_SECTION_ORDER).from_str(
|
||||||
|
text, overrides=overrides, interpolate=interpolate,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_installed_models() -> List[str]:
|
def get_installed_models() -> List[str]:
|
||||||
"""List all model packages currently installed in the environment.
|
"""List all model packages currently installed in the environment.
|
||||||
|
|
||||||
|
@ -602,27 +628,6 @@ def get_async(stream, numpy_array):
|
||||||
return array
|
return array
|
||||||
|
|
||||||
|
|
||||||
def env_opt(name: str, default: Optional[Any] = None) -> Optional[Any]:
|
|
||||||
if type(default) is float:
|
|
||||||
type_convert = float
|
|
||||||
else:
|
|
||||||
type_convert = int
|
|
||||||
if "SPACY_" + name.upper() in os.environ:
|
|
||||||
value = type_convert(os.environ["SPACY_" + name.upper()])
|
|
||||||
if _PRINT_ENV:
|
|
||||||
print(name, "=", repr(value), "via", "$SPACY_" + name.upper())
|
|
||||||
return value
|
|
||||||
elif name in os.environ:
|
|
||||||
value = type_convert(os.environ[name])
|
|
||||||
if _PRINT_ENV:
|
|
||||||
print(name, "=", repr(value), "via", "$" + name)
|
|
||||||
return value
|
|
||||||
else:
|
|
||||||
if _PRINT_ENV:
|
|
||||||
print(name, "=", repr(default), "by default")
|
|
||||||
return default
|
|
||||||
|
|
||||||
|
|
||||||
def read_regex(path: Union[str, Path]) -> Pattern:
|
def read_regex(path: Union[str, Path]) -> Pattern:
|
||||||
path = ensure_path(path)
|
path = ensure_path(path)
|
||||||
with path.open(encoding="utf8") as file_:
|
with path.open(encoding="utf8") as file_:
|
||||||
|
@ -923,45 +928,6 @@ def copy_config(config: Union[Dict[str, Any], Config]) -> Config:
|
||||||
raise ValueError(Errors.E961.format(config=config)) from None
|
raise ValueError(Errors.E961.format(config=config)) from None
|
||||||
|
|
||||||
|
|
||||||
def deep_merge_configs(
|
|
||||||
config: Union[Dict[str, Any], Config], defaults: Union[Dict[str, Any], Config]
|
|
||||||
) -> Config:
|
|
||||||
"""Deep merge two configs, a base config and its defaults. Ignores
|
|
||||||
references to registered functions to avoid filling in
|
|
||||||
|
|
||||||
config (Dict[str, Any]): The config.
|
|
||||||
destination (Dict[str, Any]): The config defaults.
|
|
||||||
RETURNS (Dict[str, Any]): The merged config.
|
|
||||||
"""
|
|
||||||
config = copy_config(config)
|
|
||||||
merged = _deep_merge_configs(config, defaults)
|
|
||||||
return Config(merged)
|
|
||||||
|
|
||||||
|
|
||||||
def _deep_merge_configs(
|
|
||||||
config: Union[Dict[str, Any], Config], defaults: Union[Dict[str, Any], Config]
|
|
||||||
) -> Union[Dict[str, Any], Config]:
|
|
||||||
for key, value in defaults.items():
|
|
||||||
if isinstance(value, dict):
|
|
||||||
node = config.setdefault(key, {})
|
|
||||||
if not isinstance(node, dict):
|
|
||||||
continue
|
|
||||||
promises = [key for key in value if key.startswith("@")]
|
|
||||||
promise = promises[0] if promises else None
|
|
||||||
# We only update the block from defaults if it refers to the same
|
|
||||||
# registered function
|
|
||||||
if (
|
|
||||||
promise
|
|
||||||
and any(k.startswith("@") for k in node)
|
|
||||||
and (promise in node and node[promise] != value[promise])
|
|
||||||
):
|
|
||||||
continue
|
|
||||||
defaults = _deep_merge_configs(node, value)
|
|
||||||
elif key not in config:
|
|
||||||
config[key] = value
|
|
||||||
return config
|
|
||||||
|
|
||||||
|
|
||||||
def dot_to_dict(values: Dict[str, Any]) -> Dict[str, dict]:
|
def dot_to_dict(values: Dict[str, Any]) -> Dict[str, dict]:
|
||||||
"""Convert dot notation to a dict. For example: {"token.pos": True,
|
"""Convert dot notation to a dict. For example: {"token.pos": True,
|
||||||
"token._.xyz": True} becomes {"token": {"pos": True, "_": {"xyz": True }}}.
|
"token._.xyz": True} becomes {"token": {"pos": True, "_": {"xyz": True }}}.
|
||||||
|
@ -1067,24 +1033,7 @@ class DummyTokenizer:
|
||||||
|
|
||||||
|
|
||||||
def create_default_optimizer() -> Optimizer:
|
def create_default_optimizer() -> Optimizer:
|
||||||
# TODO: Do we still want to allow env_opt?
|
return Adam()
|
||||||
learn_rate = env_opt("learn_rate", 0.001)
|
|
||||||
beta1 = env_opt("optimizer_B1", 0.9)
|
|
||||||
beta2 = env_opt("optimizer_B2", 0.999)
|
|
||||||
eps = env_opt("optimizer_eps", 1e-8)
|
|
||||||
L2 = env_opt("L2_penalty", 1e-6)
|
|
||||||
grad_clip = env_opt("grad_norm_clip", 10.0)
|
|
||||||
L2_is_weight_decay = env_opt("L2_is_weight_decay", False)
|
|
||||||
optimizer = Adam(
|
|
||||||
learn_rate,
|
|
||||||
L2=L2,
|
|
||||||
beta1=beta1,
|
|
||||||
beta2=beta2,
|
|
||||||
eps=eps,
|
|
||||||
grad_clip=grad_clip,
|
|
||||||
L2_is_weight_decay=L2_is_weight_decay,
|
|
||||||
)
|
|
||||||
return optimizer
|
|
||||||
|
|
||||||
|
|
||||||
def minibatch(items, size):
|
def minibatch(items, size):
|
||||||
|
|
|
@ -274,7 +274,7 @@ architectures into your training config.
|
||||||
| `get_spans` | `Callable` | Function that takes a batch of [`Doc`](/api/doc) object and returns lists of [`Span`](/api) objects to process by the transformer. [See here](/api/transformer#span_getters) for built-in options and examples. |
|
| `get_spans` | `Callable` | Function that takes a batch of [`Doc`](/api/doc) object and returns lists of [`Span`](/api) objects to process by the transformer. [See here](/api/transformer#span_getters) for built-in options and examples. |
|
||||||
| `tokenizer_config` | `Dict[str, Any]` | Tokenizer settings passed to [`transformers.AutoTokenizer`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoTokenizer). |
|
| `tokenizer_config` | `Dict[str, Any]` | Tokenizer settings passed to [`transformers.AutoTokenizer`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoTokenizer). |
|
||||||
|
|
||||||
### spacy-transformers.Tok2VecListener.v1 {#Tok2VecListener}
|
### spacy-transformers.Tok2VecListener.v1 {#transformers-Tok2VecListener}
|
||||||
|
|
||||||
> #### Example Config
|
> #### Example Config
|
||||||
>
|
>
|
||||||
|
|
|
@ -16,9 +16,11 @@ menu:
|
||||||
- ['Project', 'project']
|
- ['Project', 'project']
|
||||||
---
|
---
|
||||||
|
|
||||||
For a list of available commands, type `spacy --help`.
|
spaCy's CLI provides a range of helpful commands for downloading and training
|
||||||
|
models, converting data and debugging your config, data and installation. For a
|
||||||
<!-- TODO: add notes on autocompletion etc. -->
|
list of available commands, you can type `python -m spacy --help`. You can also
|
||||||
|
add the `--help` flag to any command or subcommand to see the description,
|
||||||
|
available arguments and usage.
|
||||||
|
|
||||||
## Download {#download}
|
## Download {#download}
|
||||||
|
|
||||||
|
@ -41,13 +43,13 @@ the model name to be specified with its version (e.g. `en_core_web_sm-2.2.0`).
|
||||||
$ python -m spacy download [model] [--direct] [pip args]
|
$ python -m spacy download [model] [--direct] [pip args]
|
||||||
```
|
```
|
||||||
|
|
||||||
| Argument | Type | Description |
|
| Argument | Type | Description |
|
||||||
| ------------------------------------- | ---------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------------------------- | ------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `model` | positional | Model name, e.g. `en_core_web_sm`.. |
|
| `model` | positional | Model name, e.g. [`en_core_web_sm`](/models/en#en_core_web_sm). |
|
||||||
| `--direct`, `-d` | flag | Force direct download of exact model version. |
|
| `--direct`, `-d` | flag | Force direct download of exact model version. |
|
||||||
| pip args <Tag variant="new">2.1</Tag> | - | Additional installation options to be passed to `pip install` when installing the model package. For example, `--user` to install to the user home directory or `--no-deps` to not install model dependencies. |
|
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||||
| `--help`, `-h` | flag | Show help message and available arguments. |
|
| pip args <Tag variant="new">2.1</Tag> | option / flag | Additional installation options to be passed to `pip install` when installing the model package. For example, `--user` to install to the user home directory or `--no-deps` to not install model dependencies. |
|
||||||
| **CREATES** | directory | The installed model package in your `site-packages` directory. |
|
| **CREATES** | directory | The installed model package in your `site-packages` directory. |
|
||||||
|
|
||||||
## Info {#info}
|
## Info {#info}
|
||||||
|
|
||||||
|
@ -101,39 +103,62 @@ files and model directories.
|
||||||
|
|
||||||
### init config {#init-config new="3"}
|
### init config {#init-config new="3"}
|
||||||
|
|
||||||
Initialize and export a [`config.cfg` file](/usage/training#config) for training
|
Initialize and save a [`config.cfg` file](/usage/training#config) using the
|
||||||
and update it with all default values, if possible. Config files used for
|
**recommended settings** for your use case. It works just like the
|
||||||
training should always be complete and not contain any hidden defaults or
|
[quickstart widget](/usage/training#quickstart), only that it also auto-fills
|
||||||
missing values, so this command helps you create your final config. It takes
|
all default values and exports a [training](/usage/training#config)-ready
|
||||||
**one** of the following options:
|
config. The settings you specify will impact the suggested model architectures
|
||||||
|
and pipeline setup, as well as the hyperparameters. You can also adjust and
|
||||||
- `--base`: Base **config** to auto-fill, e.g. created using the
|
customize those settings in your config file later.
|
||||||
[training quickstart](/usage/training#quickstart) widget.
|
|
||||||
- `--lang`: Base **language** code to use for blank config.
|
|
||||||
- `--model`: Base **model** to copy config from.
|
|
||||||
|
|
||||||
> ```bash
|
> ```bash
|
||||||
> ### with base config {wrap="true"}
|
> ### Example {wrap="true"}
|
||||||
> $ python -m spacy init config config.cfg --base base.cfg
|
> $ python -m spacy init config config.cfg --lang en --pipeline ner,textcat --optimize accuracy
|
||||||
> ```
|
|
||||||
>
|
|
||||||
> ```bash
|
|
||||||
> ### blank language {wrap="true"}
|
|
||||||
> $ python -m spacy init config config.cfg --lang en --pipeline tagger,parser
|
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ python -m spacy init config [output] [--base] [--lang] [--model] [--pipeline]
|
$ python -m spacy init config [output_file] [--lang] [--pipeline]
|
||||||
|
[--optimize] [--cpu]
|
||||||
```
|
```
|
||||||
|
|
||||||
| Argument | Type | Description |
|
| Argument | Type | Description |
|
||||||
| ------------------ | ---------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------ | ---------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `output` | positional | Path to output `.cfg` file. If not set, the config is written to stdout so you can pipe it forward to a file. |
|
| `output_file` | positional | Path to output `.cfg` file. If not set, the config is written to stdout so you can pipe it forward to a file. |
|
||||||
| `--base`, `-b` | option | Optional base config file to auto-fill with defaults. |
|
| `--lang`, `-l` | option | Optional code of the [language](/usage/models#languages) to use. Defaults to `"en"`. |
|
||||||
| `--lang`, `-l` | option | Optional language code to use for blank config. If a `--pipeline` is specified, the components will be added in order. |
|
| `--pipeline`, `-p` | option | Comma-separated list of trainable [pipeline components](/usage/processing-pipelines#built-in) to include in the model. Defaults to `"tagger,parser,ner"`. |
|
||||||
| `--model`, `-m` | option | Optional base model to copy config from. If a `--pipeline` is specified, only those components will be kept, and all other components not in the model will be added. |
|
| `--optimize`, `-o` | option | `"efficiency"` or `"accuracy"`. Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters. Defaults to `"efficiency"`. |
|
||||||
| `--pipeline`, `-p` | option | Optional comma-separated pipeline of components to add to blank language or model. |
|
| `--cpu`, `-C` | flag | Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters. |
|
||||||
| **CREATES** | config | Complete and auto-filled config file for training. |
|
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||||
|
| **CREATES** | file | The config file for training. |
|
||||||
|
|
||||||
|
### init fill-config {#init-fill-config new="3"}
|
||||||
|
|
||||||
|
Auto-fill a partial [`config.cfg` file](/usage/training#config) file with **all
|
||||||
|
default values**, e.g. a config generated with the
|
||||||
|
[quickstart widget](/usage/training#quickstart). Config files used for training
|
||||||
|
should always be complete and not contain any hidden defaults or missing values,
|
||||||
|
so this command helps you create your final training config. In order to find
|
||||||
|
the available settings and defaults, all functions referenced in the config will
|
||||||
|
be created, and their signatures are used to find the defaults. If your config
|
||||||
|
contains a problem that can't be resolved automatically, spaCy will show you a
|
||||||
|
validation error with more details.
|
||||||
|
|
||||||
|
> ```bash
|
||||||
|
> ### Example {wrap="true"}
|
||||||
|
> $ python -m spacy init fill-config base.cfg config.cfg
|
||||||
|
> ```
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ python -m spacy init fill-config [base_path] [output_file] [--diff]
|
||||||
|
```
|
||||||
|
|
||||||
|
| Argument | Type | Description |
|
||||||
|
| -------------- | ---------- | ------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `base_path` | positional | Path to base config to fill, e.g. generated by the [quickstart widget](/usage/training#quickstart). |
|
||||||
|
| `output_file` | positional | Path to output `.cfg` file. If not set, the config is written to stdout so you can pipe it forward to a file. |
|
||||||
|
| `--diff`, `-D` | flag | Print a visual diff highlighting the changes. |
|
||||||
|
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||||
|
| **CREATES** | file | Complete and auto-filled config file for training. |
|
||||||
|
|
||||||
### init model {#init-model new="2"}
|
### init model {#init-model new="2"}
|
||||||
|
|
||||||
|
@ -166,6 +191,7 @@ $ python -m spacy init model [lang] [output_dir] [--jsonl-loc] [--vectors-loc]
|
||||||
| `--truncate-vectors`, `-t` <Tag variant="new">2.3</Tag> | option | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. |
|
| `--truncate-vectors`, `-t` <Tag variant="new">2.3</Tag> | option | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. |
|
||||||
| `--prune-vectors`, `-V` | option | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. |
|
| `--prune-vectors`, `-V` | option | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. |
|
||||||
| `--vectors-name`, `-vn` | option | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. |
|
| `--vectors-name`, `-vn` | option | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. |
|
||||||
|
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||||
| **CREATES** | model | A spaCy model containing the vocab and vectors. |
|
| **CREATES** | model | A spaCy model containing the vocab and vectors. |
|
||||||
|
|
||||||
## Convert {#convert}
|
## Convert {#convert}
|
||||||
|
@ -234,34 +260,33 @@ $ python -m spacy debug config [config_path] [--code_path] [--output] [--auto_fi
|
||||||
|
|
||||||
<Accordion title="Example output" spaced>
|
<Accordion title="Example output" spaced>
|
||||||
|
|
||||||
<!-- TODO: update examples with validation error of final config -->
|
|
||||||
|
|
||||||
```
|
```
|
||||||
✘ Config validation error
|
✘ Config validation error
|
||||||
|
|
||||||
training -> use_gpu field required
|
training -> dropout field required
|
||||||
training -> omit_extra_lookups field required
|
training -> optimizer field required
|
||||||
training -> batch_by field required
|
training -> optimize extra fields not permitted
|
||||||
training -> raw_text field required
|
|
||||||
training -> tag_map field required
|
|
||||||
training -> evaluation_batch_size extra fields not permitted
|
|
||||||
training -> vectors extra fields not permitted
|
|
||||||
training -> width extra fields not permitted
|
|
||||||
|
|
||||||
{'gold_preproc': False, 'max_length': 3000, 'limit': 0, 'orth_variant_level': 0.0, 'dropout': 0.1, 'patience': 6000, 'max_epochs': 0, 'max_steps': 100000, 'eval_frequency': 400, 'seed': 0, 'accumulate_gradient': 4, 'width': 768, 'use_pytorch_for_gpu_memory': True, 'scores': ['speed', 'tags_acc', 'uas', 'las', 'ents_f'], 'score_weights': {'las': 0.4, 'ents_f': 0.4, 'tags_acc': 0.2}, 'init_tok2vec': None, 'vectors': None, 'discard_oversize': True, 'evaluation_batch_size': 16, 'batch_size': {'@schedules': 'compounding.v1', 'start': 800, 'stop': 800, 'compound': 1.001}, 'optimizer': {'@optimizers': 'Adam.v1', 'beta1': 0.9, 'beta2': 0.999, 'L2_is_weight_decay': True, 'L2': 0.01, 'grad_clip': 1.0, 'use_averages': False, 'eps': 1e-08, 'learn_rate': {'@schedules': 'warmup_linear.v1', 'warmup_steps': 250, 'total_steps': 20000, 'initial_rate': 5e-05}}}
|
{'vectors': 'en_vectors_web_lg', 'seed': 0, 'accumulate_gradient': 1, 'init_tok2vec': None, 'raw_text': None, 'patience': 1600, 'max_epochs': 0, 'max_steps': 20000, 'eval_frequency': 200, 'frozen_components': [], 'optimize': None, 'batcher': {'@batchers': 'batch_by_words.v1', 'discard_oversize': False, 'tolerance': 0.2, 'get_length': None, 'size': {'@schedules': 'compounding.v1', 'start': 100, 'stop': 1000, 'compound': 1.001, 't': 0.0}}, 'dev_corpus': {'@readers': 'spacy.Corpus.v1', 'path': '', 'max_length': 0, 'gold_preproc': False, 'limit': 0}, 'score_weights': {'tag_acc': 0.5, 'dep_uas': 0.25, 'dep_las': 0.25, 'sents_f': 0.0}, 'train_corpus': {'@readers': 'spacy.Corpus.v1', 'path': '', 'max_length': 0, 'gold_preproc': False, 'limit': 0}}
|
||||||
|
|
||||||
|
If your config contains missing values, you can run the 'init fill-config'
|
||||||
|
command to fill in all the defaults, if possible:
|
||||||
|
|
||||||
|
python -m spacy init fill-config tmp/starter-config_invalid.cfg --base tmp/starter-config_invalid.cfg
|
||||||
```
|
```
|
||||||
|
|
||||||
</Accordion>
|
</Accordion>
|
||||||
|
|
||||||
| Argument | Type | Default | Description |
|
| Argument | Type | Default | Description |
|
||||||
| --------------------- | ---------- | ------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| --------------------- | ------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------- |
|
||||||
| `config_path` | positional | - | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. |
|
| `config_path` | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. |
|
||||||
| `--code_path`, `-c` | option | `None` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. |
|
| `--code_path`, `-c` | option | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. |
|
||||||
| `--auto_fill`, `-F` | option | `False` | Whether or not to auto-fill the config with built-in defaults if possible. If `False`, the provided config needs to be complete. |
|
| `--auto_fill`, `-F` | option | Whether or not to auto-fill the config with built-in defaults if possible. If `False`, the provided config needs to be complete. |
|
||||||
| `--output_path`, `-o` | option | `None` | Output path where the filled config can be stored. Use '-' for standard output. |
|
| `--output_path`, `-o` | option | Output path where the filled config can be stored. Use '-' for standard output. |
|
||||||
| `--diff`, `-D` | option | `False` | Show a visual diff if config was auto-filled. |
|
| `--diff`, `-D` | option | `Show a visual diff if config was auto-filled. |
|
||||||
| `--help`, `-h` | flag | `False` | Show help message and available arguments. |
|
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||||
| overrides | | `None` | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. |
|
| overrides | option / flag | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. |
|
||||||
|
| **PRINTS** | stdout | Config validation errors, if available. |
|
||||||
|
|
||||||
### debug data {#debug-data}
|
### debug data {#debug-data}
|
||||||
|
|
||||||
|
@ -428,15 +453,16 @@ will not be available.
|
||||||
|
|
||||||
</Accordion>
|
</Accordion>
|
||||||
|
|
||||||
| Argument | Type | Description |
|
| Argument | Type | Description |
|
||||||
| -------------------------- | ---------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------------------- | ------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `config_path` | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. |
|
| `config_path` | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. |
|
||||||
| `--code`, `-c` | option | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. |
|
| `--code`, `-c` | option | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. |
|
||||||
| `--ignore-warnings`, `-IW` | flag | Ignore warnings, only show stats and errors. |
|
| `--ignore-warnings`, `-IW` | flag | Ignore warnings, only show stats and errors. |
|
||||||
| `--verbose`, `-V` | flag | Print additional information and explanations. |
|
| `--verbose`, `-V` | flag | Print additional information and explanations. |
|
||||||
| `--no-format`, `-NF` | flag | Don't pretty-print the results. Use this if you want to write to a file. |
|
| `--no-format`, `-NF` | flag | Don't pretty-print the results. Use this if you want to write to a file. |
|
||||||
| `--help`, `-h` | flag | Show help message and available arguments. |
|
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||||
| overrides | | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. |
|
| overrides | option / flag | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. |
|
||||||
|
| **PRINTS** | stdout | Debugging information. |
|
||||||
|
|
||||||
### debug profile {#debug-profile}
|
### debug profile {#debug-profile}
|
||||||
|
|
||||||
|
@ -456,11 +482,13 @@ The `profile` command is now available as a subcommand of `spacy debug`.
|
||||||
$ python -m spacy debug profile [model] [inputs] [--n-texts]
|
$ python -m spacy debug profile [model] [inputs] [--n-texts]
|
||||||
```
|
```
|
||||||
|
|
||||||
| Argument | Type | Description |
|
| Argument | Type | Description |
|
||||||
| ----------------- | ----------------------------------------------------------------- | ------------------------------------------------------- |
|
| ----------------- | ---------- | ----------------------------------------------------------------- |
|
||||||
| `model` | positional | A loadable spaCy model. |
|
| `model` | positional | A loadable spaCy model. |
|
||||||
| `inputs` | positional | Optional path to input file, or `-` for standard input. |
|
| `inputs` | positional | Optional path to input file, or `-` for standard input. |
|
||||||
| `--n-texts`, `-n` | Maximum number of texts to use if available. Defaults to `10000`. |
|
| `--n-texts`, `-n` | option | Maximum number of texts to use if available. Defaults to `10000`. |
|
||||||
|
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||||
|
| **PRINTS** | stdout | Profiling information for the model. |
|
||||||
|
|
||||||
### debug model {#debug-model}
|
### debug model {#debug-model}
|
||||||
|
|
||||||
|
@ -568,20 +596,21 @@ $ python -m spacy debug model ./config.cfg tagger -l "5,15" -DIM -PAR -P0 -P1 -P
|
||||||
|
|
||||||
</Accordion>
|
</Accordion>
|
||||||
|
|
||||||
| Argument | Type | Description | Default |
|
| Argument | Type | Description |
|
||||||
| ----------------------- | ---------- | ----------------------------------------------------------------------------------------------------- | ------- |
|
| ----------------------- | ---------- | ----------------------------------------------------------------------------------------------------- |
|
||||||
| `config_path` | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. | |
|
| `config_path` | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. | |
|
||||||
| `component` | positional | Name of the pipeline component of which the model should be analyzed. | |
|
| `component` | positional | Name of the pipeline component of which the model should be analyzed. | |
|
||||||
| `--layers`, `-l` | option | Comma-separated names of layer IDs to print. | |
|
| `--layers`, `-l` | option | Comma-separated names of layer IDs to print. | |
|
||||||
| `--dimensions`, `-DIM` | option | Show dimensions of each layer. | `False` |
|
| `--dimensions`, `-DIM` | option | Show dimensions of each layer. |
|
||||||
| `--parameters`, `-PAR` | option | Show parameters of each layer. | `False` |
|
| `--parameters`, `-PAR` | option | Show parameters of each layer. |
|
||||||
| `--gradients`, `-GRAD` | option | Show gradients of each layer. | `False` |
|
| `--gradients`, `-GRAD` | option | Show gradients of each layer. |
|
||||||
| `--attributes`, `-ATTR` | option | Show attributes of each layer. | `False` |
|
| `--attributes`, `-ATTR` | option | Show attributes of each layer. |
|
||||||
| `--print-step0`, `-P0` | option | Print model before training. | `False` |
|
| `--print-step0`, `-P0` | option | Print model before training. |
|
||||||
| `--print-step1`, `-P1` | option | Print model after initialization. | `False` |
|
| `--print-step1`, `-P1` | option | Print model after initialization. |
|
||||||
| `--print-step2`, `-P2` | option | Print model after training. | `False` |
|
| `--print-step2`, `-P2` | option | Print model after training. |
|
||||||
| `--print-step3`, `-P3` | option | Print final predictions. | `False` |
|
| `--print-step3`, `-P3` | option | Print final predictions. |
|
||||||
| `--help`, `-h` | flag | Show help message and available arguments. | |
|
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||||
|
| **PRINTS** | stdout | Debugging information. |
|
||||||
|
|
||||||
## Train {#train}
|
## Train {#train}
|
||||||
|
|
||||||
|
@ -611,15 +640,15 @@ in the section `[paths]`.
|
||||||
$ python -m spacy train [config_path] [--output] [--code] [--verbose] [overrides]
|
$ python -m spacy train [config_path] [--output] [--code] [--verbose] [overrides]
|
||||||
```
|
```
|
||||||
|
|
||||||
| Argument | Type | Description |
|
| Argument | Type | Description |
|
||||||
| ----------------- | ---------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------------- | ------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `config_path` | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. |
|
| `config_path` | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. |
|
||||||
| `--output`, `-o` | positional | Directory to store model in. Will be created if it doesn't exist. |
|
| `--output`, `-o` | positional | Directory to store model in. Will be created if it doesn't exist. |
|
||||||
| `--code`, `-c` | option | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. |
|
| `--code`, `-c` | option | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. |
|
||||||
| `--verbose`, `-V` | flag | Show more detailed messages during training. |
|
| `--verbose`, `-V` | flag | Show more detailed messages during training. |
|
||||||
| `--help`, `-h` | flag | Show help message and available arguments. |
|
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||||
| overrides | | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. |
|
| overrides | option / flag | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. |
|
||||||
| **CREATES** | model | The final model and the best model. |
|
| **CREATES** | model | The final model and the best model. |
|
||||||
|
|
||||||
## Pretrain {#pretrain new="2.1" tag="experimental"}
|
## Pretrain {#pretrain new="2.1" tag="experimental"}
|
||||||
|
|
||||||
|
@ -649,17 +678,17 @@ $ python -m spacy pretrain [texts_loc] [output_dir] [config_path]
|
||||||
[--code] [--resume-path] [--epoch-resume] [overrides]
|
[--code] [--resume-path] [--epoch-resume] [overrides]
|
||||||
```
|
```
|
||||||
|
|
||||||
| Argument | Type | Description |
|
| Argument | Type | Description |
|
||||||
| ----------------------- | ---------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------------------- | ------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `texts_loc` | positional | Path to JSONL file with raw texts to learn from, with text provided as the key `"text"` or tokens as the key `"tokens"`. [See here](/api/data-formats#pretrain) for details. |
|
| `texts_loc` | positional | Path to JSONL file with raw texts to learn from, with text provided as the key `"text"` or tokens as the key `"tokens"`. [See here](/api/data-formats#pretrain) for details. |
|
||||||
| `output_dir` | positional | Directory to write models to on each epoch. |
|
| `output_dir` | positional | Directory to write models to on each epoch. |
|
||||||
| `config_path` | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. |
|
| `config_path` | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. |
|
||||||
| `--code`, `-c` | option | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. |
|
| `--code`, `-c` | option | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. |
|
||||||
| `--resume-path`, `-r` | option | Path to pretrained weights from which to resume pretraining. |
|
| `--resume-path`, `-r` | option | Path to pretrained weights from which to resume pretraining. |
|
||||||
| `--epoch-resume`, `-er` | option | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. |
|
| `--epoch-resume`, `-er` | option | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. |
|
||||||
| `--help`, `-h` | flag | Show help message and available arguments. |
|
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||||
| overrides | | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.use_gpu 1`. |
|
| overrides | option / flag | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.use_gpu 1`. |
|
||||||
| **CREATES** | weights | The pretrained weights that can be used to initialize `spacy train`. |
|
| **CREATES** | weights | The pretrained weights that can be used to initialize `spacy train`. |
|
||||||
|
|
||||||
## Evaluate {#evaluate new="2"}
|
## Evaluate {#evaluate new="2"}
|
||||||
|
|
||||||
|
@ -687,6 +716,7 @@ $ python -m spacy evaluate [model] [data_path] [--output] [--gold-preproc]
|
||||||
| `--gpu-id`, `-g` | option | GPU to use, if any. Defaults to `-1` for CPU. |
|
| `--gpu-id`, `-g` | option | GPU to use, if any. Defaults to `-1` for CPU. |
|
||||||
| `--displacy-path`, `-dp` | option | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. |
|
| `--displacy-path`, `-dp` | option | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. |
|
||||||
| `--displacy-limit`, `-dl` | option | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. |
|
| `--displacy-limit`, `-dl` | option | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. |
|
||||||
|
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||||
| **CREATES** | `stdout`, JSON, HTML | Training results and optional metrics and visualizations. |
|
| **CREATES** | `stdout`, JSON, HTML | Training results and optional metrics and visualizations. |
|
||||||
|
|
||||||
## Package {#package}
|
## Package {#package}
|
||||||
|
@ -826,6 +856,7 @@ $ python -m spacy project run [subcommand] [project_dir] [--force] [--dry]
|
||||||
| `--force`, `-F` | flag | Force re-running steps, even if nothing changed. |
|
| `--force`, `-F` | flag | Force re-running steps, even if nothing changed. |
|
||||||
| `--dry`, `-D` | flag | Perform a dry run and don't execute scripts. |
|
| `--dry`, `-D` | flag | Perform a dry run and don't execute scripts. |
|
||||||
| `--help`, `-h` | flag | Show help message and available arguments. |
|
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||||
|
| **EXECUTES** | script | The command defined in the `project.yml`. |
|
||||||
|
|
||||||
### project dvc {#project-dvc}
|
### project dvc {#project-dvc}
|
||||||
|
|
||||||
|
@ -859,10 +890,11 @@ $ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose]
|
||||||
> python -m spacy project dvc all
|
> python -m spacy project dvc all
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Argument | Type | Description |
|
| Argument | Type | Description |
|
||||||
| ----------------- | ---------- | --------------------------------------------------------------------------------- |
|
| ----------------- | ---------- | --------------------------------------------------------------------------------------------- |
|
||||||
| `project_dir` | positional | Path to project directory. Defaults to current working directory. |
|
| `project_dir` | positional | Path to project directory. Defaults to current working directory. |
|
||||||
| `workflow` | positional | Name of workflow defined in `project.yml`. Defaults to first workflow if not set. |
|
| `workflow` | positional | Name of workflow defined in `project.yml`. Defaults to first workflow if not set. |
|
||||||
| `--force`, `-F` | flag | Force-updating config file. |
|
| `--force`, `-F` | flag | Force-updating config file. |
|
||||||
| `--verbose`, `-V` | flag | Print more output generated by DVC. |
|
| `--verbose`, `-V` | flag | Print more output generated by DVC. |
|
||||||
| `--help`, `-h` | flag | Show help message and available arguments. |
|
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||||
|
| **CREATES** | file | A `dvc.yaml` file in the project directory, based on the steps defined in the given workflow. |
|
||||||
|
|
|
@ -20,8 +20,9 @@ Config files define the training process and model pipeline and can be passed to
|
||||||
[`spacy train`](/api/cli#train). They use
|
[`spacy train`](/api/cli#train). They use
|
||||||
[Thinc's configuration system](https://thinc.ai/docs/usage-config) under the
|
[Thinc's configuration system](https://thinc.ai/docs/usage-config) under the
|
||||||
hood. For details on how to use training configs, see the
|
hood. For details on how to use training configs, see the
|
||||||
[usage documentation](/usage/training#config). To get started with a blank
|
[usage documentation](/usage/training#config). To get started with the
|
||||||
config or fill a partial config with all defaults, you can use the
|
recommended settings for your use case, check out the
|
||||||
|
[quickstart widget](/usage/training#quickstart) or run the
|
||||||
[`init config`](/api/cli#init-config) command.
|
[`init config`](/api/cli#init-config) command.
|
||||||
|
|
||||||
> #### What does the @ mean?
|
> #### What does the @ mean?
|
||||||
|
@ -182,10 +183,10 @@ run [`spacy pretrain`](/api/cli#pretrain).
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
The main data format used in spaCy v3.0 is a **binary format** created by
|
The main data format used in spaCy v3.0 is a **binary format** created by
|
||||||
serializing a [`DocBin`](/api/docbin) object, which represents a collection of
|
serializing a [`DocBin`](/api/docbin), which represents a collection of `Doc`
|
||||||
`Doc` objects. This means that you can train spaCy models using the same format
|
objects. This means that you can train spaCy models using the same format it
|
||||||
it outputs: annotated `Doc` objects. The binary format is extremely **efficient
|
outputs: annotated `Doc` objects. The binary format is extremely **efficient in
|
||||||
in storage**, especially when packing multiple documents together.
|
storage**, especially when packing multiple documents together.
|
||||||
|
|
||||||
Typically, the extension for these binary files is `.spacy`, and they are used
|
Typically, the extension for these binary files is `.spacy`, and they are used
|
||||||
as input format for specifying a [training corpus](/api/corpus) and for spaCy's
|
as input format for specifying a [training corpus](/api/corpus) and for spaCy's
|
||||||
|
|
|
@ -142,14 +142,20 @@ applied to the `Doc` in order. Both [`__call__`](/api/dependencyparser#call) and
|
||||||
|
|
||||||
## DependencyParser.begin_training {#begin_training tag="method"}
|
## DependencyParser.begin_training {#begin_training tag="method"}
|
||||||
|
|
||||||
Initialize the pipe for training, using data examples if available. Returns an
|
Initialize the component for training and return an
|
||||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
|
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
|
||||||
|
function that returns an iterable of [`Example`](/api/example) objects. The data
|
||||||
|
examples are used to **initialize the model** of the component and can either be
|
||||||
|
the full training data or a representative sample. Initialization includes
|
||||||
|
validating the network,
|
||||||
|
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||||
|
setting up the label scheme based on the data.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> parser = nlp.add_pipe("parser")
|
> parser = nlp.add_pipe("parser")
|
||||||
> optimizer = parser.begin_training(pipeline=nlp.pipeline)
|
> optimizer = parser.begin_training(lambda: [], pipeline=nlp.pipeline)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
|
|
|
@ -142,14 +142,20 @@ applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and
|
||||||
|
|
||||||
## EntityLinker.begin_training {#begin_training tag="method"}
|
## EntityLinker.begin_training {#begin_training tag="method"}
|
||||||
|
|
||||||
Initialize the pipe for training, using data examples if available. Returns an
|
Initialize the component for training and return an
|
||||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
|
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
|
||||||
|
function that returns an iterable of [`Example`](/api/example) objects. The data
|
||||||
|
examples are used to **initialize the model** of the component and can either be
|
||||||
|
the full training data or a representative sample. Initialization includes
|
||||||
|
validating the network,
|
||||||
|
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||||
|
setting up the label scheme based on the data.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> entity_linker = nlp.add_pipe("entity_linker", last=True)
|
> entity_linker = nlp.add_pipe("entity_linker", last=True)
|
||||||
> optimizer = entity_linker.begin_training(pipeline=nlp.pipeline)
|
> optimizer = entity_linker.begin_training(lambda: [], pipeline=nlp.pipeline)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
|
|
|
@ -131,14 +131,20 @@ applied to the `Doc` in order. Both [`__call__`](/api/entityrecognizer#call) and
|
||||||
|
|
||||||
## EntityRecognizer.begin_training {#begin_training tag="method"}
|
## EntityRecognizer.begin_training {#begin_training tag="method"}
|
||||||
|
|
||||||
Initialize the pipe for training, using data examples if available. Returns an
|
Initialize the component for training and return an
|
||||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
|
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
|
||||||
|
function that returns an iterable of [`Example`](/api/example) objects. The data
|
||||||
|
examples are used to **initialize the model** of the component and can either be
|
||||||
|
the full training data or a representative sample. Initialization includes
|
||||||
|
validating the network,
|
||||||
|
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||||
|
setting up the label scheme based on the data.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> ner = nlp.add_pipe("ner")
|
> ner = nlp.add_pipe("ner")
|
||||||
> optimizer = ner.begin_training(pipeline=nlp.pipeline)
|
> optimizer = ner.begin_training(lambda: [], pipeline=nlp.pipeline)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
|
|
|
@ -200,12 +200,28 @@ more efficient than processing texts one-by-one.
|
||||||
|
|
||||||
## Language.begin_training {#begin_training tag="method"}
|
## Language.begin_training {#begin_training tag="method"}
|
||||||
|
|
||||||
Initialize the pipe for training, using data examples if available. Returns an
|
Initialize the pipeline for training and return an
|
||||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
|
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
|
||||||
|
function that returns an iterable of [`Example`](/api/example) objects. The data
|
||||||
|
examples can either be the full training data or a representative sample. They
|
||||||
|
are used to **initialize the models** of trainable pipeline components and are
|
||||||
|
passed each component's [`begin_training`](/api/pipe#begin_training) method, if
|
||||||
|
available. Initialization includes validating the network,
|
||||||
|
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||||
|
setting up the label scheme based on the data.
|
||||||
|
|
||||||
|
<Infobox variant="warning" title="Changed in v3.0">
|
||||||
|
|
||||||
|
The `Language.update` method now takes a **function** that is called with no
|
||||||
|
arguments and returns a sequence of [`Example`](/api/example) objects instead of
|
||||||
|
tuples of `Doc` and `GoldParse` objects.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
|
> get_examples = lambda: examples
|
||||||
> optimizer = nlp.begin_training(get_examples)
|
> optimizer = nlp.begin_training(get_examples)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
@ -276,7 +292,7 @@ and custom registered functions if needed. See the
|
||||||
| `component_cfg` | `Dict[str, dict]` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. |
|
| `component_cfg` | `Dict[str, dict]` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. |
|
||||||
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
||||||
|
|
||||||
## Language.rehearse {#rehearse tag="method,experimental"}
|
## Language.rehearse {#rehearse tag="method,experimental" new="3"}
|
||||||
|
|
||||||
Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the
|
Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the
|
||||||
current model to make predictions similar to an initial model, to try to address
|
current model to make predictions similar to an initial model, to try to address
|
||||||
|
@ -302,6 +318,13 @@ the "catastrophic forgetting" problem. This feature is experimental.
|
||||||
|
|
||||||
Evaluate a model's pipeline components.
|
Evaluate a model's pipeline components.
|
||||||
|
|
||||||
|
<Infobox variant="warning" title="Changed in v3.0">
|
||||||
|
|
||||||
|
The `Language.update` method now takes a batch of [`Example`](/api/example)
|
||||||
|
objects instead of tuples of `Doc` and `GoldParse` objects.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
|
|
|
@ -121,15 +121,21 @@ applied to the `Doc` in order. Both [`__call__`](/api/morphologizer#call) and
|
||||||
|
|
||||||
## Morphologizer.begin_training {#begin_training tag="method"}
|
## Morphologizer.begin_training {#begin_training tag="method"}
|
||||||
|
|
||||||
Initialize the pipe for training, using data examples if available. Returns an
|
Initialize the component for training and return an
|
||||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
|
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
|
||||||
|
function that returns an iterable of [`Example`](/api/example) objects. The data
|
||||||
|
examples are used to **initialize the model** of the component and can either be
|
||||||
|
the full training data or a representative sample. Initialization includes
|
||||||
|
validating the network,
|
||||||
|
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||||
|
setting up the label scheme based on the data.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> morphologizer = nlp.add_pipe("morphologizer")
|
> morphologizer = nlp.add_pipe("morphologizer")
|
||||||
> nlp.pipeline.append(morphologizer)
|
> nlp.pipeline.append(morphologizer)
|
||||||
> optimizer = morphologizer.begin_training(pipeline=nlp.pipeline)
|
> optimizer = morphologizer.begin_training(lambda: [], pipeline=nlp.pipeline)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
|
|
|
@ -9,8 +9,8 @@ components like the [`EntityRecognizer`](/api/entityrecognizer) or
|
||||||
[`TextCategorizer`](/api/textcategorizer) inherit from it and it defines the
|
[`TextCategorizer`](/api/textcategorizer) inherit from it and it defines the
|
||||||
interface that components should follow to function as trainable components in a
|
interface that components should follow to function as trainable components in a
|
||||||
spaCy pipeline. See the docs on
|
spaCy pipeline. See the docs on
|
||||||
[writing trainable components](/usage/processing-pipelines#trainable) for how to
|
[writing trainable components](/usage/processing-pipelines#trainable-components)
|
||||||
use the `Pipe` base class to implement custom components.
|
for how to use the `Pipe` base class to implement custom components.
|
||||||
|
|
||||||
> #### Why is Pipe implemented in Cython?
|
> #### Why is Pipe implemented in Cython?
|
||||||
>
|
>
|
||||||
|
@ -45,18 +45,12 @@ Create a new pipeline instance. In your application, you would normally use a
|
||||||
shortcut for this and instantiate the component using its string name and
|
shortcut for this and instantiate the component using its string name and
|
||||||
[`nlp.add_pipe`](/api/language#create_pipe).
|
[`nlp.add_pipe`](/api/language#create_pipe).
|
||||||
|
|
||||||
<Infobox variant="danger">
|
| Name | Type | Description |
|
||||||
|
| ------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
This method needs to be overwritten with your own custom `__init__` method.
|
| `vocab` | `Vocab` | The shared vocabulary. |
|
||||||
|
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
||||||
</Infobox>
|
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
|
||||||
|
| `**cfg` | | Additional config parameters and settings. Will be available as the dictionary `Pipe.cfg` and is serialized with the component. |
|
||||||
| Name | Type | Description |
|
|
||||||
| ------- | ------------------------------------------ | ------------------------------------------------------------------------------------------- |
|
|
||||||
| `vocab` | `Vocab` | The shared vocabulary. |
|
|
||||||
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
|
||||||
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
|
|
||||||
| `**cfg` | | Additional config parameters and settings. |
|
|
||||||
|
|
||||||
## Pipe.\_\_call\_\_ {#call tag="method"}
|
## Pipe.\_\_call\_\_ {#call tag="method"}
|
||||||
|
|
||||||
|
@ -106,14 +100,20 @@ applied to the `Doc` in order. Both [`__call__`](/api/pipe#call) and
|
||||||
|
|
||||||
## Pipe.begin_training {#begin_training tag="method"}
|
## Pipe.begin_training {#begin_training tag="method"}
|
||||||
|
|
||||||
Initialize the pipe for training, using data examples if available. Returns an
|
Initialize the component for training and return an
|
||||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
|
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
|
||||||
|
function that returns an iterable of [`Example`](/api/example) objects. The data
|
||||||
|
examples are used to **initialize the model** of the component and can either be
|
||||||
|
the full training data or a representative sample. Initialization includes
|
||||||
|
validating the network,
|
||||||
|
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||||
|
setting up the label scheme based on the data.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> pipe = nlp.add_pipe("your_custom_pipe")
|
> pipe = nlp.add_pipe("your_custom_pipe")
|
||||||
> optimizer = pipe.begin_training(pipeline=nlp.pipeline)
|
> optimizer = pipe.begin_training(lambda: [], pipeline=nlp.pipeline)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
|
@ -176,12 +176,6 @@ method.
|
||||||
Learn from a batch of [`Example`](/api/example) objects containing the
|
Learn from a batch of [`Example`](/api/example) objects containing the
|
||||||
predictions and gold-standard annotations, and update the component's model.
|
predictions and gold-standard annotations, and update the component's model.
|
||||||
|
|
||||||
<Infobox variant="danger">
|
|
||||||
|
|
||||||
This method needs to be overwritten with your own custom `update` method.
|
|
||||||
|
|
||||||
</Infobox>
|
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
|
@ -200,7 +194,7 @@ This method needs to be overwritten with your own custom `update` method.
|
||||||
| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. |
|
| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. |
|
||||||
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
||||||
|
|
||||||
## Pipe.rehearse {#rehearse tag="method,experimental"}
|
## Pipe.rehearse {#rehearse tag="method,experimental" new="3"}
|
||||||
|
|
||||||
Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the
|
Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the
|
||||||
current model to make predictions similar to an initial model, to try to address
|
current model to make predictions similar to an initial model, to try to address
|
||||||
|
@ -378,6 +372,15 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
|
||||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
| **RETURNS** | `Pipe` | The pipe. |
|
| **RETURNS** | `Pipe` | The pipe. |
|
||||||
|
|
||||||
|
## Attributes {#attributes}
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| ------- | ------------------------------------------ | ----------------------------------------------------------------------------------------------------- |
|
||||||
|
| `vocab` | [`Vocab`](/api/vocab) | The shared vocabulary that's passed in on initialization. |
|
||||||
|
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model powering the component. |
|
||||||
|
| `name` | str | The name of the component instance in the pipeline. Can be used in the losses. |
|
||||||
|
| `cfg` | dict | Keyword arguments passed to [`Pipe.__init__`](/api/pipe#init). Will be serialized with the component. |
|
||||||
|
|
||||||
## Serialization fields {#serialization-fields}
|
## Serialization fields {#serialization-fields}
|
||||||
|
|
||||||
During serialization, spaCy will export several data fields used to restore
|
During serialization, spaCy will export several data fields used to restore
|
||||||
|
|
|
@ -116,14 +116,20 @@ and [`pipe`](/api/sentencerecognizer#pipe) delegate to the
|
||||||
|
|
||||||
## SentenceRecognizer.begin_training {#begin_training tag="method"}
|
## SentenceRecognizer.begin_training {#begin_training tag="method"}
|
||||||
|
|
||||||
Initialize the pipe for training, using data examples if available. Returns an
|
Initialize the component for training and return an
|
||||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
|
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
|
||||||
|
function that returns an iterable of [`Example`](/api/example) objects. The data
|
||||||
|
examples are used to **initialize the model** of the component and can either be
|
||||||
|
the full training data or a representative sample. Initialization includes
|
||||||
|
validating the network,
|
||||||
|
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||||
|
setting up the label scheme based on the data.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> senter = nlp.add_pipe("senter")
|
> senter = nlp.add_pipe("senter")
|
||||||
> optimizer = senter.begin_training(pipeline=nlp.pipeline)
|
> optimizer = senter.begin_training(lambda: [], pipeline=nlp.pipeline)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
|
@ -193,7 +199,7 @@ Delegates to [`predict`](/api/sentencerecognizer#predict) and
|
||||||
| `losses` | `Dict[str, float]` | Optional record of the loss during training. The value keyed by the model's name is updated. |
|
| `losses` | `Dict[str, float]` | Optional record of the loss during training. The value keyed by the model's name is updated. |
|
||||||
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
||||||
|
|
||||||
## SentenceRecognizer.rehearse {#rehearse tag="method,experimental"}
|
## SentenceRecognizer.rehearse {#rehearse tag="method,experimental" new="3"}
|
||||||
|
|
||||||
Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the
|
Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the
|
||||||
current model to make predictions similar to an initial model, to try to address
|
current model to make predictions similar to an initial model, to try to address
|
||||||
|
|
|
@ -114,14 +114,20 @@ applied to the `Doc` in order. Both [`__call__`](/api/tagger#call) and
|
||||||
|
|
||||||
## Tagger.begin_training {#begin_training tag="method"}
|
## Tagger.begin_training {#begin_training tag="method"}
|
||||||
|
|
||||||
Initialize the pipe for training, using data examples if available. Returns an
|
Initialize the component for training and return an
|
||||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
|
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
|
||||||
|
function that returns an iterable of [`Example`](/api/example) objects. The data
|
||||||
|
examples are used to **initialize the model** of the component and can either be
|
||||||
|
the full training data or a representative sample. Initialization includes
|
||||||
|
validating the network,
|
||||||
|
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||||
|
setting up the label scheme based on the data.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> tagger = nlp.add_pipe("tagger")
|
> tagger = nlp.add_pipe("tagger")
|
||||||
> optimizer = tagger.begin_training(pipeline=nlp.pipeline)
|
> optimizer = tagger.begin_training(lambda: [], pipeline=nlp.pipeline)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
|
@ -191,7 +197,7 @@ Delegates to [`predict`](/api/tagger#predict) and
|
||||||
| `losses` | `Dict[str, float]` | Optional record of the loss during training. The value keyed by the model's name is updated. |
|
| `losses` | `Dict[str, float]` | Optional record of the loss during training. The value keyed by the model's name is updated. |
|
||||||
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
||||||
|
|
||||||
## Tagger.rehearse {#rehearse tag="method,experimental"}
|
## Tagger.rehearse {#rehearse tag="method,experimental" new="3"}
|
||||||
|
|
||||||
Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the
|
Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the
|
||||||
current model to make predictions similar to an initial model, to try to address
|
current model to make predictions similar to an initial model, to try to address
|
||||||
|
|
|
@ -122,14 +122,20 @@ applied to the `Doc` in order. Both [`__call__`](/api/textcategorizer#call) and
|
||||||
|
|
||||||
## TextCategorizer.begin_training {#begin_training tag="method"}
|
## TextCategorizer.begin_training {#begin_training tag="method"}
|
||||||
|
|
||||||
Initialize the pipe for training, using data examples if available. Returns an
|
Initialize the component for training and return an
|
||||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
|
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
|
||||||
|
function that returns an iterable of [`Example`](/api/example) objects. The data
|
||||||
|
examples are used to **initialize the model** of the component and can either be
|
||||||
|
the full training data or a representative sample. Initialization includes
|
||||||
|
validating the network,
|
||||||
|
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||||
|
setting up the label scheme based on the data.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> textcat = nlp.add_pipe("textcat")
|
> textcat = nlp.add_pipe("textcat")
|
||||||
> optimizer = textcat.begin_training(pipeline=nlp.pipeline)
|
> optimizer = textcat.begin_training(lambda: [], pipeline=nlp.pipeline)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
|
@ -199,7 +205,7 @@ Delegates to [`predict`](/api/textcategorizer#predict) and
|
||||||
| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. |
|
| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. |
|
||||||
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
||||||
|
|
||||||
## TextCategorizer.rehearse {#rehearse tag="method,experimental"}
|
## TextCategorizer.rehearse {#rehearse tag="method,experimental" new="3"}
|
||||||
|
|
||||||
Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the
|
Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the
|
||||||
current model to make predictions similar to an initial model, to try to address
|
current model to make predictions similar to an initial model, to try to address
|
||||||
|
|
|
@ -125,14 +125,20 @@ and [`set_annotations`](/api/tok2vec#set_annotations) methods.
|
||||||
|
|
||||||
## Tok2Vec.begin_training {#begin_training tag="method"}
|
## Tok2Vec.begin_training {#begin_training tag="method"}
|
||||||
|
|
||||||
Initialize the pipe for training, using data examples if available. Returns an
|
Initialize the component for training and return an
|
||||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
|
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
|
||||||
|
function that returns an iterable of [`Example`](/api/example) objects. The data
|
||||||
|
examples are used to **initialize the model** of the component and can either be
|
||||||
|
the full training data or a representative sample. Initialization includes
|
||||||
|
validating the network,
|
||||||
|
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||||
|
setting up the label scheme based on the data.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> tok2vec = nlp.add_pipe("tok2vec")
|
> tok2vec = nlp.add_pipe("tok2vec")
|
||||||
> optimizer = tok2vec.begin_training(pipeline=nlp.pipeline)
|
> optimizer = tok2vec.begin_training(lambda: [], pipeline=nlp.pipeline)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
|
|
|
@ -159,14 +159,20 @@ applied to the `Doc` in order. Both [`__call__`](/api/transformer#call) and
|
||||||
|
|
||||||
## Transformer.begin_training {#begin_training tag="method"}
|
## Transformer.begin_training {#begin_training tag="method"}
|
||||||
|
|
||||||
Initialize the pipe for training, using data examples if available. Returns an
|
Initialize the component for training and return an
|
||||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
|
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
|
||||||
|
function that returns an iterable of [`Example`](/api/example) objects. The data
|
||||||
|
examples are used to **initialize the model** of the component and can either be
|
||||||
|
the full training data or a representative sample. Initialization includes
|
||||||
|
validating the network,
|
||||||
|
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||||
|
setting up the label scheme based on the data.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> trf = nlp.add_pipe("transformer")
|
> trf = nlp.add_pipe("transformer")
|
||||||
> optimizer = trf.begin_training(pipeline=nlp.pipeline)
|
> optimizer = trf.begin_training(lambda: [], pipeline=nlp.pipeline)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
|
@ -371,54 +377,64 @@ serialization by passing in the string names via the `exclude` argument.
|
||||||
|
|
||||||
## TransformerData {#transformerdata tag="dataclass"}
|
## TransformerData {#transformerdata tag="dataclass"}
|
||||||
|
|
||||||
Transformer tokens and outputs for one `Doc` object.
|
Transformer tokens and outputs for one `Doc` object. The transformer models
|
||||||
|
return tensors that refer to a whole padded batch of documents. These tensors
|
||||||
|
are wrapped into the
|
||||||
|
[FullTransformerBatch](/api/transformer#fulltransformerbatch) object. The
|
||||||
|
`FullTransformerBatch` then splits out the per-document data, which is handled
|
||||||
|
by this class. Instances of this class
|
||||||
|
are`typically assigned to the [Doc._.trf_data`](/api/transformer#custom-attributes)
|
||||||
|
extension attribute.
|
||||||
|
|
||||||
<!-- TODO: finish API docs, also mention "width" is property -->
|
| Name | Type | Description |
|
||||||
|
| --------- | -------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| Name | Type | Description |
|
| `tokens` | `Dict` | A slice of the tokens data produced by the tokenizer. This may have several fields, including the token IDs, the texts, and the attention mask. See the [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) object for details. |
|
||||||
| --------- | -------------------------------------------------- | ----------- |
|
| `tensors` | `List[FloatsXd]` | The activations for the Doc from the transformer. Usually the last tensor that is 3-dimensional will be the most important, as that will provide the final hidden state. Generally activations that are 2-dimensional will be attention weights. Details of this variable will differ depending on the underlying transformer model. |
|
||||||
| `tokens` | `Dict` | |
|
| `align` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | Alignment from the `Doc`'s tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. |
|
||||||
| `tensors` | `List[FloatsXd]` | |
|
| `width` | int | The width of the last hidden layer. |
|
||||||
| `align` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | |
|
|
||||||
| `width` | int | |
|
|
||||||
|
|
||||||
### TransformerData.empty {#transformerdata-emoty tag="classmethod"}
|
### TransformerData.empty {#transformerdata-emoty tag="classmethod"}
|
||||||
|
|
||||||
<!-- TODO: finish API docs -->
|
Create an empty `TransformerData` container.
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ----------------- | ----------- |
|
| ----------- | ----------------- | -------------- |
|
||||||
| **RETURNS** | `TransformerData` | |
|
| **RETURNS** | `TransformerData` | The container. |
|
||||||
|
|
||||||
## FullTransformerBatch {#fulltransformerbatch tag="dataclass"}
|
## FullTransformerBatch {#fulltransformerbatch tag="dataclass"}
|
||||||
|
|
||||||
<!-- TODO: write, also mention doc_data is property -->
|
Holds a batch of input and output objects for a transformer model. The data can
|
||||||
|
then be split to a list of [`TransformerData`](/api/transformer#transformerdata)
|
||||||
|
objects to associate the outputs to each [`Doc`](/api/doc) in the batch.
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ---------- | -------------------------------------------------------------------------------------------------------------------------- | ----------- |
|
| ---------- | -------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `spans` | `List[List[Span]]` | |
|
| `spans` | `List[List[Span]]` | The batch of input spans. The outer list refers to the Doc objects in the batch, and the inner list are the spans for that `Doc`. Note that spans are allowed to overlap or exclude tokens, but each Span can only refer to one `Doc` (by definition). This means that within a `Doc`, the regions of the output tensors that correspond to each Span may overlap or have gaps, but for each `Doc`, there is a non-overlapping contiguous slice of the outputs. |
|
||||||
| `tokens` | [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) | |
|
| `tokens` | [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) | The output of the tokenizer. |
|
||||||
| `tensors` | `List[torch.Tensor]` | |
|
| `tensors` | `List[torch.Tensor]` | The output of the transformer model. |
|
||||||
| `align` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | |
|
| `align` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | Alignment from the spaCy tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. |
|
||||||
| `doc_data` | `List[TransformerData]` | |
|
| `doc_data` | `List[TransformerData]` | The outputs, split per `Doc` object. |
|
||||||
|
|
||||||
### FullTransformerBatch.unsplit_by_doc {#fulltransformerbatch-unsplit_by_doc tag="method"}
|
### FullTransformerBatch.unsplit_by_doc {#fulltransformerbatch-unsplit_by_doc tag="method"}
|
||||||
|
|
||||||
<!-- TODO: write -->
|
Return a new `FullTransformerBatch` from a split batch of activations, using the
|
||||||
|
current object's spans, tokens and alignment. This is used during the backward
|
||||||
|
pass, in order to construct the gradients to pass back into the transformer
|
||||||
|
model.
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ---------------------- | ----------- |
|
| ----------- | ---------------------- | ------------------------------- |
|
||||||
| `arrays` | `List[List[Floats3d]]` | |
|
| `arrays` | `List[List[Floats3d]]` | The split batch of activations. |
|
||||||
| **RETURNS** | `FullTransformerBatch` | |
|
| **RETURNS** | `FullTransformerBatch` | The transformer batch. |
|
||||||
|
|
||||||
### FullTransformerBatch.split_by_doc {#fulltransformerbatch-split_by_doc tag="method"}
|
### FullTransformerBatch.split_by_doc {#fulltransformerbatch-split_by_doc tag="method"}
|
||||||
|
|
||||||
Split a `TransformerData` object that represents a batch into a list with one
|
Split a `TransformerData` object that represents a batch into a list with one
|
||||||
`TransformerData` per `Doc`.
|
`TransformerData` per `Doc`.
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ----------------------- | ----------- |
|
| ----------- | ----------------------- | ---------------- |
|
||||||
| **RETURNS** | `List[TransformerData]` | |
|
| **RETURNS** | `List[TransformerData]` | The split batch. |
|
||||||
|
|
||||||
## Span getters {#span_getters source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/span_getters.py"}
|
## Span getters {#span_getters source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/span_getters.py"}
|
||||||
|
|
||||||
|
|
|
@ -45,9 +45,9 @@ three components:
|
||||||
2. **Genre:** Type of text the model is trained on, e.g. `web` or `news`.
|
2. **Genre:** Type of text the model is trained on, e.g. `web` or `news`.
|
||||||
3. **Size:** Model size indicator, `sm`, `md` or `lg`.
|
3. **Size:** Model size indicator, `sm`, `md` or `lg`.
|
||||||
|
|
||||||
For example, `en_core_web_sm` is a small English model trained on written web
|
For example, [`en_core_web_sm`](/models/en#en_core_web_sm) is a small English
|
||||||
text (blogs, news, comments), that includes vocabulary, vectors, syntax and
|
model trained on written web text (blogs, news, comments), that includes
|
||||||
entities.
|
vocabulary, vectors, syntax and entities.
|
||||||
|
|
||||||
### Model versioning {#model-versioning}
|
### Model versioning {#model-versioning}
|
||||||
|
|
||||||
|
|
|
@ -5,7 +5,6 @@ menu:
|
||||||
- ['Processing Text', 'processing']
|
- ['Processing Text', 'processing']
|
||||||
- ['How Pipelines Work', 'pipelines']
|
- ['How Pipelines Work', 'pipelines']
|
||||||
- ['Custom Components', 'custom-components']
|
- ['Custom Components', 'custom-components']
|
||||||
# - ['Trainable Components', 'trainable-components']
|
|
||||||
- ['Extension Attributes', 'custom-components-attributes']
|
- ['Extension Attributes', 'custom-components-attributes']
|
||||||
- ['Plugins & Wrappers', 'plugins']
|
- ['Plugins & Wrappers', 'plugins']
|
||||||
---
|
---
|
||||||
|
@ -885,15 +884,117 @@ available, falls back to looking up the regular factory name.
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
<!-- TODO:
|
### Trainable components {#trainable-components new="3"}
|
||||||
## Trainable components {#trainable-components new="3"}
|
|
||||||
|
|
||||||
spaCy's [`Pipe`](/api/pipe) class helps you implement your own trainable
|
spaCy's [`Pipe`](/api/pipe) class helps you implement your own trainable
|
||||||
components that have their own model instance, make predictions over `Doc`
|
components that have their own model instance, make predictions over `Doc`
|
||||||
objects and can be updated using [`spacy train`](/api/cli#train). This lets you
|
objects and can be updated using [`spacy train`](/api/cli#train). This lets you
|
||||||
plug fully custom machine learning components into your pipeline.
|
plug fully custom machine learning components into your pipeline. You'll need
|
||||||
|
the following:
|
||||||
|
|
||||||
--->
|
1. **Model:** A Thinc [`Model`](https://thinc.ai/docs/api-model) instance. This
|
||||||
|
can be a model using [layers](https://thinc.ai/docs/api-layers) implemented
|
||||||
|
in Thinc, or a [wrapped model](https://thinc.ai/docs/usage-frameworks)
|
||||||
|
implemented in PyTorch, TensorFlow, MXNet or a fully custom solution. The
|
||||||
|
model must take a list of [`Doc`](/api/doc) objects as input and can have any
|
||||||
|
type of output.
|
||||||
|
2. **Pipe subclass:** A subclass of [`Pipe`](/api/pipe) that implements at least
|
||||||
|
two methods: [`Pipe.predict`](/api/pipe#predict) and
|
||||||
|
[`Pipe.set_annotations`](/api/pipe#set_annotations).
|
||||||
|
3. **Component factory:** A component factory registered with
|
||||||
|
[`@Language.factory`](/api/language#factory) that takes the `nlp` object and
|
||||||
|
component `name` and optional settings provided by the config and returns an
|
||||||
|
instance of your trainable component.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> from spacy.pipeline import Pipe
|
||||||
|
> from spacy.language import Language
|
||||||
|
>
|
||||||
|
> class TrainableComponent(Pipe):
|
||||||
|
> def predict(self, docs):
|
||||||
|
> ...
|
||||||
|
>
|
||||||
|
> def set_annotations(self, docs, scores):
|
||||||
|
> ...
|
||||||
|
>
|
||||||
|
> @Language.factory("my_trainable_component")
|
||||||
|
> def make_component(nlp, name, model):
|
||||||
|
> return TrainableComponent(nlp.vocab, model, name=name)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ---------------------------------------------- | ------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| [`predict`](/api/pipe#predict) | Apply the component's model to a batch of [`Doc`](/api/doc) objects (without modifying them) and return the scores. |
|
||||||
|
| [`set_annotations`](/api/pipe#set_annotations) | Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores generated by `predict`. |
|
||||||
|
|
||||||
|
By default, [`Pipe.__init__`](/api/pipe#init) takes the shared vocab, the
|
||||||
|
[`Model`](https://thinc.ai/docs/api-model) and the name of the component
|
||||||
|
instance in the pipeline, which you can use as a key in the losses. All other
|
||||||
|
keyword arguments will become available as [`Pipe.cfg`](/api/pipe#cfg) and will
|
||||||
|
also be serialized with the component.
|
||||||
|
|
||||||
|
<Accordion title="Why components should be passed a Model instance, not create it" spaced>
|
||||||
|
|
||||||
|
spaCy's [config system](/usage/training#config) resolves the config describing
|
||||||
|
the pipeline components and models **bottom-up**. This means that it will
|
||||||
|
_first_ create a `Model` from a [registered architecture](/api/architectures),
|
||||||
|
validate its arguments and _then_ pass the object forward to the component. This
|
||||||
|
means that the config can express very complex, nested trees of objects – but
|
||||||
|
the objects don't have to pass the model settings all the way down to the
|
||||||
|
components. It also makes the components more **modular** and lets you swap
|
||||||
|
different architectures in your config, and re-use model definitions.
|
||||||
|
|
||||||
|
```ini
|
||||||
|
### config.cfg (excerpt)
|
||||||
|
[components]
|
||||||
|
|
||||||
|
[components.textcat]
|
||||||
|
factory = "textcat"
|
||||||
|
labels = []
|
||||||
|
|
||||||
|
# This function is created and then passed to the "textcat" component as
|
||||||
|
# the argument "model"
|
||||||
|
[components.textcat.model]
|
||||||
|
@architectures = "spacy.TextCatEnsemble.v1"
|
||||||
|
exclusive_classes = false
|
||||||
|
pretrained_vectors = null
|
||||||
|
width = 64
|
||||||
|
conv_depth = 2
|
||||||
|
embed_size = 2000
|
||||||
|
window_size = 1
|
||||||
|
ngram_size = 1
|
||||||
|
dropout = null
|
||||||
|
|
||||||
|
[components.other_textcat]
|
||||||
|
factory = "textcat"
|
||||||
|
# This references the [components.textcat.model] block above
|
||||||
|
model = ${components.textcat.model}
|
||||||
|
labels = []
|
||||||
|
```
|
||||||
|
|
||||||
|
Your trainable pipeline component factories should therefore always take a
|
||||||
|
`model` argument instead of instantiating the
|
||||||
|
[`Model`](https://thinc.ai/docs/api-model) inside the component. To register
|
||||||
|
custom architectures, you can use the
|
||||||
|
[`@spacy.registry.architectures`](/api/top-level#registry) decorator. Also see
|
||||||
|
the [training guide](/usage/training#config) for details.
|
||||||
|
|
||||||
|
</Accordion>
|
||||||
|
|
||||||
|
For some use cases, it makes sense to also overwrite additional methods to
|
||||||
|
customize how the model is updated from examples, how it's initialized, how the
|
||||||
|
loss is calculated and to add evaluation scores to the training output.
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| -------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
|
| [`update`](/api/pipe#update) | Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model. |
|
||||||
|
| [`begin_training`](/api/pipe#begin_training) | Initialize the model. Typically calls into [`Model.initialize`](https://thinc.ai/docs/api-model#initialize) and [`Pipe.create_optimizer`](/api/pipe#create_optimizer) if no optimizer is provided. |
|
||||||
|
| [`get_loss`](/api/pipe#get_loss) | Return a tuple of the loss and the gradient for a batch of [`Example`](/api/example) objects. |
|
||||||
|
| [`score`](/api/pipe#score) | Score a batch of [`Example`](/api/example) objects and return a dictionary of scores. The [`@Language.factory`](/api/language#factory) decorator can define the `default_socre_weights` of the component to decide which keys of the scores to display during training and how they count towards the final score. |
|
||||||
|
|
||||||
|
<!-- TODO: add more details, examples and maybe an example project -->
|
||||||
|
|
||||||
## Extension attributes {#custom-components-attributes new="2"}
|
## Extension attributes {#custom-components-attributes new="2"}
|
||||||
|
|
||||||
|
|
|
@ -37,27 +37,37 @@ The recommended way to train your spaCy models is via the
|
||||||
single [`config.cfg`](#config) **configuration file** that includes all settings
|
single [`config.cfg`](#config) **configuration file** that includes all settings
|
||||||
and hyperparameters. You can optionally [overwritten](#config-overrides)
|
and hyperparameters. You can optionally [overwritten](#config-overrides)
|
||||||
settings on the command line, and load in a Python file to register
|
settings on the command line, and load in a Python file to register
|
||||||
[custom functions](#custom-code) and architectures.
|
[custom functions](#custom-code) and architectures. This quickstart widget helps
|
||||||
|
you generate a starter config with the **recommended settings** for your
|
||||||
|
specific use case. It's also available in spaCy as the
|
||||||
|
[`init config`](/api/cli#init-config) command.
|
||||||
|
|
||||||
> #### Instructions
|
> #### Instructions: widget
|
||||||
>
|
>
|
||||||
> 1. Select your requirements and settings.
|
> 1. Select your requirements and settings.
|
||||||
> 2. Use the buttons at the bottom to save the result to your clipboard or a
|
> 2. Use the buttons at the bottom to save the result to your clipboard or a
|
||||||
> file `base_config.cfg`.
|
> file `base_config.cfg`.
|
||||||
> 3. Run [`init config`](/api/cli#init-config) to create a full training config.
|
> 3. Run [`init fill-config`](/api/cli#init-fill-config) to create a full
|
||||||
|
> config.
|
||||||
> 4. Run [`train`](/api/cli#train) with your config and data.
|
> 4. Run [`train`](/api/cli#train) with your config and data.
|
||||||
|
>
|
||||||
|
> #### Instructions: CLI
|
||||||
|
>
|
||||||
|
> 1. Run the [`init config`](/api/cli#init-config) command and specify your
|
||||||
|
> requirements and settings as CLI arguments.
|
||||||
|
> 2. Run [`train`](/api/cli#train) with the exported config and data.
|
||||||
|
|
||||||
import QuickstartTraining from 'widgets/quickstart-training.js'
|
import QuickstartTraining from 'widgets/quickstart-training.js'
|
||||||
|
|
||||||
<QuickstartTraining download="base_config.cfg" />
|
<QuickstartTraining download="base_config.cfg" />
|
||||||
|
|
||||||
After you've saved the starter config to a file `base_config.cfg`, you can use
|
After you've saved the starter config to a file `base_config.cfg`, you can use
|
||||||
the [`init config`](/api/cli#init-config) command to fill in the remaining
|
the [`init fill-config`](/api/cli#init-fill-config) command to fill in the
|
||||||
defaults. Training configs should always be **complete and without hidden
|
remaining defaults. Training configs should always be **complete and without
|
||||||
defaults**, to keep your experiments reproducible.
|
hidden defaults**, to keep your experiments reproducible.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ python -m spacy init config config.cfg --base base_config.cfg
|
$ python -m spacy init fill-config base_config.cfg config.cfg
|
||||||
```
|
```
|
||||||
|
|
||||||
> #### Tip: Debug your data
|
> #### Tip: Debug your data
|
||||||
|
@ -70,10 +80,13 @@ $ python -m spacy init config config.cfg --base base_config.cfg
|
||||||
> $ python -m spacy debug data config.cfg --verbose
|
> $ python -m spacy debug data config.cfg --verbose
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
You can now add your data and run [`train`](/api/cli#train) with your config.
|
Instead of exporting your starter config from the quickstart widget and
|
||||||
See the [`convert`](/api/cli#convert) command for details on how to convert your
|
auto-filling it, you can also use the [`init config`](/api/cli#init-config)
|
||||||
data to spaCy's binary `.spacy` format. You can either include the data paths in
|
command and specify your requirement and settings and CLI arguments. You can now
|
||||||
the `[paths]` section of your config, or pass them in via the command line.
|
add your data and run [`train`](/api/cli#train) with your config. See the
|
||||||
|
[`convert`](/api/cli#convert) command for details on how to convert your data to
|
||||||
|
spaCy's binary `.spacy` format. You can either include the data paths in the
|
||||||
|
`[paths]` section of your config, or pass them in via the command line.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./dev.spacy
|
$ python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./dev.spacy
|
||||||
|
@ -601,7 +614,7 @@ settings in the block will be passed to the function as keyword arguments. Keep
|
||||||
in mind that the config shouldn't have any hidden defaults and all arguments on
|
in mind that the config shouldn't have any hidden defaults and all arguments on
|
||||||
the functions need to be represented in the config. If your function defines
|
the functions need to be represented in the config. If your function defines
|
||||||
**default argument values**, spaCy is able to auto-fill your config when you run
|
**default argument values**, spaCy is able to auto-fill your config when you run
|
||||||
[`init config`](/api/cli#init-config).
|
[`init fill-config`](/api/cli#init-fill-config).
|
||||||
|
|
||||||
```ini
|
```ini
|
||||||
### config.cfg (excerpt)
|
### config.cfg (excerpt)
|
||||||
|
@ -687,13 +700,13 @@ give you everything you need to train fully custom models with
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
<!-- TODO: maybe add something about why the Example class is great and its benefits, and how it's passed around, holds the alignment etc -->
|
|
||||||
|
|
||||||
The [`Example`](/api/example) object contains annotated training data, also
|
The [`Example`](/api/example) object contains annotated training data, also
|
||||||
called the **gold standard**. It's initialized with a [`Doc`](/api/doc) object
|
called the **gold standard**. It's initialized with a [`Doc`](/api/doc) object
|
||||||
that will hold the predictions, and another `Doc` object that holds the
|
that will hold the predictions, and another `Doc` object that holds the
|
||||||
gold-standard annotations. Here's an example of a simple `Example` for
|
gold-standard annotations. It also includes the **alignment** between those two
|
||||||
part-of-speech tags:
|
documents if they differ in tokenization. The `Example` class ensures that spaCy
|
||||||
|
can rely on one **standardized format** that's passed through the pipeline.
|
||||||
|
Here's an example of a simple `Example` for part-of-speech tags:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
words = ["I", "like", "stuff"]
|
words = ["I", "like", "stuff"]
|
||||||
|
@ -744,7 +757,8 @@ example = Example.from_dict(doc, {"entities": ["U-ORG", "O", "U-TECHNOLOGY", "O"
|
||||||
|
|
||||||
As of v3.0, the [`Example`](/api/example) object replaces the `GoldParse` class.
|
As of v3.0, the [`Example`](/api/example) object replaces the `GoldParse` class.
|
||||||
It can be constructed in a very similar way, from a `Doc` and a dictionary of
|
It can be constructed in a very similar way, from a `Doc` and a dictionary of
|
||||||
annotations:
|
annotations. For more details, see the
|
||||||
|
[migration guide](/usage/v3#migrating-training).
|
||||||
|
|
||||||
```diff
|
```diff
|
||||||
- gold = GoldParse(doc, entities=entities)
|
- gold = GoldParse(doc, entities=entities)
|
||||||
|
|
|
@ -163,8 +163,9 @@ resolved, the function is created and passed into the model as an argument.
|
||||||
Remember that the `config.cfg` used for training should contain **no missing
|
Remember that the `config.cfg` used for training should contain **no missing
|
||||||
values** and requires all settings to be defined. You don't want any hidden
|
values** and requires all settings to be defined. You don't want any hidden
|
||||||
defaults creeping in and changing your results! spaCy will tell you if settings
|
defaults creeping in and changing your results! spaCy will tell you if settings
|
||||||
are missing, and you can run [`spacy init config`](/api/cli#init-config) with to
|
are missing, and you can run
|
||||||
automatically fill in all defaults.
|
[`spacy init fill-config`](/api/cli#init-fill-config) to automatically fill in
|
||||||
|
all defaults.
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
|
|
|
@ -14,12 +14,49 @@ menu:
|
||||||
|
|
||||||
### New training workflow and config system {#features-training}
|
### New training workflow and config system {#features-training}
|
||||||
|
|
||||||
|
<Infobox title="Details & Documentation" emoji="📖" list>
|
||||||
|
|
||||||
|
- **Usage:** [Training models](/usage/training)
|
||||||
|
- **Thinc:** [Thinc's config system](https://thinc.ai/docs/usage-config),
|
||||||
|
[`Config`](https://thinc.ai/docs/api-config#config)
|
||||||
|
- **CLI:** [`train`](/api/cli#train), [`pretrain`](/api/cli#pretrain),
|
||||||
|
[`evaluate`](/api/cli#evaluate)
|
||||||
|
- **API:** [Config format](/api/data-formats#config),
|
||||||
|
[`registry`](/api/top-level#registry)
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
### Transformer-based pipelines {#features-transformers}
|
### Transformer-based pipelines {#features-transformers}
|
||||||
|
|
||||||
|
<Infobox title="Details & Documentation" emoji="📖" list>
|
||||||
|
|
||||||
|
- **Usage:** [Transformers](/usage/transformers),
|
||||||
|
[Training models](/usage/training)
|
||||||
|
- **API:** [`Transformer`](/api/transformer),
|
||||||
|
[`TransformerData`](/api/transformer#transformerdata),
|
||||||
|
[`FullTransformerBatch`](/api/transformer#fulltransformerbatch)
|
||||||
|
- **Architectures: ** [TransformerModel](/api/architectures#TransformerModel),
|
||||||
|
[Tok2VecListener](/api/architectures#transformers-Tok2VecListener),
|
||||||
|
[Tok2VecTransformer](/api/architectures#Tok2VecTransformer)
|
||||||
|
- **Models:** [`en_core_bert_sm`](/models/en)
|
||||||
|
- **Implementation:**
|
||||||
|
[`spacy-transformers`](https://github.com/explosion/spacy-transformers)
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
### Custom models using any framework {#feautres-custom-models}
|
### Custom models using any framework {#feautres-custom-models}
|
||||||
|
|
||||||
### Manage end-to-end workflows with projects {#features-projects}
|
### Manage end-to-end workflows with projects {#features-projects}
|
||||||
|
|
||||||
|
<Infobox title="Details & Documentation" emoji="📖" list>
|
||||||
|
|
||||||
|
- **Usage:** [spaCy projects](/usage/projects),
|
||||||
|
[Training models](/usage/training)
|
||||||
|
- **CLI:** [`project`](/api/cli#project), [`train`](/api/cli#train)
|
||||||
|
- **Templates:** [`projects`](https://github.com/explosion/projects)
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
### New built-in pipeline components {#features-pipeline-components}
|
### New built-in pipeline components {#features-pipeline-components}
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
|
@ -30,14 +67,48 @@ menu:
|
||||||
| [`AttributeRuler`](/api/attributeruler) | Component for setting token attributes using match patterns. |
|
| [`AttributeRuler`](/api/attributeruler) | Component for setting token attributes using match patterns. |
|
||||||
| [`Transformer`](/api/transformer) | Component for using [transformer models](/usage/transformers) in your pipeline, accessing outputs and aligning tokens. Provided via [`spacy-transformers`](https://github.com/explosion/spacy-transformers). |
|
| [`Transformer`](/api/transformer) | Component for using [transformer models](/usage/transformers) in your pipeline, accessing outputs and aligning tokens. Provided via [`spacy-transformers`](https://github.com/explosion/spacy-transformers). |
|
||||||
|
|
||||||
|
<Infobox title="Details & Documentation" emoji="📖" list>
|
||||||
|
|
||||||
|
- **Usage:** [Processing pipelines](/usage/processing-pipelines)
|
||||||
|
- **API:** [Built-in pipeline components](/api#architecture-pipeline)
|
||||||
|
- **Implementation:**
|
||||||
|
[`spacy/pipeline`](https://github.com/explosion/spaCy/tree/develop/spacy/pipeline)
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
### New and improved pipeline component APIs {#features-components}
|
### New and improved pipeline component APIs {#features-components}
|
||||||
|
|
||||||
- `Language.factory`, `Language.component`
|
- `Language.factory`, `Language.component`
|
||||||
- `Language.analyze_pipes`
|
- `Language.analyze_pipes`
|
||||||
- Adding components from other models
|
- Adding components from other models
|
||||||
|
|
||||||
|
<Infobox title="Details & Documentation" emoji="📖" list>
|
||||||
|
|
||||||
|
- **Usage:** [Custom components](/usage/processing-pipelines#custom_components),
|
||||||
|
[Defining components during training](/usage/training#config-components)
|
||||||
|
- **API:** [`Language`](/api/language)
|
||||||
|
- **Implementation:**
|
||||||
|
[`spacy/language.py`](https://github.com/explosion/spaCy/tree/develop/spacy/language.py)
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
### Type hints and type-based data validation {#features-types}
|
### Type hints and type-based data validation {#features-types}
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> from spacy.language import Language
|
||||||
|
> from pydantic import StrictBool
|
||||||
|
>
|
||||||
|
> @Language.factory("my_component")
|
||||||
|
> def create_my_component(
|
||||||
|
> nlp: Language,
|
||||||
|
> name: str,
|
||||||
|
> custom: StrictBool
|
||||||
|
> ):
|
||||||
|
> ...
|
||||||
|
> ```
|
||||||
|
|
||||||
spaCy v3.0 officially drops support for Python 2 and now requires **Python
|
spaCy v3.0 officially drops support for Python 2 and now requires **Python
|
||||||
3.6+**. This also means that the code base can take full advantage of
|
3.6+**. This also means that the code base can take full advantage of
|
||||||
[type hints](https://docs.python.org/3/library/typing.html). spaCy's user-facing
|
[type hints](https://docs.python.org/3/library/typing.html). spaCy's user-facing
|
||||||
|
@ -54,13 +125,37 @@ validation of Thinc's [config system](https://thinc.ai/docs/usage-config), which
|
||||||
lets you to register **custom functions with typed arguments**, reference them
|
lets you to register **custom functions with typed arguments**, reference them
|
||||||
in your config and see validation errors if the argument values don't match.
|
in your config and see validation errors if the argument values don't match.
|
||||||
|
|
||||||
### CLI
|
<Infobox title="Details & Documentation" emoji="📖" list>
|
||||||
|
|
||||||
| Name | Description |
|
- **Usage: **
|
||||||
| --------------------------------------- | -------------------------------------------------------------------------------------------------------- |
|
[Component type hints and validation](/usage/processing-pipelines#type-hints),
|
||||||
| [`init config`](/api/cli#init-config) | Initialize a [training config](/usage/training) file for a blank language or auto-fill a partial config. |
|
[Training with custom code](/usage/training#custom-code)
|
||||||
| [`debug config`](/api/cli#debug-config) | Debug a [training config](/usage/training) file and show validation errors. |
|
- **Thinc: **
|
||||||
| [`project`](/api/cli#project) | Subcommand for cloning and running [spaCy projects](/usage/projects). |
|
[Type checking in Thinc](https://thinc.ai/docs/usage-type-checking),
|
||||||
|
[Thinc's config system](https://thinc.ai/docs/usage-config)
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
|
### New methods, attributes and commands
|
||||||
|
|
||||||
|
The following methods, attributes and commands are new in spaCy v3.0.
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
|
| [`Token.lex`](/api/token#attributes) | Access a token's [`Lexeme`](/api/lexeme). |
|
||||||
|
| [`Language.select_pipes`](/api/language#select_pipes) | Contextmanager for enabling or disabling specific pipeline components for a block. |
|
||||||
|
| [`Language.analyze_pipes`](/api/language#analyze_pipes) | [Analyze](/usage/processing-pipelines#analysis) components and their interdependencies. |
|
||||||
|
| [`Language.resume_training`](/api/language#resume_training) | Experimental: continue training a pretrained model and initialize "rehearsal" for components that implement a `rehearse` method to prevent catastrophic forgetting. |
|
||||||
|
| [`@Language.factory`](/api/language#factory) [`@Language.component`](/api/language#component) | Decorators for [registering](/usage/processing-pipelines#custom-components) pipeline component factories and simple stateless component functions. |
|
||||||
|
| [`Language.has_factory`](/api/language#has_factory) | Check whether a component factory is registered on a language class.s |
|
||||||
|
| [`Language.get_factory_meta`](/api/language#get_factory_meta) [`Language.get_pipe_meta`](/api/language#get_factory_meta) | Get the [`FactoryMeta`](/api/language#factorymeta) with component metadata for a factory or instance name. |
|
||||||
|
| [`Language.config`](/api/language#config) | The [config](/usage/training#config) used to create the current `nlp` object. An instance of [`Config`](https://thinc.ai/docs/api-config#config) and can be saved to disk and used for training. |
|
||||||
|
| [`Pipe.score`](/api/pipe#score) | Method on trainable pipeline components that returns a dictionary of evaluation scores. |
|
||||||
|
| [`registry`](/api/top-level#registry) | Function registry to map functions to string names that can be referenced in [configs](/usage/training#config). |
|
||||||
|
| [`init config`](/api/cli#init-config) | CLI command for initializing a [training config](/usage/training) file with the recommended settings. |
|
||||||
|
| [`init fill-config`](/api/cli#init-fill-config) | CLI command for auto-filling a partial config with all defaults and missing values. |
|
||||||
|
| [`debug config`](/api/cli#debug-config) | CLI command for debugging a [training config](/usage/training) file and showing validation errors. |
|
||||||
|
| [`project`](/api/cli#project) | Suite of CLI commands for cloning, running and managing [spaCy projects](/usage/projects). |
|
||||||
|
|
||||||
## Backwards Incompatibilities {#incompat}
|
## Backwards Incompatibilities {#incompat}
|
||||||
|
|
||||||
|
@ -70,12 +165,21 @@ usability. The following section lists the relevant changes to the user-facing
|
||||||
API. For specific examples of how to rewrite your code, check out the
|
API. For specific examples of how to rewrite your code, check out the
|
||||||
[migration guide](#migrating).
|
[migration guide](#migrating).
|
||||||
|
|
||||||
### Compatibility {#incompat-compat}
|
<Infobox variant="warning">
|
||||||
|
|
||||||
- spaCy now requires **Python 3.6+**.
|
Note that spaCy v3.0 now requires **Python 3.6+**.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
### API changes {#incompat-api}
|
### API changes {#incompat-api}
|
||||||
|
|
||||||
|
- Model symlinks, the `link` command and shortcut names are now deprecated.
|
||||||
|
There can be many [different models](/models) and not just one "English
|
||||||
|
model", so you should always use the full model name like
|
||||||
|
[`en_core_web_sm`](/models/en) explicitly.
|
||||||
|
- The [`train`](/api/cli#train) and [`pretrain`](/api/cli#pretrain) commands now
|
||||||
|
only take a `config.cfg` file containing the full
|
||||||
|
[training config](/usage/training#config).
|
||||||
- [`Language.add_pipe`](/api/language#add_pipe) now takes the **string name** of
|
- [`Language.add_pipe`](/api/language#add_pipe) now takes the **string name** of
|
||||||
the component factory instead of the component function.
|
the component factory instead of the component function.
|
||||||
- **Custom pipeline components** now needs to be decorated with the
|
- **Custom pipeline components** now needs to be decorated with the
|
||||||
|
@ -87,6 +191,20 @@ API. For specific examples of how to rewrite your code, check out the
|
||||||
- The `Language.disable_pipes` contextmanager has been replaced by
|
- The `Language.disable_pipes` contextmanager has been replaced by
|
||||||
[`Language.select_pipes`](/api/language#select_pipes), which can explicitly
|
[`Language.select_pipes`](/api/language#select_pipes), which can explicitly
|
||||||
disable or enable components.
|
disable or enable components.
|
||||||
|
- The [`Language.update`](/api/language#update),
|
||||||
|
[`Language.evaluate`](/api/language#evaluate) and
|
||||||
|
[`Pipe.update`](/api/pipe#update) methods now all take batches of
|
||||||
|
[`Example`](/api/example) objects instead of `Doc` and `GoldParse` objects, or
|
||||||
|
raw text and a dictionary of annotations.
|
||||||
|
[`Language.begin_training`](/api/language#begin_training) and
|
||||||
|
[`Pipe.begin_training`](/api/pipe#begin_training) now take a function that
|
||||||
|
returns a sequence of `Example` objects to initialize the model instead of a
|
||||||
|
list of tuples.
|
||||||
|
- [`Matcher.add`](/api/matcher#add),
|
||||||
|
[`PhraseMatcher.add`](/api/phrasematcher#add) and
|
||||||
|
[`DependencyMatcher.add`](/api/dependencymatcher#add) now only accept a list
|
||||||
|
of patterns as the second argument (instead of a variable number of
|
||||||
|
arguments). The `on_match` callback becomes an optional keyword argument.
|
||||||
|
|
||||||
### Removed or renamed API {#incompat-removed}
|
### Removed or renamed API {#incompat-removed}
|
||||||
|
|
||||||
|
@ -96,6 +214,7 @@ API. For specific examples of how to rewrite your code, check out the
|
||||||
| `GoldParse` | [`Example`](/api/example) |
|
| `GoldParse` | [`Example`](/api/example) |
|
||||||
| `GoldCorpus` | [`Corpus`](/api/corpus) |
|
| `GoldCorpus` | [`Corpus`](/api/corpus) |
|
||||||
| `spacy debug-data` | [`spacy debug data`](/api/cli#debug-data) |
|
| `spacy debug-data` | [`spacy debug data`](/api/cli#debug-data) |
|
||||||
|
| `spacy profile` | [`spacy debug profile`](/api/cli#debug-profile) |
|
||||||
| `spacy link`, `util.set_data_path`, `util.get_data_path` | not needed, model symlinks are deprecated |
|
| `spacy link`, `util.set_data_path`, `util.get_data_path` | not needed, model symlinks are deprecated |
|
||||||
|
|
||||||
The following deprecated methods, attributes and arguments were removed in v3.0.
|
The following deprecated methods, attributes and arguments were removed in v3.0.
|
||||||
|
@ -121,7 +240,7 @@ on them.
|
||||||
Model symlinks and shortcuts like `en` are now officially deprecated. There are
|
Model symlinks and shortcuts like `en` are now officially deprecated. There are
|
||||||
[many different models](/models) with different capabilities and not just one
|
[many different models](/models) with different capabilities and not just one
|
||||||
"English model". In order to download and load a model, you should always use
|
"English model". In order to download and load a model, you should always use
|
||||||
its full name – for instance, `en_core_web_sm`.
|
its full name – for instance, [`en_core_web_sm`](/models/en#en_core_web_sm).
|
||||||
|
|
||||||
```diff
|
```diff
|
||||||
- python -m spacy download en
|
- python -m spacy download en
|
||||||
|
@ -224,6 +343,51 @@ and you typically shouldn't have to use it in your code.
|
||||||
+ parser = nlp.add_pipe("parser")
|
+ parser = nlp.add_pipe("parser")
|
||||||
```
|
```
|
||||||
|
|
||||||
|
If you need to add a component from an existing pretrained model, you can now
|
||||||
|
use the `source` argument on [`nlp.add_pipe`](/api/language#add_pipe). This will
|
||||||
|
check that the component is compatible, and take care of porting over all
|
||||||
|
config. During training, you can also reference existing pretrained components
|
||||||
|
in your [config](/usage/training#config-components) and decide whether or not
|
||||||
|
they should be updated with more data.
|
||||||
|
|
||||||
|
> #### config.cfg (excerpt)
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> [components.ner]
|
||||||
|
> source = "en_core_web_sm"
|
||||||
|
> component = "ner"
|
||||||
|
> ```
|
||||||
|
|
||||||
|
```diff
|
||||||
|
source_nlp = spacy.load("en_core_web_sm")
|
||||||
|
nlp = spacy.blank("en")
|
||||||
|
- ner = source_nlp.get_pipe("ner")
|
||||||
|
- nlp.add_pipe(ner)
|
||||||
|
+ nlp.add_pipe("ner", source=source_nlp)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Adding match patterns {#migrating-matcher}
|
||||||
|
|
||||||
|
The [`Matcher.add`](/api/matcher#add),
|
||||||
|
[`PhraseMatcher.add`](/api/phrasematcher#add) and
|
||||||
|
[`DependencyMatcher.add`](/api/dependencymatcher#add) methods now only accept a
|
||||||
|
**list of patterns** as the second argument (instead of a variable number of
|
||||||
|
arguments). The `on_match` callback becomes an optional keyword argument.
|
||||||
|
|
||||||
|
```diff
|
||||||
|
matcher = Matcher(nlp.vocab)
|
||||||
|
patterns = [[{"TEXT": "Google"}, {"TEXT": "Now"}], [{"TEXT": "GoogleNow"}]]
|
||||||
|
- matcher.add("GoogleNow", on_match, *patterns)
|
||||||
|
+ matcher.add("GoogleNow", patterns, on_match=on_match)
|
||||||
|
```
|
||||||
|
|
||||||
|
```diff
|
||||||
|
matcher = PhraseMatcher(nlp.vocab)
|
||||||
|
patterns = [nlp("health care reform"), nlp("healthcare reform")]
|
||||||
|
- matcher.add("HEALTH", on_match, *patterns)
|
||||||
|
+ matcher.add("HEALTH", patterns, on_match=on_match)
|
||||||
|
```
|
||||||
|
|
||||||
### Training models {#migrating-training}
|
### Training models {#migrating-training}
|
||||||
|
|
||||||
To train your models, you should now pretty much always use the
|
To train your models, you should now pretty much always use the
|
||||||
|
@ -233,15 +397,20 @@ use a [flexible config file](/usage/training#config) that describes all training
|
||||||
settings and hyperparameters, as well as your pipeline, model components and
|
settings and hyperparameters, as well as your pipeline, model components and
|
||||||
architectures to use. The `--code` argument lets you pass in code containing
|
architectures to use. The `--code` argument lets you pass in code containing
|
||||||
[custom registered functions](/usage/training#custom-code) that you can
|
[custom registered functions](/usage/training#custom-code) that you can
|
||||||
reference in your config.
|
reference in your config. To get started, check out the
|
||||||
|
[quickstart widget](/usage/training#quickstart).
|
||||||
|
|
||||||
#### Binary .spacy training data format {#migrating-training-format}
|
#### Binary .spacy training data format {#migrating-training-format}
|
||||||
|
|
||||||
spaCy now uses a new
|
spaCy v3.0 uses a new
|
||||||
[binary training data format](/api/data-formats#binary-training), which is much
|
[binary training data format](/api/data-formats#binary-training) created by
|
||||||
smaller and consists of `Doc` objects, serialized via the
|
serializing a [`DocBin`](/api/docbin), which represents a collection of `Doc`
|
||||||
[`DocBin`](/api/docbin). You can convert your existing JSON-formatted data using
|
objects. This means that you can train spaCy models using the same format it
|
||||||
the [`spacy convert`](/api/cli#convert) command, which outputs `.spacy` files:
|
outputs: annotated `Doc` objects. The binary format is extremely **efficient in
|
||||||
|
storage**, especially when packing multiple documents together.
|
||||||
|
|
||||||
|
You can convert your existing JSON-formatted data using the
|
||||||
|
[`spacy convert`](/api/cli#convert) command, which outputs `.spacy` files:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ python -m spacy convert ./training.json ./output
|
$ python -m spacy convert ./training.json ./output
|
||||||
|
@ -273,13 +442,72 @@ workflows, from data preprocessing to training and packaging your model.
|
||||||
|
|
||||||
</Project>
|
</Project>
|
||||||
|
|
||||||
#### Migrating training scripts to CLI command and config {#migrating-training-scripts}
|
|
||||||
|
|
||||||
<!-- TODO: write -->
|
|
||||||
|
|
||||||
#### Training via the Python API {#migrating-training-python}
|
#### Training via the Python API {#migrating-training-python}
|
||||||
|
|
||||||
<!-- TODO: this should explain the GoldParse -> Example stuff -->
|
For most use cases, you **shouldn't** have to write your own training scripts
|
||||||
|
anymore. Instead, you can use [`spacy train`](/api/cli#train) with a
|
||||||
|
[config file](/usage/training#config) and custom
|
||||||
|
[registered functions](/usage/training#custom-code) if needed. You can even
|
||||||
|
register callbacks that can modify the `nlp` object at different stages of its
|
||||||
|
lifecycle to fully customize it before training.
|
||||||
|
|
||||||
|
If you do decide to use the [internal training API](/usage/training#api) from
|
||||||
|
Python, you should only need a few small modifications to convert your scripts
|
||||||
|
from spaCy v2.x to v3.x. The [`Example.from_dict`](/api/example#from_dict)
|
||||||
|
classmethod takes a reference `Doc` and a
|
||||||
|
[dictionary of annotations](/api/data-formats#dict-input), similar to the
|
||||||
|
"simple training style" in spaCy v2.x:
|
||||||
|
|
||||||
|
```diff
|
||||||
|
### Migrating Doc and GoldParse
|
||||||
|
doc = nlp.make_doc("Mark Zuckerberg is the CEO of Facebook")
|
||||||
|
entities = [(0, 15, "PERSON"), (30, 38, "ORG")]
|
||||||
|
- gold = GoldParse(doc, entities=entities)
|
||||||
|
+ example = Example.from_dict(doc, {"entities": entities})
|
||||||
|
```
|
||||||
|
|
||||||
|
```diff
|
||||||
|
### Migrating simple training style
|
||||||
|
text = "Mark Zuckerberg is the CEO of Facebook"
|
||||||
|
annotations = {"entities": [(0, 15, "PERSON"), (30, 38, "ORG")]}
|
||||||
|
+ doc = nlp.make_doc(text)
|
||||||
|
+ example = Example.from_dict(doc, annotations)
|
||||||
|
```
|
||||||
|
|
||||||
|
The [`Language.update`](/api/language#update),
|
||||||
|
[`Language.evaluate`](/api/language#evaluate) and
|
||||||
|
[`Pipe.update`](/api/pipe#update) methods now all take batches of
|
||||||
|
[`Example`](/api/example) objects instead of `Doc` and `GoldParse` objects, or
|
||||||
|
raw text and a dictionary of annotations.
|
||||||
|
|
||||||
|
```python
|
||||||
|
### Training loop {highlight="11"}
|
||||||
|
TRAIN_DATA = [
|
||||||
|
("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
|
||||||
|
("I like London.", {"entities": [(7, 13, "LOC")]}),
|
||||||
|
]
|
||||||
|
nlp.begin_training()
|
||||||
|
for i in range(20):
|
||||||
|
random.shuffle(TRAIN_DATA)
|
||||||
|
for batch in minibatch(TRAIN_DATA):
|
||||||
|
examples = []
|
||||||
|
for text, annots in batch:
|
||||||
|
examples.append(Example.from_dict(nlp.make_doc(text), annots))
|
||||||
|
nlp.update(examples)
|
||||||
|
```
|
||||||
|
|
||||||
|
[`Language.begin_training`](/api/language#begin_training) and
|
||||||
|
[`Pipe.begin_training`](/api/pipe#begin_training) now take a function that
|
||||||
|
returns a sequence of `Example` objects to initialize the model instead of a
|
||||||
|
list of tuples. The data examples are used to **initialize the models** of
|
||||||
|
trainable pipeline components, which includes validating the network,
|
||||||
|
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||||
|
setting up the label scheme.
|
||||||
|
|
||||||
|
```diff
|
||||||
|
- nlp.begin_training(examples)
|
||||||
|
+ nlp.begin_training(lambda: examples)
|
||||||
|
```
|
||||||
|
|
||||||
#### Packaging models {#migrating-training-packaging}
|
#### Packaging models {#migrating-training-packaging}
|
||||||
|
|
||||||
|
|
5
website/package-lock.json
generated
5
website/package-lock.json
generated
|
@ -17455,6 +17455,11 @@
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"jinja-to-js": {
|
||||||
|
"version": "3.2.3",
|
||||||
|
"resolved": "https://registry.npmjs.org/jinja-to-js/-/jinja-to-js-3.2.3.tgz",
|
||||||
|
"integrity": "sha512-ktEBxQG17fYaFcHThB719+EbePBx+AkkORQMyuP0UuLPS2zx8uJXP5CsItXjUUwMHFPj3hCRkyqEYzLbeklYgQ=="
|
||||||
|
},
|
||||||
"jpeg-js": {
|
"jpeg-js": {
|
||||||
"version": "0.2.0",
|
"version": "0.2.0",
|
||||||
"resolved": "https://registry.npmjs.org/jpeg-js/-/jpeg-js-0.2.0.tgz",
|
"resolved": "https://registry.npmjs.org/jpeg-js/-/jpeg-js-0.2.0.tgz",
|
||||||
|
|
|
@ -41,6 +41,7 @@
|
||||||
"gatsby-transformer-sharp": "^2.1.13",
|
"gatsby-transformer-sharp": "^2.1.13",
|
||||||
"html-to-react": "^1.3.4",
|
"html-to-react": "^1.3.4",
|
||||||
"intersection-observer": "^0.5.1",
|
"intersection-observer": "^0.5.1",
|
||||||
|
"jinja-to-js": "^3.2.3",
|
||||||
"node-sass": "^4.11.0",
|
"node-sass": "^4.11.0",
|
||||||
"parse-numeric-range": "0.0.2",
|
"parse-numeric-range": "0.0.2",
|
||||||
"prismjs": "^1.15.0",
|
"prismjs": "^1.15.0",
|
||||||
|
@ -52,20 +53,22 @@
|
||||||
"remark-react": "^5.0.1"
|
"remark-react": "^5.0.1"
|
||||||
},
|
},
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"build": "gatsby build",
|
"build": "npm run python:setup && gatsby build",
|
||||||
"dev": "gatsby develop",
|
"dev": "npm run python:setup && gatsby develop",
|
||||||
"dev:nightly": "BRANCH=nightly.spacy.io npm run dev",
|
"dev:nightly": "BRANCH=nightly.spacy.io npm run dev",
|
||||||
"lint": "eslint **",
|
"lint": "eslint **",
|
||||||
"clear": "rm -rf .cache",
|
"clear": "rm -rf .cache",
|
||||||
"test": "echo \"Write tests! -> https://gatsby.app/unit-testing\""
|
"test": "echo \"Write tests! -> https://gatsby.app/unit-testing\"",
|
||||||
|
"python:install": "pip install setup/requirements.txt",
|
||||||
|
"python:setup": "cd setup && ./setup.sh"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
|
"@sindresorhus/slugify": "^0.8.0",
|
||||||
"browser-monads": "^1.0.0",
|
"browser-monads": "^1.0.0",
|
||||||
"md-attr-parser": "^1.2.1",
|
"md-attr-parser": "^1.2.1",
|
||||||
"prettier": "^1.16.4",
|
"prettier": "^1.16.4",
|
||||||
"raw-loader": "^1.0.0",
|
"raw-loader": "^1.0.0",
|
||||||
"unist-util-visit": "^1.4.0",
|
"unist-util-visit": "^1.4.0"
|
||||||
"@sindresorhus/slugify": "^0.8.0"
|
|
||||||
},
|
},
|
||||||
"repository": {
|
"repository": {
|
||||||
"type": "git",
|
"type": "git",
|
||||||
|
|
1261
website/setup/jinja_to_js.py
Normal file
1261
website/setup/jinja_to_js.py
Normal file
File diff suppressed because it is too large
Load Diff
3
website/setup/requirements.txt
Normal file
3
website/setup/requirements.txt
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
# These are used to compile the training quickstart config
|
||||||
|
jinja2
|
||||||
|
typer
|
1
website/setup/setup.sh
Executable file
1
website/setup/setup.sh
Executable file
|
@ -0,0 +1 @@
|
||||||
|
python jinja_to_js.py ../../spacy/cli/templates/quickstart_training.jinja ../src/widgets/quickstart-training-generator.js --data ../../spacy/cli/templates/quickstart_training_recommendations.json
|
|
@ -23,6 +23,7 @@ import { ReactComponent as MoonIcon } from '../images/icons/moon.svg'
|
||||||
import { ReactComponent as ClipboardIcon } from '../images/icons/clipboard.svg'
|
import { ReactComponent as ClipboardIcon } from '../images/icons/clipboard.svg'
|
||||||
import { ReactComponent as NetworkIcon } from '../images/icons/network.svg'
|
import { ReactComponent as NetworkIcon } from '../images/icons/network.svg'
|
||||||
import { ReactComponent as DownloadIcon } from '../images/icons/download.svg'
|
import { ReactComponent as DownloadIcon } from '../images/icons/download.svg'
|
||||||
|
import { ReactComponent as PackageIcon } from '../images/icons/package.svg'
|
||||||
|
|
||||||
import classes from '../styles/icon.module.sass'
|
import classes from '../styles/icon.module.sass'
|
||||||
|
|
||||||
|
@ -49,6 +50,7 @@ const icons = {
|
||||||
clipboard: ClipboardIcon,
|
clipboard: ClipboardIcon,
|
||||||
network: NetworkIcon,
|
network: NetworkIcon,
|
||||||
download: DownloadIcon,
|
download: DownloadIcon,
|
||||||
|
package: PackageIcon,
|
||||||
}
|
}
|
||||||
|
|
||||||
export default function Icon({ name, width = 20, height, inline = false, variant, className }) {
|
export default function Icon({ name, width = 20, height, inline = false, variant, className }) {
|
||||||
|
|
|
@ -5,8 +5,17 @@ import classNames from 'classnames'
|
||||||
import Icon from './icon'
|
import Icon from './icon'
|
||||||
import classes from '../styles/infobox.module.sass'
|
import classes from '../styles/infobox.module.sass'
|
||||||
|
|
||||||
export default function Infobox({ title, emoji, id, variant = 'default', className, children }) {
|
export default function Infobox({
|
||||||
|
title,
|
||||||
|
emoji,
|
||||||
|
id,
|
||||||
|
variant = 'default',
|
||||||
|
list = false,
|
||||||
|
className,
|
||||||
|
children,
|
||||||
|
}) {
|
||||||
const infoboxClassNames = classNames(classes.root, className, {
|
const infoboxClassNames = classNames(classes.root, className, {
|
||||||
|
[classes.list]: !!list,
|
||||||
[classes.warning]: variant === 'warning',
|
[classes.warning]: variant === 'warning',
|
||||||
[classes.danger]: variant === 'danger',
|
[classes.danger]: variant === 'danger',
|
||||||
})
|
})
|
||||||
|
|
|
@ -8,13 +8,21 @@ import Icon from './icon'
|
||||||
import classes from '../styles/link.module.sass'
|
import classes from '../styles/link.module.sass'
|
||||||
import { isString } from './util'
|
import { isString } from './util'
|
||||||
|
|
||||||
const internalRegex = /(http(s?)):\/\/(prodi.gy|spacy.io|irl.spacy.io)/gi
|
const internalRegex = /(http(s?)):\/\/(prodi.gy|spacy.io|irl.spacy.io|explosion.ai|course.spacy.io)/gi
|
||||||
|
|
||||||
const Whitespace = ({ children }) => (
|
const Whitespace = ({ children }) => (
|
||||||
// Ensure that links are always wrapped in spaces
|
// Ensure that links are always wrapped in spaces
|
||||||
<> {children} </>
|
<> {children} </>
|
||||||
)
|
)
|
||||||
|
|
||||||
|
function getIcon(dest) {
|
||||||
|
if (/(github.com)/.test(dest)) return 'code'
|
||||||
|
if (/^\/?api\/architectures#/.test(dest)) return 'network'
|
||||||
|
if (/^\/?api/.test(dest)) return 'docs'
|
||||||
|
if (/^\/?models\/(.+)/.test(dest)) return 'package'
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
|
||||||
export default function Link({
|
export default function Link({
|
||||||
children,
|
children,
|
||||||
to,
|
to,
|
||||||
|
@ -30,22 +38,19 @@ export default function Link({
|
||||||
}) {
|
}) {
|
||||||
const dest = to || href
|
const dest = to || href
|
||||||
const external = forceExternal || /(http(s?)):\/\//gi.test(dest)
|
const external = forceExternal || /(http(s?)):\/\//gi.test(dest)
|
||||||
const isApi = !external && !hidden && !hideIcon && /^\/?api/.test(dest)
|
const icon = getIcon(dest)
|
||||||
const isArch = !external && !hidden && !hideIcon && /^\/?api\/architectures#/.test(dest)
|
const withIcon = !hidden && !hideIcon && !!icon
|
||||||
const isSource = external && !hidden && !hideIcon && /(github.com)/.test(dest)
|
|
||||||
const withIcon = isApi || isArch || isSource
|
|
||||||
const sourceWithText = withIcon && isString(children)
|
const sourceWithText = withIcon && isString(children)
|
||||||
const linkClassNames = classNames(classes.root, className, {
|
const linkClassNames = classNames(classes.root, className, {
|
||||||
[classes.hidden]: hidden,
|
[classes.hidden]: hidden,
|
||||||
[classes.nowrap]: (withIcon && !sourceWithText) || isArch,
|
[classes.nowrap]: (withIcon && !sourceWithText) || icon === 'network',
|
||||||
[classes.withIcon]: withIcon,
|
[classes.withIcon]: withIcon,
|
||||||
})
|
})
|
||||||
const Wrapper = ws ? Whitespace : Fragment
|
const Wrapper = ws ? Whitespace : Fragment
|
||||||
const icon = isArch ? 'network' : isApi ? 'docs' : isSource ? 'code' : null
|
|
||||||
const content = (
|
const content = (
|
||||||
<>
|
<>
|
||||||
{sourceWithText ? <span className={classes.sourceText}>{children}</span> : children}
|
{sourceWithText ? <span className={classes.sourceText}>{children}</span> : children}
|
||||||
{icon && <Icon name={icon} width={16} inline className={classes.icon} />}
|
{withIcon && <Icon name={icon} width={16} inline className={classes.icon} />}
|
||||||
</>
|
</>
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -15,24 +15,18 @@ function getNewChecked(optionId, checkedForId, multiple) {
|
||||||
return [...checkedForId, optionId]
|
return [...checkedForId, optionId]
|
||||||
}
|
}
|
||||||
|
|
||||||
function getRawContent(ref) {
|
|
||||||
if (ref.current && ref.current.childNodes) {
|
|
||||||
// Select all currently visible nodes (spans and text nodes)
|
|
||||||
const result = [...ref.current.childNodes].filter(el => el.offsetParent !== null)
|
|
||||||
return result.map(el => el.textContent).join('\n')
|
|
||||||
}
|
|
||||||
return ''
|
|
||||||
}
|
|
||||||
|
|
||||||
const Quickstart = ({
|
const Quickstart = ({
|
||||||
data = [],
|
data = [],
|
||||||
title,
|
title,
|
||||||
description,
|
description,
|
||||||
copy = true,
|
copy = true,
|
||||||
download,
|
download,
|
||||||
|
rawContent = null,
|
||||||
id = 'quickstart',
|
id = 'quickstart',
|
||||||
setters = {},
|
setters = {},
|
||||||
hidePrompts,
|
hidePrompts,
|
||||||
|
small,
|
||||||
|
codeLang,
|
||||||
children,
|
children,
|
||||||
}) => {
|
}) => {
|
||||||
const contentRef = useRef()
|
const contentRef = useRef()
|
||||||
|
@ -46,6 +40,16 @@ const Quickstart = ({
|
||||||
const [copySuccess, setCopySuccess] = useState(false)
|
const [copySuccess, setCopySuccess] = useState(false)
|
||||||
const [otherState, setOtherState] = useState({})
|
const [otherState, setOtherState] = useState({})
|
||||||
const setOther = (id, value) => setOtherState({ ...otherState, [id]: value })
|
const setOther = (id, value) => setOtherState({ ...otherState, [id]: value })
|
||||||
|
const getRawContent = ref => {
|
||||||
|
if (rawContent !== null) return rawContent
|
||||||
|
if (ref.current && ref.current.childNodes) {
|
||||||
|
// Select all currently visible nodes (spans and text nodes)
|
||||||
|
const result = [...ref.current.childNodes].filter(el => el.offsetParent !== null)
|
||||||
|
return result.map(el => el.textContent).join('\n')
|
||||||
|
}
|
||||||
|
return ''
|
||||||
|
}
|
||||||
|
|
||||||
const onClickCopy = () => {
|
const onClickCopy = () => {
|
||||||
copyAreaRef.current.value = getRawContent(contentRef)
|
copyAreaRef.current.value = getRawContent(contentRef)
|
||||||
copyToClipboard(copyAreaRef, setCopySuccess)
|
copyToClipboard(copyAreaRef, setCopySuccess)
|
||||||
|
@ -210,7 +214,14 @@ const Quickstart = ({
|
||||||
}
|
}
|
||||||
)}
|
)}
|
||||||
<pre className={classes.code}>
|
<pre className={classes.code}>
|
||||||
<code className={classes.results} data-quickstart-results="" ref={contentRef}>
|
<code
|
||||||
|
className={classNames(classes.results, {
|
||||||
|
[classes.small]: !!small,
|
||||||
|
[`language-${codeLang}`]: !!codeLang,
|
||||||
|
})}
|
||||||
|
data-quickstart-results=""
|
||||||
|
ref={contentRef}
|
||||||
|
>
|
||||||
{children}
|
{children}
|
||||||
</code>
|
</code>
|
||||||
|
|
||||||
|
|
|
@ -41,6 +41,6 @@ Search.propTypes = {
|
||||||
apiKey: PropTypes.string.isRequired,
|
apiKey: PropTypes.string.isRequired,
|
||||||
indexName: PropTypes.string.isRequired,
|
indexName: PropTypes.string.isRequired,
|
||||||
}).isRequired,
|
}).isRequired,
|
||||||
id: PropTypes.string.isRequired,
|
id: PropTypes.string,
|
||||||
placeholder: PropTypes.string.isRequired,
|
placeholder: PropTypes.string,
|
||||||
}
|
}
|
||||||
|
|
5
website/src/images/icons/package.svg
Normal file
5
website/src/images/icons/package.svg
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
|
||||||
|
<path fill="none" d="M21 16V8a2 2 0 0 0-1-1.73l-7-4a2 2 0 0 0-2 0l-7 4A2 2 0 0 0 3 8v8a2 2 0 0 0 1 1.73l7 4a2 2 0 0 0 2 0l7-4A2 2 0 0 0 21 16z"></path>
|
||||||
|
<polyline fill="none" points="3.27 6.96 12 12.01 20.73 6.96"></polyline>
|
||||||
|
<line fill="none" x1="12" y1="22.08" x2="12" y2="12"></line>
|
||||||
|
</svg>
|
After Width: | Height: | Size: 440 B |
|
@ -14,6 +14,21 @@
|
||||||
font-size: inherit
|
font-size: inherit
|
||||||
line-height: inherit
|
line-height: inherit
|
||||||
|
|
||||||
|
ul li
|
||||||
|
padding-left: 0.75em
|
||||||
|
|
||||||
|
.list ul li
|
||||||
|
font-size: var(--font-size-sm)
|
||||||
|
list-style: none
|
||||||
|
padding: 0
|
||||||
|
margin: 0 0 0.35rem 0
|
||||||
|
|
||||||
|
&:before
|
||||||
|
all: initial
|
||||||
|
|
||||||
|
a, a span
|
||||||
|
border-bottom: 0 !important
|
||||||
|
|
||||||
.title
|
.title
|
||||||
font-weight: bold
|
font-weight: bold
|
||||||
color: var(--color-theme)
|
color: var(--color-theme)
|
||||||
|
|
|
@ -124,6 +124,16 @@
|
||||||
& > span
|
& > span
|
||||||
display: block
|
display: block
|
||||||
|
|
||||||
|
.small
|
||||||
|
font-size: var(--font-size-code)
|
||||||
|
line-height: 1.65
|
||||||
|
white-space: pre-wrap
|
||||||
|
max-height: 400px
|
||||||
|
overflow-y: auto
|
||||||
|
|
||||||
|
& > span
|
||||||
|
display: inline
|
||||||
|
|
||||||
.hide-prompts .prompt:before
|
.hide-prompts .prompt:before
|
||||||
content: initial !important
|
content: initial !important
|
||||||
|
|
||||||
|
|
12
website/src/widgets/quickstart-training-generator.js
Normal file
12
website/src/widgets/quickstart-training-generator.js
Normal file
File diff suppressed because one or more lines are too long
|
@ -1,13 +1,19 @@
|
||||||
import React, { useState } from 'react'
|
import React, { useState } from 'react'
|
||||||
import { StaticQuery, graphql } from 'gatsby'
|
import { StaticQuery, graphql } from 'gatsby'
|
||||||
|
import highlightCode from 'gatsby-remark-prismjs/highlight-code.js'
|
||||||
|
|
||||||
import { Quickstart, QS } from '../components/quickstart'
|
import { Quickstart } from '../components/quickstart'
|
||||||
|
import generator, { DATA as GENERATOR_DATA } from './quickstart-training-generator'
|
||||||
|
import { isString, htmlToReact } from '../components/util'
|
||||||
|
|
||||||
const DEFAULT_LANG = 'en'
|
const DEFAULT_LANG = 'en'
|
||||||
|
const DEFAULT_HARDWARE = 'gpu'
|
||||||
|
const DEFAULT_OPT = 'efficiency'
|
||||||
const COMPONENTS = ['tagger', 'parser', 'ner', 'textcat']
|
const COMPONENTS = ['tagger', 'parser', 'ner', 'textcat']
|
||||||
const COMMENT = `# This is an auto-generated partial config for training a model.
|
const COMMENT = `# This is an auto-generated partial config. To use it with 'spacy train'
|
||||||
# To use it for training, auto-fill it with all default values.
|
# you can run spacy init fill-config to auto-fill all default settings:
|
||||||
# python -m spacy init config config.cfg --base base_config.cfg`
|
# python -m spacy init fill-config ./base_config.cfg ./config.cfg`
|
||||||
|
|
||||||
const DATA = [
|
const DATA = [
|
||||||
{
|
{
|
||||||
id: 'lang',
|
id: 'lang',
|
||||||
|
@ -25,9 +31,8 @@ const DATA = [
|
||||||
id: 'hardware',
|
id: 'hardware',
|
||||||
title: 'Hardware',
|
title: 'Hardware',
|
||||||
options: [
|
options: [
|
||||||
{ id: 'cpu-only', title: 'CPU only' },
|
{ id: 'cpu', title: 'CPU preferred', checked: DEFAULT_HARDWARE === 'cpu' },
|
||||||
{ id: 'cpu', title: 'CPU preferred' },
|
{ id: 'gpu', title: 'GPU', checked: DEFAULT_HARDWARE === 'gpu' },
|
||||||
{ id: 'gpu', title: 'GPU', checked: true },
|
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -35,28 +40,45 @@ const DATA = [
|
||||||
title: 'Optimize for',
|
title: 'Optimize for',
|
||||||
help: '...',
|
help: '...',
|
||||||
options: [
|
options: [
|
||||||
{ id: 'efficiency', title: 'efficiency', checked: true },
|
{ id: 'efficiency', title: 'efficiency', checked: DEFAULT_OPT === 'efficiency' },
|
||||||
{ id: 'accuracy', title: 'accuracy' },
|
{ id: 'accuracy', title: 'accuracy', checked: DEFAULT_OPT === 'accuracy' },
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
{
|
|
||||||
id: 'config',
|
|
||||||
title: 'Configuration',
|
|
||||||
options: [
|
|
||||||
{
|
|
||||||
id: 'independent',
|
|
||||||
title: 'independent components',
|
|
||||||
help: "Make components independent and don't share weights",
|
|
||||||
},
|
|
||||||
],
|
|
||||||
multiple: true,
|
|
||||||
},
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
function stringify(value) {
|
||||||
|
if (isString(value) && value.startsWith('${')) return value
|
||||||
|
const string = JSON.stringify(value)
|
||||||
|
if (Array.isArray(value)) return string.replace(/,/g, ', ')
|
||||||
|
return string
|
||||||
|
}
|
||||||
|
|
||||||
export default function QuickstartTraining({ id, title, download = 'config.cfg' }) {
|
export default function QuickstartTraining({ id, title, download = 'config.cfg' }) {
|
||||||
const [lang, setLang] = useState(DEFAULT_LANG)
|
const [lang, setLang] = useState(DEFAULT_LANG)
|
||||||
const [pipeline, setPipeline] = useState([])
|
const [components, setComponents] = useState([])
|
||||||
const setters = { lang: setLang, components: setPipeline }
|
const [[hardware], setHardware] = useState([DEFAULT_HARDWARE])
|
||||||
|
const [[optimize], setOptimize] = useState([DEFAULT_OPT])
|
||||||
|
const setters = {
|
||||||
|
lang: setLang,
|
||||||
|
components: setComponents,
|
||||||
|
hardware: setHardware,
|
||||||
|
optimize: setOptimize,
|
||||||
|
}
|
||||||
|
const reco = GENERATOR_DATA[lang] || {}
|
||||||
|
const content = generator({
|
||||||
|
lang,
|
||||||
|
components,
|
||||||
|
optimize,
|
||||||
|
hardware,
|
||||||
|
transformer_data: reco.transformer,
|
||||||
|
word_vectors: reco.word_vectors,
|
||||||
|
})
|
||||||
|
const rawStr = content.trim().replace(/\n\n\n+/g, '\n\n')
|
||||||
|
const rawContent = `${COMMENT}\n${rawStr}`
|
||||||
|
const displayContent = highlightCode('ini', rawContent)
|
||||||
|
.split('\n')
|
||||||
|
.map(line => (line.startsWith('#') ? `<span class="token comment">${line}</span>` : line))
|
||||||
|
.join('\n')
|
||||||
return (
|
return (
|
||||||
<StaticQuery
|
<StaticQuery
|
||||||
query={query}
|
query={query}
|
||||||
|
@ -66,47 +88,19 @@ export default function QuickstartTraining({ id, title, download = 'config.cfg'
|
||||||
id: code,
|
id: code,
|
||||||
title: name,
|
title: name,
|
||||||
}))
|
}))
|
||||||
const recommendedTrf = Object.assign(
|
|
||||||
{},
|
|
||||||
...langs.map(({ code }) => ({ [code]: { sm: 'TODO', lg: 'TODO' } }))
|
|
||||||
)
|
|
||||||
return (
|
return (
|
||||||
<Quickstart
|
<Quickstart
|
||||||
download={download}
|
download={download}
|
||||||
|
rawContent={content}
|
||||||
data={DATA}
|
data={DATA}
|
||||||
title={title}
|
title={title}
|
||||||
id={id}
|
id={id}
|
||||||
setters={setters}
|
setters={setters}
|
||||||
hidePrompts
|
hidePrompts
|
||||||
|
small
|
||||||
|
codeLang="ini"
|
||||||
>
|
>
|
||||||
<QS comment>{COMMENT}</QS>
|
{htmlToReact(displayContent)}
|
||||||
<span>[paths]</span>
|
|
||||||
<span>train = ""</span>
|
|
||||||
<span>dev = ""</span>
|
|
||||||
<br />
|
|
||||||
<span>[nlp]</span>
|
|
||||||
<span>lang = "{lang}"</span>
|
|
||||||
<span>pipeline = {JSON.stringify(pipeline).replace(/,/g, ', ')}</span>
|
|
||||||
<br />
|
|
||||||
<span>[components]</span>
|
|
||||||
<br />
|
|
||||||
<span>[components.transformer]</span>
|
|
||||||
<QS optimize="efficiency">name = "{recommendedTrf[lang].sm}"</QS>
|
|
||||||
<QS optimize="accuracy">name = "{recommendedTrf[lang].lg}"</QS>
|
|
||||||
{!!pipeline.length && <br />}
|
|
||||||
{pipeline.map((pipe, i) => (
|
|
||||||
<>
|
|
||||||
{i !== 0 && <br />}
|
|
||||||
<span>[components.{pipe}]</span>
|
|
||||||
<span>factory = "{pipe}"</span>
|
|
||||||
<QS config="independent">
|
|
||||||
<br />
|
|
||||||
[components.parser.model.tok2vec]
|
|
||||||
<br />
|
|
||||||
@architectures = "spacy.Tok2Vec.v1"
|
|
||||||
</QS>
|
|
||||||
</>
|
|
||||||
))}
|
|
||||||
</Quickstart>
|
</Quickstart>
|
||||||
)
|
)
|
||||||
}}
|
}}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user