diff --git a/MANIFEST.in b/MANIFEST.in
index 6502ff607..ef42138f1 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,8 +1,9 @@
recursive-include include *.h
-recursive-include spacy *.pyx *.pxd *.txt *.cfg
+recursive-include spacy *.pyx *.pxd *.txt *.cfg *.jinja
include LICENSE
include README.md
include pyproject.toml
recursive-exclude spacy/lang *.json
recursive-include spacy/lang *.json.gz
+recursive-include spacy/cli *.json
recursive-include licenses *
diff --git a/pyproject.toml b/pyproject.toml
index d4aa25943..1b4972bd5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,7 @@ requires = [
"cymem>=2.0.2,<2.1.0",
"preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0",
- "thinc>=8.0.0a23,<8.0.0a30",
+ "thinc>=8.0.0a27,<8.0.0a30",
"blis>=0.4.0,<0.5.0",
"pytokenizations",
"smart_open>=2.0.0,<3.0.0"
diff --git a/requirements.txt b/requirements.txt
index 4bb62742d..b4901a692 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
# Our libraries
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
-thinc>=8.0.0a23,<8.0.0a30
+thinc>=8.0.0a27,<8.0.0a30
blis>=0.4.0,<0.5.0
ml_datasets>=0.1.1
murmurhash>=0.28.0,<1.1.0
@@ -26,3 +26,4 @@ pytest>=4.6.5
pytest-timeout>=1.3.0,<2.0.0
mock>=2.0.0,<3.0.0
flake8>=3.5.0,<3.6.0
+jinja2
diff --git a/setup.cfg b/setup.cfg
index f9da1adb9..a34c34e23 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -34,13 +34,13 @@ setup_requires =
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
murmurhash>=0.28.0,<1.1.0
- thinc>=8.0.0a23,<8.0.0a30
+ thinc>=8.0.0a27,<8.0.0a30
install_requires =
# Our libraries
murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
- thinc>=8.0.0a23,<8.0.0a30
+ thinc>=8.0.0a27,<8.0.0a30
blis>=0.4.0,<0.5.0
wasabi>=0.7.1,<1.1.0
srsly>=2.1.0,<3.0.0
diff --git a/spacy/__init__.py b/spacy/__init__.py
index 73e828936..d07ee5674 100644
--- a/spacy/__init__.py
+++ b/spacy/__init__.py
@@ -14,7 +14,7 @@ from . import pipeline # noqa: F401
from .cli.info import info # noqa: F401
from .glossary import explain # noqa: F401
from .about import __version__ # noqa: F401
-from .util import registry # noqa: F401
+from .util import registry, logger # noqa: F401
from .errors import Errors
from .language import Language
diff --git a/spacy/about.py b/spacy/about.py
index eb4d2128c..5ed46bbe4 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
# fmt: off
__title__ = "spacy-nightly"
-__version__ = "3.0.0a6"
+__version__ = "3.0.0a7"
__release__ = True
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py
index bc47ffdef..2b21e2f2b 100644
--- a/spacy/cli/__init__.py
+++ b/spacy/cli/__init__.py
@@ -15,7 +15,7 @@ from .debug_model import debug_model # noqa: F401
from .evaluate import evaluate # noqa: F401
from .convert import convert # noqa: F401
from .init_model import init_model # noqa: F401
-from .init_config import init_config # noqa: F401
+from .init_config import init_config, fill_config # noqa: F401
from .validate import validate # noqa: F401
from .project.clone import project_clone # noqa: F401
from .project.assets import project_assets # noqa: F401
diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index 93ec9f31e..5613fa317 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -179,13 +179,13 @@ def show_validation_error(
file_path: Optional[Union[str, Path]] = None,
*,
title: str = "Config validation error",
- hint_init: bool = True,
+ hint_fill: bool = True,
):
"""Helper to show custom config validation errors on the CLI.
file_path (str / Path): Optional file path of config file, used in hints.
title (str): Title of the custom formatted error.
- hint_init (bool): Show hint about filling config.
+ hint_fill (bool): Show hint about filling config.
"""
try:
yield
@@ -195,14 +195,14 @@ def show_validation_error(
# helper for this in Thinc
err_text = str(e).replace("Config validation error", "").strip()
print(err_text)
- if hint_init and "field required" in err_text:
+ if hint_fill and "field required" in err_text:
config_path = file_path if file_path is not None else "config.cfg"
msg.text(
"If your config contains missing values, you can run the 'init "
- "config' command to fill in all the defaults, if possible:",
+ "fill-config' command to fill in all the defaults, if possible:",
spaced=True,
)
- print(f"{COMMAND} init config {config_path} --base {config_path}\n")
+ print(f"{COMMAND} init fill-config {config_path} --base {config_path}\n")
sys.exit(1)
diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index 6c8c85e30..27cf033c4 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -5,7 +5,6 @@ import sys
import srsly
from wasabi import Printer, MESSAGES, msg, diff_strings
import typer
-from thinc.api import Config
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
from ._util import import_code, debug_cli, get_sourced_components
@@ -49,11 +48,8 @@ def debug_config_cli(
overrides = parse_config_overrides(ctx.args)
import_code(code_path)
with show_validation_error(config_path):
- config = Config().from_disk(config_path, overrides=overrides)
- try:
- nlp, _ = util.load_model_from_config(config, auto_fill=auto_fill)
- except ValueError as e:
- msg.fail(str(e), exits=1)
+ config = util.load_config(config_path, overrides=overrides)
+ nlp, _ = util.load_model_from_config(config, auto_fill=auto_fill)
if auto_fill:
orig_config = config.to_str()
filled_config = nlp.config.to_str()
@@ -134,7 +130,7 @@ def debug_data(
if not config_path.exists():
msg.fail("Config file not found", config_path, exists=1)
with show_validation_error(config_path):
- cfg = Config().from_disk(config_path, overrides=config_overrides)
+ cfg = util.load_config(config_path, overrides=config_overrides)
nlp, config = util.load_model_from_config(cfg)
# Use original config here, not resolved version
sourced_components = get_sourced_components(cfg)
diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py
index cc6cb98ea..604a5676a 100644
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@@ -1,7 +1,7 @@
from typing import Dict, Any, Optional
from pathlib import Path
from wasabi import msg
-from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam, Config
+from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam
from thinc.api import Model, data_validation
import typer
@@ -49,16 +49,12 @@ def debug_model_cli(
}
config_overrides = parse_config_overrides(ctx.args)
with show_validation_error(config_path):
- cfg = Config().from_disk(config_path, overrides=config_overrides)
- try:
- nlp, config = util.load_model_from_config(cfg)
- except ValueError as e:
- msg.fail(str(e), exits=1)
+ config = util.load_config(config_path, overrides=config_overrides)
+ nlp, config = util.load_model_from_config(config_path)
seed = config["pretraining"]["seed"]
if seed is not None:
msg.info(f"Fixing random seed: {seed}")
fix_random_seed(seed)
-
pipe = nlp.get_pipe(component)
if hasattr(pipe, "model"):
model = pipe.model
diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py
index cf77fecfd..cf8f513fc 100644
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@@ -60,7 +60,6 @@ def evaluate(
fix_random_seed()
if use_gpu >= 0:
require_gpu(use_gpu)
- util.set_env_log(False)
data_path = util.ensure_path(data_path)
output_path = util.ensure_path(output)
displacy_path = util.ensure_path(displacy_path)
diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py
index 01664ee40..7d80eb289 100644
--- a/spacy/cli/init_config.py
+++ b/spacy/cli/init_config.py
@@ -1,81 +1,178 @@
-from typing import Optional, List
+from typing import Optional, List, Tuple
+from enum import Enum
from pathlib import Path
+from wasabi import Printer, diff_strings
from thinc.api import Config
-from wasabi import msg
+from pydantic import BaseModel
+import srsly
+import re
-from ..util import load_model_from_config, get_lang_class, load_model
-from ._util import init_cli, Arg, Opt, show_validation_error
+from .. import util
+from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
+
+
+TEMPLATE_ROOT = Path(__file__).parent / "templates"
+TEMPLATE_PATH = TEMPLATE_ROOT / "quickstart_training.jinja"
+RECOMMENDATIONS_PATH = TEMPLATE_ROOT / "quickstart_training_recommendations.json"
+
+
+class Optimizations(str, Enum):
+ efficiency = "efficiency"
+ accuracy = "accuracy"
+
+
+class RecommendationsTrfItem(BaseModel):
+ name: str
+ size_factor: int
+
+
+class RecommendationsTrf(BaseModel):
+ efficiency: RecommendationsTrfItem
+ accuracy: RecommendationsTrfItem
+
+
+class RecommendationSchema(BaseModel):
+ word_vectors: Optional[str] = None
+ transformer: Optional[RecommendationsTrf] = None
@init_cli.command("config")
def init_config_cli(
# fmt: off
- output_path: Path = Arg("-", help="Output path or - for stdout", allow_dash=True),
- base_path: Optional[Path] = Opt(None, "--base", "-b", help="Optional base config to fill", exists=True, dir_okay=False),
- model: Optional[str] = Opt(None, "--model", "-m", help="Optional model to copy config from"),
- lang: Optional[str] = Opt(None, "--lang", "-l", help="Optional language code for blank config"),
- pipeline: Optional[str] = Opt(None, "--pipeline", "-p", help="Optional pipeline components to use")
+ output_file: Path = Arg("-", help="File to save config.cfg to (or - for stdout)", allow_dash=True),
+ lang: Optional[str] = Opt("en", "--lang", "-l", help="Two-letter code of the language to use"),
+ pipeline: Optional[str] = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include in the model (without 'tok2vec' or 'transformer')"),
+ optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
+ cpu: bool = Opt(False, "--cpu", "-C", help="Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
# fmt: on
):
- """Generate a starter config.cfg for training."""
- validate_cli_args(base_path, model, lang)
- is_stdout = str(output_path) == "-"
- pipeline = [p.strip() for p in pipeline.split(",")] if pipeline else []
- cfg = init_config(output_path, base_path, model, lang, pipeline, silent=is_stdout)
- if is_stdout:
- print(cfg.to_str())
+ """
+ Generate a starter config.cfg for training. Based on your requirements
+ specified via the CLI arguments, this command generates a config with the
+ optimal settings for you use case. This includes the choice of architecture,
+ pretrained weights and related hyperparameters.
+ """
+ if isinstance(optimize, Optimizations): # instance of enum from the CLI
+ optimize = optimize.value
+ pipeline = [p.strip() for p in pipeline.split(",")]
+ init_config(output_file, lang=lang, pipeline=pipeline, optimize=optimize, cpu=cpu)
+
+
+@init_cli.command("fill-config")
+def init_fill_config_cli(
+ # fmt: off
+ base_path: Path = Arg(..., help="Base config to fill", exists=True, dir_okay=False),
+ output_file: Path = Arg("-", help="File to save config.cfg to (or - for stdout)", allow_dash=True),
+ diff: bool = Opt(False, "--diff", "-D", help="Print a visual diff highlighting the changes")
+ # fmt: on
+):
+ """
+ Fill partial config.cfg with default values. Will add all missing settings
+ from the default config and will create all objects, check the registered
+ functions for their default values and update the base config. This command
+ can be used with a config generated via the training quickstart widget:
+ https://nightly.spacy.io/usage/training#quickstart
+ """
+ fill_config(output_file, base_path, diff=diff)
+
+
+def fill_config(
+ output_file: Path, base_path: Path, *, diff: bool = False
+) -> Tuple[Config, Config]:
+ is_stdout = str(output_file) == "-"
+ msg = Printer(no_print=is_stdout)
+ with show_validation_error(hint_fill=False):
+ config = util.load_config(base_path)
+ nlp, _ = util.load_model_from_config(config, auto_fill=True)
+ before = config.to_str()
+ after = nlp.config.to_str()
+ if before == after:
+ msg.warn("Nothing to auto-fill: base config is already complete")
else:
- cfg.to_disk(output_path)
- msg.good("Saved config", output_path)
+ msg.good("Auto-filled config with all values")
+ if diff and not is_stdout:
+ if before == after:
+ msg.warn("No diff to show: nothing was auto-filled")
+ else:
+ msg.divider("START CONFIG DIFF")
+ print("")
+ print(diff_strings(before, after))
+ msg.divider("END CONFIG DIFF")
+ print("")
+ save_config(nlp.config, output_file, is_stdout=is_stdout)
def init_config(
- output_path: Path,
- config_path: Optional[Path],
- model: Optional[str],
- lang: Optional[str],
- pipeline: Optional[List[str]],
- silent: bool = False,
-) -> Config:
- if config_path is not None:
- msg.info("Generating config from base config", show=not silent)
- with show_validation_error(config_path, hint_init=False):
- config = Config().from_disk(config_path)
- try:
- nlp, _ = load_model_from_config(config, auto_fill=True)
- except ValueError as e:
- msg.fail(str(e), exits=1)
- return nlp.config
- if model is not None:
- ext = f" with pipeline {pipeline}" if pipeline else ""
- msg.info(f"Generating config from model {model}{ext}", show=not silent)
- nlp = load_model(model)
- for existing_pipe_name in nlp.pipe_names:
- if existing_pipe_name not in pipeline:
- nlp.remove_pipe(existing_pipe_name)
- for pipe_name in pipeline:
- if pipe_name not in nlp.pipe_names:
- nlp.add_pipe(pipe_name)
- return nlp.config
- if lang is not None:
- ext = f" with pipeline {pipeline}" if pipeline else ""
- msg.info(f"Generating config for language '{lang}'{ext}", show=not silent)
- nlp = get_lang_class(lang)()
- for pipe_name in pipeline:
- nlp.add_pipe(pipe_name)
- return nlp.config
-
-
-def validate_cli_args(
- config_path: Optional[Path], model: Optional[str], lang: Optional[str]
+ output_file: Path, *, lang: str, pipeline: List[str], optimize: str, cpu: bool
) -> None:
- args = {"--base": config_path, "--model": model, "--lang": lang}
- if sum(arg is not None for arg in args.values()) != 1:
- existing = " ".join(f"{a} {v}" for a, v in args.items() if v is not None)
+ is_stdout = str(output_file) == "-"
+ msg = Printer(no_print=is_stdout)
+ try:
+ from jinja2 import Template
+ except ImportError:
+ msg.fail("This command requires jinja2", "pip install jinja2", exits=1)
+ recommendations = srsly.read_json(RECOMMENDATIONS_PATH)
+ lang_defaults = util.get_lang_class(lang).Defaults
+ has_letters = lang_defaults.writing_system.get("has_letters", True)
+ # Filter out duplicates since tok2vec and transformer are added by template
+ pipeline = [pipe for pipe in pipeline if pipe not in ("tok2vec", "transformer")]
+ reco = RecommendationSchema(**recommendations.get(lang, {})).dict()
+ with TEMPLATE_PATH.open("r") as f:
+ template = Template(f.read())
+ variables = {
+ "lang": lang,
+ "components": pipeline,
+ "optimize": optimize,
+ "hardware": "cpu" if cpu else "gpu",
+ "transformer_data": reco["transformer"],
+ "word_vectors": reco["word_vectors"],
+ "has_letters": has_letters,
+ }
+ base_template = template.render(variables).strip()
+ # Giving up on getting the newlines right in jinja for now
+ base_template = re.sub(r"\n\n\n+", "\n\n", base_template)
+ # Access variables declared in templates
+ template_vars = template.make_module(variables)
+ use_case = {
+ "Language": lang,
+ "Pipeline": ", ".join(pipeline),
+ "Optimize for": optimize,
+ "Hardware": variables["hardware"].upper(),
+ "Transformer": template_vars.transformer.get("name", False),
+ }
+ msg.info("Generated template specific for your use case")
+ for label, value in use_case.items():
+ msg.text(f"- {label}: {value}")
+ use_transformer = bool(template_vars.use_transformer)
+ if use_transformer:
+ require_spacy_transformers(msg)
+ with show_validation_error(hint_fill=False):
+ config = util.load_config_from_str(base_template)
+ nlp, _ = util.load_model_from_config(config, auto_fill=True)
+ if use_transformer:
+ nlp.config.pop("pretraining", {}) # TODO: solve this better
+ msg.good("Auto-filled config with all values")
+ save_config(nlp.config, output_file, is_stdout=is_stdout)
+
+
+def save_config(config: Config, output_file: Path, is_stdout: bool = False) -> None:
+ msg = Printer(no_print=is_stdout)
+ if is_stdout:
+ print(config.to_str())
+ else:
+ config.to_disk(output_file, interpolate=False)
+ msg.good("Saved config", output_file)
+ msg.text("You can now add your data and train your model:")
+ variables = ["--paths.train ./train.spacy", "--paths.dev ./dev.spacy"]
+ print(f"{COMMAND} train {output_file.parts[-1]} {' '.join(variables)}")
+
+
+def require_spacy_transformers(msg: Printer) -> None:
+ try:
+ import spacy_transformers # noqa: F401
+ except ImportError:
msg.fail(
- "The init config command expects only one of the following arguments: "
- "--base (base config to fill and update), --lang (language code to "
- "use for blank config) or --model (base model to copy config from).",
- f"Got: {existing if existing else 'no arguments'}",
+ "Using a transformer-based pipeline requires spacy-transformers "
+ "to be installed.",
exits=1,
)
diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py
index ce0eb27a0..82950f402 100644
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@@ -5,7 +5,7 @@ import time
import re
from collections import Counter
from pathlib import Path
-from thinc.api import use_pytorch_for_gpu_memory, require_gpu, Config
+from thinc.api import use_pytorch_for_gpu_memory, require_gpu
from thinc.api import set_dropout_rate, to_categorical, fix_random_seed
from thinc.api import CosineDistance, L2Distance
from wasabi import msg
@@ -88,7 +88,7 @@ def pretrain(
msg.info("Using CPU")
msg.info(f"Loading config from: {config_path}")
with show_validation_error(config_path):
- config = Config().from_disk(config_path, overrides=config_overrides)
+ config = util.load_config(config_path, overrides=config_overrides)
nlp, config = util.load_model_from_config(config)
# TODO: validate that [pretraining] block exists
if not output_dir.exists():
diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
new file mode 100644
index 000000000..4f5a2226e
--- /dev/null
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -0,0 +1,237 @@
+{# This is a template for training configs used for the quickstart widget in
+the docs and the init config command. It encodes various best practices and
+can help generate the best possible configuration, given a user's requirements. #}
+{%- set use_transformer = (transformer_data and hardware != "cpu") -%}
+{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
+[paths]
+train = ""
+dev = ""
+
+[system]
+use_pytorch_for_gpu_memory = {{ "true" if use_transformer else "false" }}
+
+[nlp]
+lang = "{{ lang }}"
+{%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components %}
+pipeline = {{ full_pipeline|pprint()|replace("'", '"')|safe }}
+tokenizer = {"@tokenizers": "spacy.Tokenizer.v1"}
+
+[components]
+
+{# TRANSFORMER PIPELINE #}
+{%- if use_transformer -%}
+[components.transformer]
+factory = "transformer"
+
+[components.transformer.model]
+@architectures = "spacy-transformers.TransformerModel.v1"
+name = "{{ transformer["name"] }}"
+tokenizer_config = {"use_fast": true}
+
+[components.transformer.model.get_spans]
+@span_getters = "strided_spans.v1"
+window = 128
+stride = 96
+
+{% if "tagger" in components %}
+[components.tagger]
+factory = "tagger"
+
+[components.tagger.model]
+@architectures = "spacy.Tagger.v1"
+nO = null
+
+[components.tagger.model.tok2vec]
+@architectures = "spacy-transformers.Tok2VecListener.v1"
+grad_factor = 1.0
+
+[components.tagger.model.tok2vec.pooling]
+@layers = "reduce_mean.v1"
+{%- endif %}
+
+{% if "parser" in components -%}
+[components.parser]
+factory = "parser"
+
+[components.parser.model]
+@architectures = "spacy.TransitionBasedParser.v1"
+nr_feature_tokens = 8
+hidden_width = 128
+maxout_pieces = 3
+use_upper = false
+nO = null
+
+[components.parser.model.tok2vec]
+@architectures = "spacy-transformers.Tok2VecListener.v1"
+grad_factor = 1.0
+
+[components.parser.model.tok2vec.pooling]
+@layers = "reduce_mean.v1"
+{%- endif %}
+
+{% if "ner" in components -%}
+[components.ner]
+factory = "ner"
+
+[components.ner.model]
+@architectures = "spacy.TransitionBasedParser.v1"
+nr_feature_tokens = 3
+hidden_width = 64
+maxout_pieces = 2
+use_upper = false
+nO = null
+
+[components.ner.model.tok2vec]
+@architectures = "spacy-transformers.Tok2VecListener.v1"
+grad_factor = 1.0
+
+[components.ner.model.tok2vec.pooling]
+@layers = "reduce_mean.v1"
+{% endif -%}
+
+{# NON-TRANSFORMER PIPELINE #}
+{% else -%}
+
+{%- if hardware == "gpu" -%}
+# There are no recommended transformer weights available for language '{{ lang }}'
+# yet, so the pipeline described here is not transformer-based.
+{%- endif %}
+
+[components.tok2vec]
+factory = "tok2vec"
+
+[components.tok2vec.model]
+@architectures = "spacy.Tok2Vec.v1"
+
+[components.tok2vec.model.embed]
+@architectures = "spacy.MultiHashEmbed.v1"
+width = ${components.tok2vec.model.encode:width}
+rows = {{ 2000 if optimize == "efficiency" else 7000 }}
+also_embed_subwords = {{ true if has_letters else false }}
+also_use_static_vectors = {{ true if optimize == "accuracy" else false }}
+
+[components.tok2vec.model.encode]
+@architectures = "spacy.MaxoutWindowEncoder.v1"
+width = {{ 96 if optimize == "efficiency" else 256 }}
+depth = {{ 4 if optimize == "efficiency" else 8 }}
+window_size = 1
+maxout_pieces = 3
+
+{% if "tagger" in components %}
+[components.tagger]
+factory = "tagger"
+
+[components.tagger.model]
+@architectures = "spacy.Tagger.v1"
+nO = null
+
+[components.tagger.model.tok2vec]
+@architectures = "spacy.Tok2VecListener.v1"
+width = ${components.tok2vec.model.encode:width}
+{%- endif %}
+
+{% if "parser" in components -%}
+[components.parser]
+factory = "parser"
+
+[components.parser.model]
+@architectures = "spacy.TransitionBasedParser.v1"
+nr_feature_tokens = 8
+hidden_width = 128
+maxout_pieces = 3
+use_upper = true
+nO = null
+
+[components.parser.model.tok2vec]
+@architectures = "spacy.Tok2VecListener.v1"
+width = ${components.tok2vec.model.encode:width}
+{%- endif %}
+
+{% if "ner" in components %}
+[components.ner]
+factory = "ner"
+
+[components.ner.model]
+@architectures = "spacy.TransitionBasedParser.v1"
+nr_feature_tokens = 6
+hidden_width = 64
+maxout_pieces = 2
+use_upper = true
+nO = null
+
+[components.ner.model.tok2vec]
+@architectures = "spacy.Tok2VecListener.v1"
+width = ${components.tok2vec.model.encode:width}
+{% endif %}
+{% endif %}
+
+{% for pipe in components %}
+{% if pipe not in ["tagger", "parser", "ner"] %}
+{# Other components defined by the user: we just assume they're factories #}
+[components.{{ pipe }}]
+factory = "{{ pipe }}"
+{% endif %}
+{% endfor %}
+
+[training]
+{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
+vectors = null
+{% else -%}
+vectors = "{{ word_vectors }}"
+{% endif -%}
+{% if use_transformer -%}
+accumulate_gradient = {{ transformer["size_factor"] }}
+{% endif %}
+
+[training.optimizer]
+@optimizers = "Adam.v1"
+
+[training.optimizer.learn_rate]
+@schedules = "warmup_linear.v1"
+warmup_steps = 250
+total_steps = 20000
+initial_rate = 5e-5
+
+[training.train_corpus]
+@readers = "spacy.Corpus.v1"
+path = ${paths:train}
+max_length = {{ 500 if hardware == "gpu" else 0 }}
+
+[training.dev_corpus]
+@readers = "spacy.Corpus.v1"
+path = ${paths:dev}
+max_length = 0
+
+{% if use_transformer %}
+[training.batcher]
+@batchers = "batch_by_padded.v1"
+discard_oversize = true
+size = 2000
+buffer = 256
+{%- else %}
+[training.batcher]
+@batchers = "batch_by_words.v1"
+discard_oversize = false
+tolerance = 0.2
+
+[training.batcher.size]
+@schedules = "compounding.v1"
+start = 100
+stop = 1000
+compound = 1.001
+{% endif %}
+
+[training.score_weights]
+{%- if "tagger" in components %}
+tag_acc = {{ (1.0 / components|length)|round(2) }}
+{%- endif -%}
+{%- if "parser" in components %}
+dep_uas = 0.0
+dep_las = {{ (1.0 / components|length)|round(2) }}
+sents_f = 0.0
+{%- endif %}
+{%- if "ner" in components %}
+ents_f = {{ (1.0 / components|length)|round(2) }}
+ents_p = 0.0
+ents_r = 0.0
+{%- endif -%}
diff --git a/spacy/cli/templates/quickstart_training_recommendations.json b/spacy/cli/templates/quickstart_training_recommendations.json
new file mode 100644
index 000000000..8a3acc438
--- /dev/null
+++ b/spacy/cli/templates/quickstart_training_recommendations.json
@@ -0,0 +1,13 @@
+{
+ "en": {
+ "word_vectors": "en_vectors_web_lg",
+ "transformer": {
+ "efficiency": { "name": "roberta-base", "size_factor": 3 },
+ "accuracy": { "name": "roberta-base", "size_factor": 3 }
+ }
+ },
+ "de": {
+ "word_vectors": null,
+ "transformer": null
+ }
+}
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 32d22d1bc..375e64ffd 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -9,6 +9,7 @@ from thinc.api import use_pytorch_for_gpu_memory, require_gpu, fix_random_seed
from thinc.api import Config, Optimizer
import random
import typer
+import logging
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
from ._util import import_code, get_sourced_components
@@ -17,7 +18,6 @@ from .. import util
from ..gold.example import Example
from ..errors import Errors
-
# Don't remove - required to load the built-in architectures
from ..ml import models # noqa: F401
@@ -48,7 +48,7 @@ def train_cli(
used to register custom functions and architectures that can then be
referenced in the config.
"""
- util.set_env_log(verbose)
+ util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
verify_cli_args(config_path, output_path)
overrides = parse_config_overrides(ctx.args)
import_code(code_path)
@@ -75,7 +75,7 @@ def train(
msg.info("Using CPU")
msg.info(f"Loading config and nlp from: {config_path}")
with show_validation_error(config_path):
- config = Config().from_disk(config_path, overrides=config_overrides)
+ config = util.load_config(config_path, overrides=config_overrides)
if config.get("training", {}).get("seed") is not None:
fix_random_seed(config["training"]["seed"])
# Use original config here before it's resolved to functions
@@ -102,9 +102,9 @@ def train(
if resume_components:
with nlp.select_pipes(enable=resume_components):
msg.info(f"Resuming training for: {resume_components}")
- nlp.resume_training()
+ nlp.resume_training(sgd=optimizer)
with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
- nlp.begin_training(lambda: train_corpus(nlp))
+ nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
if tag_map:
# Replace tag map with provided mapping
@@ -295,7 +295,11 @@ def train_while_improving(
nlp.rehearse(raw_batch, sgd=optimizer, losses=losses, exclude=exclude)
# TODO: refactor this so we don't have to run it separately in here
for name, proc in nlp.pipeline:
- if name not in exclude and hasattr(proc, "model"):
+ if (
+ name not in exclude
+ and hasattr(proc, "model")
+ and proc.model not in (True, False, None)
+ ):
proc.model.finish_update(optimizer)
optimizer.step_schedules()
if not (step % eval_frequency):
diff --git a/spacy/errors.py b/spacy/errors.py
index 8e9a8d4b4..26c0dba29 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -55,12 +55,6 @@ class Warnings:
"loaded. (Shape: {shape})")
W021 = ("Unexpected hash collision in PhraseMatcher. Matches may be "
"incorrect. Modify PhraseMatcher._terminal_hash to fix.")
- W022 = ("Training a new part-of-speech tagger using a model with no "
- "lemmatization rules or data. This means that the trained model "
- "may not be able to lemmatize correctly. If this is intentional "
- "or the language you're using doesn't have lemmatization data, "
- "you can ignore this warning. If this is surprising, make sure you "
- "have the spacy-lookups-data package installed.")
W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
"the Knowledge Base.")
W026 = ("Unable to set all sentence boundaries from dependency parses.")
@@ -482,6 +476,15 @@ class Errors:
E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
# TODO: fix numbering after merging develop into master
+ E930 = ("Received invalid get_examples callback in {name}.begin_training. "
+ "Expected function that returns an iterable of Example objects but "
+ "got: {obj}")
+ E931 = ("Encountered Pipe subclass without Pipe.{method} method in component "
+ "'{name}'. If the component is trainable and you want to use this "
+ "method, make sure it's overwritten on the subclass. If your "
+ "component isn't trainable, add a method that does nothing or "
+ "don't use the Pipe base class.")
+ E940 = ("Found NaN values in scores.")
E941 = ("Can't find model '{name}'. It looks like you're trying to load a "
"model from a shortcut, which is deprecated as of spaCy v3.0. To "
"load the model, use its full name instead:\n\n"
@@ -578,8 +581,7 @@ class Errors:
"but received None.")
E977 = ("Can not compare a MorphAnalysis with a string object. "
"This is likely a bug in spaCy, so feel free to open an issue.")
- E978 = ("The '{method}' method of {name} takes a list of Example objects, "
- "but found {types} instead.")
+ E978 = ("The {name} method takes a list of Example objects, but got: {types}")
E979 = ("Cannot convert {type} to an Example object.")
E980 = ("Each link annotation should refer to a dictionary with at most one "
"identifier mapping to 1.0, and all others to 0.0.")
diff --git a/spacy/gold/__init__.py b/spacy/gold/__init__.py
index 142c6b3a7..4d71eae09 100644
--- a/spacy/gold/__init__.py
+++ b/spacy/gold/__init__.py
@@ -1,5 +1,5 @@
from .corpus import Corpus # noqa: F401
-from .example import Example # noqa: F401
+from .example import Example, validate_examples # noqa: F401
from .align import Alignment # noqa: F401
from .iob_utils import iob_to_biluo, biluo_to_iob # noqa: F401
from .iob_utils import biluo_tags_from_offsets, offsets_from_biluo_tags # noqa: F401
diff --git a/spacy/gold/corpus.py b/spacy/gold/corpus.py
index 745d52e0e..774c3b840 100644
--- a/spacy/gold/corpus.py
+++ b/spacy/gold/corpus.py
@@ -62,7 +62,7 @@ class Corpus:
if str(path) in seen:
continue
seen.add(str(path))
- if path.parts[-1].startswith("."):
+ if path.parts and path.parts[-1].startswith("."):
continue
elif path.is_dir():
paths.extend(path.iterdir())
diff --git a/spacy/gold/example.pyx b/spacy/gold/example.pyx
index 6093d2346..3344704bf 100644
--- a/spacy/gold/example.pyx
+++ b/spacy/gold/example.pyx
@@ -1,5 +1,5 @@
+from collections import Iterable as IterableInstance
import warnings
-
import numpy
from ..tokens.doc cimport Doc
@@ -26,6 +26,22 @@ cpdef Doc annotations2doc(vocab, tok_annot, doc_annot):
return output
+def validate_examples(examples, method):
+ """Check that a batch of examples received during processing is valid.
+ This function lives here to prevent circular imports.
+
+ examples (Iterable[Examples]): A batch of examples.
+ method (str): The method name to show in error messages.
+ """
+ if not isinstance(examples, IterableInstance):
+ err = Errors.E978.format(name=method, types=type(examples))
+ raise TypeError(err)
+ wrong = set([type(eg) for eg in examples if not isinstance(eg, Example)])
+ if wrong:
+ err = Errors.E978.format(name=method, types=wrong)
+ raise TypeError(err)
+
+
cdef class Example:
def __init__(self, Doc predicted, Doc reference, *, alignment=None):
if predicted is None:
@@ -263,12 +279,10 @@ def _annot2array(vocab, tok_annot, doc_annot):
values.append([vocab.morphology.add(v) for v in value])
else:
attrs.append(key)
- try:
- values.append([vocab.strings.add(v) for v in value])
- except TypeError:
- types= set([type(v) for v in value])
+ if not all(isinstance(v, str) for v in value):
+ types = set([type(v) for v in value])
raise TypeError(Errors.E969.format(field=key, types=types)) from None
-
+ values.append([vocab.strings.add(v) for v in value])
array = numpy.asarray(values, dtype="uint64")
return attrs, array.T
diff --git a/spacy/language.py b/spacy/language.py
index 85aac15ef..b67c55e3b 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -5,7 +5,6 @@ import random
import itertools
import weakref
import functools
-from collections import Iterable as IterableInstance
from contextlib import contextmanager
from copy import copy, deepcopy
from pathlib import Path
@@ -19,10 +18,10 @@ from timeit import default_timer as timer
from .tokens.underscore import Underscore
from .vocab import Vocab, create_vocab
from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
-from .gold import Example
+from .gold import Example, validate_examples
from .scorer import Scorer
from .util import create_default_optimizer, registry
-from .util import SimpleFrozenDict, combine_score_weights
+from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER
from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .lang.punctuation import TOKENIZER_INFIXES
@@ -37,7 +36,7 @@ from . import about
# This is the base config will all settings (training etc.)
DEFAULT_CONFIG_PATH = Path(__file__).parent / "default_config.cfg"
-DEFAULT_CONFIG = Config().from_disk(DEFAULT_CONFIG_PATH)
+DEFAULT_CONFIG = util.load_config(DEFAULT_CONFIG_PATH)
class BaseDefaults:
@@ -46,7 +45,7 @@ class BaseDefaults:
Language.Defaults.
"""
- config: Config = Config()
+ config: Config = Config(section_order=CONFIG_SECTION_ORDER)
tokenizer_exceptions: Dict[str, List[dict]] = BASE_EXCEPTIONS
prefixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_PREFIXES
suffixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_SUFFIXES
@@ -135,7 +134,7 @@ class Language:
# of the rest.
util.registry._entry_point_factories.get_all()
- self._config = util.deep_merge_configs(self.default_config, DEFAULT_CONFIG)
+ self._config = DEFAULT_CONFIG.merge(self.default_config)
self._meta = dict(meta)
self._path = None
self._optimizer = None
@@ -168,9 +167,7 @@ class Language:
def __init_subclass__(cls, **kwargs):
super().__init_subclass__(**kwargs)
- cls.default_config = util.deep_merge_configs(
- cls.Defaults.config, DEFAULT_CONFIG
- )
+ cls.default_config = DEFAULT_CONFIG.merge(cls.Defaults.config)
cls.default_config["nlp"]["lang"] = cls.lang
@property
@@ -533,6 +530,7 @@ class Language:
name: Optional[str] = None,
*,
config: Optional[Dict[str, Any]] = SimpleFrozenDict(),
+ raw_config: Optional[Config] = None,
validate: bool = True,
) -> Callable[[Doc], Doc]:
"""Create a pipeline component. Mostly used internally. To create and
@@ -543,6 +541,7 @@ class Language:
Defaults to factory name if not set.
config (Optional[Dict[str, Any]]): Config parameters to use for this
component. Will be merged with default config, if available.
+ raw_config (Optional[Config]): Internals: the non-interpolated config.
validate (bool): Whether to validate the component config against the
arguments and types expected by the factory.
RETURNS (Callable[[Doc], Doc]): The pipeline component.
@@ -569,7 +568,7 @@ class Language:
# This is unideal, but the alternative would mean you always need to
# specify the full config settings, which is not really viable.
if pipe_meta.default_config:
- config = util.deep_merge_configs(config, pipe_meta.default_config)
+ config = Config(pipe_meta.default_config).merge(config)
# We need to create a top-level key because Thinc doesn't allow resolving
# top-level references to registered functions. Also gives nicer errors.
# The name allows components to know their pipe name and use it in the
@@ -583,12 +582,14 @@ class Language:
cfg = {factory_name: config}
# We're calling the internal _fill here to avoid constructing the
# registered functions twice
- # TODO: customize validation to make it more readable / relate it to
- # pipeline component and why it failed, explain default config
resolved, filled = registry.resolve(cfg, validate=validate)
- filled = filled[factory_name]
+ filled = Config(filled[factory_name])
filled["factory"] = factory_name
filled.pop("@factories", None)
+ # Merge the final filled config with the raw config (including non-
+ # interpolated variables)
+ if raw_config:
+ filled = filled.merge(raw_config)
self._pipe_configs[name] = filled
return resolved[factory_name]
@@ -614,7 +615,10 @@ class Language:
)
)
pipe = source.get_pipe(source_name)
- pipe_config = util.copy_config(source.config["components"][source_name])
+ # Make sure the source config is interpolated so we don't end up with
+ # orphaned variables in our final config
+ source_config = source.config.interpolate()
+ pipe_config = util.copy_config(source_config["components"][source_name])
self._pipe_configs[name] = pipe_config
return pipe, pipe_config["factory"]
@@ -629,6 +633,7 @@ class Language:
last: Optional[bool] = None,
source: Optional["Language"] = None,
config: Optional[Dict[str, Any]] = SimpleFrozenDict(),
+ raw_config: Optional[Config] = None,
validate: bool = True,
) -> Callable[[Doc], Doc]:
"""Add a component to the processing pipeline. Valid components are
@@ -650,6 +655,7 @@ class Language:
component from.
config (Optional[Dict[str, Any]]): Config parameters to use for this
component. Will be merged with default config, if available.
+ raw_config (Optional[Config]): Internals: the non-interpolated config.
validate (bool): Whether to validate the component config against the
arguments and types expected by the factory.
RETURNS (Callable[[Doc], Doc]): The pipeline component.
@@ -679,7 +685,11 @@ class Language:
lang_code=self.lang,
)
pipe_component = self.create_pipe(
- factory_name, name=name, config=config, validate=validate,
+ factory_name,
+ name=name,
+ config=config,
+ raw_config=raw_config,
+ validate=validate,
)
pipe_index = self._get_pipe_index(before, after, first, last)
self._pipe_meta[name] = self.get_factory_meta(factory_name)
@@ -935,17 +945,7 @@ class Language:
losses = {}
if len(examples) == 0:
return losses
- if not isinstance(examples, IterableInstance):
- raise TypeError(
- Errors.E978.format(
- name="language", method="update", types=type(examples)
- )
- )
- wrong_types = set([type(eg) for eg in examples if not isinstance(eg, Example)])
- if wrong_types:
- raise TypeError(
- Errors.E978.format(name="language", method="update", types=wrong_types)
- )
+ validate_examples(examples, "Language.update")
if sgd is None:
if self._optimizer is None:
self._optimizer = create_default_optimizer()
@@ -962,7 +962,11 @@ class Language:
proc.update(examples, sgd=None, losses=losses, **component_cfg[name])
if sgd not in (None, False):
for name, proc in self.pipeline:
- if name not in exclude and hasattr(proc, "model"):
+ if (
+ name not in exclude
+ and hasattr(proc, "model")
+ and proc.model not in (True, False, None)
+ ):
proc.model.finish_update(sgd)
return losses
@@ -999,19 +1003,7 @@ class Language:
"""
if len(examples) == 0:
return
- if not isinstance(examples, IterableInstance):
- raise TypeError(
- Errors.E978.format(
- name="language", method="rehearse", types=type(examples)
- )
- )
- wrong_types = set([type(eg) for eg in examples if not isinstance(eg, Example)])
- if wrong_types:
- raise TypeError(
- Errors.E978.format(
- name="language", method="rehearse", types=wrong_types
- )
- )
+ validate_examples(examples, "Language.rehearse")
if sgd is None:
if self._optimizer is None:
self._optimizer = create_default_optimizer()
@@ -1060,7 +1052,15 @@ class Language:
if get_examples is None:
get_examples = lambda: []
else: # Populate vocab
+ if not hasattr(get_examples, "__call__"):
+ err = Errors.E930.format(name="Language", obj=type(get_examples))
+ raise ValueError(err)
for example in get_examples():
+ if not isinstance(example, Example):
+ err = Errors.E978.format(
+ name="Language.begin_training", types=type(example)
+ )
+ raise ValueError(err)
for word in [t.text for t in example.reference]:
_ = self.vocab[word] # noqa: F841
if device >= 0: # TODO: do we need this here?
@@ -1133,17 +1133,7 @@ class Language:
DOCS: https://spacy.io/api/language#evaluate
"""
- if not isinstance(examples, IterableInstance):
- err = Errors.E978.format(
- name="language", method="evaluate", types=type(examples)
- )
- raise TypeError(err)
- wrong_types = set([type(eg) for eg in examples if not isinstance(eg, Example)])
- if wrong_types:
- err = Errors.E978.format(
- name="language", method="evaluate", types=wrong_types
- )
- raise TypeError(err)
+ validate_examples(examples, "Language.evaluate")
if component_cfg is None:
component_cfg = {}
if scorer_cfg is None:
@@ -1400,7 +1390,9 @@ class Language:
DOCS: https://spacy.io/api/language#from_config
"""
if auto_fill:
- config = util.deep_merge_configs(config, cls.default_config)
+ config = Config(
+ cls.default_config, section_order=CONFIG_SECTION_ORDER
+ ).merge(config)
if "nlp" not in config:
raise ValueError(Errors.E985.format(config=config))
config_lang = config["nlp"]["lang"]
@@ -1438,16 +1430,20 @@ class Language:
or lang_cls is not cls
):
raise ValueError(Errors.E943.format(value=type(lang_cls)))
+ # Note that we don't load vectors here, instead they get loaded explicitly
+ # inside stuff like the spacy train function. If we loaded them here,
+ # then we would load them twice at runtime: once when we make from config,
+ # and then again when we load from disk.
nlp = lang_cls(vocab=vocab, create_tokenizer=create_tokenizer)
if after_creation is not None:
nlp = after_creation(nlp)
if not isinstance(nlp, cls):
raise ValueError(Errors.E942.format(name="creation", value=type(nlp)))
- # Note that we don't load vectors here, instead they get loaded explicitly
- # inside stuff like the spacy train function. If we loaded them here,
- # then we would load them twice at runtime: once when we make from config,
- # and then again when we load from disk.
- pipeline = config.get("components", {})
+ # To create the components we need to use the final interpolated config
+ # so all values are available (if component configs use variables).
+ # Later we replace the component config with the raw config again.
+ interpolated = filled.interpolate() if not filled.is_interpolated else filled
+ pipeline = interpolated.get("components", {})
# If components are loaded from a source (existing models), we cache
# them here so they're only loaded once
source_nlps = {}
@@ -1456,6 +1452,7 @@ class Language:
opts = ", ".join(pipeline.keys())
raise ValueError(Errors.E956.format(name=pipe_name, opts=opts))
pipe_cfg = util.copy_config(pipeline[pipe_name])
+ raw_config = Config(filled["components"][pipe_name])
if pipe_name not in disable:
if "factory" not in pipe_cfg and "source" not in pipe_cfg:
err = Errors.E984.format(name=pipe_name, config=pipe_cfg)
@@ -1465,7 +1462,11 @@ class Language:
# The pipe name (key in the config) here is the unique name
# of the component, not necessarily the factory
nlp.add_pipe(
- factory, name=pipe_name, config=pipe_cfg, validate=validate,
+ factory,
+ name=pipe_name,
+ config=pipe_cfg,
+ validate=validate,
+ raw_config=raw_config,
)
else:
model = pipe_cfg["source"]
@@ -1663,7 +1664,7 @@ def _fix_pretrained_vectors_name(nlp: Language) -> None:
else:
raise ValueError(Errors.E092)
for name, proc in nlp.pipeline:
- if not hasattr(proc, "cfg"):
+ if not hasattr(proc, "cfg") or not isinstance(proc.cfg, dict):
continue
proc.cfg.setdefault("deprecation_fixes", {})
proc.cfg["deprecation_fixes"]["vectors_name"] = nlp.vocab.vectors.name
diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx
index 801229af5..76f58df58 100644
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@@ -9,6 +9,7 @@ from .functions import merge_subtokens
from ..language import Language
from ._parser_internals import nonproj
from ..scorer import Scorer
+from ..gold import validate_examples
default_model_config = """
@@ -147,6 +148,7 @@ cdef class DependencyParser(Parser):
DOCS: https://spacy.io/api/dependencyparser#score
"""
+ validate_examples(examples, "DependencyParser.score")
def dep_getter(token, attr):
dep = getattr(token, attr)
dep = token.vocab.strings.as_string(dep).lower()
diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index 080273f57..35bf2906e 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -11,7 +11,7 @@ from ..tokens import Doc
from .pipe import Pipe, deserialize_config
from ..language import Language
from ..vocab import Vocab
-from ..gold import Example
+from ..gold import Example, validate_examples
from ..errors import Errors, Warnings
from .. import util
@@ -142,7 +142,7 @@ class EntityLinker(Pipe):
def begin_training(
self,
- get_examples: Callable[[], Iterable[Example]] = lambda: [],
+ get_examples: Callable[[], Iterable[Example]],
*,
pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
sgd: Optional[Optimizer] = None,
@@ -197,14 +197,9 @@ class EntityLinker(Pipe):
losses.setdefault(self.name, 0.0)
if not examples:
return losses
+ validate_examples(examples, "EntityLinker.update")
sentence_docs = []
- try:
- docs = [eg.predicted for eg in examples]
- except AttributeError:
- types = set([type(eg) for eg in examples])
- raise TypeError(
- Errors.E978.format(name="EntityLinker", method="update", types=types)
- ) from None
+ docs = [eg.predicted for eg in examples]
if set_annotations:
# This seems simpler than other ways to get that exact output -- but
# it does run the model twice :(
@@ -250,6 +245,7 @@ class EntityLinker(Pipe):
return losses
def get_loss(self, examples: Iterable[Example], sentence_encodings):
+ validate_examples(examples, "EntityLinker.get_loss")
entity_encodings = []
for eg in examples:
kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py
index bef97ec46..4f4e0fdd5 100644
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@@ -9,6 +9,7 @@ from ..util import ensure_path, to_disk, from_disk
from ..tokens import Doc, Span
from ..matcher import Matcher, PhraseMatcher
from ..scorer import Scorer
+from ..gold import validate_examples
DEFAULT_ENT_ID_SEP = "||"
@@ -312,6 +313,7 @@ class EntityRuler:
return label
def score(self, examples, **kwargs):
+ validate_examples(examples, "EntityRuler.score")
return Scorer.score_spans(examples, "ents", **kwargs)
def from_bytes(
diff --git a/spacy/pipeline/lemmatizer.py b/spacy/pipeline/lemmatizer.py
index f2028772f..6cea65fec 100644
--- a/spacy/pipeline/lemmatizer.py
+++ b/spacy/pipeline/lemmatizer.py
@@ -1,5 +1,4 @@
from typing import Optional, List, Dict, Any
-
from thinc.api import Model
from .pipe import Pipe
@@ -9,6 +8,7 @@ from ..lookups import Lookups, load_lookups
from ..scorer import Scorer
from ..tokens import Doc, Token
from ..vocab import Vocab
+from ..gold import validate_examples
from .. import util
@@ -127,6 +127,7 @@ class Lemmatizer(Pipe):
"""
self.vocab = vocab
self.model = model
+ self.name = name
self._mode = mode
self.lookups = lookups if lookups is not None else Lookups()
self.overwrite = overwrite
@@ -135,10 +136,10 @@ class Lemmatizer(Pipe):
elif self.mode == "rule":
self.lemmatize = self.rule_lemmatize
else:
- try:
- self.lemmatize = getattr(self, f"{self.mode}_lemmatize")
- except AttributeError:
+ mode_attr = f"{self.mode}_lemmatize"
+ if not hasattr(self, mode_attr):
raise ValueError(Errors.E1003.format(mode=mode))
+ self.lemmatize = getattr(self, mode_attr)
self.cache = {}
@property
@@ -271,6 +272,7 @@ class Lemmatizer(Pipe):
DOCS: https://spacy.io/api/lemmatizer#score
"""
+ validate_examples(examples, "Lemmatizer.score")
return Scorer.score_token_attr(examples, "lemma", **kwargs)
def to_disk(self, path, *, exclude=tuple()):
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index efc494181..329a05f90 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -6,15 +6,16 @@ from thinc.api import SequenceCategoricalCrossentropy, Model, Config
from ..tokens.doc cimport Doc
from ..vocab cimport Vocab
from ..morphology cimport Morphology
+
from ..parts_of_speech import IDS as POS_IDS
from ..symbols import POS
-
from ..language import Language
from ..errors import Errors
from .pipe import deserialize_config
from .tagger import Tagger
from .. import util
from ..scorer import Scorer
+from ..gold import validate_examples
default_model_config = """
@@ -126,7 +127,7 @@ class Morphologizer(Tagger):
self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
return 1
- def begin_training(self, get_examples=lambda: [], *, pipeline=None, sgd=None):
+ def begin_training(self, get_examples, *, pipeline=None, sgd=None):
"""Initialize the pipe for training, using data examples if available.
get_examples (Callable[[], Iterable[Example]]): Optional function that
@@ -140,6 +141,9 @@ class Morphologizer(Tagger):
DOCS: https://spacy.io/api/morphologizer#begin_training
"""
+ if not hasattr(get_examples, "__call__"):
+ err = Errors.E930.format(name="Morphologizer", obj=type(get_examples))
+ raise ValueError(err)
for example in get_examples():
for i, token in enumerate(example.reference):
pos = token.pos_
@@ -192,6 +196,7 @@ class Morphologizer(Tagger):
DOCS: https://spacy.io/api/morphologizer#get_loss
"""
+ validate_examples(examples, "Morphologizer.get_loss")
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
truths = []
for eg in examples:
@@ -228,6 +233,7 @@ class Morphologizer(Tagger):
DOCS: https://spacy.io/api/morphologizer#score
"""
+ validate_examples(examples, "Morphologizer.score")
results = {}
results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
results.update(Scorer.score_token_attr(examples, "morph", **kwargs))
diff --git a/spacy/pipeline/multitask.pyx b/spacy/pipeline/multitask.pyx
index 84ed19b0d..3ef85c821 100644
--- a/spacy/pipeline/multitask.pyx
+++ b/spacy/pipeline/multitask.pyx
@@ -8,6 +8,7 @@ from ..tokens.doc cimport Doc
from .pipe import Pipe
from .tagger import Tagger
+from ..gold import validate_examples
from ..language import Language
from ._parser_internals import nonproj
from ..attrs import POS, ID
@@ -80,10 +81,11 @@ class MultitaskObjective(Tagger):
def set_annotations(self, docs, dep_ids):
pass
- def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None):
- gold_examples = nonproj.preprocess_training_data(get_examples())
- # for raw_text, doc_annot in gold_tuples:
- for example in gold_examples:
+ def begin_training(self, get_examples, pipeline=None, sgd=None):
+ if not hasattr(get_examples, "__call__"):
+ err = Errors.E930.format(name="MultitaskObjective", obj=type(get_examples))
+ raise ValueError(err)
+ for example in get_examples():
for token in example.y:
label = self.make_label(token)
if label is not None and label not in self.labels:
@@ -175,7 +177,7 @@ class ClozeMultitask(Pipe):
def set_annotations(self, docs, dep_ids):
pass
- def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None):
+ def begin_training(self, get_examples, pipeline=None, sgd=None):
self.model.initialize()
X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
self.model.output_layer.begin_training(X)
@@ -189,6 +191,7 @@ class ClozeMultitask(Pipe):
return tokvecs, vectors
def get_loss(self, examples, vectors, prediction):
+ validate_examples(examples, "ClozeMultitask.get_loss")
# The simplest way to implement this would be to vstack the
# token.vector values, but that's a bit inefficient, especially on GPU.
# Instead we fetch the index into the vectors table for each of our tokens,
@@ -206,18 +209,16 @@ class ClozeMultitask(Pipe):
if losses is not None and self.name not in losses:
losses[self.name] = 0.
set_dropout_rate(self.model, drop)
- try:
- predictions, bp_predictions = self.model.begin_update([eg.predicted for eg in examples])
- except AttributeError:
- types = set([type(eg) for eg in examples])
- raise TypeError(Errors.E978.format(name="ClozeMultitask", method="rehearse", types=types)) from None
+ validate_examples(examples, "ClozeMultitask.rehearse")
+ docs = [eg.predicted for eg in examples]
+ predictions, bp_predictions = self.model.begin_update()
loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions)
bp_predictions(d_predictions)
if sgd is not None:
self.model.finish_update(sgd)
-
if losses is not None:
losses[self.name] += loss
+ return losses
def add_label(self, label):
raise NotImplementedError
diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx
index a3bc3d920..631b5ae72 100644
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@@ -7,6 +7,7 @@ from ._parser_internals.ner cimport BiluoPushDown
from ..language import Language
from ..scorer import Scorer
+from ..gold import validate_examples
default_model_config = """
@@ -50,7 +51,7 @@ def make_ner(
):
"""Create a transition-based EntityRecognizer component. The entity recognizer
identifies non-overlapping labelled spans of tokens.
-
+
The transition-based algorithm used encodes certain assumptions that are
effective for "traditional" named entity recognition tasks, but may not be
a good fit for every span identification problem. Specifically, the loss
@@ -120,4 +121,5 @@ cdef class EntityRecognizer(Parser):
DOCS: https://spacy.io/api/entityrecognizer#score
"""
+ validate_examples(examples, "EntityRecognizer.score")
return Scorer.score_spans(examples, "ents", **kwargs)
diff --git a/spacy/pipeline/pipe.pxd b/spacy/pipeline/pipe.pxd
index bb97f79d0..bca94d528 100644
--- a/spacy/pipeline/pipe.pxd
+++ b/spacy/pipeline/pipe.pxd
@@ -1,2 +1,5 @@
cdef class Pipe:
+ cdef public object vocab
+ cdef public object model
cdef public str name
+ cdef public object cfg
diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx
index 96a8b5944..51251dacc 100644
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@@ -1,9 +1,10 @@
# cython: infer_types=True, profile=True
import srsly
+from thinc.api import set_dropout_rate, Model
from ..tokens.doc cimport Doc
-from ..util import create_default_optimizer
+from ..gold import validate_examples
from ..errors import Errors
from .. import util
@@ -16,7 +17,6 @@ cdef class Pipe:
DOCS: https://spacy.io/api/pipe
"""
-
def __init__(self, vocab, model, name, **cfg):
"""Initialize a pipeline component.
@@ -27,7 +27,10 @@ cdef class Pipe:
DOCS: https://spacy.io/api/pipe#init
"""
- raise NotImplementedError
+ self.vocab = vocab
+ self.model = model
+ self.name = name
+ self.cfg = dict(cfg)
def __call__(self, Doc doc):
"""Apply the pipe to one document. The document is modified in place,
@@ -68,7 +71,7 @@ cdef class Pipe:
DOCS: https://spacy.io/api/pipe#predict
"""
- raise NotImplementedError
+ raise NotImplementedError(Errors.E931.format(method="predict", name=self.name))
def set_annotations(self, docs, scores):
"""Modify a batch of documents, using pre-computed scores.
@@ -78,7 +81,43 @@ cdef class Pipe:
DOCS: https://spacy.io/api/pipe#set_annotations
"""
- raise NotImplementedError
+ raise NotImplementedError(Errors.E931.format(method="set_annotations", name=self.name))
+
+ def update(self, examples, *, drop=0.0, set_annotations=False, sgd=None, losses=None):
+ """Learn from a batch of documents and gold-standard information,
+ updating the pipe's model. Delegates to predict and get_loss.
+
+ examples (Iterable[Example]): A batch of Example objects.
+ drop (float): The dropout rate.
+ set_annotations (bool): Whether or not to update the Example objects
+ with the predictions.
+ sgd (thinc.api.Optimizer): The optimizer.
+ losses (Dict[str, float]): Optional record of the loss during training.
+ Updated using the component name as the key.
+ RETURNS (Dict[str, float]): The updated losses dictionary.
+
+ DOCS: https://spacy.io/api/pipe#update
+ """
+ if losses is None:
+ losses = {}
+ if not hasattr(self, "model") or self.model in (None, True, False):
+ return losses
+ losses.setdefault(self.name, 0.0)
+ validate_examples(examples, "Pipe.update")
+ if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
+ # Handle cases where there are no tokens in any docs.
+ return
+ set_dropout_rate(self.model, drop)
+ scores, bp_scores = self.model.begin_update([eg.predicted for eg in examples])
+ loss, d_scores = self.get_loss(examples, scores)
+ bp_scores(d_scores)
+ if sgd not in (None, False):
+ self.model.finish_update(sgd)
+ losses[self.name] += loss
+ if set_annotations:
+ docs = [eg.predicted for eg in examples]
+ self.set_annotations(docs, scores=scores)
+ return losses
def rehearse(self, examples, *, sgd=None, losses=None, **config):
"""Perform a "rehearsal" update from a batch of data. Rehearsal updates
@@ -107,7 +146,7 @@ cdef class Pipe:
DOCS: https://spacy.io/api/pipe#get_loss
"""
- raise NotImplementedError
+ raise NotImplementedError(Errors.E931.format(method="get_loss", name=self.name))
def add_label(self, label):
"""Add an output label, to be predicted by the model. It's possible to
@@ -119,7 +158,7 @@ cdef class Pipe:
DOCS: https://spacy.io/api/pipe#add_label
"""
- raise NotImplementedError
+ raise NotImplementedError(Errors.E931.format(method="add_label", name=self.name))
def create_optimizer(self):
"""Create an optimizer for the pipeline component.
@@ -128,9 +167,9 @@ cdef class Pipe:
DOCS: https://spacy.io/api/pipe#create_optimizer
"""
- return create_default_optimizer()
+ return util.create_default_optimizer()
- def begin_training(self, get_examples=lambda: [], *, pipeline=None, sgd=None):
+ def begin_training(self, get_examples, *, pipeline=None, sgd=None):
"""Initialize the pipe for training, using data examples if available.
get_examples (Callable[[], Iterable[Example]]): Optional function that
diff --git a/spacy/pipeline/sentencizer.pyx b/spacy/pipeline/sentencizer.pyx
index be4351212..46d599497 100644
--- a/spacy/pipeline/sentencizer.pyx
+++ b/spacy/pipeline/sentencizer.pyx
@@ -7,6 +7,7 @@ from ..tokens.doc cimport Doc
from .pipe import Pipe
from ..language import Language
from ..scorer import Scorer
+from ..gold import validate_examples
from .. import util
@@ -58,7 +59,7 @@ class Sentencizer(Pipe):
else:
self.punct_chars = set(self.default_punct_chars)
- def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None):
+ def begin_training(self, get_examples, pipeline=None, sgd=None):
pass
def __call__(self, doc):
@@ -158,6 +159,7 @@ class Sentencizer(Pipe):
DOCS: https://spacy.io/api/sentencizer#score
"""
+ validate_examples(examples, "Sentencizer.score")
results = Scorer.score_spans(examples, "sents", **kwargs)
del results["sents_per_type"]
return results
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index cf7479c29..e82225d27 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -9,6 +9,7 @@ from .tagger import Tagger
from ..language import Language
from ..errors import Errors
from ..scorer import Scorer
+from ..gold import validate_examples
from .. import util
@@ -102,6 +103,7 @@ class SentenceRecognizer(Tagger):
DOCS: https://spacy.io/api/sentencerecognizer#get_loss
"""
+ validate_examples(examples, "SentenceRecognizer.get_loss")
labels = self.labels
loss_func = SequenceCategoricalCrossentropy(names=labels, normalize=False)
truths = []
@@ -121,7 +123,7 @@ class SentenceRecognizer(Tagger):
raise ValueError("nan value when computing loss")
return float(loss), d_scores
- def begin_training(self, get_examples=lambda: [], *, pipeline=None, sgd=None):
+ def begin_training(self, get_examples, *, pipeline=None, sgd=None):
"""Initialize the pipe for training, using data examples if available.
get_examples (Callable[[], Iterable[Example]]): Optional function that
@@ -151,6 +153,7 @@ class SentenceRecognizer(Tagger):
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
DOCS: https://spacy.io/api/sentencerecognizer#score
"""
+ validate_examples(examples, "SentenceRecognizer.score")
results = Scorer.score_spans(examples, "sents", **kwargs)
del results["sents_per_type"]
return results
diff --git a/spacy/pipeline/simple_ner.py b/spacy/pipeline/simple_ner.py
index 4965b2e13..5f3addbd7 100644
--- a/spacy/pipeline/simple_ner.py
+++ b/spacy/pipeline/simple_ner.py
@@ -1,4 +1,4 @@
-from typing import List, Iterable, Optional, Dict, Tuple, Callable
+from typing import List, Iterable, Optional, Dict, Tuple, Callable, Set
from thinc.types import Floats2d
from thinc.api import SequenceCategoricalCrossentropy, set_dropout_rate, Model
from thinc.api import Optimizer, Config
@@ -6,6 +6,7 @@ from thinc.util import to_numpy
from ..errors import Errors
from ..gold import Example, spans_from_biluo_tags, iob_to_biluo, biluo_to_iob
+from ..gold import validate_examples
from ..tokens import Doc
from ..language import Language
from ..vocab import Vocab
@@ -127,6 +128,7 @@ class SimpleNER(Pipe):
if losses is None:
losses = {}
losses.setdefault("ner", 0.0)
+ validate_examples(examples, "SimpleNER.update")
if not any(_has_ner(eg) for eg in examples):
return losses
docs = [eg.predicted for eg in examples]
@@ -142,6 +144,7 @@ class SimpleNER(Pipe):
return losses
def get_loss(self, examples: List[Example], scores) -> Tuple[List[Floats2d], float]:
+ validate_examples(examples, "SimpleNER.get_loss")
truths = []
for eg in examples:
tags = eg.get_aligned_ner()
@@ -161,14 +164,17 @@ class SimpleNER(Pipe):
def begin_training(
self,
- get_examples: Callable,
+ get_examples: Callable[[], Iterable[Example]],
pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
sgd: Optional[Optimizer] = None,
):
+ all_labels = set()
if not hasattr(get_examples, "__call__"):
- gold_tuples = get_examples
- get_examples = lambda: gold_tuples
- for label in _get_labels(get_examples()):
+ err = Errors.E930.format(name="SimpleNER", obj=type(get_examples))
+ raise ValueError(err)
+ for example in get_examples():
+ all_labels.update(_get_labels(example))
+ for label in sorted(all_labels):
self.add_label(label)
labels = self.labels
n_actions = self.model.attrs["get_num_actions"](len(labels))
@@ -185,6 +191,7 @@ class SimpleNER(Pipe):
pass
def score(self, examples, **kwargs):
+ validate_examples(examples, "SimpleNER.score")
return Scorer.score_spans(examples, "ents", **kwargs)
@@ -196,10 +203,9 @@ def _has_ner(example: Example) -> bool:
return False
-def _get_labels(examples: List[Example]) -> List[str]:
+def _get_labels(example: Example) -> Set[str]:
labels = set()
- for eg in examples:
- for ner_tag in eg.get_aligned("ENT_TYPE", as_string=True):
- if ner_tag != "O" and ner_tag != "-":
- labels.add(ner_tag)
- return list(sorted(labels))
+ for ner_tag in example.get_aligned("ENT_TYPE", as_string=True):
+ if ner_tag != "O" and ner_tag != "-":
+ labels.add(ner_tag)
+ return labels
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index 937290d5f..9070329e8 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -16,6 +16,7 @@ from ..attrs import POS, ID
from ..parts_of_speech import X
from ..errors import Errors, TempErrors, Warnings
from ..scorer import Scorer
+from ..gold import validate_examples
from .. import util
@@ -187,19 +188,15 @@ class Tagger(Pipe):
if losses is None:
losses = {}
losses.setdefault(self.name, 0.0)
- try:
- if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
- # Handle cases where there are no tokens in any docs.
- return
- except AttributeError:
- types = set([type(eg) for eg in examples])
- raise TypeError(Errors.E978.format(name="Tagger", method="update", types=types)) from None
+ validate_examples(examples, "Tagger.update")
+ if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
+ # Handle cases where there are no tokens in any docs.
+ return
set_dropout_rate(self.model, drop)
- tag_scores, bp_tag_scores = self.model.begin_update(
- [eg.predicted for eg in examples])
+ tag_scores, bp_tag_scores = self.model.begin_update([eg.predicted for eg in examples])
for sc in tag_scores:
if self.model.ops.xp.isnan(sc.sum()):
- raise ValueError("nan value in scores")
+ raise ValueError(Errors.E940)
loss, d_tag_scores = self.get_loss(examples, tag_scores)
bp_tag_scores(d_tag_scores)
if sgd not in (None, False):
@@ -226,11 +223,8 @@ class Tagger(Pipe):
DOCS: https://spacy.io/api/tagger#rehearse
"""
- try:
- docs = [eg.predicted for eg in examples]
- except AttributeError:
- types = set([type(eg) for eg in examples])
- raise TypeError(Errors.E978.format(name="Tagger", method="rehearse", types=types)) from None
+ validate_examples(examples, "Tagger.rehearse")
+ docs = [eg.predicted for eg in examples]
if self._rehearsal_model is None:
return
if not any(len(doc) for doc in docs):
@@ -256,6 +250,7 @@ class Tagger(Pipe):
DOCS: https://spacy.io/api/tagger#get_loss
"""
+ validate_examples(examples, "Tagger.get_loss")
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
truths = [eg.get_aligned("TAG", as_string=True) for eg in examples]
d_scores, loss = loss_func(scores, truths)
@@ -263,7 +258,7 @@ class Tagger(Pipe):
raise ValueError("nan value when computing loss")
return float(loss), d_scores
- def begin_training(self, get_examples=lambda: [], *, pipeline=None, sgd=None):
+ def begin_training(self, get_examples, *, pipeline=None, sgd=None):
"""Initialize the pipe for training, using data examples if available.
get_examples (Callable[[], Iterable[Example]]): Optional function that
@@ -277,13 +272,12 @@ class Tagger(Pipe):
DOCS: https://spacy.io/api/tagger#begin_training
"""
+ if not hasattr(get_examples, "__call__"):
+ err = Errors.E930.format(name="Tagger", obj=type(get_examples))
+ raise ValueError(err)
tags = set()
for example in get_examples():
- try:
- y = example.y
- except AttributeError:
- raise TypeError(Errors.E978.format(name="Tagger", method="begin_training", types=type(example))) from None
- for token in y:
+ for token in example.y:
tags.add(token.tag_)
for tag in sorted(tags):
self.add_label(tag)
@@ -318,6 +312,7 @@ class Tagger(Pipe):
DOCS: https://spacy.io/api/tagger#score
"""
+ validate_examples(examples, "Tagger.score")
return Scorer.score_token_attr(examples, "tag", **kwargs)
def to_bytes(self, *, exclude=tuple()):
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index 7b9cc1e24..ce4f286e5 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -5,7 +5,7 @@ import numpy
from .pipe import Pipe
from ..language import Language
-from ..gold import Example
+from ..gold import Example, validate_examples
from ..errors import Errors
from ..scorer import Scorer
from .. import util
@@ -209,15 +209,10 @@ class TextCategorizer(Pipe):
if losses is None:
losses = {}
losses.setdefault(self.name, 0.0)
- try:
- if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
- # Handle cases where there are no tokens in any docs.
- return losses
- except AttributeError:
- types = set([type(eg) for eg in examples])
- raise TypeError(
- Errors.E978.format(name="TextCategorizer", method="update", types=types)
- ) from None
+ validate_examples(examples, "TextCategorizer.update")
+ if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
+ # Handle cases where there are no tokens in any docs.
+ return losses
set_dropout_rate(self.model, drop)
scores, bp_scores = self.model.begin_update([eg.predicted for eg in examples])
loss, d_scores = self.get_loss(examples, scores)
@@ -252,19 +247,12 @@ class TextCategorizer(Pipe):
DOCS: https://spacy.io/api/textcategorizer#rehearse
"""
-
if losses is not None:
losses.setdefault(self.name, 0.0)
if self._rehearsal_model is None:
return losses
- try:
- docs = [eg.predicted for eg in examples]
- except AttributeError:
- types = set([type(eg) for eg in examples])
- err = Errors.E978.format(
- name="TextCategorizer", method="rehearse", types=types
- )
- raise TypeError(err) from None
+ validate_examples(examples, "TextCategorizer.rehearse")
+ docs = [eg.predicted for eg in examples]
if not any(len(doc) for doc in docs):
# Handle cases where there are no tokens in any docs.
return losses
@@ -303,6 +291,7 @@ class TextCategorizer(Pipe):
DOCS: https://spacy.io/api/textcategorizer#get_loss
"""
+ validate_examples(examples, "TextCategorizer.get_loss")
truths, not_missing = self._examples_to_truth(examples)
not_missing = self.model.ops.asarray(not_missing)
d_scores = (scores - truths) / scores.shape[0]
@@ -338,7 +327,7 @@ class TextCategorizer(Pipe):
def begin_training(
self,
- get_examples: Callable[[], Iterable[Example]] = lambda: [],
+ get_examples: Callable[[], Iterable[Example]],
*,
pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
sgd: Optional[Optimizer] = None,
@@ -356,21 +345,20 @@ class TextCategorizer(Pipe):
DOCS: https://spacy.io/api/textcategorizer#begin_training
"""
- # TODO: begin_training is not guaranteed to see all data / labels ?
- examples = list(get_examples())
- for example in examples:
- try:
- y = example.y
- except AttributeError:
- err = Errors.E978.format(
- name="TextCategorizer", method="update", types=type(example)
- )
- raise TypeError(err) from None
- for cat in y.cats:
+ if not hasattr(get_examples, "__call__"):
+ err = Errors.E930.format(name="TextCategorizer", obj=type(get_examples))
+ raise ValueError(err)
+ subbatch = [] # Select a subbatch of examples to initialize the model
+ for example in get_examples():
+ if len(subbatch) < 2:
+ subbatch.append(example)
+ for cat in example.y.cats:
self.add_label(cat)
self.require_labels()
- docs = [Doc(self.vocab, words=["hello"])]
- truths, _ = self._examples_to_truth(examples)
+ docs = [eg.reference for eg in subbatch]
+ if not docs: # need at least one doc
+ docs = [Doc(self.vocab, words=["hello"])]
+ truths, _ = self._examples_to_truth(subbatch)
self.set_output(len(self.labels))
self.model.initialize(X=docs, Y=truths)
if sgd is None:
@@ -392,6 +380,7 @@ class TextCategorizer(Pipe):
DOCS: https://spacy.io/api/textcategorizer#score
"""
+ validate_examples(examples, "TextCategorizer.score")
return Scorer.score_cats(
examples,
"cats",
diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py
index c9f0a99e9..f2d138cf7 100644
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@@ -2,7 +2,7 @@ from typing import Iterator, Sequence, Iterable, Optional, Dict, Callable, List,
from thinc.api import Model, set_dropout_rate, Optimizer, Config
from .pipe import Pipe
-from ..gold import Example
+from ..gold import Example, validate_examples
from ..tokens import Doc
from ..vocab import Vocab
from ..language import Language
@@ -166,9 +166,8 @@ class Tok2Vec(Pipe):
"""
if losses is None:
losses = {}
+ validate_examples(examples, "Tok2Vec.update")
docs = [eg.predicted for eg in examples]
- if isinstance(docs, Doc):
- docs = [docs]
set_dropout_rate(self.model, drop)
tokvecs, bp_tokvecs = self.model.begin_update(docs)
d_tokvecs = [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs]
@@ -194,7 +193,8 @@ class Tok2Vec(Pipe):
batch_id = Tok2VecListener.get_batch_id(docs)
for listener in self.listeners[:-1]:
listener.receive(batch_id, tokvecs, accumulate_gradient)
- self.listeners[-1].receive(batch_id, tokvecs, backprop)
+ if self.listeners:
+ self.listeners[-1].receive(batch_id, tokvecs, backprop)
if set_annotations:
self.set_annotations(docs, tokvecs)
return losses
@@ -204,7 +204,7 @@ class Tok2Vec(Pipe):
def begin_training(
self,
- get_examples: Callable[[], Iterable[Example]] = lambda: [],
+ get_examples: Callable[[], Iterable[Example]],
*,
pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
sgd: Optional[Optimizer] = None,
diff --git a/spacy/pipeline/transition_parser.pxd b/spacy/pipeline/transition_parser.pxd
index e594a3098..67bc01f97 100644
--- a/spacy/pipeline/transition_parser.pxd
+++ b/spacy/pipeline/transition_parser.pxd
@@ -8,11 +8,8 @@ from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC
cdef class Parser(Pipe):
- cdef readonly Vocab vocab
- cdef public object model
cdef public object _rehearsal_model
cdef readonly TransitionSystem moves
- cdef readonly object cfg
cdef public object _multitasks
cdef void _parseC(self, StateC** states,
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 9829e764d..2eadfa6aa 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -8,22 +8,21 @@ from libc.string cimport memset
from libc.stdlib cimport calloc, free
import srsly
+from thinc.api import set_dropout_rate
+import numpy.random
+import numpy
+import warnings
from ._parser_internals.stateclass cimport StateClass
from ..ml.parser_model cimport alloc_activations, free_activations
from ..ml.parser_model cimport predict_states, arg_max_if_valid
from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
from ..ml.parser_model cimport get_c_weights, get_c_sizes
-
from ..tokens.doc cimport Doc
+
+from ..gold import validate_examples
from ..errors import Errors, Warnings
from .. import util
-from ..util import create_default_optimizer
-
-from thinc.api import set_dropout_rate
-import numpy.random
-import numpy
-import warnings
cdef class Parser(Pipe):
@@ -266,6 +265,7 @@ cdef class Parser(Pipe):
if losses is None:
losses = {}
losses.setdefault(self.name, 0.)
+ validate_examples(examples, "Parser.update")
for multitask in self._multitasks:
multitask.update(examples, drop=drop, sgd=sgd)
n_examples = len([eg for eg in examples if self.moves.has_gold(eg)])
@@ -329,7 +329,7 @@ cdef class Parser(Pipe):
if self._rehearsal_model is None:
return None
losses.setdefault(self.name, 0.)
-
+ validate_examples(examples, "Parser.rehearse")
docs = [eg.predicted for eg in examples]
states = self.moves.init_batch(docs)
# This is pretty dirty, but the NER can resize itself in init_batch,
@@ -398,21 +398,18 @@ cdef class Parser(Pipe):
losses[self.name] += (d_scores**2).sum()
return d_scores
- def create_optimizer(self):
- return create_default_optimizer()
-
def set_output(self, nO):
self.model.attrs["resize_output"](self.model, nO)
def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs):
+ if not hasattr(get_examples, "__call__"):
+ err = Errors.E930.format(name="DependencyParser/EntityRecognizer", obj=type(get_examples))
+ raise ValueError(err)
self.cfg.update(kwargs)
lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})
if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS:
langs = ", ".join(util.LEXEME_NORM_LANGS)
- warnings.warn(Warnings.W033.format(model="parser or NER", langs=langs))
- if not hasattr(get_examples, '__call__'):
- gold_tuples = get_examples
- get_examples = lambda: gold_tuples
+ util.logger.debug(Warnings.W033.format(model="parser or NER", langs=langs))
actions = self.moves.get_actions(
examples=get_examples(),
min_freq=self.cfg['min_action_freq'],
diff --git a/spacy/tests/doc/test_add_entities.py b/spacy/tests/doc/test_add_entities.py
index 5fb5f0914..d6e345336 100644
--- a/spacy/tests/doc/test_add_entities.py
+++ b/spacy/tests/doc/test_add_entities.py
@@ -18,7 +18,7 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
cfg = {"model": DEFAULT_NER_MODEL}
model = registry.make_from_config(cfg, validate=True)["model"]
ner = EntityRecognizer(en_vocab, model, **config)
- ner.begin_training([])
+ ner.begin_training(lambda: [])
ner(doc)
assert len(list(doc.ents)) == 0
assert [w.ent_iob_ for w in doc] == (["O"] * len(doc))
@@ -41,7 +41,7 @@ def test_ents_reset(en_vocab):
cfg = {"model": DEFAULT_NER_MODEL}
model = registry.make_from_config(cfg, validate=True)["model"]
ner = EntityRecognizer(en_vocab, model, **config)
- ner.begin_training([])
+ ner.begin_training(lambda: [])
ner(doc)
assert [t.ent_iob_ for t in doc] == (["O"] * len(doc))
doc.ents = list(doc.ents)
diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py
index 88dfabdc8..fce5f679f 100644
--- a/spacy/tests/parser/test_add_label.py
+++ b/spacy/tests/parser/test_add_label.py
@@ -35,7 +35,7 @@ def test_init_parser(parser):
def _train_parser(parser):
fix_random_seed(1)
parser.add_label("left")
- parser.begin_training([], **parser.cfg)
+ parser.begin_training(lambda: [], **parser.cfg)
sgd = Adam(0.001)
for i in range(5):
@@ -75,7 +75,7 @@ def test_add_label_deserializes_correctly():
ner1.add_label("C")
ner1.add_label("B")
ner1.add_label("A")
- ner1.begin_training([])
+ ner1.begin_training(lambda: [])
ner2 = EntityRecognizer(Vocab(), model, **config)
# the second model needs to be resized before we can call from_bytes
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index 0ffe74273..c7a1ed0d2 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -1,17 +1,17 @@
import pytest
-
from spacy import util
from spacy.lang.en import English
-
from spacy.language import Language
from spacy.lookups import Lookups
from spacy.pipeline._parser_internals.ner import BiluoPushDown
from spacy.gold import Example
from spacy.tokens import Doc
from spacy.vocab import Vocab
+import logging
from ..util import make_tempdir
+
TRAIN_DATA = [
("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]}),
@@ -56,6 +56,7 @@ def test_get_oracle_moves(tsys, doc, entity_annots):
assert names == ["U-PERSON", "O", "O", "B-GPE", "L-GPE", "O"]
+@pytest.mark.filterwarnings("ignore::UserWarning")
def test_get_oracle_moves_negative_entities(tsys, doc, entity_annots):
entity_annots = [(s, e, "!" + label) for s, e, label in entity_annots]
example = Example.from_dict(doc, {"entities": entity_annots})
@@ -332,19 +333,21 @@ def test_overfitting_IO():
assert ents2[0].label_ == "LOC"
-def test_ner_warns_no_lookups():
+def test_ner_warns_no_lookups(caplog):
nlp = English()
assert nlp.lang in util.LEXEME_NORM_LANGS
nlp.vocab.lookups = Lookups()
assert not len(nlp.vocab.lookups)
nlp.add_pipe("ner")
- with pytest.warns(UserWarning):
+ with caplog.at_level(logging.DEBUG):
nlp.begin_training()
+ assert "W033" in caplog.text
+ caplog.clear()
nlp.vocab.lookups.add_table("lexeme_norm")
nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A"
- with pytest.warns(None) as record:
+ with caplog.at_level(logging.DEBUG):
nlp.begin_training()
- assert not record.list
+ assert "W033" not in caplog.text
@Language.factory("blocker")
diff --git a/spacy/tests/parser/test_preset_sbd.py b/spacy/tests/parser/test_preset_sbd.py
index 939181419..594498b0b 100644
--- a/spacy/tests/parser/test_preset_sbd.py
+++ b/spacy/tests/parser/test_preset_sbd.py
@@ -28,7 +28,7 @@ def parser(vocab):
parser.cfg["hidden_width"] = 32
# parser.add_label('right')
parser.add_label("left")
- parser.begin_training([], **parser.cfg)
+ parser.begin_training(lambda: [], **parser.cfg)
sgd = Adam(0.001)
for i in range(10):
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index bb93cf118..b3fb6d0fc 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -136,7 +136,7 @@ def test_kb_undefined(nlp):
"""Test that the EL can't train without defining a KB"""
entity_linker = nlp.add_pipe("entity_linker", config={})
with pytest.raises(ValueError):
- entity_linker.begin_training()
+ entity_linker.begin_training(lambda: [])
def test_kb_empty(nlp):
@@ -145,7 +145,7 @@ def test_kb_empty(nlp):
entity_linker = nlp.add_pipe("entity_linker", config=config)
assert len(entity_linker.kb) == 0
with pytest.raises(ValueError):
- entity_linker.begin_training()
+ entity_linker.begin_training(lambda: [])
def test_candidate_generation(nlp):
@@ -249,7 +249,7 @@ def test_preserving_links_asdoc(nlp):
ruler.add_patterns(patterns)
el_config = {"kb": {"@assets": "myLocationsKB.v1"}, "incl_prior": False}
el_pipe = nlp.add_pipe("entity_linker", config=el_config, last=True)
- el_pipe.begin_training()
+ el_pipe.begin_training(lambda: [])
el_pipe.incl_context = False
el_pipe.incl_prior = True
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index 17add7391..66c27b233 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -54,7 +54,7 @@ def test_textcat_learns_multilabel():
textcat = TextCategorizer(nlp.vocab, width=8)
for letter in letters:
textcat.add_label(letter)
- optimizer = textcat.begin_training()
+ optimizer = textcat.begin_training(lambda: [])
for i in range(30):
losses = {}
examples = [Example.from_dict(doc, {"cats": cats}) for doc, cat in docs]
@@ -104,7 +104,7 @@ def test_overfitting_IO():
doc = nlp(test_text)
cats = doc.cats
# note that by default, exclusive_classes = false so we need a bigger error margin
- assert cats["POSITIVE"] > 0.9
+ assert cats["POSITIVE"] > 0.8
assert cats["POSITIVE"] + cats["NEGATIVE"] == pytest.approx(1.0, 0.1)
# Also test the results are still the same after IO
@@ -113,7 +113,7 @@ def test_overfitting_IO():
nlp2 = util.load_model_from_path(tmp_dir)
doc2 = nlp2(test_text)
cats2 = doc2.cats
- assert cats2["POSITIVE"] > 0.9
+ assert cats2["POSITIVE"] > 0.8
assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.1)
# Test scoring
diff --git a/spacy/tests/regression/test_issue2001-2500.py b/spacy/tests/regression/test_issue2001-2500.py
index a09c6f4fb..259ca9b0c 100644
--- a/spacy/tests/regression/test_issue2001-2500.py
+++ b/spacy/tests/regression/test_issue2001-2500.py
@@ -25,7 +25,6 @@ def test_issue2070():
assert len(doc) == 11
-@pytest.mark.filterwarnings("ignore::UserWarning")
def test_issue2179():
"""Test that spurious 'extra_labels' aren't created when initializing NER."""
nlp = Italian()
@@ -135,7 +134,6 @@ def test_issue2464(en_vocab):
assert len(matches) == 3
-@pytest.mark.filterwarnings("ignore::UserWarning")
def test_issue2482():
"""Test we can serialize and deserialize a blank NER or parser model."""
nlp = Italian()
diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py
index cf43e1a17..3882df0a6 100644
--- a/spacy/tests/regression/test_issue2501-3000.py
+++ b/spacy/tests/regression/test_issue2501-3000.py
@@ -20,7 +20,7 @@ def test_issue2564():
nlp = Language()
tagger = nlp.add_pipe("tagger")
tagger.add_label("A")
- tagger.begin_training()
+ tagger.begin_training(lambda: [])
doc = nlp("hello world")
assert doc.is_tagged
docs = nlp.pipe(["hello", "world"])
@@ -136,7 +136,6 @@ def test_issue2782(text, lang_cls):
assert doc[0].like_num
-@pytest.mark.filterwarnings("ignore::UserWarning")
def test_issue2800():
"""Test issue that arises when too many labels are added to NER model.
Used to cause segfault.
diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py
index 98a6b9aa0..3059eb5ab 100644
--- a/spacy/tests/regression/test_issue3001-3500.py
+++ b/spacy/tests/regression/test_issue3001-3500.py
@@ -90,7 +90,6 @@ def test_issue3199():
assert list(doc[0:3].noun_chunks) == []
-@pytest.mark.filterwarnings("ignore::UserWarning")
def test_issue3209():
"""Test issue that occurred in spaCy nightly where NER labels were being
mapped to classes incorrectly after loading the model, when the labels
diff --git a/spacy/tests/regression/test_issue3501-4000.py b/spacy/tests/regression/test_issue3501-4000.py
index de554a5ec..fc2a3ed7c 100644
--- a/spacy/tests/regression/test_issue3501-4000.py
+++ b/spacy/tests/regression/test_issue3501-4000.py
@@ -91,7 +91,6 @@ def test_issue_3526_3(en_vocab):
assert new_ruler.overwrite is not ruler.overwrite
-@pytest.mark.filterwarnings("ignore::UserWarning")
def test_issue_3526_4(en_vocab):
nlp = Language(vocab=en_vocab)
patterns = [{"label": "ORG", "pattern": "Apple"}]
@@ -252,7 +251,6 @@ def test_issue3803():
assert [t.like_num for t in doc] == [True, True, True, True, True, True]
-@pytest.mark.filterwarnings("ignore::UserWarning")
def test_issue3830_no_subtok():
"""Test that the parser doesn't have subtok label if not learn_tokens"""
config = {
@@ -270,7 +268,6 @@ def test_issue3830_no_subtok():
assert "subtok" not in parser.labels
-@pytest.mark.filterwarnings("ignore::UserWarning")
def test_issue3830_with_subtok():
"""Test that the parser does have subtok label if learn_tokens=True."""
config = {
@@ -333,7 +330,6 @@ def test_issue3879(en_vocab):
assert len(matcher(doc)) == 2 # fails because of a FP match 'is a test'
-@pytest.mark.filterwarnings("ignore::UserWarning")
def test_issue3880():
"""Test that `nlp.pipe()` works when an empty string ends the batch.
diff --git a/spacy/tests/regression/test_issue4001-4500.py b/spacy/tests/regression/test_issue4001-4500.py
index 423015106..1789973e9 100644
--- a/spacy/tests/regression/test_issue4001-4500.py
+++ b/spacy/tests/regression/test_issue4001-4500.py
@@ -81,7 +81,6 @@ def test_issue4030():
assert doc.cats["inoffensive"] == 0.0
-@pytest.mark.filterwarnings("ignore::UserWarning")
def test_issue4042():
"""Test that serialization of an EntityRuler before NER works fine."""
nlp = English()
@@ -110,7 +109,6 @@ def test_issue4042():
assert doc2.ents[0].label_ == "MY_ORG"
-@pytest.mark.filterwarnings("ignore::UserWarning")
def test_issue4042_bug2():
"""
Test that serialization of an NER works fine when new labels were added.
@@ -242,7 +240,6 @@ def test_issue4190():
assert result_1b == result_2
-@pytest.mark.filterwarnings("ignore::UserWarning")
def test_issue4267():
""" Test that running an entity_ruler after ner gives consistent results"""
nlp = English()
@@ -303,7 +300,7 @@ def test_issue4313():
config = {}
ner = nlp.create_pipe("ner", config=config)
ner.add_label("SOME_LABEL")
- ner.begin_training([])
+ ner.begin_training(lambda: [])
# add a new label to the doc
doc = nlp("What do you think about Apple ?")
assert len(ner.labels) == 1
@@ -324,7 +321,6 @@ def test_issue4313():
entity_scores[(start, end, label)] += score
-@pytest.mark.filterwarnings("ignore::UserWarning")
def test_issue4348():
"""Test that training the tagger with empty data, doesn't throw errors"""
nlp = English()
diff --git a/spacy/tests/regression/test_issue4501-5000.py b/spacy/tests/regression/test_issue4501-5000.py
index 96d4e1ca4..1e655851f 100644
--- a/spacy/tests/regression/test_issue4501-5000.py
+++ b/spacy/tests/regression/test_issue4501-5000.py
@@ -179,7 +179,6 @@ def test_issue4707():
assert "entity_ruler" in new_nlp.pipe_names
-@pytest.mark.filterwarnings("ignore::UserWarning")
def test_issue4725_1():
""" Ensure the pickling of the NER goes well"""
vocab = Vocab(vectors_name="test_vocab_add_vector")
@@ -198,7 +197,6 @@ def test_issue4725_1():
assert ner2.cfg["update_with_oracle_cut_size"] == 111
-@pytest.mark.filterwarnings("ignore::UserWarning")
def test_issue4725_2():
# ensures that this runs correctly and doesn't hang or crash because of the global vectors
# if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows),
diff --git a/spacy/tests/regression/test_issue5152.py b/spacy/tests/regression/test_issue5152.py
index 3c1cee5c3..c7a70a99c 100644
--- a/spacy/tests/regression/test_issue5152.py
+++ b/spacy/tests/regression/test_issue5152.py
@@ -1,8 +1,7 @@
-import pytest
from spacy.lang.en import English
+import pytest
-@pytest.mark.filterwarnings("ignore::UserWarning")
def test_issue5152():
# Test that the comparison between a Span and a Token, goes well
# There was a bug when the number of tokens in the span equaled the number of characters in the token (!)
@@ -14,6 +13,8 @@ def test_issue5152():
span_2 = text[0:3] # Talk about being
span_3 = text_var[0:3] # Talk of being
token = y[0] # Let
- assert span.similarity(token) == 0.0
+ with pytest.warns(UserWarning):
+ assert span.similarity(token) == 0.0
assert span.similarity(span_2) == 1.0
- assert span_2.similarity(span_3) < 1.0
+ with pytest.warns(UserWarning):
+ assert span_2.similarity(span_3) < 1.0
diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py
index 4c6504f6b..93069d9a3 100644
--- a/spacy/tests/regression/test_issue5230.py
+++ b/spacy/tests/regression/test_issue5230.py
@@ -62,7 +62,7 @@ def tagger():
# need to add model for two reasons:
# 1. no model leads to error in serialization,
# 2. the affected line is the one for model serialization
- tagger.begin_training(pipeline=nlp.pipeline)
+ tagger.begin_training(lambda: [], pipeline=nlp.pipeline)
return tagger
@@ -81,7 +81,7 @@ def entity_linker():
# need to add model for two reasons:
# 1. no model leads to error in serialization,
# 2. the affected line is the one for model serialization
- entity_linker.begin_training(pipeline=nlp.pipeline)
+ entity_linker.begin_training(lambda: [], pipeline=nlp.pipeline)
return entity_linker
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index 8e3c95823..1de137e81 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -4,7 +4,7 @@ import spacy
from spacy.lang.en import English
from spacy.lang.de import German
from spacy.language import Language
-from spacy.util import registry, deep_merge_configs, load_model_from_config
+from spacy.util import registry, load_model_from_config
from spacy.ml.models import build_Tok2Vec_model, build_tb_parser_model
from spacy.ml.models import MultiHashEmbed, MaxoutWindowEncoder
@@ -194,37 +194,6 @@ def test_serialize_parser():
assert upper.get_dim("nI") == 66
-def test_deep_merge_configs():
- config = {"a": "hello", "b": {"c": "d"}}
- defaults = {"a": "world", "b": {"c": "e", "f": "g"}}
- merged = deep_merge_configs(config, defaults)
- assert len(merged) == 2
- assert merged["a"] == "hello"
- assert merged["b"] == {"c": "d", "f": "g"}
- config = {"a": "hello", "b": {"@test": "x", "foo": 1}}
- defaults = {"a": "world", "b": {"@test": "x", "foo": 100, "bar": 2}, "c": 100}
- merged = deep_merge_configs(config, defaults)
- assert len(merged) == 3
- assert merged["a"] == "hello"
- assert merged["b"] == {"@test": "x", "foo": 1, "bar": 2}
- assert merged["c"] == 100
- config = {"a": "hello", "b": {"@test": "x", "foo": 1}, "c": 100}
- defaults = {"a": "world", "b": {"@test": "y", "foo": 100, "bar": 2}}
- merged = deep_merge_configs(config, defaults)
- assert len(merged) == 3
- assert merged["a"] == "hello"
- assert merged["b"] == {"@test": "x", "foo": 1}
- assert merged["c"] == 100
- # Test that leaving out the factory just adds to existing
- config = {"a": "hello", "b": {"foo": 1}, "c": 100}
- defaults = {"a": "world", "b": {"@test": "y", "foo": 100, "bar": 2}}
- merged = deep_merge_configs(config, defaults)
- assert len(merged) == 3
- assert merged["a"] == "hello"
- assert merged["b"] == {"@test": "y", "foo": 1, "bar": 2}
- assert merged["c"] == 100
-
-
def test_config_nlp_roundtrip():
"""Test that a config prduced by the nlp object passes training config
validation."""
@@ -311,3 +280,22 @@ def test_config_overrides():
nlp = spacy.load(d)
assert isinstance(nlp, English)
assert nlp.pipe_names == ["tok2vec", "tagger"]
+
+
+def test_config_interpolation():
+ config = Config().from_str(nlp_config_string, interpolate=False)
+ assert config["training"]["train_corpus"]["path"] == "${paths:train}"
+ interpolated = config.interpolate()
+ assert interpolated["training"]["train_corpus"]["path"] == ""
+ nlp = English.from_config(config)
+ assert nlp.config["training"]["train_corpus"]["path"] == "${paths:train}"
+ # Ensure that variables are preserved in nlp config
+ width = "${components.tok2vec.model:width}"
+ assert config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
+ assert nlp.config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
+ interpolated2 = nlp.config.interpolate()
+ assert interpolated2["training"]["train_corpus"]["path"] == ""
+ assert interpolated2["components"]["tagger"]["model"]["tok2vec"]["width"] == 342
+ nlp2 = English.from_config(interpolated)
+ assert nlp2.config["training"]["train_corpus"]["path"] == ""
+ assert nlp2.config["components"]["tagger"]["model"]["tok2vec"]["width"] == 342
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index b5cc6fff8..1da257fd5 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -1,11 +1,14 @@
import pytest
-
from spacy.gold import docs_to_json, biluo_tags_from_offsets
from spacy.gold.converters import iob2docs, conll_ner2docs, conllu2docs
from spacy.lang.en import English
from spacy.schemas import ProjectConfigSchema, validate
from spacy.cli.pretrain import make_docs
+from spacy.cli.init_config import init_config, RECOMMENDATIONS_PATH
+from spacy.cli.init_config import RecommendationSchema
from spacy.cli._util import validate_project_commands, parse_config_overrides
+from spacy.util import get_lang_class
+import srsly
def test_cli_converters_conllu2json():
@@ -319,3 +322,20 @@ def test_parse_config_overrides(args, expected):
def test_parse_config_overrides_invalid(args):
with pytest.raises(SystemExit):
parse_config_overrides(args)
+
+
+@pytest.mark.parametrize("lang", ["en", "nl"])
+@pytest.mark.parametrize(
+ "pipeline", [["tagger", "parser", "ner"], [], ["ner", "textcat", "sentencizer"]]
+)
+@pytest.mark.parametrize("optimize", ["efficiency", "accuracy"])
+def test_init_config(lang, pipeline, optimize):
+ # TODO: add more tests and also check for GPU with transformers
+ init_config("-", lang=lang, pipeline=pipeline, optimize=optimize, cpu=True)
+
+
+def test_model_recommendations():
+ recommendations = srsly.read_json(RECOMMENDATIONS_PATH)
+ for lang, data in recommendations.items():
+ assert get_lang_class(lang)
+ assert RecommendationSchema(**data)
diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py
index 708c57837..334d9fc24 100644
--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@@ -154,6 +154,7 @@ def test_example_from_dict_some_ner(en_vocab):
assert ner_tags == ["U-LOC", None, None, None]
+@pytest.mark.filterwarnings("ignore::UserWarning")
def test_json2docs_no_ner(en_vocab):
data = [
{
@@ -506,6 +507,7 @@ def test_roundtrip_docs_to_docbin(doc):
assert cats["BAKING"] == reloaded_example.reference.cats["BAKING"]
+@pytest.mark.filterwarnings("ignore::UserWarning")
def test_make_orth_variants(doc):
nlp = English()
with make_tempdir() as tmpdir:
@@ -586,7 +588,7 @@ def test_tuple_format_implicit():
("Uber blew through $1 million a week", {"entities": [(0, 4, "ORG")]}),
(
"Spotify steps up Asia expansion",
- {"entities": [(0, 8, "ORG"), (17, 21, "LOC")]},
+ {"entities": [(0, 7, "ORG"), (17, 21, "LOC")]},
),
("Google rebrands its business apps", {"entities": [(0, 6, "ORG")]}),
]
@@ -601,7 +603,7 @@ def test_tuple_format_implicit_invalid():
("Uber blew through $1 million a week", {"frumble": [(0, 4, "ORG")]}),
(
"Spotify steps up Asia expansion",
- {"entities": [(0, 8, "ORG"), (17, 21, "LOC")]},
+ {"entities": [(0, 7, "ORG"), (17, 21, "LOC")]},
),
("Google rebrands its business apps", {"entities": [(0, 6, "ORG")]}),
]
diff --git a/spacy/tests/test_new_example.py b/spacy/tests/test_new_example.py
index df6489aa8..321eaae95 100644
--- a/spacy/tests/test_new_example.py
+++ b/spacy/tests/test_new_example.py
@@ -46,6 +46,7 @@ def test_Example_from_dict_with_tags(pred_words, annots):
assert aligned_tags == ["NN" for _ in predicted]
+@pytest.mark.filterwarnings("ignore::UserWarning")
def test_aligned_tags():
pred_words = ["Apply", "some", "sunscreen", "unless", "you", "can", "not"]
gold_words = ["Apply", "some", "sun", "screen", "unless", "you", "cannot"]
@@ -198,8 +199,8 @@ def test_Example_from_dict_with_entities(annots):
def test_Example_from_dict_with_entities_invalid(annots):
vocab = Vocab()
predicted = Doc(vocab, words=annots["words"])
- example = Example.from_dict(predicted, annots)
- # TODO: shouldn't this throw some sort of warning ?
+ with pytest.warns(UserWarning):
+ example = Example.from_dict(predicted, annots)
assert len(list(example.reference.ents)) == 0
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index ce7b5cb1c..a13299fff 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -24,6 +24,7 @@ from .util import registry
from .attrs import intify_attrs
from .symbols import ORTH
from .scorer import Scorer
+from .gold import validate_examples
cdef class Tokenizer:
@@ -712,6 +713,7 @@ cdef class Tokenizer:
return tokens
def score(self, examples, **kwargs):
+ validate_examples(examples, "Tokenizer.score")
return Scorer.score_tokenization(examples)
def to_disk(self, path, **kwargs):
diff --git a/spacy/util.py b/spacy/util.py
index d10f83789..3cf165a4f 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -24,6 +24,7 @@ import tempfile
import shutil
import shlex
import inspect
+import logging
try:
import cupy.random
@@ -54,10 +55,19 @@ if TYPE_CHECKING:
from .vocab import Vocab # noqa: F401
-_PRINT_ENV = False
OOV_RANK = numpy.iinfo(numpy.uint64).max
LEXEME_NORM_LANGS = ["da", "de", "el", "en", "id", "lb", "pt", "ru", "sr", "ta", "th"]
+# Default order of sections in the config.cfg. Not all sections needs to exist,
+# and additional sections are added at the end, in alphabetical order.
+# fmt: off
+CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "training", "pretraining"]
+# fmt: on
+
+
+logging.basicConfig()
+logger = logging.getLogger("spacy")
+
class registry(thinc.registry):
languages = catalogue.create("spacy", "languages", entry_points=True)
@@ -109,11 +119,6 @@ class SimpleFrozenDict(dict):
raise NotImplementedError(self.error)
-def set_env_log(value: bool) -> None:
- global _PRINT_ENV
- _PRINT_ENV = value
-
-
def lang_class_is_loaded(lang: str) -> bool:
"""Check whether a Language class is already loaded. Language classes are
loaded lazily, to avoid expensive setup code associated with the language
@@ -264,9 +269,7 @@ def load_model_from_path(
if not meta:
meta = get_model_meta(model_path)
config_path = model_path / "config.cfg"
- if not config_path.exists() or not config_path.is_file():
- raise IOError(Errors.E053.format(path=config_path, name="config.cfg"))
- config = Config().from_disk(config_path, overrides=dict_to_dot(config))
+ config = load_config(config_path, overrides=dict_to_dot(config))
nlp, _ = load_model_from_config(config, vocab=vocab, disable=disable)
return nlp.from_disk(model_path, exclude=disable)
@@ -317,6 +320,29 @@ def load_model_from_init_py(
)
+def load_config(
+ path: Union[str, Path],
+ overrides: Dict[str, Any] = SimpleFrozenDict(),
+ interpolate: bool = False,
+) -> Config:
+ """Load a config file. Takes care of path validation and section order."""
+ config_path = ensure_path(path)
+ if not config_path.exists() or not config_path.is_file():
+ raise IOError(Errors.E053.format(path=config_path, name="config.cfg"))
+ return Config(section_order=CONFIG_SECTION_ORDER).from_disk(
+ config_path, overrides=overrides, interpolate=interpolate
+ )
+
+
+def load_config_from_str(
+ text: str, overrides: Dict[str, Any] = SimpleFrozenDict(), interpolate: bool = False
+):
+ """Load a full config from a string."""
+ return Config(section_order=CONFIG_SECTION_ORDER).from_str(
+ text, overrides=overrides, interpolate=interpolate,
+ )
+
+
def get_installed_models() -> List[str]:
"""List all model packages currently installed in the environment.
@@ -602,27 +628,6 @@ def get_async(stream, numpy_array):
return array
-def env_opt(name: str, default: Optional[Any] = None) -> Optional[Any]:
- if type(default) is float:
- type_convert = float
- else:
- type_convert = int
- if "SPACY_" + name.upper() in os.environ:
- value = type_convert(os.environ["SPACY_" + name.upper()])
- if _PRINT_ENV:
- print(name, "=", repr(value), "via", "$SPACY_" + name.upper())
- return value
- elif name in os.environ:
- value = type_convert(os.environ[name])
- if _PRINT_ENV:
- print(name, "=", repr(value), "via", "$" + name)
- return value
- else:
- if _PRINT_ENV:
- print(name, "=", repr(default), "by default")
- return default
-
-
def read_regex(path: Union[str, Path]) -> Pattern:
path = ensure_path(path)
with path.open(encoding="utf8") as file_:
@@ -923,45 +928,6 @@ def copy_config(config: Union[Dict[str, Any], Config]) -> Config:
raise ValueError(Errors.E961.format(config=config)) from None
-def deep_merge_configs(
- config: Union[Dict[str, Any], Config], defaults: Union[Dict[str, Any], Config]
-) -> Config:
- """Deep merge two configs, a base config and its defaults. Ignores
- references to registered functions to avoid filling in
-
- config (Dict[str, Any]): The config.
- destination (Dict[str, Any]): The config defaults.
- RETURNS (Dict[str, Any]): The merged config.
- """
- config = copy_config(config)
- merged = _deep_merge_configs(config, defaults)
- return Config(merged)
-
-
-def _deep_merge_configs(
- config: Union[Dict[str, Any], Config], defaults: Union[Dict[str, Any], Config]
-) -> Union[Dict[str, Any], Config]:
- for key, value in defaults.items():
- if isinstance(value, dict):
- node = config.setdefault(key, {})
- if not isinstance(node, dict):
- continue
- promises = [key for key in value if key.startswith("@")]
- promise = promises[0] if promises else None
- # We only update the block from defaults if it refers to the same
- # registered function
- if (
- promise
- and any(k.startswith("@") for k in node)
- and (promise in node and node[promise] != value[promise])
- ):
- continue
- defaults = _deep_merge_configs(node, value)
- elif key not in config:
- config[key] = value
- return config
-
-
def dot_to_dict(values: Dict[str, Any]) -> Dict[str, dict]:
"""Convert dot notation to a dict. For example: {"token.pos": True,
"token._.xyz": True} becomes {"token": {"pos": True, "_": {"xyz": True }}}.
@@ -1067,24 +1033,7 @@ class DummyTokenizer:
def create_default_optimizer() -> Optimizer:
- # TODO: Do we still want to allow env_opt?
- learn_rate = env_opt("learn_rate", 0.001)
- beta1 = env_opt("optimizer_B1", 0.9)
- beta2 = env_opt("optimizer_B2", 0.999)
- eps = env_opt("optimizer_eps", 1e-8)
- L2 = env_opt("L2_penalty", 1e-6)
- grad_clip = env_opt("grad_norm_clip", 10.0)
- L2_is_weight_decay = env_opt("L2_is_weight_decay", False)
- optimizer = Adam(
- learn_rate,
- L2=L2,
- beta1=beta1,
- beta2=beta2,
- eps=eps,
- grad_clip=grad_clip,
- L2_is_weight_decay=L2_is_weight_decay,
- )
- return optimizer
+ return Adam()
def minibatch(items, size):
diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md
index 73631c64a..cc6f44fcc 100644
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@@ -274,7 +274,7 @@ architectures into your training config.
| `get_spans` | `Callable` | Function that takes a batch of [`Doc`](/api/doc) object and returns lists of [`Span`](/api) objects to process by the transformer. [See here](/api/transformer#span_getters) for built-in options and examples. |
| `tokenizer_config` | `Dict[str, Any]` | Tokenizer settings passed to [`transformers.AutoTokenizer`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoTokenizer). |
-### spacy-transformers.Tok2VecListener.v1 {#Tok2VecListener}
+### spacy-transformers.Tok2VecListener.v1 {#transformers-Tok2VecListener}
> #### Example Config
>
diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index 5c971effa..be7a2b499 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -16,9 +16,11 @@ menu:
- ['Project', 'project']
---
-For a list of available commands, type `spacy --help`.
-
-
+spaCy's CLI provides a range of helpful commands for downloading and training
+models, converting data and debugging your config, data and installation. For a
+list of available commands, you can type `python -m spacy --help`. You can also
+add the `--help` flag to any command or subcommand to see the description,
+available arguments and usage.
## Download {#download}
@@ -41,13 +43,13 @@ the model name to be specified with its version (e.g. `en_core_web_sm-2.2.0`).
$ python -m spacy download [model] [--direct] [pip args]
```
-| Argument | Type | Description |
-| ------------------------------------- | ---------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `model` | positional | Model name, e.g. `en_core_web_sm`.. |
-| `--direct`, `-d` | flag | Force direct download of exact model version. |
-| pip args
-
+
{children}
diff --git a/website/src/components/search.js b/website/src/components/search.js
index 4581516c2..eeab9ef40 100644
--- a/website/src/components/search.js
+++ b/website/src/components/search.js
@@ -41,6 +41,6 @@ Search.propTypes = {
apiKey: PropTypes.string.isRequired,
indexName: PropTypes.string.isRequired,
}).isRequired,
- id: PropTypes.string.isRequired,
- placeholder: PropTypes.string.isRequired,
+ id: PropTypes.string,
+ placeholder: PropTypes.string,
}
diff --git a/website/src/images/icons/package.svg b/website/src/images/icons/package.svg
new file mode 100644
index 000000000..4edaf4e6f
--- /dev/null
+++ b/website/src/images/icons/package.svg
@@ -0,0 +1,5 @@
+
diff --git a/website/src/styles/infobox.module.sass b/website/src/styles/infobox.module.sass
index baf9919c3..8d6071f18 100644
--- a/website/src/styles/infobox.module.sass
+++ b/website/src/styles/infobox.module.sass
@@ -14,6 +14,21 @@
font-size: inherit
line-height: inherit
+ ul li
+ padding-left: 0.75em
+
+.list ul li
+ font-size: var(--font-size-sm)
+ list-style: none
+ padding: 0
+ margin: 0 0 0.35rem 0
+
+ &:before
+ all: initial
+
+ a, a span
+ border-bottom: 0 !important
+
.title
font-weight: bold
color: var(--color-theme)
diff --git a/website/src/styles/quickstart.module.sass b/website/src/styles/quickstart.module.sass
index a10bacca1..91dd19f85 100644
--- a/website/src/styles/quickstart.module.sass
+++ b/website/src/styles/quickstart.module.sass
@@ -124,6 +124,16 @@
& > span
display: block
+.small
+ font-size: var(--font-size-code)
+ line-height: 1.65
+ white-space: pre-wrap
+ max-height: 400px
+ overflow-y: auto
+
+ & > span
+ display: inline
+
.hide-prompts .prompt:before
content: initial !important
diff --git a/website/src/widgets/quickstart-training-generator.js b/website/src/widgets/quickstart-training-generator.js
new file mode 100644
index 000000000..c7f856073
--- /dev/null
+++ b/website/src/widgets/quickstart-training-generator.js
@@ -0,0 +1,12 @@
+// This file was auto-generated by jinja_to_js.py based on quickstart_training.jinja
+import jinjaToJS from "jinja-to-js";export default function templateQuickstartTraining(ctx) {
+ var __result = "";
+ var __tmp;
+ var __runtime = jinjaToJS.runtime;
+ var __filters = jinjaToJS.filters;
+ var __globals = jinjaToJS.globals;
+ var context = jinjaToJS.createContext(ctx);
+ var use_transformer = context.transformer_data && context.hardware!=="cpu";var transformer = (use_transformer ? context.transformer_data[context.optimize] : {});__result += "[paths]\ntrain = \"\"\ndev = \"\"\n\n[system]\nuse_pytorch_for_gpu_memory = ";__result += "" + __runtime.escape((__tmp = ((use_transformer ? "true" : "false"))) == null ? "" : __tmp);__result += "\n\n[nlp]\nlang = \"";__result += "" + __runtime.escape((__tmp = (context.lang)) == null ? "" : __tmp);__result += "\"";var full_pipeline = [(use_transformer ? "transformer" : "tok2vec")].concat(context.components);__result += "\npipeline = ";__result += "" + ((__tmp = (JSON.stringify(full_pipeline).split("'").join("\""))) == null ? "" : __tmp);__result += "\ntokenizer = {\"@tokenizers\": \"spacy.Tokenizer.v1\"}\n\n[components]\n\n";if(__runtime.boolean(use_transformer)){__result += "[components.transformer]\nfactory = \"transformer\"\n\n[components.transformer.model]\n@architectures = \"spacy-transformers.TransformerModel.v1\"\nname = \"";__result += "" + __runtime.escape((__tmp = (transformer["name"])) == null ? "" : __tmp);__result += "\"\ntokenizer_config = {\"use_fast\": true}\n\n[components.transformer.model.get_spans]\n@span_getters = \"strided_spans.v1\"\nwindow = 128\nstride = 96\n\n";if(context.components.includes("tagger")){__result += "\n[components.tagger]\nfactory = \"tagger\"\n\n[components.tagger.model]\n@architectures = \"spacy.Tagger.v1\"\nnO = null\n\n[components.tagger.model.tok2vec]\n@architectures = \"spacy-transformers.Tok2VecListener.v1\"\ngrad_factor = 1.0\n\n[components.tagger.model.tok2vec.pooling]\n@layers = \"reduce_mean.v1\"";}__result += "\n\n";if(context.components.includes("parser")){__result += "[components.parser]\nfactory = \"parser\"\n\n[components.parser.model]\n@architectures = \"spacy.TransitionBasedParser.v1\"\nnr_feature_tokens = 8\nhidden_width = 128\nmaxout_pieces = 3\nuse_upper = false\nnO = null\n\n[components.parser.model.tok2vec]\n@architectures = \"spacy-transformers.Tok2VecListener.v1\"\ngrad_factor = 1.0\n\n[components.parser.model.tok2vec.pooling]\n@layers = \"reduce_mean.v1\"";}__result += "\n\n";if(context.components.includes("ner")){__result += "[components.ner]\nfactory = \"ner\"\n\n[components.ner.model]\n@architectures = \"spacy.TransitionBasedParser.v1\"\nnr_feature_tokens = 3\nhidden_width = 64\nmaxout_pieces = 2\nuse_upper = false\nnO = null\n\n[components.ner.model.tok2vec]\n@architectures = \"spacy-transformers.Tok2VecListener.v1\"\ngrad_factor = 1.0\n\n[components.ner.model.tok2vec.pooling]\n@layers = \"reduce_mean.v1\"\n";}__result += "\n";} else {if(context.hardware==="gpu"){__result += "# There are no recommended transformer weights available for language '";__result += "" + __runtime.escape((__tmp = (context.lang)) == null ? "" : __tmp);__result += "'\n# yet, so the pipeline described here is not transformer-based.";}__result += "\n\n[components.tok2vec]\nfactory = \"tok2vec\"\n\n[components.tok2vec.model]\n@architectures = \"spacy.Tok2Vec.v1\"\n\n[components.tok2vec.model.embed]\n@architectures = \"spacy.MultiHashEmbed.v1\"\nwidth = ${components.tok2vec.model.encode:width}\nrows = ";__result += "" + __runtime.escape((__tmp = ((context.optimize==="efficiency" ? 2000 : 7000))) == null ? "" : __tmp);__result += "\nalso_embed_subwords = ";__result += "" + __runtime.escape((__tmp = ((context.has_letters ? true : false))) == null ? "" : __tmp);__result += "\nalso_use_static_vectors = ";__result += "" + __runtime.escape((__tmp = ((context.optimize==="accuracy" ? true : false))) == null ? "" : __tmp);__result += "\n\n[components.tok2vec.model.encode]\n@architectures = \"spacy.MaxoutWindowEncoder.v1\"\nwidth = ";__result += "" + __runtime.escape((__tmp = ((context.optimize==="efficiency" ? 96 : 256))) == null ? "" : __tmp);__result += "\ndepth = ";__result += "" + __runtime.escape((__tmp = ((context.optimize==="efficiency" ? 4 : 8))) == null ? "" : __tmp);__result += "\nwindow_size = 1\nmaxout_pieces = 3\n\n";if(context.components.includes("tagger")){__result += "\n[components.tagger]\nfactory = \"tagger\"\n\n[components.tagger.model]\n@architectures = \"spacy.Tagger.v1\"\nnO = null\n\n[components.tagger.model.tok2vec]\n@architectures = \"spacy.Tok2VecListener.v1\"\nwidth = ${components.tok2vec.model.encode:width}";}__result += "\n\n";if(context.components.includes("parser")){__result += "[components.parser]\nfactory = \"parser\"\n\n[components.parser.model]\n@architectures = \"spacy.TransitionBasedParser.v1\"\nnr_feature_tokens = 8\nhidden_width = 128\nmaxout_pieces = 3\nuse_upper = true\nnO = null\n\n[components.parser.model.tok2vec]\n@architectures = \"spacy.Tok2VecListener.v1\"\nwidth = ${components.tok2vec.model.encode:width}";}__result += "\n\n";if(context.components.includes("ner")){__result += "\n[components.ner]\nfactory = \"ner\"\n\n[components.ner.model]\n@architectures = \"spacy.TransitionBasedParser.v1\"\nnr_feature_tokens = 6\nhidden_width = 64\nmaxout_pieces = 2\nuse_upper = true\nnO = null\n\n[components.ner.model.tok2vec]\n@architectures = \"spacy.Tok2VecListener.v1\"\nwidth = ${components.tok2vec.model.encode:width}\n";}__result += "\n";}__result += "\n\n";__runtime.each(context.components,function(pipe){var __$0 = context.pipe;context.pipe = pipe;__result += "\n";if(!["tagger","parser","ner"].includes(pipe)){__result += "\n";__result += "\n[components.";__result += "" + __runtime.escape((__tmp = (pipe)) == null ? "" : __tmp);__result += "]\nfactory = \"";__result += "" + __runtime.escape((__tmp = (pipe)) == null ? "" : __tmp);__result += "\"\n";}__result += "\n";context.pipe = __$0;});__result += "\n\n[training]\n";if(__runtime.boolean(use_transformer) || context.optimize==="efficiency" || !__runtime.boolean(context.word_vectors)){__result += "vectors = null\n";} else {__result += "vectors = \"";__result += "" + __runtime.escape((__tmp = (context.word_vectors)) == null ? "" : __tmp);__result += "\"\n";}if(__runtime.boolean(use_transformer)){__result += "accumulate_gradient = ";__result += "" + __runtime.escape((__tmp = (transformer["size_factor"])) == null ? "" : __tmp);__result += "\n";}__result += "\n\n[training.optimizer]\n@optimizers = \"Adam.v1\"\n\n[training.optimizer.learn_rate]\n@schedules = \"warmup_linear.v1\"\nwarmup_steps = 250\ntotal_steps = 20000\ninitial_rate = 5e-5\n\n[training.train_corpus]\n@readers = \"spacy.Corpus.v1\"\npath = ${paths:train}\nmax_length = ";__result += "" + __runtime.escape((__tmp = ((context.hardware==="gpu" ? 500 : 0))) == null ? "" : __tmp);__result += "\n\n[training.dev_corpus]\n@readers = \"spacy.Corpus.v1\"\npath = ${paths:dev}\nmax_length = 0\n\n";if(__runtime.boolean(use_transformer)){__result += "\n[training.batcher]\n@batchers = \"batch_by_padded.v1\"\ndiscard_oversize = true\nsize = 2000\nbuffer = 256";} else {__result += "\n[training.batcher]\n@batchers = \"batch_by_words.v1\"\ndiscard_oversize = false\ntolerance = 0.2\n\n[training.batcher.size]\n@schedules = \"compounding.v1\"\nstart = 100\nstop = 1000\ncompound = 1.001\n";}__result += "\n\n[training.score_weights]";if(context.components.includes("tagger")){__result += "\ntag_acc = ";__result += "" + __runtime.escape((__tmp = (Math.round((1.0 / __filters.size(context.components)+ Number.EPSILON) * 10**2) / 10**2)) == null ? "" : __tmp);}if(context.components.includes("parser")){__result += "\ndep_uas = 0.0\ndep_las = ";__result += "" + __runtime.escape((__tmp = (Math.round((1.0 / __filters.size(context.components)+ Number.EPSILON) * 10**2) / 10**2)) == null ? "" : __tmp);__result += "\nsents_f = 0.0";}if(context.components.includes("ner")){__result += "\nents_f = ";__result += "" + __runtime.escape((__tmp = (Math.round((1.0 / __filters.size(context.components)+ Number.EPSILON) * 10**2) / 10**2)) == null ? "" : __tmp);__result += "\nents_p = 0.0\nents_r = 0.0";}
+ return __result;
+}
+export const DATA = {"en": {"word_vectors": "en_vectors_web_lg", "transformer": {"efficiency": {"name": "roberta-base", "size_factor": 3}, "accuracy": {"name": "roberta-base", "size_factor": 3}}}, "de": {"word_vectors": null, "transformer": null}}
\ No newline at end of file
diff --git a/website/src/widgets/quickstart-training.js b/website/src/widgets/quickstart-training.js
index b7920dd02..4e379e5ec 100644
--- a/website/src/widgets/quickstart-training.js
+++ b/website/src/widgets/quickstart-training.js
@@ -1,13 +1,19 @@
import React, { useState } from 'react'
import { StaticQuery, graphql } from 'gatsby'
+import highlightCode from 'gatsby-remark-prismjs/highlight-code.js'
-import { Quickstart, QS } from '../components/quickstart'
+import { Quickstart } from '../components/quickstart'
+import generator, { DATA as GENERATOR_DATA } from './quickstart-training-generator'
+import { isString, htmlToReact } from '../components/util'
const DEFAULT_LANG = 'en'
+const DEFAULT_HARDWARE = 'gpu'
+const DEFAULT_OPT = 'efficiency'
const COMPONENTS = ['tagger', 'parser', 'ner', 'textcat']
-const COMMENT = `# This is an auto-generated partial config for training a model.
-# To use it for training, auto-fill it with all default values.
-# python -m spacy init config config.cfg --base base_config.cfg`
+const COMMENT = `# This is an auto-generated partial config. To use it with 'spacy train'
+# you can run spacy init fill-config to auto-fill all default settings:
+# python -m spacy init fill-config ./base_config.cfg ./config.cfg`
+
const DATA = [
{
id: 'lang',
@@ -25,9 +31,8 @@ const DATA = [
id: 'hardware',
title: 'Hardware',
options: [
- { id: 'cpu-only', title: 'CPU only' },
- { id: 'cpu', title: 'CPU preferred' },
- { id: 'gpu', title: 'GPU', checked: true },
+ { id: 'cpu', title: 'CPU preferred', checked: DEFAULT_HARDWARE === 'cpu' },
+ { id: 'gpu', title: 'GPU', checked: DEFAULT_HARDWARE === 'gpu' },
],
},
{
@@ -35,28 +40,45 @@ const DATA = [
title: 'Optimize for',
help: '...',
options: [
- { id: 'efficiency', title: 'efficiency', checked: true },
- { id: 'accuracy', title: 'accuracy' },
+ { id: 'efficiency', title: 'efficiency', checked: DEFAULT_OPT === 'efficiency' },
+ { id: 'accuracy', title: 'accuracy', checked: DEFAULT_OPT === 'accuracy' },
],
},
- {
- id: 'config',
- title: 'Configuration',
- options: [
- {
- id: 'independent',
- title: 'independent components',
- help: "Make components independent and don't share weights",
- },
- ],
- multiple: true,
- },
]
+function stringify(value) {
+ if (isString(value) && value.startsWith('${')) return value
+ const string = JSON.stringify(value)
+ if (Array.isArray(value)) return string.replace(/,/g, ', ')
+ return string
+}
+
export default function QuickstartTraining({ id, title, download = 'config.cfg' }) {
const [lang, setLang] = useState(DEFAULT_LANG)
- const [pipeline, setPipeline] = useState([])
- const setters = { lang: setLang, components: setPipeline }
+ const [components, setComponents] = useState([])
+ const [[hardware], setHardware] = useState([DEFAULT_HARDWARE])
+ const [[optimize], setOptimize] = useState([DEFAULT_OPT])
+ const setters = {
+ lang: setLang,
+ components: setComponents,
+ hardware: setHardware,
+ optimize: setOptimize,
+ }
+ const reco = GENERATOR_DATA[lang] || {}
+ const content = generator({
+ lang,
+ components,
+ optimize,
+ hardware,
+ transformer_data: reco.transformer,
+ word_vectors: reco.word_vectors,
+ })
+ const rawStr = content.trim().replace(/\n\n\n+/g, '\n\n')
+ const rawContent = `${COMMENT}\n${rawStr}`
+ const displayContent = highlightCode('ini', rawContent)
+ .split('\n')
+ .map(line => (line.startsWith('#') ? `${line}` : line))
+ .join('\n')
return (
({ [code]: { sm: 'TODO', lg: 'TODO' } }))
- )
return (
- {COMMENT}
- [paths]
- train = ""
- dev = ""
-
- [nlp]
- lang = "{lang}"
- pipeline = {JSON.stringify(pipeline).replace(/,/g, ', ')}
-
- [components]
-
- [components.transformer]
- name = "{recommendedTrf[lang].sm}"
- name = "{recommendedTrf[lang].lg}"
- {!!pipeline.length &&
}
- {pipeline.map((pipe, i) => (
- <>
- {i !== 0 &&
}
- [components.{pipe}]
- factory = "{pipe}"
-
-
- [components.parser.model.tok2vec]
-
- @architectures = "spacy.Tok2Vec.v1"
-
- >
- ))}
+ {htmlToReact(displayContent)}
)
}}