Update for new Thinc and adjust config

This commit is contained in:
Ines Montani 2020-08-13 17:38:30 +02:00
parent 965805f372
commit 88b0a96801
19 changed files with 433 additions and 414 deletions

View File

@ -1,5 +1,5 @@
recursive-include include *.h
recursive-include spacy *.pyx *.pxd *.txt *.cfg
recursive-include spacy *.pyx *.pxd *.txt *.cfg *.jinja
include LICENSE
include README.md
include pyproject.toml

View File

@ -6,7 +6,7 @@ requires = [
"cymem>=2.0.2,<2.1.0",
"preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0",
"thinc>=8.0.0a23,<8.0.0a30",
"thinc>=8.0.0a25,<8.0.0a30",
"blis>=0.4.0,<0.5.0",
"pytokenizations",
"smart_open>=2.0.0,<3.0.0"

View File

@ -1,7 +1,7 @@
# Our libraries
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
thinc>=8.0.0a23,<8.0.0a30
thinc>=8.0.0a25,<8.0.0a30
blis>=0.4.0,<0.5.0
ml_datasets>=0.1.1
murmurhash>=0.28.0,<1.1.0
@ -26,3 +26,4 @@ pytest>=4.6.5
pytest-timeout>=1.3.0,<2.0.0
mock>=2.0.0,<3.0.0
flake8>=3.5.0,<3.6.0
jinja2

View File

@ -34,13 +34,13 @@ setup_requires =
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
murmurhash>=0.28.0,<1.1.0
thinc>=8.0.0a23,<8.0.0a30
thinc>=8.0.0a25,<8.0.0a30
install_requires =
# Our libraries
murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
thinc>=8.0.0a23,<8.0.0a30
thinc>=8.0.0a25,<8.0.0a30
blis>=0.4.0,<0.5.0
wasabi>=0.7.1,<1.1.0
srsly>=2.1.0,<3.0.0

View File

@ -49,7 +49,7 @@ def debug_config_cli(
overrides = parse_config_overrides(ctx.args)
import_code(code_path)
with show_validation_error(config_path):
config = Config().from_disk(config_path, overrides=overrides)
config = Config().from_disk(config_path, overrides=overrides, interpolate=False)
try:
nlp, _ = util.load_model_from_config(config, auto_fill=auto_fill)
except ValueError as e:
@ -134,7 +134,9 @@ def debug_data(
if not config_path.exists():
msg.fail("Config file not found", config_path, exists=1)
with show_validation_error(config_path):
cfg = Config().from_disk(config_path, overrides=config_overrides)
cfg = Config().from_disk(
config_path, overrides=config_overrides, interpolate=False
)
nlp, config = util.load_model_from_config(cfg)
# Use original config here, not resolved version
sourced_components = get_sourced_components(cfg)

View File

@ -1,7 +1,7 @@
from typing import Dict, Any, Optional
from pathlib import Path
from wasabi import msg
from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam, Config
from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam
from thinc.api import Model, data_validation
import typer
@ -49,16 +49,16 @@ def debug_model_cli(
}
config_overrides = parse_config_overrides(ctx.args)
with show_validation_error(config_path):
cfg = Config().from_disk(config_path, overrides=config_overrides)
try:
nlp, config = util.load_model_from_config(cfg)
nlp, config = util.load_model_from_config_path(
config_path, overrides=config_overrides
)
except ValueError as e:
msg.fail(str(e), exits=1)
seed = config["pretraining"]["seed"]
if seed is not None:
msg.info(f"Fixing random seed: {seed}")
fix_random_seed(seed)
pipe = nlp.get_pipe(component)
if hasattr(pipe, "model"):
model = pipe.model

View File

@ -1,81 +1,107 @@
from typing import Optional, List
from enum import Enum
from pathlib import Path
from thinc.api import Config
from wasabi import msg
from wasabi import Printer
import srsly
import re
from ..util import load_model_from_config, get_lang_class, load_model
from ._util import init_cli, Arg, Opt, show_validation_error
from ..util import load_model_from_config, get_lang_class
from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
TEMPLATE_PATH = Path(__file__).parent / "templates" / "quickstart_training.jinja"
class Optimizations(str, Enum):
efficiency = "efficiency"
accuracy = "accuracy"
@init_cli.command("config")
def init_config_cli(
# fmt: off
output_path: Path = Arg("-", help="Output path or - for stdout", allow_dash=True),
base_path: Optional[Path] = Opt(None, "--base", "-b", help="Optional base config to fill", exists=True, dir_okay=False),
model: Optional[str] = Opt(None, "--model", "-m", help="Optional model to copy config from"),
lang: Optional[str] = Opt(None, "--lang", "-l", help="Optional language code for blank config"),
pipeline: Optional[str] = Opt(None, "--pipeline", "-p", help="Optional pipeline components to use")
output_file: Path = Arg("-", help="File to save config.cfg to (or - for stdout)", allow_dash=True),
# TODO: base_path: Optional[Path] = Opt(None, "--base", "-b", help="Optional base config to fill", exists=True, dir_okay=False),
lang: Optional[str] = Opt("en", "--lang", "-l", help="Two-letter code of the language to use"),
pipeline: Optional[str] = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include in the model (without 'tok2vec' or 'transformer')"),
optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
cpu: bool = Opt(False, "--cpu", "-C", help="Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
# fmt: on
):
"""Generate a starter config.cfg for training."""
validate_cli_args(base_path, model, lang)
is_stdout = str(output_path) == "-"
pipeline = [p.strip() for p in pipeline.split(",")] if pipeline else []
cfg = init_config(output_path, base_path, model, lang, pipeline, silent=is_stdout)
if is_stdout:
print(cfg.to_str())
else:
cfg.to_disk(output_path)
msg.good("Saved config", output_path)
"""
Generate a starter config.cfg for training. Based on your requirements
specified via the CLI arguments, this command generates a config with the
optimal settings for you use case. This includes the choice of architecture,
pretrained weights and related hyperparameters.
"""
if isinstance(optimize, Optimizations): # instance of enum from the CLI
optimize = optimize.value
pipeline = [p.strip() for p in pipeline.split(",")]
init_config(output_file, lang=lang, pipeline=pipeline, optimize=optimize, cpu=cpu)
def init_config(
output_path: Path,
config_path: Optional[Path],
model: Optional[str],
lang: Optional[str],
pipeline: Optional[List[str]],
silent: bool = False,
) -> Config:
if config_path is not None:
msg.info("Generating config from base config", show=not silent)
with show_validation_error(config_path, hint_init=False):
config = Config().from_disk(config_path)
output_file: Path, *, lang: str, pipeline: List[str], optimize: str, cpu: bool
) -> None:
is_stdout = str(output_file) == "-"
msg = Printer(no_print=is_stdout)
try:
from jinja2 import Template
except ImportError:
msg.fail("This command requires jinja2", "pip install jinja2", exits=1)
lang_defaults = get_lang_class(lang).Defaults
has_letters = lang_defaults.writing_system.get("has_letters", True)
has_transformer = False # TODO: check this somehow
if has_transformer:
require_spacy_transformers(msg)
with TEMPLATE_PATH.open("r") as f:
template = Template(f.read())
variables = {
"lang": lang,
"pipeline": srsly.json_dumps(pipeline).replace(",", ", "),
"components": pipeline,
"optimize": optimize,
"hardware": "cpu" if cpu else "gpu",
"has_transformer": has_transformer,
"has_letters": has_letters,
}
base_template = template.render(**variables).strip()
# Giving up on getting the newlines right in jinja for now
base_template = re.sub(r"\n\n\n+", "\n\n", base_template)
use_case = {
"Language": lang,
"Pipeline": ", ".join(pipeline),
"Optimize for": optimize,
"Hardware": variables["hardware"].upper(),
}
msg.good("Generated template specific for your use case:")
for label, value in use_case.items():
msg.text(f"- {label}: {value}")
with show_validation_error(hint_init=False):
with msg.loading("Auto-filling config..."):
config = Config().from_str(base_template, interpolate=False)
try:
nlp, _ = load_model_from_config(config, auto_fill=True)
except ValueError as e:
msg.fail(str(e), exits=1)
return nlp.config
if model is not None:
ext = f" with pipeline {pipeline}" if pipeline else ""
msg.info(f"Generating config from model {model}{ext}", show=not silent)
nlp = load_model(model)
for existing_pipe_name in nlp.pipe_names:
if existing_pipe_name not in pipeline:
nlp.remove_pipe(existing_pipe_name)
for pipe_name in pipeline:
if pipe_name not in nlp.pipe_names:
nlp.add_pipe(pipe_name)
return nlp.config
if lang is not None:
ext = f" with pipeline {pipeline}" if pipeline else ""
msg.info(f"Generating config for language '{lang}'{ext}", show=not silent)
nlp = get_lang_class(lang)()
for pipe_name in pipeline:
nlp.add_pipe(pipe_name)
return nlp.config
msg.good("Auto-filled config with all values")
if is_stdout:
print(nlp.config.to_str())
else:
nlp.config.to_disk(output_file, interpolate=False)
msg.good("Saved config", output_file)
msg.text("You can now add your data and train your model:")
variables = ["--paths.train ./train.spacy", "--paths.dev ./dev.spacy"]
print(f"{COMMAND} train {output_file.parts[-1]} {' '.join(variables)}")
def validate_cli_args(
config_path: Optional[Path], model: Optional[str], lang: Optional[str]
) -> None:
args = {"--base": config_path, "--model": model, "--lang": lang}
if sum(arg is not None for arg in args.values()) != 1:
existing = " ".join(f"{a} {v}" for a, v in args.items() if v is not None)
def require_spacy_transformers(msg):
try:
import spacy_transformers # noqa: F401
except ImportError:
msg.fail(
"The init config command expects only one of the following arguments: "
"--base (base config to fill and update), --lang (language code to "
"use for blank config) or --model (base model to copy config from).",
f"Got: {existing if existing else 'no arguments'}",
"Using a transformer-based pipeline requires spacy-transformers "
"to be installed.",
exits=1,
)

View File

@ -5,7 +5,7 @@ import time
import re
from collections import Counter
from pathlib import Path
from thinc.api import use_pytorch_for_gpu_memory, require_gpu, Config
from thinc.api import use_pytorch_for_gpu_memory, require_gpu
from thinc.api import set_dropout_rate, to_categorical, fix_random_seed
from thinc.api import CosineDistance, L2Distance
from wasabi import msg
@ -88,8 +88,9 @@ def pretrain(
msg.info("Using CPU")
msg.info(f"Loading config from: {config_path}")
with show_validation_error(config_path):
config = Config().from_disk(config_path, overrides=config_overrides)
nlp, config = util.load_model_from_config(config)
nlp, config = util.load_model_from_config_path(
config_path, overrides=config_overrides
)
# TODO: validate that [pretraining] block exists
if not output_dir.exists():
output_dir.mkdir()

View File

@ -0,0 +1,231 @@
{# This is a template for training configs used for the quickstart widget in
the docs and the init config command. It encodes various best practices and
can help generate the best possible configuration, given a user's requirements. #}
# This is an auto-generated config for training a model with 'spacy train'
[paths]
train = ""
dev = ""
[nlp]
lang = "{{ lang }}"
pipeline = {{ pipeline|safe }}
tokenizer = {"@tokenizers": "spacy.Tokenizer.v1"}
[components]
{# TRANSFORMER PIPELINE #}
{%- if has_transformer -%}
[components.transformer]
factory = "transformer"
[components.transformer.model]
@architectures = "spacy-transformers.TransformerModel.v1"
{#- name = {{ transformer_info["name"] }} #}
name = "roberta-base"
tokenizer_config = {"use_fast": true}
[components.transformer.model.get_spans]
@span_getters = "strided_spans.v1"
window = 128
stride = 96
{% if "tagger" in components %}
[components.tagger]
factory = "tagger"
[components.tagger.model]
@architectures = "spacy.Tagger.v1"
nO = null
[components.tagger.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
[components.tagger.model.tok2vec.pooling]
@layers = "reduce_mean.v1"
{%- endif %}
{% if "parser" in components -%}
[components.parser]
factory = "parser"
[components.parser.model]
@architectures = "spacy.TransitionBasedParser.v1"
nr_feature_tokens = 8
hidden_width = 128
maxout_pieces = 3
use_upper = false
nO = null
[components.parser.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
[components.parser.model.tok2vec.pooling]
@layers = "reduce_mean.v1"
{%- endif %}
{% if "ner" in components -%}
[components.ner]
factory = "ner"
[components.ner.model]
@architectures = "spacy.TransitionBasedParser.v1"
nr_feature_tokens = 3
hidden_width = 64
maxout_pieces = 2
use_upper = false
nO = null
[components.ner.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
[components.ner.model.tok2vec.pooling]
@layers = "reduce_mean.v1"
{% endif -%}
{# NON-TRANSFORMER PIPELINE #}
{% else -%}
{%- if hardware == "gpu" -%}
# There are no recommended transformer weights available for language '{{ lang }}'
# yet, so the pipeline described here is not transformer-based.
{%- endif %}
[components.tok2vec]
factory = "tok2vec"
[components.tok2vec.model]
@architectures = "spacy.Tok2Vec.v1"
[components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v1"
width = ${components.tok2vec.model.encode:width}
rows = {{ 2000 if optimize == "efficiency" else 7000 }}
also_embed_subwords = {{ true if has_letters else false }}
also_use_static_vectors = {{ true if optimize == "accuracy" else false }}
[components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v1"
width = {{ 96 if optimize == "efficiency" else 256 }}
depth = {{ 4 if optimize == "efficiency" else 8 }}
window_size = 1
maxout_pieces = 3
{% if "tagger" in components %}
[components.tagger]
factory = "tagger"
[components.tagger.model]
@architectures = "spacy.Tagger.v1"
nO = null
[components.tagger.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode:width}
{%- endif %}
{% if "parser" in components -%}
[components.parser]
factory = "parser"
[components.parser.model]
@architectures = "spacy.TransitionBasedParser.v1"
nr_feature_tokens = 8
hidden_width = 128
maxout_pieces = 3
use_upper = true
nO = null
[components.parser.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode:width}
{%- endif %}
{% if "ner" in components %}
[components.ner]
factory = "ner"
[components.ner.model]
@architectures = "spacy.TransitionBasedParser.v1"
nr_feature_tokens = 6
hidden_width = 64
maxout_pieces = 2
use_upper = true
nO = null
[components.ner.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode:width}
{% endif %}
{% endif %}
{% for pipe in components %}
{% if pipe not in ["tagger", "parser", "ner"] %}
{# Other components defined by the user: we just assume they're factories #}
[components.{{ pipe }}]
factory = "{{ pipe }}"
{% endif %}
{% endfor %}
[training]
vectors = {{ ('"en_vectors_web_lg"' if optimize == "accuracy" and not has_transformer else false)|safe }}
{% if has_transformer -%}
{#- accumulate_gradient = {{ transformer_info["size_factor"] }} #}
accumulate_gradient = 3
{% endif -%}
[training.optimizer]
@optimizers = "Adam.v1"
[training.optimizer.learn_rate]
@schedules = "warmup_linear.v1"
warmup_steps = 250
total_steps = 20000
initial_rate = 5e-5
[training.train_corpus]
@readers = "spacy.Corpus.v1"
path = ${paths:train}
max_length = {{ 500 if hardware == "gpu" else 0 }}
[training.dev_corpus]
@readers = "spacy.Corpus.v1"
path = ${paths:dev}
max_length = 0
{% if has_transformer %}
[training.batcher]
@batchers = "batch_by_padded.v1"
discard_oversize = true
size = 2000
buffer = 256
{%- else %}
[training.batcher]
@batchers = "batch_by_words.v1"
discard_oversize = false
tolerance = 0.2
[training.batcher.size]
@schedules = "compounding.v1"
start = 100
stop = 1000
compound = 1.001
{% endif %}
[training.score_weights]
{%- if "tagger" in components %}
tag_acc = {{ (1.0 / components|length)|round(2) }}
{%- endif -%}
{%- if "parser" in components %}
dep_uas = 0.0
dep_las = {{ (1.0 / components|length)|round(2) }}
sents_f = 0.0
{%- endif %}
{%- if "ner" in components %}
ents_f = {{ (1.0 / components|length)|round(2) }}
ents_p = 0.0
ents_r = 0.0
{%- endif -%}

View File

@ -75,7 +75,9 @@ def train(
msg.info("Using CPU")
msg.info(f"Loading config and nlp from: {config_path}")
with show_validation_error(config_path):
config = Config().from_disk(config_path, overrides=config_overrides)
config = Config().from_disk(
config_path, overrides=config_overrides, interpolate=False
)
if config.get("training", {}).get("seed") is not None:
fix_random_seed(config["training"]["seed"])
# Use original config here before it's resolved to functions

View File

@ -36,7 +36,7 @@ from . import about
# This is the base config will all settings (training etc.)
DEFAULT_CONFIG_PATH = Path(__file__).parent / "default_config.cfg"
DEFAULT_CONFIG = Config().from_disk(DEFAULT_CONFIG_PATH)
DEFAULT_CONFIG = Config().from_disk(DEFAULT_CONFIG_PATH, interpolate=False)
class BaseDefaults:
@ -134,7 +134,7 @@ class Language:
# of the rest.
util.registry._entry_point_factories.get_all()
self._config = util.deep_merge_configs(self.default_config, DEFAULT_CONFIG)
self._config = DEFAULT_CONFIG.merge(self.default_config)
self._meta = dict(meta)
self._path = None
self._optimizer = None
@ -167,9 +167,7 @@ class Language:
def __init_subclass__(cls, **kwargs):
super().__init_subclass__(**kwargs)
cls.default_config = util.deep_merge_configs(
cls.Defaults.config, DEFAULT_CONFIG
)
cls.default_config = DEFAULT_CONFIG.merge(cls.Defaults.config)
cls.default_config["nlp"]["lang"] = cls.lang
@property
@ -532,6 +530,7 @@ class Language:
name: Optional[str] = None,
*,
config: Optional[Dict[str, Any]] = SimpleFrozenDict(),
raw_config: Optional[Config] = None,
validate: bool = True,
) -> Callable[[Doc], Doc]:
"""Create a pipeline component. Mostly used internally. To create and
@ -542,6 +541,7 @@ class Language:
Defaults to factory name if not set.
config (Optional[Dict[str, Any]]): Config parameters to use for this
component. Will be merged with default config, if available.
raw_config (Optional[Config]): Internals: the non-interpolated config.
validate (bool): Whether to validate the component config against the
arguments and types expected by the factory.
RETURNS (Callable[[Doc], Doc]): The pipeline component.
@ -568,7 +568,7 @@ class Language:
# This is unideal, but the alternative would mean you always need to
# specify the full config settings, which is not really viable.
if pipe_meta.default_config:
config = util.deep_merge_configs(config, pipe_meta.default_config)
config = Config(pipe_meta.default_config).merge(config)
# We need to create a top-level key because Thinc doesn't allow resolving
# top-level references to registered functions. Also gives nicer errors.
# The name allows components to know their pipe name and use it in the
@ -582,12 +582,14 @@ class Language:
cfg = {factory_name: config}
# We're calling the internal _fill here to avoid constructing the
# registered functions twice
# TODO: customize validation to make it more readable / relate it to
# pipeline component and why it failed, explain default config
resolved, filled = registry.resolve(cfg, validate=validate)
filled = filled[factory_name]
filled["factory"] = factory_name
filled.pop("@factories", None)
# Merge the final filled config with the raw config (including non-
# interpolated variables)
if raw_config:
filled = filled.merge(raw_config)
self._pipe_configs[name] = filled
return resolved[factory_name]
@ -613,7 +615,10 @@ class Language:
)
)
pipe = source.get_pipe(source_name)
pipe_config = util.copy_config(source.config["components"][source_name])
# Make sure the source config is interpolated so we don't end up with
# orphaned variables in our final config
source_config = source.config.interpolate()
pipe_config = util.copy_config(source_config["components"][source_name])
self._pipe_configs[name] = pipe_config
return pipe, pipe_config["factory"]
@ -628,6 +633,7 @@ class Language:
last: Optional[bool] = None,
source: Optional["Language"] = None,
config: Optional[Dict[str, Any]] = SimpleFrozenDict(),
raw_config: Optional[Config] = None,
validate: bool = True,
) -> Callable[[Doc], Doc]:
"""Add a component to the processing pipeline. Valid components are
@ -649,6 +655,7 @@ class Language:
component from.
config (Optional[Dict[str, Any]]): Config parameters to use for this
component. Will be merged with default config, if available.
raw_config (Optional[Config]): Internals: the non-interpolated config.
validate (bool): Whether to validate the component config against the
arguments and types expected by the factory.
RETURNS (Callable[[Doc], Doc]): The pipeline component.
@ -678,7 +685,11 @@ class Language:
lang_code=self.lang,
)
pipe_component = self.create_pipe(
factory_name, name=name, config=config, validate=validate,
factory_name,
name=name,
config=config,
raw_config=raw_config,
validate=validate,
)
pipe_index = self._get_pipe_index(before, after, first, last)
self._pipe_meta[name] = self.get_factory_meta(factory_name)
@ -1379,7 +1390,7 @@ class Language:
DOCS: https://spacy.io/api/language#from_config
"""
if auto_fill:
config = util.deep_merge_configs(config, cls.default_config)
config = Config(cls.default_config).merge(config)
if "nlp" not in config:
raise ValueError(Errors.E985.format(config=config))
config_lang = config["nlp"]["lang"]
@ -1417,16 +1428,20 @@ class Language:
or lang_cls is not cls
):
raise ValueError(Errors.E943.format(value=type(lang_cls)))
# Note that we don't load vectors here, instead they get loaded explicitly
# inside stuff like the spacy train function. If we loaded them here,
# then we would load them twice at runtime: once when we make from config,
# and then again when we load from disk.
nlp = lang_cls(vocab=vocab, create_tokenizer=create_tokenizer)
if after_creation is not None:
nlp = after_creation(nlp)
if not isinstance(nlp, cls):
raise ValueError(Errors.E942.format(name="creation", value=type(nlp)))
# Note that we don't load vectors here, instead they get loaded explicitly
# inside stuff like the spacy train function. If we loaded them here,
# then we would load them twice at runtime: once when we make from config,
# and then again when we load from disk.
pipeline = config.get("components", {})
# To create the components we need to use the final interpolated config
# so all values are available (if component configs use variables).
# Later we replace the component config with the raw config again.
interpolated = filled.interpolate() if not filled.is_interpolated else filled
pipeline = interpolated.get("components", {})
# If components are loaded from a source (existing models), we cache
# them here so they're only loaded once
source_nlps = {}
@ -1435,6 +1450,7 @@ class Language:
opts = ", ".join(pipeline.keys())
raise ValueError(Errors.E956.format(name=pipe_name, opts=opts))
pipe_cfg = util.copy_config(pipeline[pipe_name])
raw_config = Config(filled["components"][pipe_name])
if pipe_name not in disable:
if "factory" not in pipe_cfg and "source" not in pipe_cfg:
err = Errors.E984.format(name=pipe_name, config=pipe_cfg)
@ -1444,7 +1460,11 @@ class Language:
# The pipe name (key in the config) here is the unique name
# of the component, not necessarily the factory
nlp.add_pipe(
factory, name=pipe_name, config=pipe_cfg, validate=validate,
factory,
name=pipe_name,
config=pipe_cfg,
validate=validate,
raw_config=raw_config,
)
else:
model = pipe_cfg["source"]

View File

@ -4,7 +4,7 @@ import spacy
from spacy.lang.en import English
from spacy.lang.de import German
from spacy.language import Language
from spacy.util import registry, deep_merge_configs, load_model_from_config
from spacy.util import registry, load_model_from_config
from spacy.ml.models import build_Tok2Vec_model, build_tb_parser_model
from spacy.ml.models import MultiHashEmbed, MaxoutWindowEncoder
@ -194,37 +194,6 @@ def test_serialize_parser():
assert upper.get_dim("nI") == 66
def test_deep_merge_configs():
config = {"a": "hello", "b": {"c": "d"}}
defaults = {"a": "world", "b": {"c": "e", "f": "g"}}
merged = deep_merge_configs(config, defaults)
assert len(merged) == 2
assert merged["a"] == "hello"
assert merged["b"] == {"c": "d", "f": "g"}
config = {"a": "hello", "b": {"@test": "x", "foo": 1}}
defaults = {"a": "world", "b": {"@test": "x", "foo": 100, "bar": 2}, "c": 100}
merged = deep_merge_configs(config, defaults)
assert len(merged) == 3
assert merged["a"] == "hello"
assert merged["b"] == {"@test": "x", "foo": 1, "bar": 2}
assert merged["c"] == 100
config = {"a": "hello", "b": {"@test": "x", "foo": 1}, "c": 100}
defaults = {"a": "world", "b": {"@test": "y", "foo": 100, "bar": 2}}
merged = deep_merge_configs(config, defaults)
assert len(merged) == 3
assert merged["a"] == "hello"
assert merged["b"] == {"@test": "x", "foo": 1}
assert merged["c"] == 100
# Test that leaving out the factory just adds to existing
config = {"a": "hello", "b": {"foo": 1}, "c": 100}
defaults = {"a": "world", "b": {"@test": "y", "foo": 100, "bar": 2}}
merged = deep_merge_configs(config, defaults)
assert len(merged) == 3
assert merged["a"] == "hello"
assert merged["b"] == {"@test": "y", "foo": 1, "bar": 2}
assert merged["c"] == 100
def test_config_nlp_roundtrip():
"""Test that a config prduced by the nlp object passes training config
validation."""
@ -311,3 +280,22 @@ def test_config_overrides():
nlp = spacy.load(d)
assert isinstance(nlp, English)
assert nlp.pipe_names == ["tok2vec", "tagger"]
def test_config_interpolation():
config = Config().from_str(nlp_config_string, interpolate=False)
assert config["training"]["train_corpus"]["path"] == "${paths:train}"
interpolated = config.interpolate()
assert interpolated["training"]["train_corpus"]["path"] == ""
nlp = English.from_config(config)
assert nlp.config["training"]["train_corpus"]["path"] == "${paths:train}"
# Ensure that variables are preserved in nlp config
width = "${components.tok2vec.model:width}"
assert config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
assert nlp.config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
interpolated2 = nlp.config.interpolate()
assert interpolated2["training"]["train_corpus"]["path"] == ""
assert interpolated2["components"]["tagger"]["model"]["tok2vec"]["width"] == 342
nlp2 = English.from_config(interpolated)
assert nlp2.config["training"]["train_corpus"]["path"] == ""
assert nlp2.config["components"]["tagger"]["model"]["tok2vec"]["width"] == 342

View File

@ -5,6 +5,7 @@ from spacy.gold.converters import iob2docs, conll_ner2docs, conllu2docs
from spacy.lang.en import English
from spacy.schemas import ProjectConfigSchema, validate
from spacy.cli.pretrain import make_docs
from spacy.cli.init_config import init_config
from spacy.cli._util import validate_project_commands, parse_config_overrides
@ -319,3 +320,13 @@ def test_parse_config_overrides(args, expected):
def test_parse_config_overrides_invalid(args):
with pytest.raises(SystemExit):
parse_config_overrides(args)
@pytest.mark.parametrize("lang", ["en", "nl"])
@pytest.mark.parametrize(
"pipeline", [["tagger", "parser", "ner"], [], ["ner", "textcat", "sentencizer"]]
)
@pytest.mark.parametrize("optimize", ["efficiency", "accuracy"])
def test_init_config(lang, pipeline, optimize):
# TODO: add more tests and also check for GPU with transformers
init_config("-", lang=lang, pipeline=pipeline, optimize=optimize, cpu=True)

View File

@ -264,11 +264,31 @@ def load_model_from_path(
if not meta:
meta = get_model_meta(model_path)
config_path = model_path / "config.cfg"
nlp, _ = load_model_from_config_path(
config_path, overrides=dict_to_dot(config), vocab=vocab, disable=disable
)
return nlp.from_disk(model_path, exclude=disable)
def load_model_from_config_path(
config_path: Union[str, Path],
*,
vocab: Union["Vocab", bool] = True,
disable: Iterable[str] = tuple(),
auto_fill: bool = False,
validate: bool = True,
overrides: Dict[str, Any] = SimpleFrozenDict(),
interpolate: bool = False,
) -> Tuple["Language", Config]:
config_path = ensure_path(config_path)
if not config_path.exists() or not config_path.is_file():
raise IOError(Errors.E053.format(path=config_path, name="config.cfg"))
config = Config().from_disk(config_path, overrides=dict_to_dot(config))
nlp, _ = load_model_from_config(config, vocab=vocab, disable=disable)
return nlp.from_disk(model_path, exclude=disable)
config = Config().from_disk(
config_path, overrides=overrides, interpolate=interpolate
)
return load_model_from_config(
config, vocab=vocab, disable=disable, auto_fill=auto_fill, validate=validate,
)
def load_model_from_config(
@ -923,45 +943,6 @@ def copy_config(config: Union[Dict[str, Any], Config]) -> Config:
raise ValueError(Errors.E961.format(config=config)) from None
def deep_merge_configs(
config: Union[Dict[str, Any], Config], defaults: Union[Dict[str, Any], Config]
) -> Config:
"""Deep merge two configs, a base config and its defaults. Ignores
references to registered functions to avoid filling in
config (Dict[str, Any]): The config.
destination (Dict[str, Any]): The config defaults.
RETURNS (Dict[str, Any]): The merged config.
"""
config = copy_config(config)
merged = _deep_merge_configs(config, defaults)
return Config(merged)
def _deep_merge_configs(
config: Union[Dict[str, Any], Config], defaults: Union[Dict[str, Any], Config]
) -> Union[Dict[str, Any], Config]:
for key, value in defaults.items():
if isinstance(value, dict):
node = config.setdefault(key, {})
if not isinstance(node, dict):
continue
promises = [key for key in value if key.startswith("@")]
promise = promises[0] if promises else None
# We only update the block from defaults if it refers to the same
# registered function
if (
promise
and any(k.startswith("@") for k in node)
and (promise in node and node[promise] != value[promise])
):
continue
defaults = _deep_merge_configs(node, value)
elif key not in config:
config[key] = value
return config
def dot_to_dict(values: Dict[str, Any]) -> Dict[str, dict]:
"""Convert dot notation to a dict. For example: {"token.pos": True,
"token._.xyz": True} becomes {"token": {"pos": True, "_": {"xyz": True }}}.

View File

@ -867,8 +867,10 @@ class JinjaToJS(object):
)
with option(kwargs, use_python_bool_wrapper=False):
if operand.op == "in":
if operand.op == "in" or operand.op == "notin":
# Special case for "in" operator
if operand.op == "notin":
self.output.write("!")
self._process_node(operand.expr, **kwargs)
self.output.write(".includes(")
self._process_node(node.expr, **kwargs)

View File

@ -1,107 +0,0 @@
{# Template for "CPU" configs. The transformer will use a different template. #}
# This is an auto-generated partial config for training a model.
# To use it for training, auto-fill it with all default values.
# python -m spacy init config config.cfg --base base_config.cfg
[paths]
train = ""
dev = ""
[nlp]
lang = "{{ lang }}"
pipeline = {{ pipeline|safe }}
vectors = {{ ('"en_vectors_web_lg"' if optimize == "accuracy" else false)|safe }}
tokenizer = {"@tokenizers": "spacy.Tokenizer.v1"}
[components]
[components.tok2vec]
factory = "tok2vec"
[components.tok2vec.model]
@architectures = "spacy.Tok2Vec.v1"
[components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v1"
width = ${components.tok2vec.model.encode:width}
rows = {{ 2000 if optimize == "efficiency" else 7000 }}
also_embed_subwords = {{ true if has_letters else false }}
also_use_static_vectors = {{ true if optimize == "accuracy" else false }}
[components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v1"
width = {{ 96 if optimize == "efficiency" else 256 }}
depth = {{ 4 if optimize == "efficiency" else 8 }}
window_size = 1
maxout_pieces = 3
{% if "tagger" in components %}
[components.tagger]
factory = "tagger"
[components.tagger.model]
@architectures = "spacy.Tagger.v1"
nO = null
[components.tagger.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode:width}
{%- endif %}
{% if "parser" in components -%}
[components.parser]
factory = "parser"
[components.parser.model]
@architectures = "spacy.TransitionBasedParser.v1"
nr_feature_tokens = 8
hidden_width = 128
maxout_pieces = 3
use_upper = true
nO = null
[components.parser.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode:width}
{%- endif %}
{% if "ner" in components -%}
[components.ner]
factory = "ner"
[components.ner.model]
@architectures = "spacy.TransitionBasedParser.v1"
nr_feature_tokens = 6
hidden_width = 64
maxout_pieces = 2
use_upper = true
nO = null
[components.ner.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode:width}
{% endif -%}
[training]
[training.train_corpus]
@readers = "spacy.Corpus.v1"
path = ${paths:train}
[training.dev_corpus]
@readers = "spacy.Corpus.v1"
path = ${paths:dev}
[training.score_weights]
{%- if "tagger" in components %}
tag_acc = {{ (1.0 / components|length)|round() }}
{%- endif -%}
{%- if "parser" in components %}
dep_uas = 0.0
dep_las = {{ (1.0 / components|length)|round() }}
sents_f = 0.0
{%- endif %}
{%- if "ner" in components %}
ents_f = {{ (1.0 / components|length)|round() }}
ents_p = 0.0
ents_r = 0.0
{%- endif -%}

View File

@ -1,139 +0,0 @@
{# Template for "CPU" configs. The transformer will use a different template. #}
# This is an auto-generated partial config for training a model.
# To use it for training, auto-fill it with all default values.
# python -m spacy init config config.cfg --base base_config.cfg
[paths]
train = ""
dev = ""
[nlp]
lang = "{{ lang }}"
pipeline = {{ pipeline|safe }}
vectors = null
tokenizer = {"@tokenizers": "spacy.Tokenizer.v1"}
[components]
[components.transformer]
factory = "transformer"
[components.transformer.model]
@architectures = "spacy-transformers.TransformerModel.v1"
{#- name = {{ transformer_info["name"] }} #}
name = "roberta-base"
tokenizer_config = {"use_fast": true}
[components.transformer.model.get_spans]
@span_getters = "strided_spans.v1"
window = 128
stride = 96
{% if "tagger" in components %}
[components.tagger]
factory = "tagger"
[components.tagger.model]
@architectures = "spacy.Tagger.v1"
nO = null
[components.tagger.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
[components.ner.model.tok2vec.pooling]
@layers = "reduce_mean.v1"
{%- endif %}
{% if "parser" in components -%}
[components.parser]
factory = "parser"
[components.parser.model]
@architectures = "spacy.TransitionBasedParser.v1"
nr_feature_tokens = 8
hidden_width = 128
maxout_pieces = 3
use_upper = false
nO = null
[components.parser.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
[components.ner.model.tok2vec.pooling]
@layers = "reduce_mean.v1"
{%- endif %}
{% if "ner" in components -%}
[components.ner]
factory = "ner"
[components.ner.model]
@architectures = "spacy.TransitionBasedParser.v1"
nr_feature_tokens = 3
hidden_width = 64
maxout_pieces = 2
use_upper = false
nO = null
[components.ner.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
[components.parser.model.tok2vec.pooling]
@layers = "reduce_mean.v1"
{% endif -%}
[training]
{#- accumulate_gradient = {{ transformer_info["size_factor"] }} #}
accumulate_gradient = 3
[training.optimizer]
@optimizers = "Adam.v1"
beta1 = 0.9
beta2 = 0.999
L2_is_weight_decay = true
L2 = 0.01
grad_clip = 1.0
use_averages = false
eps = 1e-8
[training.optimizer.learn_rate]
@schedules = "warmup_linear.v1"
warmup_steps = 250
total_steps = 20000
initial_rate = 5e-5
[training.train_corpus]
@readers = "spacy.Corpus.v1"
path = ${paths:train}
gold_preproc = false
max_length = 500
limit = 0
[training.dev_corpus]
@readers = "spacy.Corpus.v1"
path = ${paths:dev}
gold_preproc = false
max_length = 0
limit = 0
[training.batcher]
@batchers = "batch_by_padded.v1"
discard_oversize = true
batch_size = 2000
[training.score_weights]
{%- if "tagger" in components %}
tag_acc = {{ (1.0 / components|length)|round(2) }}
{%- endif -%}
{%- if "parser" in components %}
dep_uas = 0.0
dep_las = {{ (1.0 / components|length)|round(2) }}
sents_f = 0.0
{%- endif %}
{%- if "ner" in components %}
ents_f = {{ (1.0 / components|length)|round(2) }}
ents_p = 0.0
ents_r = 0.0
{%- endif -%}

View File

@ -1 +1 @@
python jinja_to_js.py quickstart_training_cpu.jinja ../src/widgets/quickstart-training-generator.js
python jinja_to_js.py ../../spacy/cli/templates/quickstart_training.jinja ../src/widgets/quickstart-training-generator.js

File diff suppressed because one or more lines are too long