mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Update for new Thinc and adjust config
This commit is contained in:
parent
965805f372
commit
88b0a96801
|
@ -1,5 +1,5 @@
|
||||||
recursive-include include *.h
|
recursive-include include *.h
|
||||||
recursive-include spacy *.pyx *.pxd *.txt *.cfg
|
recursive-include spacy *.pyx *.pxd *.txt *.cfg *.jinja
|
||||||
include LICENSE
|
include LICENSE
|
||||||
include README.md
|
include README.md
|
||||||
include pyproject.toml
|
include pyproject.toml
|
||||||
|
|
|
@ -6,7 +6,7 @@ requires = [
|
||||||
"cymem>=2.0.2,<2.1.0",
|
"cymem>=2.0.2,<2.1.0",
|
||||||
"preshed>=3.0.2,<3.1.0",
|
"preshed>=3.0.2,<3.1.0",
|
||||||
"murmurhash>=0.28.0,<1.1.0",
|
"murmurhash>=0.28.0,<1.1.0",
|
||||||
"thinc>=8.0.0a23,<8.0.0a30",
|
"thinc>=8.0.0a25,<8.0.0a30",
|
||||||
"blis>=0.4.0,<0.5.0",
|
"blis>=0.4.0,<0.5.0",
|
||||||
"pytokenizations",
|
"pytokenizations",
|
||||||
"smart_open>=2.0.0,<3.0.0"
|
"smart_open>=2.0.0,<3.0.0"
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# Our libraries
|
# Our libraries
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.0.0a23,<8.0.0a30
|
thinc>=8.0.0a25,<8.0.0a30
|
||||||
blis>=0.4.0,<0.5.0
|
blis>=0.4.0,<0.5.0
|
||||||
ml_datasets>=0.1.1
|
ml_datasets>=0.1.1
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
|
@ -26,3 +26,4 @@ pytest>=4.6.5
|
||||||
pytest-timeout>=1.3.0,<2.0.0
|
pytest-timeout>=1.3.0,<2.0.0
|
||||||
mock>=2.0.0,<3.0.0
|
mock>=2.0.0,<3.0.0
|
||||||
flake8>=3.5.0,<3.6.0
|
flake8>=3.5.0,<3.6.0
|
||||||
|
jinja2
|
||||||
|
|
|
@ -34,13 +34,13 @@ setup_requires =
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
thinc>=8.0.0a23,<8.0.0a30
|
thinc>=8.0.0a25,<8.0.0a30
|
||||||
install_requires =
|
install_requires =
|
||||||
# Our libraries
|
# Our libraries
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.0.0a23,<8.0.0a30
|
thinc>=8.0.0a25,<8.0.0a30
|
||||||
blis>=0.4.0,<0.5.0
|
blis>=0.4.0,<0.5.0
|
||||||
wasabi>=0.7.1,<1.1.0
|
wasabi>=0.7.1,<1.1.0
|
||||||
srsly>=2.1.0,<3.0.0
|
srsly>=2.1.0,<3.0.0
|
||||||
|
|
|
@ -49,7 +49,7 @@ def debug_config_cli(
|
||||||
overrides = parse_config_overrides(ctx.args)
|
overrides = parse_config_overrides(ctx.args)
|
||||||
import_code(code_path)
|
import_code(code_path)
|
||||||
with show_validation_error(config_path):
|
with show_validation_error(config_path):
|
||||||
config = Config().from_disk(config_path, overrides=overrides)
|
config = Config().from_disk(config_path, overrides=overrides, interpolate=False)
|
||||||
try:
|
try:
|
||||||
nlp, _ = util.load_model_from_config(config, auto_fill=auto_fill)
|
nlp, _ = util.load_model_from_config(config, auto_fill=auto_fill)
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
|
@ -134,7 +134,9 @@ def debug_data(
|
||||||
if not config_path.exists():
|
if not config_path.exists():
|
||||||
msg.fail("Config file not found", config_path, exists=1)
|
msg.fail("Config file not found", config_path, exists=1)
|
||||||
with show_validation_error(config_path):
|
with show_validation_error(config_path):
|
||||||
cfg = Config().from_disk(config_path, overrides=config_overrides)
|
cfg = Config().from_disk(
|
||||||
|
config_path, overrides=config_overrides, interpolate=False
|
||||||
|
)
|
||||||
nlp, config = util.load_model_from_config(cfg)
|
nlp, config = util.load_model_from_config(cfg)
|
||||||
# Use original config here, not resolved version
|
# Use original config here, not resolved version
|
||||||
sourced_components = get_sourced_components(cfg)
|
sourced_components = get_sourced_components(cfg)
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
from typing import Dict, Any, Optional
|
from typing import Dict, Any, Optional
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam, Config
|
from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam
|
||||||
from thinc.api import Model, data_validation
|
from thinc.api import Model, data_validation
|
||||||
import typer
|
import typer
|
||||||
|
|
||||||
|
@ -49,16 +49,16 @@ def debug_model_cli(
|
||||||
}
|
}
|
||||||
config_overrides = parse_config_overrides(ctx.args)
|
config_overrides = parse_config_overrides(ctx.args)
|
||||||
with show_validation_error(config_path):
|
with show_validation_error(config_path):
|
||||||
cfg = Config().from_disk(config_path, overrides=config_overrides)
|
|
||||||
try:
|
try:
|
||||||
nlp, config = util.load_model_from_config(cfg)
|
nlp, config = util.load_model_from_config_path(
|
||||||
|
config_path, overrides=config_overrides
|
||||||
|
)
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
msg.fail(str(e), exits=1)
|
msg.fail(str(e), exits=1)
|
||||||
seed = config["pretraining"]["seed"]
|
seed = config["pretraining"]["seed"]
|
||||||
if seed is not None:
|
if seed is not None:
|
||||||
msg.info(f"Fixing random seed: {seed}")
|
msg.info(f"Fixing random seed: {seed}")
|
||||||
fix_random_seed(seed)
|
fix_random_seed(seed)
|
||||||
|
|
||||||
pipe = nlp.get_pipe(component)
|
pipe = nlp.get_pipe(component)
|
||||||
if hasattr(pipe, "model"):
|
if hasattr(pipe, "model"):
|
||||||
model = pipe.model
|
model = pipe.model
|
||||||
|
|
|
@ -1,81 +1,107 @@
|
||||||
from typing import Optional, List
|
from typing import Optional, List
|
||||||
|
from enum import Enum
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from thinc.api import Config
|
from thinc.api import Config
|
||||||
from wasabi import msg
|
from wasabi import Printer
|
||||||
|
import srsly
|
||||||
|
import re
|
||||||
|
|
||||||
from ..util import load_model_from_config, get_lang_class, load_model
|
from ..util import load_model_from_config, get_lang_class
|
||||||
from ._util import init_cli, Arg, Opt, show_validation_error
|
from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
|
||||||
|
|
||||||
|
|
||||||
|
TEMPLATE_PATH = Path(__file__).parent / "templates" / "quickstart_training.jinja"
|
||||||
|
|
||||||
|
|
||||||
|
class Optimizations(str, Enum):
|
||||||
|
efficiency = "efficiency"
|
||||||
|
accuracy = "accuracy"
|
||||||
|
|
||||||
|
|
||||||
@init_cli.command("config")
|
@init_cli.command("config")
|
||||||
def init_config_cli(
|
def init_config_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
output_path: Path = Arg("-", help="Output path or - for stdout", allow_dash=True),
|
output_file: Path = Arg("-", help="File to save config.cfg to (or - for stdout)", allow_dash=True),
|
||||||
base_path: Optional[Path] = Opt(None, "--base", "-b", help="Optional base config to fill", exists=True, dir_okay=False),
|
# TODO: base_path: Optional[Path] = Opt(None, "--base", "-b", help="Optional base config to fill", exists=True, dir_okay=False),
|
||||||
model: Optional[str] = Opt(None, "--model", "-m", help="Optional model to copy config from"),
|
lang: Optional[str] = Opt("en", "--lang", "-l", help="Two-letter code of the language to use"),
|
||||||
lang: Optional[str] = Opt(None, "--lang", "-l", help="Optional language code for blank config"),
|
pipeline: Optional[str] = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include in the model (without 'tok2vec' or 'transformer')"),
|
||||||
pipeline: Optional[str] = Opt(None, "--pipeline", "-p", help="Optional pipeline components to use")
|
optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
||||||
|
cpu: bool = Opt(False, "--cpu", "-C", help="Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""Generate a starter config.cfg for training."""
|
"""
|
||||||
validate_cli_args(base_path, model, lang)
|
Generate a starter config.cfg for training. Based on your requirements
|
||||||
is_stdout = str(output_path) == "-"
|
specified via the CLI arguments, this command generates a config with the
|
||||||
pipeline = [p.strip() for p in pipeline.split(",")] if pipeline else []
|
optimal settings for you use case. This includes the choice of architecture,
|
||||||
cfg = init_config(output_path, base_path, model, lang, pipeline, silent=is_stdout)
|
pretrained weights and related hyperparameters.
|
||||||
if is_stdout:
|
"""
|
||||||
print(cfg.to_str())
|
if isinstance(optimize, Optimizations): # instance of enum from the CLI
|
||||||
else:
|
optimize = optimize.value
|
||||||
cfg.to_disk(output_path)
|
pipeline = [p.strip() for p in pipeline.split(",")]
|
||||||
msg.good("Saved config", output_path)
|
init_config(output_file, lang=lang, pipeline=pipeline, optimize=optimize, cpu=cpu)
|
||||||
|
|
||||||
|
|
||||||
def init_config(
|
def init_config(
|
||||||
output_path: Path,
|
output_file: Path, *, lang: str, pipeline: List[str], optimize: str, cpu: bool
|
||||||
config_path: Optional[Path],
|
) -> None:
|
||||||
model: Optional[str],
|
is_stdout = str(output_file) == "-"
|
||||||
lang: Optional[str],
|
msg = Printer(no_print=is_stdout)
|
||||||
pipeline: Optional[List[str]],
|
try:
|
||||||
silent: bool = False,
|
from jinja2 import Template
|
||||||
) -> Config:
|
except ImportError:
|
||||||
if config_path is not None:
|
msg.fail("This command requires jinja2", "pip install jinja2", exits=1)
|
||||||
msg.info("Generating config from base config", show=not silent)
|
lang_defaults = get_lang_class(lang).Defaults
|
||||||
with show_validation_error(config_path, hint_init=False):
|
has_letters = lang_defaults.writing_system.get("has_letters", True)
|
||||||
config = Config().from_disk(config_path)
|
has_transformer = False # TODO: check this somehow
|
||||||
|
if has_transformer:
|
||||||
|
require_spacy_transformers(msg)
|
||||||
|
with TEMPLATE_PATH.open("r") as f:
|
||||||
|
template = Template(f.read())
|
||||||
|
variables = {
|
||||||
|
"lang": lang,
|
||||||
|
"pipeline": srsly.json_dumps(pipeline).replace(",", ", "),
|
||||||
|
"components": pipeline,
|
||||||
|
"optimize": optimize,
|
||||||
|
"hardware": "cpu" if cpu else "gpu",
|
||||||
|
"has_transformer": has_transformer,
|
||||||
|
"has_letters": has_letters,
|
||||||
|
}
|
||||||
|
base_template = template.render(**variables).strip()
|
||||||
|
# Giving up on getting the newlines right in jinja for now
|
||||||
|
base_template = re.sub(r"\n\n\n+", "\n\n", base_template)
|
||||||
|
use_case = {
|
||||||
|
"Language": lang,
|
||||||
|
"Pipeline": ", ".join(pipeline),
|
||||||
|
"Optimize for": optimize,
|
||||||
|
"Hardware": variables["hardware"].upper(),
|
||||||
|
}
|
||||||
|
msg.good("Generated template specific for your use case:")
|
||||||
|
for label, value in use_case.items():
|
||||||
|
msg.text(f"- {label}: {value}")
|
||||||
|
with show_validation_error(hint_init=False):
|
||||||
|
with msg.loading("Auto-filling config..."):
|
||||||
|
config = Config().from_str(base_template, interpolate=False)
|
||||||
try:
|
try:
|
||||||
nlp, _ = load_model_from_config(config, auto_fill=True)
|
nlp, _ = load_model_from_config(config, auto_fill=True)
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
msg.fail(str(e), exits=1)
|
msg.fail(str(e), exits=1)
|
||||||
return nlp.config
|
msg.good("Auto-filled config with all values")
|
||||||
if model is not None:
|
if is_stdout:
|
||||||
ext = f" with pipeline {pipeline}" if pipeline else ""
|
print(nlp.config.to_str())
|
||||||
msg.info(f"Generating config from model {model}{ext}", show=not silent)
|
else:
|
||||||
nlp = load_model(model)
|
nlp.config.to_disk(output_file, interpolate=False)
|
||||||
for existing_pipe_name in nlp.pipe_names:
|
msg.good("Saved config", output_file)
|
||||||
if existing_pipe_name not in pipeline:
|
msg.text("You can now add your data and train your model:")
|
||||||
nlp.remove_pipe(existing_pipe_name)
|
variables = ["--paths.train ./train.spacy", "--paths.dev ./dev.spacy"]
|
||||||
for pipe_name in pipeline:
|
print(f"{COMMAND} train {output_file.parts[-1]} {' '.join(variables)}")
|
||||||
if pipe_name not in nlp.pipe_names:
|
|
||||||
nlp.add_pipe(pipe_name)
|
|
||||||
return nlp.config
|
|
||||||
if lang is not None:
|
|
||||||
ext = f" with pipeline {pipeline}" if pipeline else ""
|
|
||||||
msg.info(f"Generating config for language '{lang}'{ext}", show=not silent)
|
|
||||||
nlp = get_lang_class(lang)()
|
|
||||||
for pipe_name in pipeline:
|
|
||||||
nlp.add_pipe(pipe_name)
|
|
||||||
return nlp.config
|
|
||||||
|
|
||||||
|
|
||||||
def validate_cli_args(
|
def require_spacy_transformers(msg):
|
||||||
config_path: Optional[Path], model: Optional[str], lang: Optional[str]
|
try:
|
||||||
) -> None:
|
import spacy_transformers # noqa: F401
|
||||||
args = {"--base": config_path, "--model": model, "--lang": lang}
|
except ImportError:
|
||||||
if sum(arg is not None for arg in args.values()) != 1:
|
|
||||||
existing = " ".join(f"{a} {v}" for a, v in args.items() if v is not None)
|
|
||||||
msg.fail(
|
msg.fail(
|
||||||
"The init config command expects only one of the following arguments: "
|
"Using a transformer-based pipeline requires spacy-transformers "
|
||||||
"--base (base config to fill and update), --lang (language code to "
|
"to be installed.",
|
||||||
"use for blank config) or --model (base model to copy config from).",
|
|
||||||
f"Got: {existing if existing else 'no arguments'}",
|
|
||||||
exits=1,
|
exits=1,
|
||||||
)
|
)
|
||||||
|
|
|
@ -5,7 +5,7 @@ import time
|
||||||
import re
|
import re
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from thinc.api import use_pytorch_for_gpu_memory, require_gpu, Config
|
from thinc.api import use_pytorch_for_gpu_memory, require_gpu
|
||||||
from thinc.api import set_dropout_rate, to_categorical, fix_random_seed
|
from thinc.api import set_dropout_rate, to_categorical, fix_random_seed
|
||||||
from thinc.api import CosineDistance, L2Distance
|
from thinc.api import CosineDistance, L2Distance
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
|
@ -88,8 +88,9 @@ def pretrain(
|
||||||
msg.info("Using CPU")
|
msg.info("Using CPU")
|
||||||
msg.info(f"Loading config from: {config_path}")
|
msg.info(f"Loading config from: {config_path}")
|
||||||
with show_validation_error(config_path):
|
with show_validation_error(config_path):
|
||||||
config = Config().from_disk(config_path, overrides=config_overrides)
|
nlp, config = util.load_model_from_config_path(
|
||||||
nlp, config = util.load_model_from_config(config)
|
config_path, overrides=config_overrides
|
||||||
|
)
|
||||||
# TODO: validate that [pretraining] block exists
|
# TODO: validate that [pretraining] block exists
|
||||||
if not output_dir.exists():
|
if not output_dir.exists():
|
||||||
output_dir.mkdir()
|
output_dir.mkdir()
|
||||||
|
|
231
spacy/cli/templates/quickstart_training.jinja
Normal file
231
spacy/cli/templates/quickstart_training.jinja
Normal file
|
@ -0,0 +1,231 @@
|
||||||
|
{# This is a template for training configs used for the quickstart widget in
|
||||||
|
the docs and the init config command. It encodes various best practices and
|
||||||
|
can help generate the best possible configuration, given a user's requirements. #}
|
||||||
|
# This is an auto-generated config for training a model with 'spacy train'
|
||||||
|
[paths]
|
||||||
|
train = ""
|
||||||
|
dev = ""
|
||||||
|
|
||||||
|
[nlp]
|
||||||
|
lang = "{{ lang }}"
|
||||||
|
pipeline = {{ pipeline|safe }}
|
||||||
|
tokenizer = {"@tokenizers": "spacy.Tokenizer.v1"}
|
||||||
|
|
||||||
|
[components]
|
||||||
|
|
||||||
|
{# TRANSFORMER PIPELINE #}
|
||||||
|
{%- if has_transformer -%}
|
||||||
|
[components.transformer]
|
||||||
|
factory = "transformer"
|
||||||
|
|
||||||
|
[components.transformer.model]
|
||||||
|
@architectures = "spacy-transformers.TransformerModel.v1"
|
||||||
|
{#- name = {{ transformer_info["name"] }} #}
|
||||||
|
name = "roberta-base"
|
||||||
|
tokenizer_config = {"use_fast": true}
|
||||||
|
|
||||||
|
[components.transformer.model.get_spans]
|
||||||
|
@span_getters = "strided_spans.v1"
|
||||||
|
window = 128
|
||||||
|
stride = 96
|
||||||
|
|
||||||
|
{% if "tagger" in components %}
|
||||||
|
[components.tagger]
|
||||||
|
factory = "tagger"
|
||||||
|
|
||||||
|
[components.tagger.model]
|
||||||
|
@architectures = "spacy.Tagger.v1"
|
||||||
|
nO = null
|
||||||
|
|
||||||
|
[components.tagger.model.tok2vec]
|
||||||
|
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||||
|
grad_factor = 1.0
|
||||||
|
|
||||||
|
[components.tagger.model.tok2vec.pooling]
|
||||||
|
@layers = "reduce_mean.v1"
|
||||||
|
{%- endif %}
|
||||||
|
|
||||||
|
{% if "parser" in components -%}
|
||||||
|
[components.parser]
|
||||||
|
factory = "parser"
|
||||||
|
|
||||||
|
[components.parser.model]
|
||||||
|
@architectures = "spacy.TransitionBasedParser.v1"
|
||||||
|
nr_feature_tokens = 8
|
||||||
|
hidden_width = 128
|
||||||
|
maxout_pieces = 3
|
||||||
|
use_upper = false
|
||||||
|
nO = null
|
||||||
|
|
||||||
|
[components.parser.model.tok2vec]
|
||||||
|
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||||
|
grad_factor = 1.0
|
||||||
|
|
||||||
|
[components.parser.model.tok2vec.pooling]
|
||||||
|
@layers = "reduce_mean.v1"
|
||||||
|
{%- endif %}
|
||||||
|
|
||||||
|
{% if "ner" in components -%}
|
||||||
|
[components.ner]
|
||||||
|
factory = "ner"
|
||||||
|
|
||||||
|
[components.ner.model]
|
||||||
|
@architectures = "spacy.TransitionBasedParser.v1"
|
||||||
|
nr_feature_tokens = 3
|
||||||
|
hidden_width = 64
|
||||||
|
maxout_pieces = 2
|
||||||
|
use_upper = false
|
||||||
|
nO = null
|
||||||
|
|
||||||
|
[components.ner.model.tok2vec]
|
||||||
|
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||||
|
grad_factor = 1.0
|
||||||
|
|
||||||
|
[components.ner.model.tok2vec.pooling]
|
||||||
|
@layers = "reduce_mean.v1"
|
||||||
|
{% endif -%}
|
||||||
|
|
||||||
|
{# NON-TRANSFORMER PIPELINE #}
|
||||||
|
{% else -%}
|
||||||
|
|
||||||
|
{%- if hardware == "gpu" -%}
|
||||||
|
# There are no recommended transformer weights available for language '{{ lang }}'
|
||||||
|
# yet, so the pipeline described here is not transformer-based.
|
||||||
|
{%- endif %}
|
||||||
|
|
||||||
|
[components.tok2vec]
|
||||||
|
factory = "tok2vec"
|
||||||
|
|
||||||
|
[components.tok2vec.model]
|
||||||
|
@architectures = "spacy.Tok2Vec.v1"
|
||||||
|
|
||||||
|
[components.tok2vec.model.embed]
|
||||||
|
@architectures = "spacy.MultiHashEmbed.v1"
|
||||||
|
width = ${components.tok2vec.model.encode:width}
|
||||||
|
rows = {{ 2000 if optimize == "efficiency" else 7000 }}
|
||||||
|
also_embed_subwords = {{ true if has_letters else false }}
|
||||||
|
also_use_static_vectors = {{ true if optimize == "accuracy" else false }}
|
||||||
|
|
||||||
|
[components.tok2vec.model.encode]
|
||||||
|
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
||||||
|
width = {{ 96 if optimize == "efficiency" else 256 }}
|
||||||
|
depth = {{ 4 if optimize == "efficiency" else 8 }}
|
||||||
|
window_size = 1
|
||||||
|
maxout_pieces = 3
|
||||||
|
|
||||||
|
{% if "tagger" in components %}
|
||||||
|
[components.tagger]
|
||||||
|
factory = "tagger"
|
||||||
|
|
||||||
|
[components.tagger.model]
|
||||||
|
@architectures = "spacy.Tagger.v1"
|
||||||
|
nO = null
|
||||||
|
|
||||||
|
[components.tagger.model.tok2vec]
|
||||||
|
@architectures = "spacy.Tok2VecListener.v1"
|
||||||
|
width = ${components.tok2vec.model.encode:width}
|
||||||
|
{%- endif %}
|
||||||
|
|
||||||
|
{% if "parser" in components -%}
|
||||||
|
[components.parser]
|
||||||
|
factory = "parser"
|
||||||
|
|
||||||
|
[components.parser.model]
|
||||||
|
@architectures = "spacy.TransitionBasedParser.v1"
|
||||||
|
nr_feature_tokens = 8
|
||||||
|
hidden_width = 128
|
||||||
|
maxout_pieces = 3
|
||||||
|
use_upper = true
|
||||||
|
nO = null
|
||||||
|
|
||||||
|
[components.parser.model.tok2vec]
|
||||||
|
@architectures = "spacy.Tok2VecListener.v1"
|
||||||
|
width = ${components.tok2vec.model.encode:width}
|
||||||
|
{%- endif %}
|
||||||
|
|
||||||
|
{% if "ner" in components %}
|
||||||
|
[components.ner]
|
||||||
|
factory = "ner"
|
||||||
|
|
||||||
|
[components.ner.model]
|
||||||
|
@architectures = "spacy.TransitionBasedParser.v1"
|
||||||
|
nr_feature_tokens = 6
|
||||||
|
hidden_width = 64
|
||||||
|
maxout_pieces = 2
|
||||||
|
use_upper = true
|
||||||
|
nO = null
|
||||||
|
|
||||||
|
[components.ner.model.tok2vec]
|
||||||
|
@architectures = "spacy.Tok2VecListener.v1"
|
||||||
|
width = ${components.tok2vec.model.encode:width}
|
||||||
|
{% endif %}
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
{% for pipe in components %}
|
||||||
|
{% if pipe not in ["tagger", "parser", "ner"] %}
|
||||||
|
{# Other components defined by the user: we just assume they're factories #}
|
||||||
|
[components.{{ pipe }}]
|
||||||
|
factory = "{{ pipe }}"
|
||||||
|
{% endif %}
|
||||||
|
{% endfor %}
|
||||||
|
|
||||||
|
[training]
|
||||||
|
vectors = {{ ('"en_vectors_web_lg"' if optimize == "accuracy" and not has_transformer else false)|safe }}
|
||||||
|
|
||||||
|
{% if has_transformer -%}
|
||||||
|
{#- accumulate_gradient = {{ transformer_info["size_factor"] }} #}
|
||||||
|
accumulate_gradient = 3
|
||||||
|
{% endif -%}
|
||||||
|
|
||||||
|
[training.optimizer]
|
||||||
|
@optimizers = "Adam.v1"
|
||||||
|
|
||||||
|
[training.optimizer.learn_rate]
|
||||||
|
@schedules = "warmup_linear.v1"
|
||||||
|
warmup_steps = 250
|
||||||
|
total_steps = 20000
|
||||||
|
initial_rate = 5e-5
|
||||||
|
|
||||||
|
[training.train_corpus]
|
||||||
|
@readers = "spacy.Corpus.v1"
|
||||||
|
path = ${paths:train}
|
||||||
|
max_length = {{ 500 if hardware == "gpu" else 0 }}
|
||||||
|
|
||||||
|
[training.dev_corpus]
|
||||||
|
@readers = "spacy.Corpus.v1"
|
||||||
|
path = ${paths:dev}
|
||||||
|
max_length = 0
|
||||||
|
|
||||||
|
{% if has_transformer %}
|
||||||
|
[training.batcher]
|
||||||
|
@batchers = "batch_by_padded.v1"
|
||||||
|
discard_oversize = true
|
||||||
|
size = 2000
|
||||||
|
buffer = 256
|
||||||
|
{%- else %}
|
||||||
|
[training.batcher]
|
||||||
|
@batchers = "batch_by_words.v1"
|
||||||
|
discard_oversize = false
|
||||||
|
tolerance = 0.2
|
||||||
|
|
||||||
|
[training.batcher.size]
|
||||||
|
@schedules = "compounding.v1"
|
||||||
|
start = 100
|
||||||
|
stop = 1000
|
||||||
|
compound = 1.001
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
[training.score_weights]
|
||||||
|
{%- if "tagger" in components %}
|
||||||
|
tag_acc = {{ (1.0 / components|length)|round(2) }}
|
||||||
|
{%- endif -%}
|
||||||
|
{%- if "parser" in components %}
|
||||||
|
dep_uas = 0.0
|
||||||
|
dep_las = {{ (1.0 / components|length)|round(2) }}
|
||||||
|
sents_f = 0.0
|
||||||
|
{%- endif %}
|
||||||
|
{%- if "ner" in components %}
|
||||||
|
ents_f = {{ (1.0 / components|length)|round(2) }}
|
||||||
|
ents_p = 0.0
|
||||||
|
ents_r = 0.0
|
||||||
|
{%- endif -%}
|
|
@ -75,7 +75,9 @@ def train(
|
||||||
msg.info("Using CPU")
|
msg.info("Using CPU")
|
||||||
msg.info(f"Loading config and nlp from: {config_path}")
|
msg.info(f"Loading config and nlp from: {config_path}")
|
||||||
with show_validation_error(config_path):
|
with show_validation_error(config_path):
|
||||||
config = Config().from_disk(config_path, overrides=config_overrides)
|
config = Config().from_disk(
|
||||||
|
config_path, overrides=config_overrides, interpolate=False
|
||||||
|
)
|
||||||
if config.get("training", {}).get("seed") is not None:
|
if config.get("training", {}).get("seed") is not None:
|
||||||
fix_random_seed(config["training"]["seed"])
|
fix_random_seed(config["training"]["seed"])
|
||||||
# Use original config here before it's resolved to functions
|
# Use original config here before it's resolved to functions
|
||||||
|
|
|
@ -36,7 +36,7 @@ from . import about
|
||||||
|
|
||||||
# This is the base config will all settings (training etc.)
|
# This is the base config will all settings (training etc.)
|
||||||
DEFAULT_CONFIG_PATH = Path(__file__).parent / "default_config.cfg"
|
DEFAULT_CONFIG_PATH = Path(__file__).parent / "default_config.cfg"
|
||||||
DEFAULT_CONFIG = Config().from_disk(DEFAULT_CONFIG_PATH)
|
DEFAULT_CONFIG = Config().from_disk(DEFAULT_CONFIG_PATH, interpolate=False)
|
||||||
|
|
||||||
|
|
||||||
class BaseDefaults:
|
class BaseDefaults:
|
||||||
|
@ -134,7 +134,7 @@ class Language:
|
||||||
# of the rest.
|
# of the rest.
|
||||||
util.registry._entry_point_factories.get_all()
|
util.registry._entry_point_factories.get_all()
|
||||||
|
|
||||||
self._config = util.deep_merge_configs(self.default_config, DEFAULT_CONFIG)
|
self._config = DEFAULT_CONFIG.merge(self.default_config)
|
||||||
self._meta = dict(meta)
|
self._meta = dict(meta)
|
||||||
self._path = None
|
self._path = None
|
||||||
self._optimizer = None
|
self._optimizer = None
|
||||||
|
@ -167,9 +167,7 @@ class Language:
|
||||||
|
|
||||||
def __init_subclass__(cls, **kwargs):
|
def __init_subclass__(cls, **kwargs):
|
||||||
super().__init_subclass__(**kwargs)
|
super().__init_subclass__(**kwargs)
|
||||||
cls.default_config = util.deep_merge_configs(
|
cls.default_config = DEFAULT_CONFIG.merge(cls.Defaults.config)
|
||||||
cls.Defaults.config, DEFAULT_CONFIG
|
|
||||||
)
|
|
||||||
cls.default_config["nlp"]["lang"] = cls.lang
|
cls.default_config["nlp"]["lang"] = cls.lang
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -532,6 +530,7 @@ class Language:
|
||||||
name: Optional[str] = None,
|
name: Optional[str] = None,
|
||||||
*,
|
*,
|
||||||
config: Optional[Dict[str, Any]] = SimpleFrozenDict(),
|
config: Optional[Dict[str, Any]] = SimpleFrozenDict(),
|
||||||
|
raw_config: Optional[Config] = None,
|
||||||
validate: bool = True,
|
validate: bool = True,
|
||||||
) -> Callable[[Doc], Doc]:
|
) -> Callable[[Doc], Doc]:
|
||||||
"""Create a pipeline component. Mostly used internally. To create and
|
"""Create a pipeline component. Mostly used internally. To create and
|
||||||
|
@ -542,6 +541,7 @@ class Language:
|
||||||
Defaults to factory name if not set.
|
Defaults to factory name if not set.
|
||||||
config (Optional[Dict[str, Any]]): Config parameters to use for this
|
config (Optional[Dict[str, Any]]): Config parameters to use for this
|
||||||
component. Will be merged with default config, if available.
|
component. Will be merged with default config, if available.
|
||||||
|
raw_config (Optional[Config]): Internals: the non-interpolated config.
|
||||||
validate (bool): Whether to validate the component config against the
|
validate (bool): Whether to validate the component config against the
|
||||||
arguments and types expected by the factory.
|
arguments and types expected by the factory.
|
||||||
RETURNS (Callable[[Doc], Doc]): The pipeline component.
|
RETURNS (Callable[[Doc], Doc]): The pipeline component.
|
||||||
|
@ -568,7 +568,7 @@ class Language:
|
||||||
# This is unideal, but the alternative would mean you always need to
|
# This is unideal, but the alternative would mean you always need to
|
||||||
# specify the full config settings, which is not really viable.
|
# specify the full config settings, which is not really viable.
|
||||||
if pipe_meta.default_config:
|
if pipe_meta.default_config:
|
||||||
config = util.deep_merge_configs(config, pipe_meta.default_config)
|
config = Config(pipe_meta.default_config).merge(config)
|
||||||
# We need to create a top-level key because Thinc doesn't allow resolving
|
# We need to create a top-level key because Thinc doesn't allow resolving
|
||||||
# top-level references to registered functions. Also gives nicer errors.
|
# top-level references to registered functions. Also gives nicer errors.
|
||||||
# The name allows components to know their pipe name and use it in the
|
# The name allows components to know their pipe name and use it in the
|
||||||
|
@ -582,12 +582,14 @@ class Language:
|
||||||
cfg = {factory_name: config}
|
cfg = {factory_name: config}
|
||||||
# We're calling the internal _fill here to avoid constructing the
|
# We're calling the internal _fill here to avoid constructing the
|
||||||
# registered functions twice
|
# registered functions twice
|
||||||
# TODO: customize validation to make it more readable / relate it to
|
|
||||||
# pipeline component and why it failed, explain default config
|
|
||||||
resolved, filled = registry.resolve(cfg, validate=validate)
|
resolved, filled = registry.resolve(cfg, validate=validate)
|
||||||
filled = filled[factory_name]
|
filled = filled[factory_name]
|
||||||
filled["factory"] = factory_name
|
filled["factory"] = factory_name
|
||||||
filled.pop("@factories", None)
|
filled.pop("@factories", None)
|
||||||
|
# Merge the final filled config with the raw config (including non-
|
||||||
|
# interpolated variables)
|
||||||
|
if raw_config:
|
||||||
|
filled = filled.merge(raw_config)
|
||||||
self._pipe_configs[name] = filled
|
self._pipe_configs[name] = filled
|
||||||
return resolved[factory_name]
|
return resolved[factory_name]
|
||||||
|
|
||||||
|
@ -613,7 +615,10 @@ class Language:
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
pipe = source.get_pipe(source_name)
|
pipe = source.get_pipe(source_name)
|
||||||
pipe_config = util.copy_config(source.config["components"][source_name])
|
# Make sure the source config is interpolated so we don't end up with
|
||||||
|
# orphaned variables in our final config
|
||||||
|
source_config = source.config.interpolate()
|
||||||
|
pipe_config = util.copy_config(source_config["components"][source_name])
|
||||||
self._pipe_configs[name] = pipe_config
|
self._pipe_configs[name] = pipe_config
|
||||||
return pipe, pipe_config["factory"]
|
return pipe, pipe_config["factory"]
|
||||||
|
|
||||||
|
@ -628,6 +633,7 @@ class Language:
|
||||||
last: Optional[bool] = None,
|
last: Optional[bool] = None,
|
||||||
source: Optional["Language"] = None,
|
source: Optional["Language"] = None,
|
||||||
config: Optional[Dict[str, Any]] = SimpleFrozenDict(),
|
config: Optional[Dict[str, Any]] = SimpleFrozenDict(),
|
||||||
|
raw_config: Optional[Config] = None,
|
||||||
validate: bool = True,
|
validate: bool = True,
|
||||||
) -> Callable[[Doc], Doc]:
|
) -> Callable[[Doc], Doc]:
|
||||||
"""Add a component to the processing pipeline. Valid components are
|
"""Add a component to the processing pipeline. Valid components are
|
||||||
|
@ -649,6 +655,7 @@ class Language:
|
||||||
component from.
|
component from.
|
||||||
config (Optional[Dict[str, Any]]): Config parameters to use for this
|
config (Optional[Dict[str, Any]]): Config parameters to use for this
|
||||||
component. Will be merged with default config, if available.
|
component. Will be merged with default config, if available.
|
||||||
|
raw_config (Optional[Config]): Internals: the non-interpolated config.
|
||||||
validate (bool): Whether to validate the component config against the
|
validate (bool): Whether to validate the component config against the
|
||||||
arguments and types expected by the factory.
|
arguments and types expected by the factory.
|
||||||
RETURNS (Callable[[Doc], Doc]): The pipeline component.
|
RETURNS (Callable[[Doc], Doc]): The pipeline component.
|
||||||
|
@ -678,7 +685,11 @@ class Language:
|
||||||
lang_code=self.lang,
|
lang_code=self.lang,
|
||||||
)
|
)
|
||||||
pipe_component = self.create_pipe(
|
pipe_component = self.create_pipe(
|
||||||
factory_name, name=name, config=config, validate=validate,
|
factory_name,
|
||||||
|
name=name,
|
||||||
|
config=config,
|
||||||
|
raw_config=raw_config,
|
||||||
|
validate=validate,
|
||||||
)
|
)
|
||||||
pipe_index = self._get_pipe_index(before, after, first, last)
|
pipe_index = self._get_pipe_index(before, after, first, last)
|
||||||
self._pipe_meta[name] = self.get_factory_meta(factory_name)
|
self._pipe_meta[name] = self.get_factory_meta(factory_name)
|
||||||
|
@ -1379,7 +1390,7 @@ class Language:
|
||||||
DOCS: https://spacy.io/api/language#from_config
|
DOCS: https://spacy.io/api/language#from_config
|
||||||
"""
|
"""
|
||||||
if auto_fill:
|
if auto_fill:
|
||||||
config = util.deep_merge_configs(config, cls.default_config)
|
config = Config(cls.default_config).merge(config)
|
||||||
if "nlp" not in config:
|
if "nlp" not in config:
|
||||||
raise ValueError(Errors.E985.format(config=config))
|
raise ValueError(Errors.E985.format(config=config))
|
||||||
config_lang = config["nlp"]["lang"]
|
config_lang = config["nlp"]["lang"]
|
||||||
|
@ -1417,16 +1428,20 @@ class Language:
|
||||||
or lang_cls is not cls
|
or lang_cls is not cls
|
||||||
):
|
):
|
||||||
raise ValueError(Errors.E943.format(value=type(lang_cls)))
|
raise ValueError(Errors.E943.format(value=type(lang_cls)))
|
||||||
|
# Note that we don't load vectors here, instead they get loaded explicitly
|
||||||
|
# inside stuff like the spacy train function. If we loaded them here,
|
||||||
|
# then we would load them twice at runtime: once when we make from config,
|
||||||
|
# and then again when we load from disk.
|
||||||
nlp = lang_cls(vocab=vocab, create_tokenizer=create_tokenizer)
|
nlp = lang_cls(vocab=vocab, create_tokenizer=create_tokenizer)
|
||||||
if after_creation is not None:
|
if after_creation is not None:
|
||||||
nlp = after_creation(nlp)
|
nlp = after_creation(nlp)
|
||||||
if not isinstance(nlp, cls):
|
if not isinstance(nlp, cls):
|
||||||
raise ValueError(Errors.E942.format(name="creation", value=type(nlp)))
|
raise ValueError(Errors.E942.format(name="creation", value=type(nlp)))
|
||||||
# Note that we don't load vectors here, instead they get loaded explicitly
|
# To create the components we need to use the final interpolated config
|
||||||
# inside stuff like the spacy train function. If we loaded them here,
|
# so all values are available (if component configs use variables).
|
||||||
# then we would load them twice at runtime: once when we make from config,
|
# Later we replace the component config with the raw config again.
|
||||||
# and then again when we load from disk.
|
interpolated = filled.interpolate() if not filled.is_interpolated else filled
|
||||||
pipeline = config.get("components", {})
|
pipeline = interpolated.get("components", {})
|
||||||
# If components are loaded from a source (existing models), we cache
|
# If components are loaded from a source (existing models), we cache
|
||||||
# them here so they're only loaded once
|
# them here so they're only loaded once
|
||||||
source_nlps = {}
|
source_nlps = {}
|
||||||
|
@ -1435,6 +1450,7 @@ class Language:
|
||||||
opts = ", ".join(pipeline.keys())
|
opts = ", ".join(pipeline.keys())
|
||||||
raise ValueError(Errors.E956.format(name=pipe_name, opts=opts))
|
raise ValueError(Errors.E956.format(name=pipe_name, opts=opts))
|
||||||
pipe_cfg = util.copy_config(pipeline[pipe_name])
|
pipe_cfg = util.copy_config(pipeline[pipe_name])
|
||||||
|
raw_config = Config(filled["components"][pipe_name])
|
||||||
if pipe_name not in disable:
|
if pipe_name not in disable:
|
||||||
if "factory" not in pipe_cfg and "source" not in pipe_cfg:
|
if "factory" not in pipe_cfg and "source" not in pipe_cfg:
|
||||||
err = Errors.E984.format(name=pipe_name, config=pipe_cfg)
|
err = Errors.E984.format(name=pipe_name, config=pipe_cfg)
|
||||||
|
@ -1444,7 +1460,11 @@ class Language:
|
||||||
# The pipe name (key in the config) here is the unique name
|
# The pipe name (key in the config) here is the unique name
|
||||||
# of the component, not necessarily the factory
|
# of the component, not necessarily the factory
|
||||||
nlp.add_pipe(
|
nlp.add_pipe(
|
||||||
factory, name=pipe_name, config=pipe_cfg, validate=validate,
|
factory,
|
||||||
|
name=pipe_name,
|
||||||
|
config=pipe_cfg,
|
||||||
|
validate=validate,
|
||||||
|
raw_config=raw_config,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
model = pipe_cfg["source"]
|
model = pipe_cfg["source"]
|
||||||
|
|
|
@ -4,7 +4,7 @@ import spacy
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.lang.de import German
|
from spacy.lang.de import German
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
from spacy.util import registry, deep_merge_configs, load_model_from_config
|
from spacy.util import registry, load_model_from_config
|
||||||
from spacy.ml.models import build_Tok2Vec_model, build_tb_parser_model
|
from spacy.ml.models import build_Tok2Vec_model, build_tb_parser_model
|
||||||
from spacy.ml.models import MultiHashEmbed, MaxoutWindowEncoder
|
from spacy.ml.models import MultiHashEmbed, MaxoutWindowEncoder
|
||||||
|
|
||||||
|
@ -194,37 +194,6 @@ def test_serialize_parser():
|
||||||
assert upper.get_dim("nI") == 66
|
assert upper.get_dim("nI") == 66
|
||||||
|
|
||||||
|
|
||||||
def test_deep_merge_configs():
|
|
||||||
config = {"a": "hello", "b": {"c": "d"}}
|
|
||||||
defaults = {"a": "world", "b": {"c": "e", "f": "g"}}
|
|
||||||
merged = deep_merge_configs(config, defaults)
|
|
||||||
assert len(merged) == 2
|
|
||||||
assert merged["a"] == "hello"
|
|
||||||
assert merged["b"] == {"c": "d", "f": "g"}
|
|
||||||
config = {"a": "hello", "b": {"@test": "x", "foo": 1}}
|
|
||||||
defaults = {"a": "world", "b": {"@test": "x", "foo": 100, "bar": 2}, "c": 100}
|
|
||||||
merged = deep_merge_configs(config, defaults)
|
|
||||||
assert len(merged) == 3
|
|
||||||
assert merged["a"] == "hello"
|
|
||||||
assert merged["b"] == {"@test": "x", "foo": 1, "bar": 2}
|
|
||||||
assert merged["c"] == 100
|
|
||||||
config = {"a": "hello", "b": {"@test": "x", "foo": 1}, "c": 100}
|
|
||||||
defaults = {"a": "world", "b": {"@test": "y", "foo": 100, "bar": 2}}
|
|
||||||
merged = deep_merge_configs(config, defaults)
|
|
||||||
assert len(merged) == 3
|
|
||||||
assert merged["a"] == "hello"
|
|
||||||
assert merged["b"] == {"@test": "x", "foo": 1}
|
|
||||||
assert merged["c"] == 100
|
|
||||||
# Test that leaving out the factory just adds to existing
|
|
||||||
config = {"a": "hello", "b": {"foo": 1}, "c": 100}
|
|
||||||
defaults = {"a": "world", "b": {"@test": "y", "foo": 100, "bar": 2}}
|
|
||||||
merged = deep_merge_configs(config, defaults)
|
|
||||||
assert len(merged) == 3
|
|
||||||
assert merged["a"] == "hello"
|
|
||||||
assert merged["b"] == {"@test": "y", "foo": 1, "bar": 2}
|
|
||||||
assert merged["c"] == 100
|
|
||||||
|
|
||||||
|
|
||||||
def test_config_nlp_roundtrip():
|
def test_config_nlp_roundtrip():
|
||||||
"""Test that a config prduced by the nlp object passes training config
|
"""Test that a config prduced by the nlp object passes training config
|
||||||
validation."""
|
validation."""
|
||||||
|
@ -311,3 +280,22 @@ def test_config_overrides():
|
||||||
nlp = spacy.load(d)
|
nlp = spacy.load(d)
|
||||||
assert isinstance(nlp, English)
|
assert isinstance(nlp, English)
|
||||||
assert nlp.pipe_names == ["tok2vec", "tagger"]
|
assert nlp.pipe_names == ["tok2vec", "tagger"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_config_interpolation():
|
||||||
|
config = Config().from_str(nlp_config_string, interpolate=False)
|
||||||
|
assert config["training"]["train_corpus"]["path"] == "${paths:train}"
|
||||||
|
interpolated = config.interpolate()
|
||||||
|
assert interpolated["training"]["train_corpus"]["path"] == ""
|
||||||
|
nlp = English.from_config(config)
|
||||||
|
assert nlp.config["training"]["train_corpus"]["path"] == "${paths:train}"
|
||||||
|
# Ensure that variables are preserved in nlp config
|
||||||
|
width = "${components.tok2vec.model:width}"
|
||||||
|
assert config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
|
||||||
|
assert nlp.config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
|
||||||
|
interpolated2 = nlp.config.interpolate()
|
||||||
|
assert interpolated2["training"]["train_corpus"]["path"] == ""
|
||||||
|
assert interpolated2["components"]["tagger"]["model"]["tok2vec"]["width"] == 342
|
||||||
|
nlp2 = English.from_config(interpolated)
|
||||||
|
assert nlp2.config["training"]["train_corpus"]["path"] == ""
|
||||||
|
assert nlp2.config["components"]["tagger"]["model"]["tok2vec"]["width"] == 342
|
||||||
|
|
|
@ -5,6 +5,7 @@ from spacy.gold.converters import iob2docs, conll_ner2docs, conllu2docs
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.schemas import ProjectConfigSchema, validate
|
from spacy.schemas import ProjectConfigSchema, validate
|
||||||
from spacy.cli.pretrain import make_docs
|
from spacy.cli.pretrain import make_docs
|
||||||
|
from spacy.cli.init_config import init_config
|
||||||
from spacy.cli._util import validate_project_commands, parse_config_overrides
|
from spacy.cli._util import validate_project_commands, parse_config_overrides
|
||||||
|
|
||||||
|
|
||||||
|
@ -319,3 +320,13 @@ def test_parse_config_overrides(args, expected):
|
||||||
def test_parse_config_overrides_invalid(args):
|
def test_parse_config_overrides_invalid(args):
|
||||||
with pytest.raises(SystemExit):
|
with pytest.raises(SystemExit):
|
||||||
parse_config_overrides(args)
|
parse_config_overrides(args)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("lang", ["en", "nl"])
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"pipeline", [["tagger", "parser", "ner"], [], ["ner", "textcat", "sentencizer"]]
|
||||||
|
)
|
||||||
|
@pytest.mark.parametrize("optimize", ["efficiency", "accuracy"])
|
||||||
|
def test_init_config(lang, pipeline, optimize):
|
||||||
|
# TODO: add more tests and also check for GPU with transformers
|
||||||
|
init_config("-", lang=lang, pipeline=pipeline, optimize=optimize, cpu=True)
|
||||||
|
|
|
@ -264,11 +264,31 @@ def load_model_from_path(
|
||||||
if not meta:
|
if not meta:
|
||||||
meta = get_model_meta(model_path)
|
meta = get_model_meta(model_path)
|
||||||
config_path = model_path / "config.cfg"
|
config_path = model_path / "config.cfg"
|
||||||
|
nlp, _ = load_model_from_config_path(
|
||||||
|
config_path, overrides=dict_to_dot(config), vocab=vocab, disable=disable
|
||||||
|
)
|
||||||
|
return nlp.from_disk(model_path, exclude=disable)
|
||||||
|
|
||||||
|
|
||||||
|
def load_model_from_config_path(
|
||||||
|
config_path: Union[str, Path],
|
||||||
|
*,
|
||||||
|
vocab: Union["Vocab", bool] = True,
|
||||||
|
disable: Iterable[str] = tuple(),
|
||||||
|
auto_fill: bool = False,
|
||||||
|
validate: bool = True,
|
||||||
|
overrides: Dict[str, Any] = SimpleFrozenDict(),
|
||||||
|
interpolate: bool = False,
|
||||||
|
) -> Tuple["Language", Config]:
|
||||||
|
config_path = ensure_path(config_path)
|
||||||
if not config_path.exists() or not config_path.is_file():
|
if not config_path.exists() or not config_path.is_file():
|
||||||
raise IOError(Errors.E053.format(path=config_path, name="config.cfg"))
|
raise IOError(Errors.E053.format(path=config_path, name="config.cfg"))
|
||||||
config = Config().from_disk(config_path, overrides=dict_to_dot(config))
|
config = Config().from_disk(
|
||||||
nlp, _ = load_model_from_config(config, vocab=vocab, disable=disable)
|
config_path, overrides=overrides, interpolate=interpolate
|
||||||
return nlp.from_disk(model_path, exclude=disable)
|
)
|
||||||
|
return load_model_from_config(
|
||||||
|
config, vocab=vocab, disable=disable, auto_fill=auto_fill, validate=validate,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def load_model_from_config(
|
def load_model_from_config(
|
||||||
|
@ -923,45 +943,6 @@ def copy_config(config: Union[Dict[str, Any], Config]) -> Config:
|
||||||
raise ValueError(Errors.E961.format(config=config)) from None
|
raise ValueError(Errors.E961.format(config=config)) from None
|
||||||
|
|
||||||
|
|
||||||
def deep_merge_configs(
|
|
||||||
config: Union[Dict[str, Any], Config], defaults: Union[Dict[str, Any], Config]
|
|
||||||
) -> Config:
|
|
||||||
"""Deep merge two configs, a base config and its defaults. Ignores
|
|
||||||
references to registered functions to avoid filling in
|
|
||||||
|
|
||||||
config (Dict[str, Any]): The config.
|
|
||||||
destination (Dict[str, Any]): The config defaults.
|
|
||||||
RETURNS (Dict[str, Any]): The merged config.
|
|
||||||
"""
|
|
||||||
config = copy_config(config)
|
|
||||||
merged = _deep_merge_configs(config, defaults)
|
|
||||||
return Config(merged)
|
|
||||||
|
|
||||||
|
|
||||||
def _deep_merge_configs(
|
|
||||||
config: Union[Dict[str, Any], Config], defaults: Union[Dict[str, Any], Config]
|
|
||||||
) -> Union[Dict[str, Any], Config]:
|
|
||||||
for key, value in defaults.items():
|
|
||||||
if isinstance(value, dict):
|
|
||||||
node = config.setdefault(key, {})
|
|
||||||
if not isinstance(node, dict):
|
|
||||||
continue
|
|
||||||
promises = [key for key in value if key.startswith("@")]
|
|
||||||
promise = promises[0] if promises else None
|
|
||||||
# We only update the block from defaults if it refers to the same
|
|
||||||
# registered function
|
|
||||||
if (
|
|
||||||
promise
|
|
||||||
and any(k.startswith("@") for k in node)
|
|
||||||
and (promise in node and node[promise] != value[promise])
|
|
||||||
):
|
|
||||||
continue
|
|
||||||
defaults = _deep_merge_configs(node, value)
|
|
||||||
elif key not in config:
|
|
||||||
config[key] = value
|
|
||||||
return config
|
|
||||||
|
|
||||||
|
|
||||||
def dot_to_dict(values: Dict[str, Any]) -> Dict[str, dict]:
|
def dot_to_dict(values: Dict[str, Any]) -> Dict[str, dict]:
|
||||||
"""Convert dot notation to a dict. For example: {"token.pos": True,
|
"""Convert dot notation to a dict. For example: {"token.pos": True,
|
||||||
"token._.xyz": True} becomes {"token": {"pos": True, "_": {"xyz": True }}}.
|
"token._.xyz": True} becomes {"token": {"pos": True, "_": {"xyz": True }}}.
|
||||||
|
|
|
@ -867,8 +867,10 @@ class JinjaToJS(object):
|
||||||
)
|
)
|
||||||
|
|
||||||
with option(kwargs, use_python_bool_wrapper=False):
|
with option(kwargs, use_python_bool_wrapper=False):
|
||||||
if operand.op == "in":
|
if operand.op == "in" or operand.op == "notin":
|
||||||
# Special case for "in" operator
|
# Special case for "in" operator
|
||||||
|
if operand.op == "notin":
|
||||||
|
self.output.write("!")
|
||||||
self._process_node(operand.expr, **kwargs)
|
self._process_node(operand.expr, **kwargs)
|
||||||
self.output.write(".includes(")
|
self.output.write(".includes(")
|
||||||
self._process_node(node.expr, **kwargs)
|
self._process_node(node.expr, **kwargs)
|
||||||
|
|
|
@ -1,107 +0,0 @@
|
||||||
{# Template for "CPU" configs. The transformer will use a different template. #}
|
|
||||||
# This is an auto-generated partial config for training a model.
|
|
||||||
# To use it for training, auto-fill it with all default values.
|
|
||||||
# python -m spacy init config config.cfg --base base_config.cfg
|
|
||||||
[paths]
|
|
||||||
train = ""
|
|
||||||
dev = ""
|
|
||||||
|
|
||||||
[nlp]
|
|
||||||
lang = "{{ lang }}"
|
|
||||||
pipeline = {{ pipeline|safe }}
|
|
||||||
vectors = {{ ('"en_vectors_web_lg"' if optimize == "accuracy" else false)|safe }}
|
|
||||||
tokenizer = {"@tokenizers": "spacy.Tokenizer.v1"}
|
|
||||||
|
|
||||||
[components]
|
|
||||||
|
|
||||||
[components.tok2vec]
|
|
||||||
factory = "tok2vec"
|
|
||||||
|
|
||||||
[components.tok2vec.model]
|
|
||||||
@architectures = "spacy.Tok2Vec.v1"
|
|
||||||
|
|
||||||
[components.tok2vec.model.embed]
|
|
||||||
@architectures = "spacy.MultiHashEmbed.v1"
|
|
||||||
width = ${components.tok2vec.model.encode:width}
|
|
||||||
rows = {{ 2000 if optimize == "efficiency" else 7000 }}
|
|
||||||
also_embed_subwords = {{ true if has_letters else false }}
|
|
||||||
also_use_static_vectors = {{ true if optimize == "accuracy" else false }}
|
|
||||||
|
|
||||||
[components.tok2vec.model.encode]
|
|
||||||
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
|
||||||
width = {{ 96 if optimize == "efficiency" else 256 }}
|
|
||||||
depth = {{ 4 if optimize == "efficiency" else 8 }}
|
|
||||||
window_size = 1
|
|
||||||
maxout_pieces = 3
|
|
||||||
|
|
||||||
{% if "tagger" in components %}
|
|
||||||
[components.tagger]
|
|
||||||
factory = "tagger"
|
|
||||||
|
|
||||||
[components.tagger.model]
|
|
||||||
@architectures = "spacy.Tagger.v1"
|
|
||||||
nO = null
|
|
||||||
|
|
||||||
[components.tagger.model.tok2vec]
|
|
||||||
@architectures = "spacy.Tok2VecListener.v1"
|
|
||||||
width = ${components.tok2vec.model.encode:width}
|
|
||||||
{%- endif %}
|
|
||||||
|
|
||||||
{% if "parser" in components -%}
|
|
||||||
[components.parser]
|
|
||||||
factory = "parser"
|
|
||||||
|
|
||||||
[components.parser.model]
|
|
||||||
@architectures = "spacy.TransitionBasedParser.v1"
|
|
||||||
nr_feature_tokens = 8
|
|
||||||
hidden_width = 128
|
|
||||||
maxout_pieces = 3
|
|
||||||
use_upper = true
|
|
||||||
nO = null
|
|
||||||
|
|
||||||
[components.parser.model.tok2vec]
|
|
||||||
@architectures = "spacy.Tok2VecListener.v1"
|
|
||||||
width = ${components.tok2vec.model.encode:width}
|
|
||||||
{%- endif %}
|
|
||||||
|
|
||||||
{% if "ner" in components -%}
|
|
||||||
[components.ner]
|
|
||||||
factory = "ner"
|
|
||||||
|
|
||||||
[components.ner.model]
|
|
||||||
@architectures = "spacy.TransitionBasedParser.v1"
|
|
||||||
nr_feature_tokens = 6
|
|
||||||
hidden_width = 64
|
|
||||||
maxout_pieces = 2
|
|
||||||
use_upper = true
|
|
||||||
nO = null
|
|
||||||
|
|
||||||
[components.ner.model.tok2vec]
|
|
||||||
@architectures = "spacy.Tok2VecListener.v1"
|
|
||||||
width = ${components.tok2vec.model.encode:width}
|
|
||||||
{% endif -%}
|
|
||||||
|
|
||||||
[training]
|
|
||||||
|
|
||||||
[training.train_corpus]
|
|
||||||
@readers = "spacy.Corpus.v1"
|
|
||||||
path = ${paths:train}
|
|
||||||
|
|
||||||
[training.dev_corpus]
|
|
||||||
@readers = "spacy.Corpus.v1"
|
|
||||||
path = ${paths:dev}
|
|
||||||
|
|
||||||
[training.score_weights]
|
|
||||||
{%- if "tagger" in components %}
|
|
||||||
tag_acc = {{ (1.0 / components|length)|round() }}
|
|
||||||
{%- endif -%}
|
|
||||||
{%- if "parser" in components %}
|
|
||||||
dep_uas = 0.0
|
|
||||||
dep_las = {{ (1.0 / components|length)|round() }}
|
|
||||||
sents_f = 0.0
|
|
||||||
{%- endif %}
|
|
||||||
{%- if "ner" in components %}
|
|
||||||
ents_f = {{ (1.0 / components|length)|round() }}
|
|
||||||
ents_p = 0.0
|
|
||||||
ents_r = 0.0
|
|
||||||
{%- endif -%}
|
|
|
@ -1,139 +0,0 @@
|
||||||
{# Template for "CPU" configs. The transformer will use a different template. #}
|
|
||||||
# This is an auto-generated partial config for training a model.
|
|
||||||
# To use it for training, auto-fill it with all default values.
|
|
||||||
# python -m spacy init config config.cfg --base base_config.cfg
|
|
||||||
[paths]
|
|
||||||
train = ""
|
|
||||||
dev = ""
|
|
||||||
|
|
||||||
[nlp]
|
|
||||||
lang = "{{ lang }}"
|
|
||||||
pipeline = {{ pipeline|safe }}
|
|
||||||
vectors = null
|
|
||||||
tokenizer = {"@tokenizers": "spacy.Tokenizer.v1"}
|
|
||||||
|
|
||||||
[components]
|
|
||||||
|
|
||||||
[components.transformer]
|
|
||||||
factory = "transformer"
|
|
||||||
|
|
||||||
[components.transformer.model]
|
|
||||||
@architectures = "spacy-transformers.TransformerModel.v1"
|
|
||||||
{#- name = {{ transformer_info["name"] }} #}
|
|
||||||
name = "roberta-base"
|
|
||||||
tokenizer_config = {"use_fast": true}
|
|
||||||
|
|
||||||
[components.transformer.model.get_spans]
|
|
||||||
@span_getters = "strided_spans.v1"
|
|
||||||
window = 128
|
|
||||||
stride = 96
|
|
||||||
|
|
||||||
{% if "tagger" in components %}
|
|
||||||
[components.tagger]
|
|
||||||
factory = "tagger"
|
|
||||||
|
|
||||||
[components.tagger.model]
|
|
||||||
@architectures = "spacy.Tagger.v1"
|
|
||||||
nO = null
|
|
||||||
|
|
||||||
[components.tagger.model.tok2vec]
|
|
||||||
@architectures = "spacy-transformers.TransformerListener.v1"
|
|
||||||
grad_factor = 1.0
|
|
||||||
|
|
||||||
[components.ner.model.tok2vec.pooling]
|
|
||||||
@layers = "reduce_mean.v1"
|
|
||||||
{%- endif %}
|
|
||||||
|
|
||||||
{% if "parser" in components -%}
|
|
||||||
[components.parser]
|
|
||||||
factory = "parser"
|
|
||||||
|
|
||||||
[components.parser.model]
|
|
||||||
@architectures = "spacy.TransitionBasedParser.v1"
|
|
||||||
nr_feature_tokens = 8
|
|
||||||
hidden_width = 128
|
|
||||||
maxout_pieces = 3
|
|
||||||
use_upper = false
|
|
||||||
nO = null
|
|
||||||
|
|
||||||
[components.parser.model.tok2vec]
|
|
||||||
@architectures = "spacy-transformers.TransformerListener.v1"
|
|
||||||
grad_factor = 1.0
|
|
||||||
|
|
||||||
[components.ner.model.tok2vec.pooling]
|
|
||||||
@layers = "reduce_mean.v1"
|
|
||||||
{%- endif %}
|
|
||||||
|
|
||||||
{% if "ner" in components -%}
|
|
||||||
[components.ner]
|
|
||||||
factory = "ner"
|
|
||||||
|
|
||||||
[components.ner.model]
|
|
||||||
@architectures = "spacy.TransitionBasedParser.v1"
|
|
||||||
nr_feature_tokens = 3
|
|
||||||
hidden_width = 64
|
|
||||||
maxout_pieces = 2
|
|
||||||
use_upper = false
|
|
||||||
nO = null
|
|
||||||
|
|
||||||
[components.ner.model.tok2vec]
|
|
||||||
@architectures = "spacy-transformers.TransformerListener.v1"
|
|
||||||
grad_factor = 1.0
|
|
||||||
|
|
||||||
[components.parser.model.tok2vec.pooling]
|
|
||||||
@layers = "reduce_mean.v1"
|
|
||||||
{% endif -%}
|
|
||||||
|
|
||||||
[training]
|
|
||||||
{#- accumulate_gradient = {{ transformer_info["size_factor"] }} #}
|
|
||||||
accumulate_gradient = 3
|
|
||||||
|
|
||||||
[training.optimizer]
|
|
||||||
@optimizers = "Adam.v1"
|
|
||||||
beta1 = 0.9
|
|
||||||
beta2 = 0.999
|
|
||||||
L2_is_weight_decay = true
|
|
||||||
L2 = 0.01
|
|
||||||
grad_clip = 1.0
|
|
||||||
use_averages = false
|
|
||||||
eps = 1e-8
|
|
||||||
|
|
||||||
[training.optimizer.learn_rate]
|
|
||||||
@schedules = "warmup_linear.v1"
|
|
||||||
warmup_steps = 250
|
|
||||||
total_steps = 20000
|
|
||||||
initial_rate = 5e-5
|
|
||||||
|
|
||||||
[training.train_corpus]
|
|
||||||
@readers = "spacy.Corpus.v1"
|
|
||||||
path = ${paths:train}
|
|
||||||
gold_preproc = false
|
|
||||||
max_length = 500
|
|
||||||
limit = 0
|
|
||||||
|
|
||||||
[training.dev_corpus]
|
|
||||||
@readers = "spacy.Corpus.v1"
|
|
||||||
path = ${paths:dev}
|
|
||||||
gold_preproc = false
|
|
||||||
max_length = 0
|
|
||||||
limit = 0
|
|
||||||
|
|
||||||
[training.batcher]
|
|
||||||
@batchers = "batch_by_padded.v1"
|
|
||||||
discard_oversize = true
|
|
||||||
batch_size = 2000
|
|
||||||
|
|
||||||
[training.score_weights]
|
|
||||||
{%- if "tagger" in components %}
|
|
||||||
tag_acc = {{ (1.0 / components|length)|round(2) }}
|
|
||||||
{%- endif -%}
|
|
||||||
{%- if "parser" in components %}
|
|
||||||
dep_uas = 0.0
|
|
||||||
dep_las = {{ (1.0 / components|length)|round(2) }}
|
|
||||||
sents_f = 0.0
|
|
||||||
{%- endif %}
|
|
||||||
{%- if "ner" in components %}
|
|
||||||
ents_f = {{ (1.0 / components|length)|round(2) }}
|
|
||||||
ents_p = 0.0
|
|
||||||
ents_r = 0.0
|
|
||||||
{%- endif -%}
|
|
|
@ -1 +1 @@
|
||||||
python jinja_to_js.py quickstart_training_cpu.jinja ../src/widgets/quickstart-training-generator.js
|
python jinja_to_js.py ../../spacy/cli/templates/quickstart_training.jinja ../src/widgets/quickstart-training-generator.js
|
||||||
|
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue
Block a user