Merge branch 'develop' into fix/train-config-interpolation

This commit is contained in:
Ines Montani 2020-09-26 15:32:14 +02:00
commit b4486d747d
11 changed files with 44 additions and 32 deletions

View File

@ -6,7 +6,7 @@ requires = [
"cymem>=2.0.2,<2.1.0",
"preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0",
"thinc>=8.0.0a35,<8.0.0a40",
"thinc>=8.0.0a36,<8.0.0a40",
"blis>=0.4.0,<0.5.0",
"pytokenizations",
"pathy"

View File

@ -1,7 +1,7 @@
# Our libraries
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
thinc>=8.0.0a35,<8.0.0a40
thinc>=8.0.0a36,<8.0.0a40
blis>=0.4.0,<0.5.0
ml_datasets==0.2.0a0
murmurhash>=0.28.0,<1.1.0

View File

@ -34,13 +34,13 @@ setup_requires =
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
murmurhash>=0.28.0,<1.1.0
thinc>=8.0.0a35,<8.0.0a40
thinc>=8.0.0a36,<8.0.0a40
install_requires =
# Our libraries
murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
thinc>=8.0.0a35,<8.0.0a40
thinc>=8.0.0a36,<8.0.0a40
blis>=0.4.0,<0.5.0
wasabi>=0.8.0,<1.1.0
srsly>=2.1.0,<3.0.0

View File

@ -10,7 +10,7 @@ from click import NoSuchOption
from click.parser import split_arg_string
from typer.main import get_command
from contextlib import contextmanager
from thinc.config import Config, ConfigValidationError
from thinc.api import Config, ConfigValidationError
from configparser import InterpolationError
import os
@ -226,24 +226,28 @@ def get_checksum(path: Union[Path, str]) -> str:
def show_validation_error(
file_path: Optional[Union[str, Path]] = None,
*,
title: str = "Config validation error",
title: Optional[str] = None,
desc: str = "",
show_config: Optional[bool] = None,
hint_fill: bool = True,
):
"""Helper to show custom config validation errors on the CLI.
file_path (str / Path): Optional file path of config file, used in hints.
title (str): Title of the custom formatted error.
title (str): Override title of custom formatted error.
desc (str): Override description of custom formatted error.
show_config (bool): Whether to output the config the error refers to.
hint_fill (bool): Show hint about filling config.
"""
try:
yield
except (ConfigValidationError, InterpolationError) as e:
msg.fail(title, spaced=True)
# TODO: This is kinda hacky and we should probably provide a better
# helper for this in Thinc
err_text = str(e).replace("Config validation error", "").strip()
print(err_text)
if hint_fill and "field required" in err_text:
except ConfigValidationError as e:
title = title if title is not None else e.title
# Re-generate a new error object with overrides
err = e.from_error(e, title="", desc=desc, show_config=show_config)
msg.fail(title)
print(err.text.strip())
if hint_fill and "value_error.missing" in err.error_types:
config_path = file_path if file_path is not None else "config.cfg"
msg.text(
"If your config contains missing values, you can run the 'init "
@ -252,6 +256,8 @@ def show_validation_error(
)
print(f"{COMMAND} init fill-config {config_path} --base {config_path}\n")
sys.exit(1)
except InterpolationError as e:
msg.fail("Config validation error", e, exits=1)
def import_code(code_path: Optional[Union[Path, str]]) -> None:

View File

@ -1,8 +1,8 @@
from typing import Optional, Dict, Any, Union, List
from pathlib import Path
from wasabi import msg, table
from thinc.api import Config
from thinc.config import VARIABLE_RE, ConfigValidationError
from thinc.api import Config, ConfigValidationError
from thinc.config import VARIABLE_RE
import typer
from ._util import Arg, Opt, show_validation_error, parse_config_overrides
@ -115,4 +115,4 @@ def check_section_refs(config: Config, fields: List[str]) -> None:
msg = f"not a valid section reference: {value}"
errors.append({"loc": field.split("."), "msg": msg})
if errors:
raise ConfigValidationError(config, errors)
raise ConfigValidationError(config=config, errors=errors)

View File

@ -92,7 +92,7 @@ def train(
nlp, config = util.load_model_from_config(raw_config)
util.load_vocab_data_into_model(nlp, lookups=config["training"]["lookups"])
if config["training"]["vectors"] is not None:
util.load_vectors_into_model(nlp, config["training"]["vectors"])
add_vectors(nlp, config["training"]["vectors"])
raw_text, tag_map, morph_rules, weights_data = load_from_paths(config)
T_cfg = config["training"]
optimizer = T_cfg["optimizer"]
@ -198,6 +198,19 @@ def train(
msg.good(f"Saved pipeline to output directory {final_model_path}")
def add_vectors(nlp: Language, vectors: str) -> None:
title = f"Config validation error for vectors {vectors}"
desc = (
"This typically means that there's a problem in the config.cfg included "
"with the packaged vectors. Make sure that the vectors package you're "
"loading is compatible with the current version of spaCy."
)
with show_validation_error(
title=title, desc=desc, hint_fill=False, show_config=False
):
util.load_vectors_into_model(nlp, vectors)
def create_train_batches(iterator, batcher, max_epochs: int):
epoch = 0
examples = list(iterator)

View File

@ -82,7 +82,7 @@ class AttributeRuler(Pipe):
matches = self.matcher(doc, allow_missing=True)
# Sort by the attribute ID, so that later rules have precendence
matches = [
(_parse_key(self.vocab.strings[m_id]), m_id, s, e)
(int(self.vocab.strings[m_id]), m_id, s, e)
for m_id, s, e in matches
]
matches.sort()
@ -184,7 +184,7 @@ class AttributeRuler(Pipe):
"""
# We need to make a string here, because otherwise the ID we pass back
# will be interpreted as the hash of a string, rather than an ordinal.
key = _make_key(len(self.attrs))
key = str(len(self.attrs))
self.matcher.add(self.vocab.strings.add(key), patterns)
self._attrs_unnormed.append(attrs)
attrs = normalize_token_attrs(self.vocab, attrs)
@ -209,7 +209,7 @@ class AttributeRuler(Pipe):
all_patterns = []
for i in range(len(self.attrs)):
p = {}
p["patterns"] = self.matcher.get(_make_key(i))[1]
p["patterns"] = self.matcher.get(str(i))[1]
p["attrs"] = self._attrs_unnormed[i]
p["index"] = self.indices[i]
all_patterns.append(p)
@ -313,12 +313,6 @@ class AttributeRuler(Pipe):
return self
def _make_key(n_attr):
return f"attr_rule_{n_attr}"
def _parse_key(key):
return int(key.rsplit("_", 1)[1])
def _split_morph_attrs(attrs):
"""Split entries from a tag map or morph rules dict into to two dicts, one

View File

@ -1,6 +1,6 @@
import pytest
from spacy.lang.zh import Chinese, _get_pkuseg_trie_data
from thinc.config import ConfigValidationError
from thinc.api import ConfigValidationError
# fmt: off

View File

@ -4,8 +4,7 @@ from spacy.lang.en import English
from spacy.lang.de import German
from spacy.tokens import Doc
from spacy.util import registry, SimpleFrozenDict, combine_score_weights
from thinc.api import Model, Linear
from thinc.config import ConfigValidationError
from thinc.api import Model, Linear, ConfigValidationError
from pydantic import StrictInt, StrictStr
from ..util import make_tempdir

View File

@ -1,5 +1,5 @@
import pytest
from thinc.config import Config, ConfigValidationError
from thinc.api import Config, ConfigValidationError
import spacy
from spacy.lang.en import English
from spacy.lang.de import German

View File

@ -8,7 +8,7 @@ from spacy.cli._util import validate_project_commands, parse_config_overrides
from spacy.cli._util import load_project_config, substitute_project_variables
from spacy.cli._util import string_to_list, OVERRIDES_ENV_VAR
from spacy.cli.debug_config import check_section_refs
from thinc.config import ConfigValidationError, Config
from thinc.api import ConfigValidationError, Config
import srsly
import os