diff --git a/pyproject.toml b/pyproject.toml index 14d2c1e8e..896ad339f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ requires = [ "cymem>=2.0.2,<2.1.0", "preshed>=3.0.2,<3.1.0", "murmurhash>=0.28.0,<1.1.0", - "thinc>=8.0.0a35,<8.0.0a40", + "thinc>=8.0.0a36,<8.0.0a40", "blis>=0.4.0,<0.5.0", "pytokenizations", "pathy" diff --git a/requirements.txt b/requirements.txt index b3a95dcff..2746ecc37 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # Our libraries cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 -thinc>=8.0.0a35,<8.0.0a40 +thinc>=8.0.0a36,<8.0.0a40 blis>=0.4.0,<0.5.0 ml_datasets==0.2.0a0 murmurhash>=0.28.0,<1.1.0 diff --git a/setup.cfg b/setup.cfg index b080d4330..33dabc91f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -34,13 +34,13 @@ setup_requires = cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 murmurhash>=0.28.0,<1.1.0 - thinc>=8.0.0a35,<8.0.0a40 + thinc>=8.0.0a36,<8.0.0a40 install_requires = # Our libraries murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 - thinc>=8.0.0a35,<8.0.0a40 + thinc>=8.0.0a36,<8.0.0a40 blis>=0.4.0,<0.5.0 wasabi>=0.8.0,<1.1.0 srsly>=2.1.0,<3.0.0 diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index 21a4e54ce..506380b0b 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -10,7 +10,7 @@ from click import NoSuchOption from click.parser import split_arg_string from typer.main import get_command from contextlib import contextmanager -from thinc.config import Config, ConfigValidationError +from thinc.api import Config, ConfigValidationError from configparser import InterpolationError import os @@ -226,24 +226,28 @@ def get_checksum(path: Union[Path, str]) -> str: def show_validation_error( file_path: Optional[Union[str, Path]] = None, *, - title: str = "Config validation error", + title: Optional[str] = None, + desc: str = "", + show_config: Optional[bool] = None, hint_fill: bool = True, ): """Helper to show custom config validation errors on the CLI. file_path (str / Path): Optional file path of config file, used in hints. - title (str): Title of the custom formatted error. + title (str): Override title of custom formatted error. + desc (str): Override description of custom formatted error. + show_config (bool): Whether to output the config the error refers to. hint_fill (bool): Show hint about filling config. """ try: yield - except (ConfigValidationError, InterpolationError) as e: - msg.fail(title, spaced=True) - # TODO: This is kinda hacky and we should probably provide a better - # helper for this in Thinc - err_text = str(e).replace("Config validation error", "").strip() - print(err_text) - if hint_fill and "field required" in err_text: + except ConfigValidationError as e: + title = title if title is not None else e.title + # Re-generate a new error object with overrides + err = e.from_error(e, title="", desc=desc, show_config=show_config) + msg.fail(title) + print(err.text.strip()) + if hint_fill and "value_error.missing" in err.error_types: config_path = file_path if file_path is not None else "config.cfg" msg.text( "If your config contains missing values, you can run the 'init " @@ -252,6 +256,8 @@ def show_validation_error( ) print(f"{COMMAND} init fill-config {config_path} --base {config_path}\n") sys.exit(1) + except InterpolationError as e: + msg.fail("Config validation error", e, exits=1) def import_code(code_path: Optional[Union[Path, str]]) -> None: diff --git a/spacy/cli/debug_config.py b/spacy/cli/debug_config.py index d07a0bb2d..c0c7de7ef 100644 --- a/spacy/cli/debug_config.py +++ b/spacy/cli/debug_config.py @@ -1,8 +1,8 @@ from typing import Optional, Dict, Any, Union, List from pathlib import Path from wasabi import msg, table -from thinc.api import Config -from thinc.config import VARIABLE_RE, ConfigValidationError +from thinc.api import Config, ConfigValidationError +from thinc.config import VARIABLE_RE import typer from ._util import Arg, Opt, show_validation_error, parse_config_overrides @@ -115,4 +115,4 @@ def check_section_refs(config: Config, fields: List[str]) -> None: msg = f"not a valid section reference: {value}" errors.append({"loc": field.split("."), "msg": msg}) if errors: - raise ConfigValidationError(config, errors) + raise ConfigValidationError(config=config, errors=errors) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index e64ee532b..5fc4ff035 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -92,7 +92,7 @@ def train( nlp, config = util.load_model_from_config(raw_config) util.load_vocab_data_into_model(nlp, lookups=config["training"]["lookups"]) if config["training"]["vectors"] is not None: - util.load_vectors_into_model(nlp, config["training"]["vectors"]) + add_vectors(nlp, config["training"]["vectors"]) raw_text, tag_map, morph_rules, weights_data = load_from_paths(config) T_cfg = config["training"] optimizer = T_cfg["optimizer"] @@ -198,6 +198,19 @@ def train( msg.good(f"Saved pipeline to output directory {final_model_path}") +def add_vectors(nlp: Language, vectors: str) -> None: + title = f"Config validation error for vectors {vectors}" + desc = ( + "This typically means that there's a problem in the config.cfg included " + "with the packaged vectors. Make sure that the vectors package you're " + "loading is compatible with the current version of spaCy." + ) + with show_validation_error( + title=title, desc=desc, hint_fill=False, show_config=False + ): + util.load_vectors_into_model(nlp, vectors) + + def create_train_batches(iterator, batcher, max_epochs: int): epoch = 0 examples = list(iterator) diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attributeruler.py index 1dc2a10dd..4243ebcfb 100644 --- a/spacy/pipeline/attributeruler.py +++ b/spacy/pipeline/attributeruler.py @@ -82,7 +82,7 @@ class AttributeRuler(Pipe): matches = self.matcher(doc, allow_missing=True) # Sort by the attribute ID, so that later rules have precendence matches = [ - (_parse_key(self.vocab.strings[m_id]), m_id, s, e) + (int(self.vocab.strings[m_id]), m_id, s, e) for m_id, s, e in matches ] matches.sort() @@ -184,7 +184,7 @@ class AttributeRuler(Pipe): """ # We need to make a string here, because otherwise the ID we pass back # will be interpreted as the hash of a string, rather than an ordinal. - key = _make_key(len(self.attrs)) + key = str(len(self.attrs)) self.matcher.add(self.vocab.strings.add(key), patterns) self._attrs_unnormed.append(attrs) attrs = normalize_token_attrs(self.vocab, attrs) @@ -209,7 +209,7 @@ class AttributeRuler(Pipe): all_patterns = [] for i in range(len(self.attrs)): p = {} - p["patterns"] = self.matcher.get(_make_key(i))[1] + p["patterns"] = self.matcher.get(str(i))[1] p["attrs"] = self._attrs_unnormed[i] p["index"] = self.indices[i] all_patterns.append(p) @@ -313,12 +313,6 @@ class AttributeRuler(Pipe): return self -def _make_key(n_attr): - return f"attr_rule_{n_attr}" - -def _parse_key(key): - return int(key.rsplit("_", 1)[1]) - def _split_morph_attrs(attrs): """Split entries from a tag map or morph rules dict into to two dicts, one diff --git a/spacy/tests/lang/zh/test_tokenizer.py b/spacy/tests/lang/zh/test_tokenizer.py index 70e753ba2..741eb0ace 100644 --- a/spacy/tests/lang/zh/test_tokenizer.py +++ b/spacy/tests/lang/zh/test_tokenizer.py @@ -1,6 +1,6 @@ import pytest from spacy.lang.zh import Chinese, _get_pkuseg_trie_data -from thinc.config import ConfigValidationError +from thinc.api import ConfigValidationError # fmt: off diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py index 07648024c..cac394913 100644 --- a/spacy/tests/pipeline/test_pipe_factories.py +++ b/spacy/tests/pipeline/test_pipe_factories.py @@ -4,8 +4,7 @@ from spacy.lang.en import English from spacy.lang.de import German from spacy.tokens import Doc from spacy.util import registry, SimpleFrozenDict, combine_score_weights -from thinc.api import Model, Linear -from thinc.config import ConfigValidationError +from thinc.api import Model, Linear, ConfigValidationError from pydantic import StrictInt, StrictStr from ..util import make_tempdir diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py index ec7544456..1a5be4bec 100644 --- a/spacy/tests/serialize/test_serialize_config.py +++ b/spacy/tests/serialize/test_serialize_config.py @@ -1,5 +1,5 @@ import pytest -from thinc.config import Config, ConfigValidationError +from thinc.api import Config, ConfigValidationError import spacy from spacy.lang.en import English from spacy.lang.de import German diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index a66ab8de1..caf4ea890 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -8,7 +8,7 @@ from spacy.cli._util import validate_project_commands, parse_config_overrides from spacy.cli._util import load_project_config, substitute_project_variables from spacy.cli._util import string_to_list, OVERRIDES_ENV_VAR from spacy.cli.debug_config import check_section_refs -from thinc.config import ConfigValidationError, Config +from thinc.api import ConfigValidationError, Config import srsly import os