Merge branch 'develop' into fix/train-config-interpolation

This commit is contained in:
Ines Montani 2020-09-26 15:32:14 +02:00
commit b4486d747d
11 changed files with 44 additions and 32 deletions

View File

@ -6,7 +6,7 @@ requires = [
"cymem>=2.0.2,<2.1.0", "cymem>=2.0.2,<2.1.0",
"preshed>=3.0.2,<3.1.0", "preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0", "murmurhash>=0.28.0,<1.1.0",
"thinc>=8.0.0a35,<8.0.0a40", "thinc>=8.0.0a36,<8.0.0a40",
"blis>=0.4.0,<0.5.0", "blis>=0.4.0,<0.5.0",
"pytokenizations", "pytokenizations",
"pathy" "pathy"

View File

@ -1,7 +1,7 @@
# Our libraries # Our libraries
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
thinc>=8.0.0a35,<8.0.0a40 thinc>=8.0.0a36,<8.0.0a40
blis>=0.4.0,<0.5.0 blis>=0.4.0,<0.5.0
ml_datasets==0.2.0a0 ml_datasets==0.2.0a0
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0

View File

@ -34,13 +34,13 @@ setup_requires =
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
thinc>=8.0.0a35,<8.0.0a40 thinc>=8.0.0a36,<8.0.0a40
install_requires = install_requires =
# Our libraries # Our libraries
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
thinc>=8.0.0a35,<8.0.0a40 thinc>=8.0.0a36,<8.0.0a40
blis>=0.4.0,<0.5.0 blis>=0.4.0,<0.5.0
wasabi>=0.8.0,<1.1.0 wasabi>=0.8.0,<1.1.0
srsly>=2.1.0,<3.0.0 srsly>=2.1.0,<3.0.0

View File

@ -10,7 +10,7 @@ from click import NoSuchOption
from click.parser import split_arg_string from click.parser import split_arg_string
from typer.main import get_command from typer.main import get_command
from contextlib import contextmanager from contextlib import contextmanager
from thinc.config import Config, ConfigValidationError from thinc.api import Config, ConfigValidationError
from configparser import InterpolationError from configparser import InterpolationError
import os import os
@ -226,24 +226,28 @@ def get_checksum(path: Union[Path, str]) -> str:
def show_validation_error( def show_validation_error(
file_path: Optional[Union[str, Path]] = None, file_path: Optional[Union[str, Path]] = None,
*, *,
title: str = "Config validation error", title: Optional[str] = None,
desc: str = "",
show_config: Optional[bool] = None,
hint_fill: bool = True, hint_fill: bool = True,
): ):
"""Helper to show custom config validation errors on the CLI. """Helper to show custom config validation errors on the CLI.
file_path (str / Path): Optional file path of config file, used in hints. file_path (str / Path): Optional file path of config file, used in hints.
title (str): Title of the custom formatted error. title (str): Override title of custom formatted error.
desc (str): Override description of custom formatted error.
show_config (bool): Whether to output the config the error refers to.
hint_fill (bool): Show hint about filling config. hint_fill (bool): Show hint about filling config.
""" """
try: try:
yield yield
except (ConfigValidationError, InterpolationError) as e: except ConfigValidationError as e:
msg.fail(title, spaced=True) title = title if title is not None else e.title
# TODO: This is kinda hacky and we should probably provide a better # Re-generate a new error object with overrides
# helper for this in Thinc err = e.from_error(e, title="", desc=desc, show_config=show_config)
err_text = str(e).replace("Config validation error", "").strip() msg.fail(title)
print(err_text) print(err.text.strip())
if hint_fill and "field required" in err_text: if hint_fill and "value_error.missing" in err.error_types:
config_path = file_path if file_path is not None else "config.cfg" config_path = file_path if file_path is not None else "config.cfg"
msg.text( msg.text(
"If your config contains missing values, you can run the 'init " "If your config contains missing values, you can run the 'init "
@ -252,6 +256,8 @@ def show_validation_error(
) )
print(f"{COMMAND} init fill-config {config_path} --base {config_path}\n") print(f"{COMMAND} init fill-config {config_path} --base {config_path}\n")
sys.exit(1) sys.exit(1)
except InterpolationError as e:
msg.fail("Config validation error", e, exits=1)
def import_code(code_path: Optional[Union[Path, str]]) -> None: def import_code(code_path: Optional[Union[Path, str]]) -> None:

View File

@ -1,8 +1,8 @@
from typing import Optional, Dict, Any, Union, List from typing import Optional, Dict, Any, Union, List
from pathlib import Path from pathlib import Path
from wasabi import msg, table from wasabi import msg, table
from thinc.api import Config from thinc.api import Config, ConfigValidationError
from thinc.config import VARIABLE_RE, ConfigValidationError from thinc.config import VARIABLE_RE
import typer import typer
from ._util import Arg, Opt, show_validation_error, parse_config_overrides from ._util import Arg, Opt, show_validation_error, parse_config_overrides
@ -115,4 +115,4 @@ def check_section_refs(config: Config, fields: List[str]) -> None:
msg = f"not a valid section reference: {value}" msg = f"not a valid section reference: {value}"
errors.append({"loc": field.split("."), "msg": msg}) errors.append({"loc": field.split("."), "msg": msg})
if errors: if errors:
raise ConfigValidationError(config, errors) raise ConfigValidationError(config=config, errors=errors)

View File

@ -92,7 +92,7 @@ def train(
nlp, config = util.load_model_from_config(raw_config) nlp, config = util.load_model_from_config(raw_config)
util.load_vocab_data_into_model(nlp, lookups=config["training"]["lookups"]) util.load_vocab_data_into_model(nlp, lookups=config["training"]["lookups"])
if config["training"]["vectors"] is not None: if config["training"]["vectors"] is not None:
util.load_vectors_into_model(nlp, config["training"]["vectors"]) add_vectors(nlp, config["training"]["vectors"])
raw_text, tag_map, morph_rules, weights_data = load_from_paths(config) raw_text, tag_map, morph_rules, weights_data = load_from_paths(config)
T_cfg = config["training"] T_cfg = config["training"]
optimizer = T_cfg["optimizer"] optimizer = T_cfg["optimizer"]
@ -198,6 +198,19 @@ def train(
msg.good(f"Saved pipeline to output directory {final_model_path}") msg.good(f"Saved pipeline to output directory {final_model_path}")
def add_vectors(nlp: Language, vectors: str) -> None:
title = f"Config validation error for vectors {vectors}"
desc = (
"This typically means that there's a problem in the config.cfg included "
"with the packaged vectors. Make sure that the vectors package you're "
"loading is compatible with the current version of spaCy."
)
with show_validation_error(
title=title, desc=desc, hint_fill=False, show_config=False
):
util.load_vectors_into_model(nlp, vectors)
def create_train_batches(iterator, batcher, max_epochs: int): def create_train_batches(iterator, batcher, max_epochs: int):
epoch = 0 epoch = 0
examples = list(iterator) examples = list(iterator)

View File

@ -82,7 +82,7 @@ class AttributeRuler(Pipe):
matches = self.matcher(doc, allow_missing=True) matches = self.matcher(doc, allow_missing=True)
# Sort by the attribute ID, so that later rules have precendence # Sort by the attribute ID, so that later rules have precendence
matches = [ matches = [
(_parse_key(self.vocab.strings[m_id]), m_id, s, e) (int(self.vocab.strings[m_id]), m_id, s, e)
for m_id, s, e in matches for m_id, s, e in matches
] ]
matches.sort() matches.sort()
@ -184,7 +184,7 @@ class AttributeRuler(Pipe):
""" """
# We need to make a string here, because otherwise the ID we pass back # We need to make a string here, because otherwise the ID we pass back
# will be interpreted as the hash of a string, rather than an ordinal. # will be interpreted as the hash of a string, rather than an ordinal.
key = _make_key(len(self.attrs)) key = str(len(self.attrs))
self.matcher.add(self.vocab.strings.add(key), patterns) self.matcher.add(self.vocab.strings.add(key), patterns)
self._attrs_unnormed.append(attrs) self._attrs_unnormed.append(attrs)
attrs = normalize_token_attrs(self.vocab, attrs) attrs = normalize_token_attrs(self.vocab, attrs)
@ -209,7 +209,7 @@ class AttributeRuler(Pipe):
all_patterns = [] all_patterns = []
for i in range(len(self.attrs)): for i in range(len(self.attrs)):
p = {} p = {}
p["patterns"] = self.matcher.get(_make_key(i))[1] p["patterns"] = self.matcher.get(str(i))[1]
p["attrs"] = self._attrs_unnormed[i] p["attrs"] = self._attrs_unnormed[i]
p["index"] = self.indices[i] p["index"] = self.indices[i]
all_patterns.append(p) all_patterns.append(p)
@ -313,12 +313,6 @@ class AttributeRuler(Pipe):
return self return self
def _make_key(n_attr):
return f"attr_rule_{n_attr}"
def _parse_key(key):
return int(key.rsplit("_", 1)[1])
def _split_morph_attrs(attrs): def _split_morph_attrs(attrs):
"""Split entries from a tag map or morph rules dict into to two dicts, one """Split entries from a tag map or morph rules dict into to two dicts, one

View File

@ -1,6 +1,6 @@
import pytest import pytest
from spacy.lang.zh import Chinese, _get_pkuseg_trie_data from spacy.lang.zh import Chinese, _get_pkuseg_trie_data
from thinc.config import ConfigValidationError from thinc.api import ConfigValidationError
# fmt: off # fmt: off

View File

@ -4,8 +4,7 @@ from spacy.lang.en import English
from spacy.lang.de import German from spacy.lang.de import German
from spacy.tokens import Doc from spacy.tokens import Doc
from spacy.util import registry, SimpleFrozenDict, combine_score_weights from spacy.util import registry, SimpleFrozenDict, combine_score_weights
from thinc.api import Model, Linear from thinc.api import Model, Linear, ConfigValidationError
from thinc.config import ConfigValidationError
from pydantic import StrictInt, StrictStr from pydantic import StrictInt, StrictStr
from ..util import make_tempdir from ..util import make_tempdir

View File

@ -1,5 +1,5 @@
import pytest import pytest
from thinc.config import Config, ConfigValidationError from thinc.api import Config, ConfigValidationError
import spacy import spacy
from spacy.lang.en import English from spacy.lang.en import English
from spacy.lang.de import German from spacy.lang.de import German

View File

@ -8,7 +8,7 @@ from spacy.cli._util import validate_project_commands, parse_config_overrides
from spacy.cli._util import load_project_config, substitute_project_variables from spacy.cli._util import load_project_config, substitute_project_variables
from spacy.cli._util import string_to_list, OVERRIDES_ENV_VAR from spacy.cli._util import string_to_list, OVERRIDES_ENV_VAR
from spacy.cli.debug_config import check_section_refs from spacy.cli.debug_config import check_section_refs
from thinc.config import ConfigValidationError, Config from thinc.api import ConfigValidationError, Config
import srsly import srsly
import os import os