mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
Merge branch 'develop' into fix/train-config-interpolation
This commit is contained in:
commit
b4486d747d
|
@ -6,7 +6,7 @@ requires = [
|
||||||
"cymem>=2.0.2,<2.1.0",
|
"cymem>=2.0.2,<2.1.0",
|
||||||
"preshed>=3.0.2,<3.1.0",
|
"preshed>=3.0.2,<3.1.0",
|
||||||
"murmurhash>=0.28.0,<1.1.0",
|
"murmurhash>=0.28.0,<1.1.0",
|
||||||
"thinc>=8.0.0a35,<8.0.0a40",
|
"thinc>=8.0.0a36,<8.0.0a40",
|
||||||
"blis>=0.4.0,<0.5.0",
|
"blis>=0.4.0,<0.5.0",
|
||||||
"pytokenizations",
|
"pytokenizations",
|
||||||
"pathy"
|
"pathy"
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# Our libraries
|
# Our libraries
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.0.0a35,<8.0.0a40
|
thinc>=8.0.0a36,<8.0.0a40
|
||||||
blis>=0.4.0,<0.5.0
|
blis>=0.4.0,<0.5.0
|
||||||
ml_datasets==0.2.0a0
|
ml_datasets==0.2.0a0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
|
|
|
@ -34,13 +34,13 @@ setup_requires =
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
thinc>=8.0.0a35,<8.0.0a40
|
thinc>=8.0.0a36,<8.0.0a40
|
||||||
install_requires =
|
install_requires =
|
||||||
# Our libraries
|
# Our libraries
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.0.0a35,<8.0.0a40
|
thinc>=8.0.0a36,<8.0.0a40
|
||||||
blis>=0.4.0,<0.5.0
|
blis>=0.4.0,<0.5.0
|
||||||
wasabi>=0.8.0,<1.1.0
|
wasabi>=0.8.0,<1.1.0
|
||||||
srsly>=2.1.0,<3.0.0
|
srsly>=2.1.0,<3.0.0
|
||||||
|
|
|
@ -10,7 +10,7 @@ from click import NoSuchOption
|
||||||
from click.parser import split_arg_string
|
from click.parser import split_arg_string
|
||||||
from typer.main import get_command
|
from typer.main import get_command
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
from thinc.config import Config, ConfigValidationError
|
from thinc.api import Config, ConfigValidationError
|
||||||
from configparser import InterpolationError
|
from configparser import InterpolationError
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
@ -226,24 +226,28 @@ def get_checksum(path: Union[Path, str]) -> str:
|
||||||
def show_validation_error(
|
def show_validation_error(
|
||||||
file_path: Optional[Union[str, Path]] = None,
|
file_path: Optional[Union[str, Path]] = None,
|
||||||
*,
|
*,
|
||||||
title: str = "Config validation error",
|
title: Optional[str] = None,
|
||||||
|
desc: str = "",
|
||||||
|
show_config: Optional[bool] = None,
|
||||||
hint_fill: bool = True,
|
hint_fill: bool = True,
|
||||||
):
|
):
|
||||||
"""Helper to show custom config validation errors on the CLI.
|
"""Helper to show custom config validation errors on the CLI.
|
||||||
|
|
||||||
file_path (str / Path): Optional file path of config file, used in hints.
|
file_path (str / Path): Optional file path of config file, used in hints.
|
||||||
title (str): Title of the custom formatted error.
|
title (str): Override title of custom formatted error.
|
||||||
|
desc (str): Override description of custom formatted error.
|
||||||
|
show_config (bool): Whether to output the config the error refers to.
|
||||||
hint_fill (bool): Show hint about filling config.
|
hint_fill (bool): Show hint about filling config.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
yield
|
yield
|
||||||
except (ConfigValidationError, InterpolationError) as e:
|
except ConfigValidationError as e:
|
||||||
msg.fail(title, spaced=True)
|
title = title if title is not None else e.title
|
||||||
# TODO: This is kinda hacky and we should probably provide a better
|
# Re-generate a new error object with overrides
|
||||||
# helper for this in Thinc
|
err = e.from_error(e, title="", desc=desc, show_config=show_config)
|
||||||
err_text = str(e).replace("Config validation error", "").strip()
|
msg.fail(title)
|
||||||
print(err_text)
|
print(err.text.strip())
|
||||||
if hint_fill and "field required" in err_text:
|
if hint_fill and "value_error.missing" in err.error_types:
|
||||||
config_path = file_path if file_path is not None else "config.cfg"
|
config_path = file_path if file_path is not None else "config.cfg"
|
||||||
msg.text(
|
msg.text(
|
||||||
"If your config contains missing values, you can run the 'init "
|
"If your config contains missing values, you can run the 'init "
|
||||||
|
@ -252,6 +256,8 @@ def show_validation_error(
|
||||||
)
|
)
|
||||||
print(f"{COMMAND} init fill-config {config_path} --base {config_path}\n")
|
print(f"{COMMAND} init fill-config {config_path} --base {config_path}\n")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
except InterpolationError as e:
|
||||||
|
msg.fail("Config validation error", e, exits=1)
|
||||||
|
|
||||||
|
|
||||||
def import_code(code_path: Optional[Union[Path, str]]) -> None:
|
def import_code(code_path: Optional[Union[Path, str]]) -> None:
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
from typing import Optional, Dict, Any, Union, List
|
from typing import Optional, Dict, Any, Union, List
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import msg, table
|
from wasabi import msg, table
|
||||||
from thinc.api import Config
|
from thinc.api import Config, ConfigValidationError
|
||||||
from thinc.config import VARIABLE_RE, ConfigValidationError
|
from thinc.config import VARIABLE_RE
|
||||||
import typer
|
import typer
|
||||||
|
|
||||||
from ._util import Arg, Opt, show_validation_error, parse_config_overrides
|
from ._util import Arg, Opt, show_validation_error, parse_config_overrides
|
||||||
|
@ -115,4 +115,4 @@ def check_section_refs(config: Config, fields: List[str]) -> None:
|
||||||
msg = f"not a valid section reference: {value}"
|
msg = f"not a valid section reference: {value}"
|
||||||
errors.append({"loc": field.split("."), "msg": msg})
|
errors.append({"loc": field.split("."), "msg": msg})
|
||||||
if errors:
|
if errors:
|
||||||
raise ConfigValidationError(config, errors)
|
raise ConfigValidationError(config=config, errors=errors)
|
||||||
|
|
|
@ -92,7 +92,7 @@ def train(
|
||||||
nlp, config = util.load_model_from_config(raw_config)
|
nlp, config = util.load_model_from_config(raw_config)
|
||||||
util.load_vocab_data_into_model(nlp, lookups=config["training"]["lookups"])
|
util.load_vocab_data_into_model(nlp, lookups=config["training"]["lookups"])
|
||||||
if config["training"]["vectors"] is not None:
|
if config["training"]["vectors"] is not None:
|
||||||
util.load_vectors_into_model(nlp, config["training"]["vectors"])
|
add_vectors(nlp, config["training"]["vectors"])
|
||||||
raw_text, tag_map, morph_rules, weights_data = load_from_paths(config)
|
raw_text, tag_map, morph_rules, weights_data = load_from_paths(config)
|
||||||
T_cfg = config["training"]
|
T_cfg = config["training"]
|
||||||
optimizer = T_cfg["optimizer"]
|
optimizer = T_cfg["optimizer"]
|
||||||
|
@ -198,6 +198,19 @@ def train(
|
||||||
msg.good(f"Saved pipeline to output directory {final_model_path}")
|
msg.good(f"Saved pipeline to output directory {final_model_path}")
|
||||||
|
|
||||||
|
|
||||||
|
def add_vectors(nlp: Language, vectors: str) -> None:
|
||||||
|
title = f"Config validation error for vectors {vectors}"
|
||||||
|
desc = (
|
||||||
|
"This typically means that there's a problem in the config.cfg included "
|
||||||
|
"with the packaged vectors. Make sure that the vectors package you're "
|
||||||
|
"loading is compatible with the current version of spaCy."
|
||||||
|
)
|
||||||
|
with show_validation_error(
|
||||||
|
title=title, desc=desc, hint_fill=False, show_config=False
|
||||||
|
):
|
||||||
|
util.load_vectors_into_model(nlp, vectors)
|
||||||
|
|
||||||
|
|
||||||
def create_train_batches(iterator, batcher, max_epochs: int):
|
def create_train_batches(iterator, batcher, max_epochs: int):
|
||||||
epoch = 0
|
epoch = 0
|
||||||
examples = list(iterator)
|
examples = list(iterator)
|
||||||
|
|
|
@ -82,7 +82,7 @@ class AttributeRuler(Pipe):
|
||||||
matches = self.matcher(doc, allow_missing=True)
|
matches = self.matcher(doc, allow_missing=True)
|
||||||
# Sort by the attribute ID, so that later rules have precendence
|
# Sort by the attribute ID, so that later rules have precendence
|
||||||
matches = [
|
matches = [
|
||||||
(_parse_key(self.vocab.strings[m_id]), m_id, s, e)
|
(int(self.vocab.strings[m_id]), m_id, s, e)
|
||||||
for m_id, s, e in matches
|
for m_id, s, e in matches
|
||||||
]
|
]
|
||||||
matches.sort()
|
matches.sort()
|
||||||
|
@ -184,7 +184,7 @@ class AttributeRuler(Pipe):
|
||||||
"""
|
"""
|
||||||
# We need to make a string here, because otherwise the ID we pass back
|
# We need to make a string here, because otherwise the ID we pass back
|
||||||
# will be interpreted as the hash of a string, rather than an ordinal.
|
# will be interpreted as the hash of a string, rather than an ordinal.
|
||||||
key = _make_key(len(self.attrs))
|
key = str(len(self.attrs))
|
||||||
self.matcher.add(self.vocab.strings.add(key), patterns)
|
self.matcher.add(self.vocab.strings.add(key), patterns)
|
||||||
self._attrs_unnormed.append(attrs)
|
self._attrs_unnormed.append(attrs)
|
||||||
attrs = normalize_token_attrs(self.vocab, attrs)
|
attrs = normalize_token_attrs(self.vocab, attrs)
|
||||||
|
@ -209,7 +209,7 @@ class AttributeRuler(Pipe):
|
||||||
all_patterns = []
|
all_patterns = []
|
||||||
for i in range(len(self.attrs)):
|
for i in range(len(self.attrs)):
|
||||||
p = {}
|
p = {}
|
||||||
p["patterns"] = self.matcher.get(_make_key(i))[1]
|
p["patterns"] = self.matcher.get(str(i))[1]
|
||||||
p["attrs"] = self._attrs_unnormed[i]
|
p["attrs"] = self._attrs_unnormed[i]
|
||||||
p["index"] = self.indices[i]
|
p["index"] = self.indices[i]
|
||||||
all_patterns.append(p)
|
all_patterns.append(p)
|
||||||
|
@ -313,12 +313,6 @@ class AttributeRuler(Pipe):
|
||||||
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def _make_key(n_attr):
|
|
||||||
return f"attr_rule_{n_attr}"
|
|
||||||
|
|
||||||
def _parse_key(key):
|
|
||||||
return int(key.rsplit("_", 1)[1])
|
|
||||||
|
|
||||||
|
|
||||||
def _split_morph_attrs(attrs):
|
def _split_morph_attrs(attrs):
|
||||||
"""Split entries from a tag map or morph rules dict into to two dicts, one
|
"""Split entries from a tag map or morph rules dict into to two dicts, one
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
import pytest
|
import pytest
|
||||||
from spacy.lang.zh import Chinese, _get_pkuseg_trie_data
|
from spacy.lang.zh import Chinese, _get_pkuseg_trie_data
|
||||||
from thinc.config import ConfigValidationError
|
from thinc.api import ConfigValidationError
|
||||||
|
|
||||||
|
|
||||||
# fmt: off
|
# fmt: off
|
||||||
|
|
|
@ -4,8 +4,7 @@ from spacy.lang.en import English
|
||||||
from spacy.lang.de import German
|
from spacy.lang.de import German
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
from spacy.util import registry, SimpleFrozenDict, combine_score_weights
|
from spacy.util import registry, SimpleFrozenDict, combine_score_weights
|
||||||
from thinc.api import Model, Linear
|
from thinc.api import Model, Linear, ConfigValidationError
|
||||||
from thinc.config import ConfigValidationError
|
|
||||||
from pydantic import StrictInt, StrictStr
|
from pydantic import StrictInt, StrictStr
|
||||||
|
|
||||||
from ..util import make_tempdir
|
from ..util import make_tempdir
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
import pytest
|
import pytest
|
||||||
from thinc.config import Config, ConfigValidationError
|
from thinc.api import Config, ConfigValidationError
|
||||||
import spacy
|
import spacy
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.lang.de import German
|
from spacy.lang.de import German
|
||||||
|
|
|
@ -8,7 +8,7 @@ from spacy.cli._util import validate_project_commands, parse_config_overrides
|
||||||
from spacy.cli._util import load_project_config, substitute_project_variables
|
from spacy.cli._util import load_project_config, substitute_project_variables
|
||||||
from spacy.cli._util import string_to_list, OVERRIDES_ENV_VAR
|
from spacy.cli._util import string_to_list, OVERRIDES_ENV_VAR
|
||||||
from spacy.cli.debug_config import check_section_refs
|
from spacy.cli.debug_config import check_section_refs
|
||||||
from thinc.config import ConfigValidationError, Config
|
from thinc.api import ConfigValidationError, Config
|
||||||
import srsly
|
import srsly
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user