Update Thinc and include section order

This commit is contained in:
Ines Montani 2020-08-14 14:06:22 +02:00
parent 8736bfc052
commit 67cc39af7f
10 changed files with 53 additions and 52 deletions

View File

@ -6,7 +6,7 @@ requires = [
"cymem>=2.0.2,<2.1.0",
"preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0",
"thinc>=8.0.0a25,<8.0.0a30",
"thinc>=8.0.0a26,<8.0.0a30",
"blis>=0.4.0,<0.5.0",
"pytokenizations",
"smart_open>=2.0.0,<3.0.0"

View File

@ -1,7 +1,7 @@
# Our libraries
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
thinc>=8.0.0a25,<8.0.0a30
thinc>=8.0.0a26,<8.0.0a30
blis>=0.4.0,<0.5.0
ml_datasets>=0.1.1
murmurhash>=0.28.0,<1.1.0

View File

@ -34,13 +34,13 @@ setup_requires =
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
murmurhash>=0.28.0,<1.1.0
thinc>=8.0.0a25,<8.0.0a30
thinc>=8.0.0a26,<8.0.0a30
install_requires =
# Our libraries
murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
thinc>=8.0.0a25,<8.0.0a30
thinc>=8.0.0a26,<8.0.0a30
blis>=0.4.0,<0.5.0
wasabi>=0.7.1,<1.1.0
srsly>=2.1.0,<3.0.0

View File

@ -5,7 +5,6 @@ import sys
import srsly
from wasabi import Printer, MESSAGES, msg, diff_strings
import typer
from thinc.api import Config
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
from ._util import import_code, debug_cli, get_sourced_components
@ -49,7 +48,7 @@ def debug_config_cli(
overrides = parse_config_overrides(ctx.args)
import_code(code_path)
with show_validation_error(config_path):
config = Config().from_disk(config_path, overrides=overrides, interpolate=False)
config = util.load_config(config_path, overrides=overrides)
try:
nlp, _ = util.load_model_from_config(config, auto_fill=auto_fill)
except ValueError as e:
@ -134,9 +133,7 @@ def debug_data(
if not config_path.exists():
msg.fail("Config file not found", config_path, exists=1)
with show_validation_error(config_path):
cfg = Config().from_disk(
config_path, overrides=config_overrides, interpolate=False
)
cfg = util.load_config(config_path, overrides=config_overrides)
nlp, config = util.load_model_from_config(cfg)
# Use original config here, not resolved version
sourced_components = get_sourced_components(cfg)

View File

@ -49,10 +49,9 @@ def debug_model_cli(
}
config_overrides = parse_config_overrides(ctx.args)
with show_validation_error(config_path):
config = util.load_config(config_path, overrides=config_overrides)
try:
nlp, config = util.load_model_from_config_path(
config_path, overrides=config_overrides
)
nlp, config = util.load_model_from_config(config_path)
except ValueError as e:
msg.fail(str(e), exits=1)
seed = config["pretraining"]["seed"]

View File

@ -1,12 +1,11 @@
from typing import Optional, List
from enum import Enum
from pathlib import Path
from thinc.api import Config
from wasabi import Printer
import srsly
import re
from ..util import load_model_from_config, get_lang_class
from .. import util
from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
@ -50,7 +49,7 @@ def init_config(
from jinja2 import Template
except ImportError:
msg.fail("This command requires jinja2", "pip install jinja2", exits=1)
lang_defaults = get_lang_class(lang).Defaults
lang_defaults = util.get_lang_class(lang).Defaults
has_letters = lang_defaults.writing_system.get("has_letters", True)
has_transformer = False # TODO: check this somehow
if has_transformer:
@ -80,9 +79,9 @@ def init_config(
msg.text(f"- {label}: {value}")
with show_validation_error(hint_init=False):
with msg.loading("Auto-filling config..."):
config = Config().from_str(base_template, interpolate=False)
config = util.load_config_from_str(base_template)
try:
nlp, _ = load_model_from_config(config, auto_fill=True)
nlp, _ = util.load_model_from_config(config, auto_fill=True)
except ValueError as e:
msg.fail(str(e), exits=1)
msg.good("Auto-filled config with all values")

View File

@ -88,9 +88,8 @@ def pretrain(
msg.info("Using CPU")
msg.info(f"Loading config from: {config_path}")
with show_validation_error(config_path):
nlp, config = util.load_model_from_config_path(
config_path, overrides=config_overrides
)
config = util.load_config(config_path, overrides=config_overrides)
nlp, config = util.load_model_from_config(config)
# TODO: validate that [pretraining] block exists
if not output_dir.exists():
output_dir.mkdir()

View File

@ -75,9 +75,7 @@ def train(
msg.info("Using CPU")
msg.info(f"Loading config and nlp from: {config_path}")
with show_validation_error(config_path):
config = Config().from_disk(
config_path, overrides=config_overrides, interpolate=False
)
config = util.load_config(config_path, overrides=config_overrides)
if config.get("training", {}).get("seed") is not None:
fix_random_seed(config["training"]["seed"])
# Use original config here before it's resolved to functions

View File

@ -21,7 +21,7 @@ from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
from .gold import Example, validate_examples
from .scorer import Scorer
from .util import create_default_optimizer, registry
from .util import SimpleFrozenDict, combine_score_weights
from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER
from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .lang.punctuation import TOKENIZER_INFIXES
@ -36,7 +36,7 @@ from . import about
# This is the base config will all settings (training etc.)
DEFAULT_CONFIG_PATH = Path(__file__).parent / "default_config.cfg"
DEFAULT_CONFIG = Config().from_disk(DEFAULT_CONFIG_PATH, interpolate=False)
DEFAULT_CONFIG = util.load_config(DEFAULT_CONFIG_PATH)
class BaseDefaults:
@ -45,7 +45,7 @@ class BaseDefaults:
Language.Defaults.
"""
config: Config = Config()
config: Config = Config(section_order=CONFIG_SECTION_ORDER)
tokenizer_exceptions: Dict[str, List[dict]] = BASE_EXCEPTIONS
prefixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_PREFIXES
suffixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_SUFFIXES
@ -583,7 +583,7 @@ class Language:
# We're calling the internal _fill here to avoid constructing the
# registered functions twice
resolved, filled = registry.resolve(cfg, validate=validate)
filled = filled[factory_name]
filled = Config(filled[factory_name])
filled["factory"] = factory_name
filled.pop("@factories", None)
# Merge the final filled config with the raw config (including non-
@ -1390,7 +1390,9 @@ class Language:
DOCS: https://spacy.io/api/language#from_config
"""
if auto_fill:
config = Config(cls.default_config).merge(config)
config = Config(
cls.default_config, section_order=CONFIG_SECTION_ORDER
).merge(config)
if "nlp" not in config:
raise ValueError(Errors.E985.format(config=config))
config_lang = config["nlp"]["lang"]

View File

@ -58,6 +58,12 @@ _PRINT_ENV = False
OOV_RANK = numpy.iinfo(numpy.uint64).max
LEXEME_NORM_LANGS = ["da", "de", "el", "en", "id", "lb", "pt", "ru", "sr", "ta", "th"]
# Default order of sections in the config.cfg. Not all sections needs to exist,
# and additional sections are added at the end, in alphabetical order.
# fmt: off
CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "training", "pretraining"]
# fmt: on
class registry(thinc.registry):
languages = catalogue.create("spacy", "languages", entry_points=True)
@ -264,33 +270,11 @@ def load_model_from_path(
if not meta:
meta = get_model_meta(model_path)
config_path = model_path / "config.cfg"
nlp, _ = load_model_from_config_path(
config_path, overrides=dict_to_dot(config), vocab=vocab, disable=disable
)
config = load_config(config_path, overrides=dict_to_dot(config))
nlp, _ = load_model_from_config(config, vocab=vocab, disable=disable)
return nlp.from_disk(model_path, exclude=disable)
def load_model_from_config_path(
config_path: Union[str, Path],
*,
vocab: Union["Vocab", bool] = True,
disable: Iterable[str] = tuple(),
auto_fill: bool = False,
validate: bool = True,
overrides: Dict[str, Any] = SimpleFrozenDict(),
interpolate: bool = False,
) -> Tuple["Language", Config]:
config_path = ensure_path(config_path)
if not config_path.exists() or not config_path.is_file():
raise IOError(Errors.E053.format(path=config_path, name="config.cfg"))
config = Config().from_disk(
config_path, overrides=overrides, interpolate=interpolate
)
return load_model_from_config(
config, vocab=vocab, disable=disable, auto_fill=auto_fill, validate=validate,
)
def load_model_from_config(
config: Union[Dict[str, Any], Config],
*,
@ -337,6 +321,29 @@ def load_model_from_init_py(
)
def load_config(
path: Union[str, Path],
overrides: Dict[str, Any] = SimpleFrozenDict(),
interpolate: bool = False,
) -> Config:
"""Load a config file. Takes care of path validation and section order."""
config_path = ensure_path(path)
if not config_path.exists() or not config_path.is_file():
raise IOError(Errors.E053.format(path=config_path, name="config.cfg"))
return Config(section_order=CONFIG_SECTION_ORDER).from_disk(
config_path, overrides=overrides, interpolate=interpolate
)
def load_config_from_str(
text: str, overrides: Dict[str, Any] = SimpleFrozenDict(), interpolate: bool = False
):
"""Load a full config from a string."""
return Config(section_order=CONFIG_SECTION_ORDER).from_str(
text, overrides=overrides, interpolate=interpolate,
)
def get_installed_models() -> List[str]:
"""List all model packages currently installed in the environment.