Update Thinc and include section order

This commit is contained in:
Ines Montani 2020-08-14 14:06:22 +02:00
parent 8736bfc052
commit 67cc39af7f
10 changed files with 53 additions and 52 deletions

View File

@ -6,7 +6,7 @@ requires = [
"cymem>=2.0.2,<2.1.0", "cymem>=2.0.2,<2.1.0",
"preshed>=3.0.2,<3.1.0", "preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0", "murmurhash>=0.28.0,<1.1.0",
"thinc>=8.0.0a25,<8.0.0a30", "thinc>=8.0.0a26,<8.0.0a30",
"blis>=0.4.0,<0.5.0", "blis>=0.4.0,<0.5.0",
"pytokenizations", "pytokenizations",
"smart_open>=2.0.0,<3.0.0" "smart_open>=2.0.0,<3.0.0"

View File

@ -1,7 +1,7 @@
# Our libraries # Our libraries
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
thinc>=8.0.0a25,<8.0.0a30 thinc>=8.0.0a26,<8.0.0a30
blis>=0.4.0,<0.5.0 blis>=0.4.0,<0.5.0
ml_datasets>=0.1.1 ml_datasets>=0.1.1
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0

View File

@ -34,13 +34,13 @@ setup_requires =
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
thinc>=8.0.0a25,<8.0.0a30 thinc>=8.0.0a26,<8.0.0a30
install_requires = install_requires =
# Our libraries # Our libraries
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
thinc>=8.0.0a25,<8.0.0a30 thinc>=8.0.0a26,<8.0.0a30
blis>=0.4.0,<0.5.0 blis>=0.4.0,<0.5.0
wasabi>=0.7.1,<1.1.0 wasabi>=0.7.1,<1.1.0
srsly>=2.1.0,<3.0.0 srsly>=2.1.0,<3.0.0

View File

@ -5,7 +5,6 @@ import sys
import srsly import srsly
from wasabi import Printer, MESSAGES, msg, diff_strings from wasabi import Printer, MESSAGES, msg, diff_strings
import typer import typer
from thinc.api import Config
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
from ._util import import_code, debug_cli, get_sourced_components from ._util import import_code, debug_cli, get_sourced_components
@ -49,7 +48,7 @@ def debug_config_cli(
overrides = parse_config_overrides(ctx.args) overrides = parse_config_overrides(ctx.args)
import_code(code_path) import_code(code_path)
with show_validation_error(config_path): with show_validation_error(config_path):
config = Config().from_disk(config_path, overrides=overrides, interpolate=False) config = util.load_config(config_path, overrides=overrides)
try: try:
nlp, _ = util.load_model_from_config(config, auto_fill=auto_fill) nlp, _ = util.load_model_from_config(config, auto_fill=auto_fill)
except ValueError as e: except ValueError as e:
@ -134,9 +133,7 @@ def debug_data(
if not config_path.exists(): if not config_path.exists():
msg.fail("Config file not found", config_path, exists=1) msg.fail("Config file not found", config_path, exists=1)
with show_validation_error(config_path): with show_validation_error(config_path):
cfg = Config().from_disk( cfg = util.load_config(config_path, overrides=config_overrides)
config_path, overrides=config_overrides, interpolate=False
)
nlp, config = util.load_model_from_config(cfg) nlp, config = util.load_model_from_config(cfg)
# Use original config here, not resolved version # Use original config here, not resolved version
sourced_components = get_sourced_components(cfg) sourced_components = get_sourced_components(cfg)

View File

@ -49,10 +49,9 @@ def debug_model_cli(
} }
config_overrides = parse_config_overrides(ctx.args) config_overrides = parse_config_overrides(ctx.args)
with show_validation_error(config_path): with show_validation_error(config_path):
config = util.load_config(config_path, overrides=config_overrides)
try: try:
nlp, config = util.load_model_from_config_path( nlp, config = util.load_model_from_config(config_path)
config_path, overrides=config_overrides
)
except ValueError as e: except ValueError as e:
msg.fail(str(e), exits=1) msg.fail(str(e), exits=1)
seed = config["pretraining"]["seed"] seed = config["pretraining"]["seed"]

View File

@ -1,12 +1,11 @@
from typing import Optional, List from typing import Optional, List
from enum import Enum from enum import Enum
from pathlib import Path from pathlib import Path
from thinc.api import Config
from wasabi import Printer from wasabi import Printer
import srsly import srsly
import re import re
from ..util import load_model_from_config, get_lang_class from .. import util
from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
@ -50,7 +49,7 @@ def init_config(
from jinja2 import Template from jinja2 import Template
except ImportError: except ImportError:
msg.fail("This command requires jinja2", "pip install jinja2", exits=1) msg.fail("This command requires jinja2", "pip install jinja2", exits=1)
lang_defaults = get_lang_class(lang).Defaults lang_defaults = util.get_lang_class(lang).Defaults
has_letters = lang_defaults.writing_system.get("has_letters", True) has_letters = lang_defaults.writing_system.get("has_letters", True)
has_transformer = False # TODO: check this somehow has_transformer = False # TODO: check this somehow
if has_transformer: if has_transformer:
@ -80,9 +79,9 @@ def init_config(
msg.text(f"- {label}: {value}") msg.text(f"- {label}: {value}")
with show_validation_error(hint_init=False): with show_validation_error(hint_init=False):
with msg.loading("Auto-filling config..."): with msg.loading("Auto-filling config..."):
config = Config().from_str(base_template, interpolate=False) config = util.load_config_from_str(base_template)
try: try:
nlp, _ = load_model_from_config(config, auto_fill=True) nlp, _ = util.load_model_from_config(config, auto_fill=True)
except ValueError as e: except ValueError as e:
msg.fail(str(e), exits=1) msg.fail(str(e), exits=1)
msg.good("Auto-filled config with all values") msg.good("Auto-filled config with all values")

View File

@ -88,9 +88,8 @@ def pretrain(
msg.info("Using CPU") msg.info("Using CPU")
msg.info(f"Loading config from: {config_path}") msg.info(f"Loading config from: {config_path}")
with show_validation_error(config_path): with show_validation_error(config_path):
nlp, config = util.load_model_from_config_path( config = util.load_config(config_path, overrides=config_overrides)
config_path, overrides=config_overrides nlp, config = util.load_model_from_config(config)
)
# TODO: validate that [pretraining] block exists # TODO: validate that [pretraining] block exists
if not output_dir.exists(): if not output_dir.exists():
output_dir.mkdir() output_dir.mkdir()

View File

@ -75,9 +75,7 @@ def train(
msg.info("Using CPU") msg.info("Using CPU")
msg.info(f"Loading config and nlp from: {config_path}") msg.info(f"Loading config and nlp from: {config_path}")
with show_validation_error(config_path): with show_validation_error(config_path):
config = Config().from_disk( config = util.load_config(config_path, overrides=config_overrides)
config_path, overrides=config_overrides, interpolate=False
)
if config.get("training", {}).get("seed") is not None: if config.get("training", {}).get("seed") is not None:
fix_random_seed(config["training"]["seed"]) fix_random_seed(config["training"]["seed"])
# Use original config here before it's resolved to functions # Use original config here before it's resolved to functions

View File

@ -21,7 +21,7 @@ from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
from .gold import Example, validate_examples from .gold import Example, validate_examples
from .scorer import Scorer from .scorer import Scorer
from .util import create_default_optimizer, registry from .util import create_default_optimizer, registry
from .util import SimpleFrozenDict, combine_score_weights from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER
from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .lang.punctuation import TOKENIZER_INFIXES from .lang.punctuation import TOKENIZER_INFIXES
@ -36,7 +36,7 @@ from . import about
# This is the base config will all settings (training etc.) # This is the base config will all settings (training etc.)
DEFAULT_CONFIG_PATH = Path(__file__).parent / "default_config.cfg" DEFAULT_CONFIG_PATH = Path(__file__).parent / "default_config.cfg"
DEFAULT_CONFIG = Config().from_disk(DEFAULT_CONFIG_PATH, interpolate=False) DEFAULT_CONFIG = util.load_config(DEFAULT_CONFIG_PATH)
class BaseDefaults: class BaseDefaults:
@ -45,7 +45,7 @@ class BaseDefaults:
Language.Defaults. Language.Defaults.
""" """
config: Config = Config() config: Config = Config(section_order=CONFIG_SECTION_ORDER)
tokenizer_exceptions: Dict[str, List[dict]] = BASE_EXCEPTIONS tokenizer_exceptions: Dict[str, List[dict]] = BASE_EXCEPTIONS
prefixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_PREFIXES prefixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_PREFIXES
suffixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_SUFFIXES suffixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_SUFFIXES
@ -583,7 +583,7 @@ class Language:
# We're calling the internal _fill here to avoid constructing the # We're calling the internal _fill here to avoid constructing the
# registered functions twice # registered functions twice
resolved, filled = registry.resolve(cfg, validate=validate) resolved, filled = registry.resolve(cfg, validate=validate)
filled = filled[factory_name] filled = Config(filled[factory_name])
filled["factory"] = factory_name filled["factory"] = factory_name
filled.pop("@factories", None) filled.pop("@factories", None)
# Merge the final filled config with the raw config (including non- # Merge the final filled config with the raw config (including non-
@ -1390,7 +1390,9 @@ class Language:
DOCS: https://spacy.io/api/language#from_config DOCS: https://spacy.io/api/language#from_config
""" """
if auto_fill: if auto_fill:
config = Config(cls.default_config).merge(config) config = Config(
cls.default_config, section_order=CONFIG_SECTION_ORDER
).merge(config)
if "nlp" not in config: if "nlp" not in config:
raise ValueError(Errors.E985.format(config=config)) raise ValueError(Errors.E985.format(config=config))
config_lang = config["nlp"]["lang"] config_lang = config["nlp"]["lang"]

View File

@ -58,6 +58,12 @@ _PRINT_ENV = False
OOV_RANK = numpy.iinfo(numpy.uint64).max OOV_RANK = numpy.iinfo(numpy.uint64).max
LEXEME_NORM_LANGS = ["da", "de", "el", "en", "id", "lb", "pt", "ru", "sr", "ta", "th"] LEXEME_NORM_LANGS = ["da", "de", "el", "en", "id", "lb", "pt", "ru", "sr", "ta", "th"]
# Default order of sections in the config.cfg. Not all sections needs to exist,
# and additional sections are added at the end, in alphabetical order.
# fmt: off
CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "training", "pretraining"]
# fmt: on
class registry(thinc.registry): class registry(thinc.registry):
languages = catalogue.create("spacy", "languages", entry_points=True) languages = catalogue.create("spacy", "languages", entry_points=True)
@ -264,33 +270,11 @@ def load_model_from_path(
if not meta: if not meta:
meta = get_model_meta(model_path) meta = get_model_meta(model_path)
config_path = model_path / "config.cfg" config_path = model_path / "config.cfg"
nlp, _ = load_model_from_config_path( config = load_config(config_path, overrides=dict_to_dot(config))
config_path, overrides=dict_to_dot(config), vocab=vocab, disable=disable nlp, _ = load_model_from_config(config, vocab=vocab, disable=disable)
)
return nlp.from_disk(model_path, exclude=disable) return nlp.from_disk(model_path, exclude=disable)
def load_model_from_config_path(
config_path: Union[str, Path],
*,
vocab: Union["Vocab", bool] = True,
disable: Iterable[str] = tuple(),
auto_fill: bool = False,
validate: bool = True,
overrides: Dict[str, Any] = SimpleFrozenDict(),
interpolate: bool = False,
) -> Tuple["Language", Config]:
config_path = ensure_path(config_path)
if not config_path.exists() or not config_path.is_file():
raise IOError(Errors.E053.format(path=config_path, name="config.cfg"))
config = Config().from_disk(
config_path, overrides=overrides, interpolate=interpolate
)
return load_model_from_config(
config, vocab=vocab, disable=disable, auto_fill=auto_fill, validate=validate,
)
def load_model_from_config( def load_model_from_config(
config: Union[Dict[str, Any], Config], config: Union[Dict[str, Any], Config],
*, *,
@ -337,6 +321,29 @@ def load_model_from_init_py(
) )
def load_config(
path: Union[str, Path],
overrides: Dict[str, Any] = SimpleFrozenDict(),
interpolate: bool = False,
) -> Config:
"""Load a config file. Takes care of path validation and section order."""
config_path = ensure_path(path)
if not config_path.exists() or not config_path.is_file():
raise IOError(Errors.E053.format(path=config_path, name="config.cfg"))
return Config(section_order=CONFIG_SECTION_ORDER).from_disk(
config_path, overrides=overrides, interpolate=interpolate
)
def load_config_from_str(
text: str, overrides: Dict[str, Any] = SimpleFrozenDict(), interpolate: bool = False
):
"""Load a full config from a string."""
return Config(section_order=CONFIG_SECTION_ORDER).from_str(
text, overrides=overrides, interpolate=interpolate,
)
def get_installed_models() -> List[str]: def get_installed_models() -> List[str]:
"""List all model packages currently installed in the environment. """List all model packages currently installed in the environment.