mirror of
https://github.com/explosion/spaCy.git
synced 2025-03-03 02:48:04 +03:00
Update Thinc and include section order
This commit is contained in:
parent
8736bfc052
commit
67cc39af7f
|
@ -6,7 +6,7 @@ requires = [
|
|||
"cymem>=2.0.2,<2.1.0",
|
||||
"preshed>=3.0.2,<3.1.0",
|
||||
"murmurhash>=0.28.0,<1.1.0",
|
||||
"thinc>=8.0.0a25,<8.0.0a30",
|
||||
"thinc>=8.0.0a26,<8.0.0a30",
|
||||
"blis>=0.4.0,<0.5.0",
|
||||
"pytokenizations",
|
||||
"smart_open>=2.0.0,<3.0.0"
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
# Our libraries
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.0.0a25,<8.0.0a30
|
||||
thinc>=8.0.0a26,<8.0.0a30
|
||||
blis>=0.4.0,<0.5.0
|
||||
ml_datasets>=0.1.1
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
|
|
|
@ -34,13 +34,13 @@ setup_requires =
|
|||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
thinc>=8.0.0a25,<8.0.0a30
|
||||
thinc>=8.0.0a26,<8.0.0a30
|
||||
install_requires =
|
||||
# Our libraries
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.0.0a25,<8.0.0a30
|
||||
thinc>=8.0.0a26,<8.0.0a30
|
||||
blis>=0.4.0,<0.5.0
|
||||
wasabi>=0.7.1,<1.1.0
|
||||
srsly>=2.1.0,<3.0.0
|
||||
|
|
|
@ -5,7 +5,6 @@ import sys
|
|||
import srsly
|
||||
from wasabi import Printer, MESSAGES, msg, diff_strings
|
||||
import typer
|
||||
from thinc.api import Config
|
||||
|
||||
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
|
||||
from ._util import import_code, debug_cli, get_sourced_components
|
||||
|
@ -49,7 +48,7 @@ def debug_config_cli(
|
|||
overrides = parse_config_overrides(ctx.args)
|
||||
import_code(code_path)
|
||||
with show_validation_error(config_path):
|
||||
config = Config().from_disk(config_path, overrides=overrides, interpolate=False)
|
||||
config = util.load_config(config_path, overrides=overrides)
|
||||
try:
|
||||
nlp, _ = util.load_model_from_config(config, auto_fill=auto_fill)
|
||||
except ValueError as e:
|
||||
|
@ -134,9 +133,7 @@ def debug_data(
|
|||
if not config_path.exists():
|
||||
msg.fail("Config file not found", config_path, exists=1)
|
||||
with show_validation_error(config_path):
|
||||
cfg = Config().from_disk(
|
||||
config_path, overrides=config_overrides, interpolate=False
|
||||
)
|
||||
cfg = util.load_config(config_path, overrides=config_overrides)
|
||||
nlp, config = util.load_model_from_config(cfg)
|
||||
# Use original config here, not resolved version
|
||||
sourced_components = get_sourced_components(cfg)
|
||||
|
|
|
@ -49,10 +49,9 @@ def debug_model_cli(
|
|||
}
|
||||
config_overrides = parse_config_overrides(ctx.args)
|
||||
with show_validation_error(config_path):
|
||||
config = util.load_config(config_path, overrides=config_overrides)
|
||||
try:
|
||||
nlp, config = util.load_model_from_config_path(
|
||||
config_path, overrides=config_overrides
|
||||
)
|
||||
nlp, config = util.load_model_from_config(config_path)
|
||||
except ValueError as e:
|
||||
msg.fail(str(e), exits=1)
|
||||
seed = config["pretraining"]["seed"]
|
||||
|
|
|
@ -1,12 +1,11 @@
|
|||
from typing import Optional, List
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from thinc.api import Config
|
||||
from wasabi import Printer
|
||||
import srsly
|
||||
import re
|
||||
|
||||
from ..util import load_model_from_config, get_lang_class
|
||||
from .. import util
|
||||
from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
|
||||
|
||||
|
||||
|
@ -50,7 +49,7 @@ def init_config(
|
|||
from jinja2 import Template
|
||||
except ImportError:
|
||||
msg.fail("This command requires jinja2", "pip install jinja2", exits=1)
|
||||
lang_defaults = get_lang_class(lang).Defaults
|
||||
lang_defaults = util.get_lang_class(lang).Defaults
|
||||
has_letters = lang_defaults.writing_system.get("has_letters", True)
|
||||
has_transformer = False # TODO: check this somehow
|
||||
if has_transformer:
|
||||
|
@ -80,9 +79,9 @@ def init_config(
|
|||
msg.text(f"- {label}: {value}")
|
||||
with show_validation_error(hint_init=False):
|
||||
with msg.loading("Auto-filling config..."):
|
||||
config = Config().from_str(base_template, interpolate=False)
|
||||
config = util.load_config_from_str(base_template)
|
||||
try:
|
||||
nlp, _ = load_model_from_config(config, auto_fill=True)
|
||||
nlp, _ = util.load_model_from_config(config, auto_fill=True)
|
||||
except ValueError as e:
|
||||
msg.fail(str(e), exits=1)
|
||||
msg.good("Auto-filled config with all values")
|
||||
|
|
|
@ -88,9 +88,8 @@ def pretrain(
|
|||
msg.info("Using CPU")
|
||||
msg.info(f"Loading config from: {config_path}")
|
||||
with show_validation_error(config_path):
|
||||
nlp, config = util.load_model_from_config_path(
|
||||
config_path, overrides=config_overrides
|
||||
)
|
||||
config = util.load_config(config_path, overrides=config_overrides)
|
||||
nlp, config = util.load_model_from_config(config)
|
||||
# TODO: validate that [pretraining] block exists
|
||||
if not output_dir.exists():
|
||||
output_dir.mkdir()
|
||||
|
|
|
@ -75,9 +75,7 @@ def train(
|
|||
msg.info("Using CPU")
|
||||
msg.info(f"Loading config and nlp from: {config_path}")
|
||||
with show_validation_error(config_path):
|
||||
config = Config().from_disk(
|
||||
config_path, overrides=config_overrides, interpolate=False
|
||||
)
|
||||
config = util.load_config(config_path, overrides=config_overrides)
|
||||
if config.get("training", {}).get("seed") is not None:
|
||||
fix_random_seed(config["training"]["seed"])
|
||||
# Use original config here before it's resolved to functions
|
||||
|
|
|
@ -21,7 +21,7 @@ from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
|
|||
from .gold import Example, validate_examples
|
||||
from .scorer import Scorer
|
||||
from .util import create_default_optimizer, registry
|
||||
from .util import SimpleFrozenDict, combine_score_weights
|
||||
from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER
|
||||
from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
|
||||
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||
from .lang.punctuation import TOKENIZER_INFIXES
|
||||
|
@ -36,7 +36,7 @@ from . import about
|
|||
|
||||
# This is the base config will all settings (training etc.)
|
||||
DEFAULT_CONFIG_PATH = Path(__file__).parent / "default_config.cfg"
|
||||
DEFAULT_CONFIG = Config().from_disk(DEFAULT_CONFIG_PATH, interpolate=False)
|
||||
DEFAULT_CONFIG = util.load_config(DEFAULT_CONFIG_PATH)
|
||||
|
||||
|
||||
class BaseDefaults:
|
||||
|
@ -45,7 +45,7 @@ class BaseDefaults:
|
|||
Language.Defaults.
|
||||
"""
|
||||
|
||||
config: Config = Config()
|
||||
config: Config = Config(section_order=CONFIG_SECTION_ORDER)
|
||||
tokenizer_exceptions: Dict[str, List[dict]] = BASE_EXCEPTIONS
|
||||
prefixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_PREFIXES
|
||||
suffixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_SUFFIXES
|
||||
|
@ -583,7 +583,7 @@ class Language:
|
|||
# We're calling the internal _fill here to avoid constructing the
|
||||
# registered functions twice
|
||||
resolved, filled = registry.resolve(cfg, validate=validate)
|
||||
filled = filled[factory_name]
|
||||
filled = Config(filled[factory_name])
|
||||
filled["factory"] = factory_name
|
||||
filled.pop("@factories", None)
|
||||
# Merge the final filled config with the raw config (including non-
|
||||
|
@ -1390,7 +1390,9 @@ class Language:
|
|||
DOCS: https://spacy.io/api/language#from_config
|
||||
"""
|
||||
if auto_fill:
|
||||
config = Config(cls.default_config).merge(config)
|
||||
config = Config(
|
||||
cls.default_config, section_order=CONFIG_SECTION_ORDER
|
||||
).merge(config)
|
||||
if "nlp" not in config:
|
||||
raise ValueError(Errors.E985.format(config=config))
|
||||
config_lang = config["nlp"]["lang"]
|
||||
|
|
|
@ -58,6 +58,12 @@ _PRINT_ENV = False
|
|||
OOV_RANK = numpy.iinfo(numpy.uint64).max
|
||||
LEXEME_NORM_LANGS = ["da", "de", "el", "en", "id", "lb", "pt", "ru", "sr", "ta", "th"]
|
||||
|
||||
# Default order of sections in the config.cfg. Not all sections needs to exist,
|
||||
# and additional sections are added at the end, in alphabetical order.
|
||||
# fmt: off
|
||||
CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "training", "pretraining"]
|
||||
# fmt: on
|
||||
|
||||
|
||||
class registry(thinc.registry):
|
||||
languages = catalogue.create("spacy", "languages", entry_points=True)
|
||||
|
@ -264,33 +270,11 @@ def load_model_from_path(
|
|||
if not meta:
|
||||
meta = get_model_meta(model_path)
|
||||
config_path = model_path / "config.cfg"
|
||||
nlp, _ = load_model_from_config_path(
|
||||
config_path, overrides=dict_to_dot(config), vocab=vocab, disable=disable
|
||||
)
|
||||
config = load_config(config_path, overrides=dict_to_dot(config))
|
||||
nlp, _ = load_model_from_config(config, vocab=vocab, disable=disable)
|
||||
return nlp.from_disk(model_path, exclude=disable)
|
||||
|
||||
|
||||
def load_model_from_config_path(
|
||||
config_path: Union[str, Path],
|
||||
*,
|
||||
vocab: Union["Vocab", bool] = True,
|
||||
disable: Iterable[str] = tuple(),
|
||||
auto_fill: bool = False,
|
||||
validate: bool = True,
|
||||
overrides: Dict[str, Any] = SimpleFrozenDict(),
|
||||
interpolate: bool = False,
|
||||
) -> Tuple["Language", Config]:
|
||||
config_path = ensure_path(config_path)
|
||||
if not config_path.exists() or not config_path.is_file():
|
||||
raise IOError(Errors.E053.format(path=config_path, name="config.cfg"))
|
||||
config = Config().from_disk(
|
||||
config_path, overrides=overrides, interpolate=interpolate
|
||||
)
|
||||
return load_model_from_config(
|
||||
config, vocab=vocab, disable=disable, auto_fill=auto_fill, validate=validate,
|
||||
)
|
||||
|
||||
|
||||
def load_model_from_config(
|
||||
config: Union[Dict[str, Any], Config],
|
||||
*,
|
||||
|
@ -337,6 +321,29 @@ def load_model_from_init_py(
|
|||
)
|
||||
|
||||
|
||||
def load_config(
|
||||
path: Union[str, Path],
|
||||
overrides: Dict[str, Any] = SimpleFrozenDict(),
|
||||
interpolate: bool = False,
|
||||
) -> Config:
|
||||
"""Load a config file. Takes care of path validation and section order."""
|
||||
config_path = ensure_path(path)
|
||||
if not config_path.exists() or not config_path.is_file():
|
||||
raise IOError(Errors.E053.format(path=config_path, name="config.cfg"))
|
||||
return Config(section_order=CONFIG_SECTION_ORDER).from_disk(
|
||||
config_path, overrides=overrides, interpolate=interpolate
|
||||
)
|
||||
|
||||
|
||||
def load_config_from_str(
|
||||
text: str, overrides: Dict[str, Any] = SimpleFrozenDict(), interpolate: bool = False
|
||||
):
|
||||
"""Load a full config from a string."""
|
||||
return Config(section_order=CONFIG_SECTION_ORDER).from_str(
|
||||
text, overrides=overrides, interpolate=interpolate,
|
||||
)
|
||||
|
||||
|
||||
def get_installed_models() -> List[str]:
|
||||
"""List all model packages currently installed in the environment.
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user