Update CLI and add [initialize] block

2025-08-09 06:34:54 +03:00 · 2020-09-28 11:56:14 +02:00 · 2020-09-28 11:56:14 +02:00 · e44a7519cd
commit e44a7519cd
parent d5155376fd
7 changed files with 94 additions and 51 deletions
--- a/setup.cfg
+++ b/setup.cfg
@ -98,7 +98,7 @@ universal = false
 formats = gztar

 [flake8]
-ignore = E203, E266, E501, E731, W503
+ignore = E203, E266, E501, E731, W503, E741
 max-line-length = 80
 select = B,C,E,F,W,T4,B9
 exclude =
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@ -459,24 +459,3 @@ def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[in
            p = int(p)
        result.append(p)
    return result
-
-
-def load_from_paths(
-    config: Config,
-) -> Tuple[List[Dict[str, str]], Dict[str, dict], bytes]:
-    # TODO: separate checks from loading
-    raw_text = ensure_path(config["training"]["raw_text"])
-    if raw_text is not None:
-        if not raw_text.exists():
-            msg.fail("Can't find raw text", raw_text, exits=1)
-        raw_text = list(srsly.read_jsonl(config["training"]["raw_text"]))
-    tag_map = {}
-    morph_rules = {}
-    weights_data = None
-    init_tok2vec = ensure_path(config["training"]["init_tok2vec"])
-    if init_tok2vec is not None:
-        if not init_tok2vec.exists():
-            msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1)
-        with init_tok2vec.open("rb") as file_:
-            weights_data = file_.read()
-    return raw_text, tag_map, morph_rules, weights_data
--- a/spacy/cli/init_pipeline.py
+++ b/spacy/cli/init_pipeline.py
@ -8,12 +8,12 @@ import srsly

 from .. import util
 from ..util import registry, resolve_dot_names, OOV_RANK
-from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain
+from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain, ConfigSchemaInit
 from ..language import Language
 from ..lookups import Lookups
 from ..errors import Errors
 from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
-from ._util import import_code, get_sourced_components, load_from_paths
+from ._util import import_code, get_sourced_components


 DEFAULT_OOV_PROB = -20
@ -67,14 +67,15 @@ def init_pipeline(config: Config, use_gpu: int = -1) -> Language:
    # Use original config here before it's resolved to functions
    sourced_components = get_sourced_components(config)
    with show_validation_error():
-        nlp = util.load_model_from_config(raw_config)
+        nlp = util.load_model_from_config(raw_config, auto_fill=True)
    msg.good("Set up nlp object from config")
+    config = nlp.config.interpolate()
    # Resolve all training-relevant sections using the filled nlp config
    T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
    dot_names = [T["train_corpus"], T["dev_corpus"], T["raw_text"]]
    train_corpus, dev_corpus, raw_text = resolve_dot_names(config, dot_names)
-    # TODO: move lookups to [initialize], add vocab data
-    init_vocab(nlp, lookups=T["lookups"])
+    I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
+    init_vocab(nlp, data=I["vocab"]["data"], lookups=I["vocab"]["lookups"])
    msg.good("Created vocabulary")
    if T["vectors"] is not None:
        add_vectors(nlp, T["vectors"])
@ -98,22 +99,19 @@ def init_pipeline(config: Config, use_gpu: int = -1) -> Language:
    verify_config(nlp)
    if "pretraining" in config and config["pretraining"]:
        P = registry.resolve(config["pretraining"], schema=ConfigSchemaPretrain)
-        add_tok2vec_weights({"training": T, "pretraining": P}, nlp)
+        add_tok2vec_weights(nlp, P, I)
    # TODO: this should be handled better?
    nlp = before_to_disk(nlp)
    return nlp


 def init_vocab(
-    nlp: Language,
-    *,
-    vocab_data: Optional[Path] = None,
-    lookups: Optional[Lookups] = None,
+    nlp: Language, *, data: Optional[Path] = None, lookups: Optional[Lookups] = None,
 ) -> Language:
    if lookups:
        nlp.vocab.lookups = lookups
        msg.good(f"Added vocab lookups: {', '.join(lookups.tables)}")
-    data_path = util.ensure_path(vocab_data)
+    data_path = util.ensure_path(data)
    if data_path is not None:
        lex_attrs = srsly.read_jsonl(data_path)
        for lexeme in nlp.vocab:
@ -131,11 +129,29 @@ def init_vocab(
        msg.good(f"Added {len(nlp.vocab)} lexical entries to the vocab")


-def add_tok2vec_weights(config: Config, nlp: Language) -> None:
+def add_tok2vec_weights(
+    nlp: Language, pretrain_config: Dict[str, Any], init_config: Dict[str, Any]
+) -> None:
    # Load pretrained tok2vec weights - cf. CLI command 'pretrain'
-    weights_data = load_from_paths(config)
+    P = pretrain_config
+    I = init_config
+    raw_text = util.ensure_path(I["vocab"]["raw_text"])
+    if raw_text is not None:
+        if not raw_text.exists():
+            msg.fail("Can't find raw text", raw_text, exits=1)
+        raw_text = list(srsly.read_jsonl(raw_text))
+    weights_data = None
+    init_tok2vec = util.ensure_path(I["vocab"]["init_tok2vec"])
+    if init_tok2vec is not None:
+        if P["objective"].get("type") == "vectors" and not I["vectors"]:
+            err = "Need initialize.vectors if pretraining.objective.type is vectors"
+            msg.fail(err, exits=1)
+        if not init_tok2vec.exists():
+            msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1)
+        with init_tok2vec.open("rb") as file_:
+            weights_data = file_.read()
    if weights_data is not None:
-        tok2vec_component = config["pretraining"]["component"]
+        tok2vec_component = P["component"]
        if tok2vec_component is None:
            msg.fail(
                f"To use pretrained tok2vec weights, [pretraining.component] "
@ -143,9 +159,8 @@ def add_tok2vec_weights(config: Config, nlp: Language) -> None:
                exits=1,
            )
        layer = nlp.get_pipe(tok2vec_component).model
-        tok2vec_layer = config["pretraining"]["layer"]
-        if tok2vec_layer:
-            layer = layer.get_ref(tok2vec_layer)
+        if P["layer"]:
+            layer = layer.get_ref(P["layer"])
        layer.from_bytes(weights_data)
        msg.good(f"Loaded pretrained weights into component '{tok2vec_component}'")

--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -14,7 +14,6 @@ from .init_pipeline import init_pipeline, must_initialize
 from .init_pipeline import create_before_to_disk_callback
 from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
 from ._util import import_code
-from ._util import load_from_paths  # noqa: F401 (needed for Ray extension for now)
 from ..language import Language
 from .. import util
 from ..training.example import Example
@ -381,3 +380,26 @@ def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> No
        if not output_path.exists():
            output_path.mkdir()
            msg.good(f"Created output directory: {output_path}")
+
+
+# TODO: this is currently imported by the ray extension and not used otherwise
+def load_from_paths(
+    config: Config,
+) -> Tuple[List[Dict[str, str]], Dict[str, dict], bytes]:
+    import srsly
+    # TODO: separate checks from loading
+    raw_text = util.ensure_path(config["training"]["raw_text"])
+    if raw_text is not None:
+        if not raw_text.exists():
+            msg.fail("Can't find raw text", raw_text, exits=1)
+        raw_text = list(srsly.read_jsonl(config["training"]["raw_text"]))
+    tag_map = {}
+    morph_rules = {}
+    weights_data = None
+    init_tok2vec = util.ensure_path(config["training"]["init_tok2vec"])
+    if init_tok2vec is not None:
+        if not init_tok2vec.exists():
+            msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1)
+        with init_tok2vec.open("rb") as file_:
+            weights_data = file_.read()
+    return raw_text, tag_map, morph_rules, weights_data
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@ -108,3 +108,15 @@ grad_clip = 1.0
 use_averages = false
 eps = 1e-8
 learn_rate = 0.001
+
+[initialize]
+tokenizer = {}
+components = {}
+
+[initialize.vocab]
+data = null
+lookups = null
+vectors = null
+# Extra resources for transfer-learning or pseudo-rehearsal
+init_tok2vec = ${paths.init_tok2vec}
+raw_text = ${paths.raw}
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -273,22 +273,37 @@ class ConfigSchemaPretrain(BaseModel):
        arbitrary_types_allowed = True


+class ConfigSchemaInitVocab(BaseModel):
+    # fmt: off
+    data: Optional[str] = Field(..., title="Path to JSON-formatted vocabulary file")
+    lookups: Optional[Lookups] = Field(..., title="Vocabulary lookups, e.g. lexeme normalization")
+    vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
+    init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
+    raw_text: Optional[StrictStr] = Field(default=None, title="Raw text")
+    # fmt: on
+
+    class Config:
+        extra = "forbid"
+        arbitrary_types_allowed = True
+
+
+class ConfigSchemaInit(BaseModel):
+    vocab: ConfigSchemaInitVocab
+    tokenizer: Any
+    components: Dict[str, Any]
+
+    class Config:
+        extra = "forbid"
+        arbitrary_types_allowed = True
+
+
 class ConfigSchema(BaseModel):
    training: ConfigSchemaTraining
    nlp: ConfigSchemaNlp
    pretraining: Union[ConfigSchemaPretrain, ConfigSchemaPretrainEmpty] = {}
    components: Dict[str, Dict[str, Any]]
    corpora: Dict[str, Reader]
-
-    @root_validator(allow_reuse=True)
-    def validate_config(cls, values):
-        """Perform additional validation for settings with dependencies."""
-        pt = values.get("pretraining")
-        if pt and not isinstance(pt, ConfigSchemaPretrainEmpty):
-            if pt.objective.get("type") == "vectors" and not values["nlp"].vectors:
-                err = "Need nlp.vectors if pretraining.objective.type is vectors"
-                raise ValueError(err)
-        return values
+    initialize: ConfigSchemaInit

    class Config:
        extra = "allow"
--- a/spacy/util.py
+++ b/spacy/util.py
@ -61,7 +61,7 @@ LEXEME_NORM_LANGS = ["da", "de", "el", "en", "id", "lb", "pt", "ru", "sr", "ta",
 # Default order of sections in the config.cfg. Not all sections needs to exist,
 # and additional sections are added at the end, in alphabetical order.
 # fmt: off
-CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "corpora", "training", "pretraining"]
+CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "corpora", "training", "pretraining", "initialize"]
 # fmt: on