spaCy/spacy/cli/init_pipeline.py

from typing import Optional, Literal, List
import logging
from pathlib import Path
from wasabi import msg
import srsly
from radicli import Arg, ExistingPath, ExistingFilePathOrDash, ExistingFilePath

from .. import util
from ..training.initialize import init_nlp, convert_vectors
from ..language import Language
from ._util import cli, parse_config_overrides, show_validation_error
from ._util import import_code, setup_gpu, _handle_renamed_language_codes


@cli.subcommand(
    "init",
    "vectors",
    # fmt: off
    lang=Arg(help="The language of the nlp object to create"),
    vectors_loc=Arg(help="Vectors file in Word2Vec format"),
    output_dir=Arg(help="Pipeline output directory"),
    prune=Arg("--prune", "-p", help="Optional number of vectors to prune to"),
    truncate=Arg("--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
    mode=Arg("--mode", "-m", help="Vectors mode: default or floret"),
    verbose=Arg("--verbose", "-V", help="Display more information for debugging purposes"),
    jsonl_loc=Arg("--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file"),
    # fmt: on
)
def init_vectors_cli(
    lang: str,
    vectors_loc: ExistingPath,
    output_dir: Path,
    prune: int = -1,
    truncate: int = 0,
    mode: Literal["default", "floret"] = "default",
    verbose: bool = False,
    jsonl_loc: Optional[Path] = None,
):
    """Convert word vectors for use with spaCy. Will export an nlp object that
    you can use in the [initialize] block of your config to initialize
    a model with vectors.
    """
    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)

    # Throw error for renamed language codes in v4
    _handle_renamed_language_codes(lang)

    msg.info(f"Creating blank nlp object for language '{lang}'")
    nlp = util.get_lang_class(lang)()
    if jsonl_loc is not None:
        update_lexemes(nlp, jsonl_loc)
    convert_vectors(
        nlp,
        vectors_loc,
        truncate=truncate,
        prune=prune,
        mode=mode,
    )
    msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
    nlp.to_disk(output_dir)
    msg.good(
        "Saved nlp object with vectors to output directory. You can now use the "
        "path to it in your config as the 'vectors' setting in [initialize].",
        output_dir.resolve(),
    )


def update_lexemes(nlp: Language, jsonl_loc: Path) -> None:
    # Mostly used for backwards-compatibility and may be removed in the future
    lex_attrs = srsly.read_jsonl(jsonl_loc)
    for attrs in lex_attrs:
        if "settings" in attrs:
            continue
        lexeme = nlp.vocab[attrs["orth"]]
        lexeme.set_attrs(**attrs)


@cli.subcommand_with_extra(
    "init",
    "nlp",
    # fmt: off
    config_path=Arg(help="Path to config file"),
    output_path=Arg(help="Output directory for the prepared data"),
    code_path=Arg("--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
    verbose=Arg("--verbose", "-V", help="Display more information for debugging purposes"),
    use_gpu=Arg("--gpu-id", "-g", help="GPU ID or -1 for CPU"),
    # fmt: on
)
def init_pipeline_cli(
    config_path: ExistingFilePathOrDash,
    output_path: Path,
    code_path: Optional[ExistingFilePath] = None,
    verbose: bool = False,
    use_gpu: int = -1,
    _extra: List[str] = [],
):
    """Initialize a pipeline."""
    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
    overrides = parse_config_overrides(_extra)
    import_code(code_path)
    setup_gpu(use_gpu)
    with show_validation_error(config_path):
        config = util.load_config(config_path, overrides=overrides)
    with show_validation_error(hint_fill=False):
        nlp = init_nlp(config, use_gpu=use_gpu)
    nlp.to_disk(output_path)
    msg.good(f"Saved initialized pipeline to {output_path}")


@cli.subcommand_with_extra(
    "init",
    "labels",
    # fmt: off
    config_path=Arg(help="Path to config file"),
    output_path=Arg(help="Output directory for the labels"),
    code_path=Arg("--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
    verbose=Arg("--verbose", "-V", help="Display more information for debugging purposes"),
    use_gpu=Arg("--gpu-id", "-g", help="GPU ID or -1 for CPU"),
    # fmt: on
)
def init_labels_cli(
    config_path: ExistingFilePathOrDash,
    output_path: Path,
    code_path: Optional[ExistingFilePath] = None,
    verbose: bool = False,
    use_gpu: int = -1,
    _extra: List[str] = [],
):
    """Generate JSON files for the labels in the data. This helps speed up the
    training process, since spaCy won't have to preprocess the data to
    extract the labels."""
    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
    if not output_path.exists():
        output_path.mkdir(parents=True)
    overrides = parse_config_overrides(_extra)
    import_code(code_path)
    setup_gpu(use_gpu)
    with show_validation_error(config_path):
        config = util.load_config(config_path, overrides=overrides)
    with show_validation_error(hint_fill=False):
        nlp = init_nlp(config, use_gpu=use_gpu)
    _init_labels(nlp, output_path)


def _init_labels(nlp, output_path):
    for name, component in nlp.pipeline:
        if getattr(component, "label_data", None) is not None:
            output_file = output_path / f"{name}.json"
            srsly.write_json(output_file, component.label_data)
            msg.good(f"Saving label data for component '{name}' to {output_file}")
        else:
            msg.info(f"No label data found for component '{name}'")