import logging from pathlib import Path from typing import Optional import srsly import typer from wasabi import msg from .. import util from ..language import Language from ..training.initialize import convert_vectors, init_nlp from ._util import ( Arg, Opt, _handle_renamed_language_codes, import_code, init_cli, parse_config_overrides, setup_gpu, show_validation_error, ) @init_cli.command("vectors") def init_vectors_cli( # fmt: off lang: str = Arg(..., help="The language of the nlp object to create"), vectors_loc: Path = Arg(..., help="Vectors file in Word2Vec format", exists=True), output_dir: Path = Arg(..., help="Pipeline output directory"), prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"), truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"), mode: str = Opt("default", "--mode", "-m", help="Vectors mode: default or floret"), verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True), attr: str = Opt("ORTH", "--attr", "-a", help="Optional token attribute to use for vectors, e.g. LOWER or NORM"), # fmt: on ): """Convert word vectors for use with spaCy. Will export an nlp object that you can use in the [initialize] block of your config to initialize a model with vectors. """ util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) # Throw error for renamed language codes in v4 _handle_renamed_language_codes(lang) msg.info(f"Creating blank nlp object for language '{lang}'") nlp = util.get_lang_class(lang)() if jsonl_loc is not None: update_lexemes(nlp, jsonl_loc) convert_vectors( nlp, vectors_loc, truncate=truncate, prune=prune, mode=mode, attr=attr, ) msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors") nlp.to_disk(output_dir) msg.good( "Saved nlp object with vectors to output directory. You can now use the " "path to it in your config as the 'vectors' setting in [initialize].", output_dir.resolve(), ) def update_lexemes(nlp: Language, jsonl_loc: Path) -> None: # Mostly used for backwards-compatibility and may be removed in the future lex_attrs = srsly.read_jsonl(jsonl_loc) for attrs in lex_attrs: if "settings" in attrs: continue lexeme = nlp.vocab[attrs["orth"]] lexeme.set_attrs(**attrs) @init_cli.command( "nlp", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, hidden=True, ) def init_pipeline_cli( # fmt: off ctx: typer.Context, # This is only used to read additional arguments config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True), output_path: Path = Arg(..., help="Output directory for the prepared data"), code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU") # fmt: on ): if verbose: util.logger.setLevel(logging.DEBUG) overrides = parse_config_overrides(ctx.args) import_code(code_path) setup_gpu(use_gpu) with show_validation_error(config_path): config = util.load_config(config_path, overrides=overrides) with show_validation_error(hint_fill=False): nlp = init_nlp(config, use_gpu=use_gpu) nlp.to_disk(output_path) msg.good(f"Saved initialized pipeline to {output_path}") @init_cli.command( "labels", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, ) def init_labels_cli( # fmt: off ctx: typer.Context, # This is only used to read additional arguments config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True), output_path: Path = Arg(..., help="Output directory for the labels"), code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU") # fmt: on ): """Generate JSON files for the labels in the data. This helps speed up the training process, since spaCy won't have to preprocess the data to extract the labels.""" if verbose: util.logger.setLevel(logging.DEBUG) if not output_path.exists(): output_path.mkdir(parents=True) overrides = parse_config_overrides(ctx.args) import_code(code_path) setup_gpu(use_gpu) with show_validation_error(config_path): config = util.load_config(config_path, overrides=overrides) with show_validation_error(hint_fill=False): nlp = init_nlp(config, use_gpu=use_gpu) _init_labels(nlp, output_path) def _init_labels(nlp, output_path): for name, component in nlp.pipeline: if getattr(component, "label_data", None) is not None: output_file = output_path / f"{name}.json" srsly.write_json(output_file, component.label_data) msg.good(f"Saving label data for component '{name}' to {output_file}") else: msg.info(f"No label data found for component '{name}'")