spaCy/spacy/cli/init_pipeline.py

import logging
from pathlib import Path
from typing import Optional

import srsly
import typer
from wasabi import msg

from .. import util
from ..language import Language
from ..training.initialize import convert_vectors, init_nlp
from ._util import (
    Arg,
    Opt,
    import_code,
    init_cli,
    parse_config_overrides,
    setup_gpu,
    show_validation_error,
)


@init_cli.command("vectors")
def init_vectors_cli(
    # fmt: off
    lang: str = Arg(..., help="The language of the nlp object to create"),
    vectors_loc: Path = Arg(..., help="Vectors file in Word2Vec format", exists=True),
    output_dir: Path = Arg(..., help="Pipeline output directory"),
    prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
    truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
    mode: str = Opt("default", "--mode", "-m", help="Vectors mode: default or floret"),
    name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
    jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
    # fmt: on
):
    """Convert word vectors for use with spaCy. Will export an nlp object that
    you can use in the [initialize] block of your config to initialize
    a model with vectors.
    """
    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
    msg.info(f"Creating blank nlp object for language '{lang}'")
    nlp = util.get_lang_class(lang)()
    if jsonl_loc is not None:
        update_lexemes(nlp, jsonl_loc)
    convert_vectors(
        nlp,
        vectors_loc,
        truncate=truncate,
        prune=prune,
        name=name,
        mode=mode,
    )
    msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
    nlp.to_disk(output_dir)
    msg.good(
        "Saved nlp object with vectors to output directory. You can now use the "
        "path to it in your config as the 'vectors' setting in [initialize].",
        output_dir.resolve(),
    )


def update_lexemes(nlp: Language, jsonl_loc: Path) -> None:
    # Mostly used for backwards-compatibility and may be removed in the future
    lex_attrs = srsly.read_jsonl(jsonl_loc)
    for attrs in lex_attrs:
        if "settings" in attrs:
            continue
        lexeme = nlp.vocab[attrs["orth"]]
        lexeme.set_attrs(**attrs)


@init_cli.command(
    "nlp",
    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
    hidden=True,
)
def init_pipeline_cli(
    # fmt: off
    ctx: typer.Context,  # This is only used to read additional arguments
    config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
    output_path: Path = Arg(..., help="Output directory for the prepared data"),
    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
    # fmt: on
):
    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
    overrides = parse_config_overrides(ctx.args)
    import_code(code_path)
    setup_gpu(use_gpu)
    with show_validation_error(config_path):
        config = util.load_config(config_path, overrides=overrides)
    with show_validation_error(hint_fill=False):
        nlp = init_nlp(config, use_gpu=use_gpu)
    nlp.to_disk(output_path)
    msg.good(f"Saved initialized pipeline to {output_path}")


@init_cli.command(
    "labels",
    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
)
def init_labels_cli(
    # fmt: off
    ctx: typer.Context,  # This is only used to read additional arguments
    config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
    output_path: Path = Arg(..., help="Output directory for the labels"),
    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
    # fmt: on
):
    """Generate JSON files for the labels in the data. This helps speed up the
    training process, since spaCy won't have to preprocess the data to
    extract the labels."""
    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
    if not output_path.exists():
        output_path.mkdir(parents=True)
    overrides = parse_config_overrides(ctx.args)
    import_code(code_path)
    setup_gpu(use_gpu)
    with show_validation_error(config_path):
        config = util.load_config(config_path, overrides=overrides)
    with show_validation_error(hint_fill=False):
        nlp = init_nlp(config, use_gpu=use_gpu)
    _init_labels(nlp, output_path)


def _init_labels(nlp, output_path):
    for name, component in nlp.pipeline:
        if getattr(component, "label_data", None) is not None:
            output_file = output_path / f"{name}.json"
            srsly.write_json(output_file, component.label_data)
            msg.good(f"Saving label data for component '{name}' to {output_file}")
        else:
            msg.info(f"No label data found for component '{name}'")
Add init_pipeline file 2020-09-28 10:47:34 +03:00			`import logging`
			`from pathlib import Path`
Configure isort to use the Black profile, recursively isort the `spacy` module (#12721) * Use isort with Black profile * isort all the things * Fix import cycles as a result of import sorting * Add DOCBIN_ALL_ATTRS type definition * Add isort to requirements * Remove isort from build dependencies check * Typo 2023-06-14 18:48:41 +03:00			`from typing import Optional`

Move init labels to init pipeline module 2020-09-29 19:09:33 +03:00			`import srsly`
Configure isort to use the Black profile, recursively isort the `spacy` module (#12721) * Use isort with Black profile * isort all the things * Fix import cycles as a result of import sorting * Add DOCBIN_ALL_ATTRS type definition * Add isort to requirements * Remove isort from build dependencies check * Typo 2023-06-14 18:48:41 +03:00			`import typer`
			`from wasabi import msg`
Add init_pipeline file 2020-09-28 10:47:34 +03:00
			`from .. import util`
Hide jsonl_loc on init vectors and tidy up [ci skip] 2020-10-01 17:44:17 +03:00			`from ..language import Language`
Configure isort to use the Black profile, recursively isort the `spacy` module (#12721) * Use isort with Black profile * isort all the things * Fix import cycles as a result of import sorting * Add DOCBIN_ALL_ATTRS type definition * Add isort to requirements * Remove isort from build dependencies check * Typo 2023-06-14 18:48:41 +03:00			`from ..training.initialize import convert_vectors, init_nlp`
			`from ._util import (`
			`Arg,`
			`Opt,`
			`import_code,`
			`init_cli,`
			`parse_config_overrides,`
			`setup_gpu,`
			`show_validation_error,`
			`)`
Update vocab init 2020-09-28 12:30:18 +03:00

Add init vectors 2020-09-29 11:58:50 +03:00			`@init_cli.command("vectors")`
			`def init_vectors_cli(`
			`# fmt: off`
			`lang: str = Arg(..., help="The language of the nlp object to create"),`
			`vectors_loc: Path = Arg(..., help="Vectors file in Word2Vec format", exists=True),`
			`output_dir: Path = Arg(..., help="Pipeline output directory"),`
			`prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),`
			`truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),`
Add support for floret vectors (#8909) * Add support for fasttext-bloom hash-only vectors Overview: * Extend `Vectors` to have two modes: `default` and `ngram` * `default` is the default mode and equivalent to the current `Vectors` * `ngram` supports the hash-only ngram tables from `fasttext-bloom` * Extend `spacy.StaticVectors.v2` to handle both modes with no changes for `default` vectors * Extend `spacy init vectors` to support ngram tables The `ngram` mode only supports vector tables produced by this fork of fastText, which adds an option to represent all vectors using only the ngram buckets table and which uses the exact same ngram generation algorithm and hash function (`MurmurHash3_x64_128`). `fasttext-bloom` produces an additional `.hashvec` table, which can be loaded by `spacy init vectors --fasttext-bloom-vectors`. https://github.com/adrianeboyd/fastText/tree/feature/bloom Implementation details: * `Vectors` now includes the `StringStore` as `Vectors.strings` so that the API can stay consistent for both `default` (which can look up from `str` or `int`) and `ngram` (which requires `str` to calculate the ngrams). * In ngram mode `Vectors` uses a default `Vectors` object as a cache since the ngram vectors lookups are relatively expensive. * The default cache size is the same size as the provided ngram vector table. * Once the cache is full, no more entries are added. The user is responsible for managing the cache in cases where the initial documents are not representative of the texts. * The cache can be resized by setting `Vectors.ngram_cache_size` or cleared with `vectors._ngram_cache.clear()`. * The API ends up a bit split between methods for `default` and for `ngram`, so functions that only make sense for `default` or `ngram` include warnings with custom messages suggesting alternatives where possible. * `Vocab.vectors` becomes a property so that the string stores can be synced when assigning vectors to a vocab. * `Vectors` serializes its own config settings as `vectors.cfg`. * The `Vectors` serialization methods have added support for `exclude` so that the `Vocab` can exclude the `Vectors` strings while serializing. Removed: * The `minn` and `maxn` options and related code from `Vocab.get_vector`, which does not work in a meaningful way for default vector tables. * The unused `GlobalRegistry` in `Vectors`. * Refactor to use reduce_mean Refactor to use reduce_mean and remove the ngram vectors cache. * Rename to floret * Rename to floret in error messages * Use --vectors-mode in CLI, vector init * Fix vectors mode in init * Remove unused var * Minor API and docstrings adjustments * Rename `--vectors-mode` to `--mode` in `init vectors` CLI * Rename `Vectors.get_floret_vectors` to `Vectors.get_batch` and support both modes. * Minor updates to Vectors docstrings. * Update API docs for Vectors and init vectors CLI * Update types for StaticVectors 2021-10-27 15:08:31 +03:00			`mode: str = Opt("default", "--mode", "-m", help="Vectors mode: default or floret"),`
Add init vectors 2020-09-29 11:58:50 +03:00			`name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),`
Fix logging 2020-09-29 17:08:39 +03:00			`verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),`
Hide jsonl_loc on init vectors and tidy up [ci skip] 2020-10-01 17:44:17 +03:00			`jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),`
Add init vectors 2020-09-29 11:58:50 +03:00			`# fmt: on`
			`):`
Fix logging 2020-09-29 17:08:39 +03:00			`"""Convert word vectors for use with spaCy. Will export an nlp object that`
Update docs [ci skip] 2020-10-01 13:15:53 +03:00			`you can use in the [initialize] block of your config to initialize`
Fix logging 2020-09-29 17:08:39 +03:00			`a model with vectors.`
			`"""`
Tidy up and adjust logging [ci skip] 2020-09-30 02:22:08 +03:00			`util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)`
Add init vectors 2020-09-29 11:58:50 +03:00			`msg.info(f"Creating blank nlp object for language '{lang}'")`
			`nlp = util.get_lang_class(lang)()`
Restore the 'jsonl' arg for init vectors The lexemes.jsonl file is still used in our English vectors, and it may be required by users as well. I think it's worth supporting the option. 2020-09-29 22:33:55 +03:00			`if jsonl_loc is not None:`
Hide jsonl_loc on init vectors and tidy up [ci skip] 2020-10-01 17:44:17 +03:00			`update_lexemes(nlp, jsonl_loc)`
Add support for floret vectors (#8909) * Add support for fasttext-bloom hash-only vectors Overview: * Extend `Vectors` to have two modes: `default` and `ngram` * `default` is the default mode and equivalent to the current `Vectors` * `ngram` supports the hash-only ngram tables from `fasttext-bloom` * Extend `spacy.StaticVectors.v2` to handle both modes with no changes for `default` vectors * Extend `spacy init vectors` to support ngram tables The `ngram` mode only supports vector tables produced by this fork of fastText, which adds an option to represent all vectors using only the ngram buckets table and which uses the exact same ngram generation algorithm and hash function (`MurmurHash3_x64_128`). `fasttext-bloom` produces an additional `.hashvec` table, which can be loaded by `spacy init vectors --fasttext-bloom-vectors`. https://github.com/adrianeboyd/fastText/tree/feature/bloom Implementation details: * `Vectors` now includes the `StringStore` as `Vectors.strings` so that the API can stay consistent for both `default` (which can look up from `str` or `int`) and `ngram` (which requires `str` to calculate the ngrams). * In ngram mode `Vectors` uses a default `Vectors` object as a cache since the ngram vectors lookups are relatively expensive. * The default cache size is the same size as the provided ngram vector table. * Once the cache is full, no more entries are added. The user is responsible for managing the cache in cases where the initial documents are not representative of the texts. * The cache can be resized by setting `Vectors.ngram_cache_size` or cleared with `vectors._ngram_cache.clear()`. * The API ends up a bit split between methods for `default` and for `ngram`, so functions that only make sense for `default` or `ngram` include warnings with custom messages suggesting alternatives where possible. * `Vocab.vectors` becomes a property so that the string stores can be synced when assigning vectors to a vocab. * `Vectors` serializes its own config settings as `vectors.cfg`. * The `Vectors` serialization methods have added support for `exclude` so that the `Vocab` can exclude the `Vectors` strings while serializing. Removed: * The `minn` and `maxn` options and related code from `Vocab.get_vector`, which does not work in a meaningful way for default vector tables. * The unused `GlobalRegistry` in `Vectors`. * Refactor to use reduce_mean Refactor to use reduce_mean and remove the ngram vectors cache. * Rename to floret * Rename to floret in error messages * Use --vectors-mode in CLI, vector init * Fix vectors mode in init * Remove unused var * Minor API and docstrings adjustments * Rename `--vectors-mode` to `--mode` in `init vectors` CLI * Rename `Vectors.get_floret_vectors` to `Vectors.get_batch` and support both modes. * Minor updates to Vectors docstrings. * Update API docs for Vectors and init vectors CLI * Update types for StaticVectors 2021-10-27 15:08:31 +03:00			`convert_vectors(`
			`nlp,`
			`vectors_loc,`
			`truncate=truncate,`
			`prune=prune,`
			`name=name,`
			`mode=mode,`
			`)`
Fix logging 2020-09-29 17:08:39 +03:00			`msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")`
Add init vectors 2020-09-29 11:58:50 +03:00			`nlp.to_disk(output_dir)`
			`msg.good(`
			`"Saved nlp object with vectors to output directory. You can now use the "`
Fix success message [ci skip] 2020-10-23 17:11:54 +03:00			`"path to it in your config as the 'vectors' setting in [initialize].",`
Resolve dir for better output [ci skip] 2020-09-29 23:01:04 +03:00			`output_dir.resolve(),`
Add init vectors 2020-09-29 11:58:50 +03:00			`)`


Hide jsonl_loc on init vectors and tidy up [ci skip] 2020-10-01 17:44:17 +03:00			`def update_lexemes(nlp: Language, jsonl_loc: Path) -> None:`
			`# Mostly used for backwards-compatibility and may be removed in the future`
			`lex_attrs = srsly.read_jsonl(jsonl_loc)`
			`for attrs in lex_attrs:`
			`if "settings" in attrs:`
			`continue`
			`lexeme = nlp.vocab[attrs["orth"]]`
			`lexeme.set_attrs(**attrs)`


Add init_pipeline file 2020-09-28 10:47:34 +03:00			`@init_cli.command(`
Don't support init path for now 2020-09-28 13:46:28 +03:00			`"nlp",`
			`context_settings={"allow_extra_args": True, "ignore_unknown_options": True},`
			`hidden=True,`
Add init_pipeline file 2020-09-28 10:47:34 +03:00			`)`
			`def init_pipeline_cli(`
			`# fmt: off`
			`ctx: typer.Context, # This is only used to read additional arguments`
Update argument handling and documentation 2020-12-08 12:41:18 +03:00			`config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),`
Add init_pipeline file 2020-09-28 10:47:34 +03:00			`output_path: Path = Arg(..., help="Output directory for the prepared data"),`
			`code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),`
			`verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),`
Refactor CLI 2020-09-28 16:09:59 +03:00			`use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")`
Add init_pipeline file 2020-09-28 10:47:34 +03:00			`# fmt: on`
			`):`
Tidy up and adjust logging [ci skip] 2020-09-30 02:22:08 +03:00			`util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)`
Add init_pipeline file 2020-09-28 10:47:34 +03:00			`overrides = parse_config_overrides(ctx.args)`
			`import_code(code_path)`
Refactor CLI 2020-09-28 16:09:59 +03:00			`setup_gpu(use_gpu)`
Add init_pipeline file 2020-09-28 10:47:34 +03:00			`with show_validation_error(config_path):`
Fix commands 2020-09-28 11:53:17 +03:00			`config = util.load_config(config_path, overrides=overrides)`
Refactor CLI 2020-09-28 16:09:59 +03:00			`with show_validation_error(hint_fill=False):`
Tidy up and adjust logging [ci skip] 2020-09-30 02:22:08 +03:00			`nlp = init_nlp(config, use_gpu=use_gpu)`
Add init_pipeline file 2020-09-28 10:47:34 +03:00			`nlp.to_disk(output_path)`
Fix commands 2020-09-28 11:53:17 +03:00			`msg.good(f"Saved initialized pipeline to {output_path}")`
Move init labels to init pipeline module 2020-09-29 19:09:33 +03:00

			`@init_cli.command(`
			`"labels",`
			`context_settings={"allow_extra_args": True, "ignore_unknown_options": True},`
			`)`
			`def init_labels_cli(`
			`# fmt: off`
			`ctx: typer.Context, # This is only used to read additional arguments`
Update argument handling and documentation 2020-12-08 12:41:18 +03:00			`config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),`
Move init labels to init pipeline module 2020-09-29 19:09:33 +03:00			`output_path: Path = Arg(..., help="Output directory for the labels"),`
			`code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),`
			`verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),`
			`use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")`
			`# fmt: on`
			`):`
Update docs [ci skip] 2020-10-01 18:38:17 +03:00			`"""Generate JSON files for the labels in the data. This helps speed up the`
Tidy up and adjust logging [ci skip] 2020-09-30 02:22:08 +03:00			`training process, since spaCy won't have to preprocess the data to`
			`extract the labels."""`
			`util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)`
Move init labels to init pipeline module 2020-09-29 19:09:33 +03:00			`if not output_path.exists():`
TextCat updates and fixes (#6263) * small fix in example imports * throw error when train_corpus or dev_corpus is not a string * small fix in custom logger example * limit macro_auc to labels with 2 annotations * fix typo * also create parents of output_dir if need be * update documentation of textcat scores * refactor TextCatEnsemble * fix tests for new AUC definition * bump to 3.0.0a42 * update docs * rename to spacy.TextCatEnsemble.v2 * spacy.TextCatEnsemble.v1 in legacy * cleanup * small fix * update to 3.0.0rc2 * fix import that got lost in merge * cursed IDE * fix two typos 2020-10-18 15:50:41 +03:00			`output_path.mkdir(parents=True)`
Move init labels to init pipeline module 2020-09-29 19:09:33 +03:00			`overrides = parse_config_overrides(ctx.args)`
			`import_code(code_path)`
			`setup_gpu(use_gpu)`
			`with show_validation_error(config_path):`
			`config = util.load_config(config_path, overrides=overrides)`
			`with show_validation_error(hint_fill=False):`
			`nlp = init_nlp(config, use_gpu=use_gpu)`
fix spancat initialize with labels (#8620) 2021-07-06 20:08:25 +03:00			`_init_labels(nlp, output_path)`


			`def _init_labels(nlp, output_path):`
Move init labels to init pipeline module 2020-09-29 19:09:33 +03:00			`for name, component in nlp.pipeline:`
			`if getattr(component, "label_data", None) is not None:`
Tidy up and adjust logging [ci skip] 2020-09-30 02:22:08 +03:00			`output_file = output_path / f"{name}.json"`
			`srsly.write_json(output_file, component.label_data)`
WIP: Various small training changes (#6818) * Allow output_path to be None during training * Fix cat scoring (?) * Improve error message for weighted None score * Improve messages So we can call this in other places etc. * FIx output path check * Use latest wasabi * Revert "Improve error message for weighted None score" This reverts commit 70599267635e2cfcc6c8922e3e4fb20dc978beb6. * Exclude None scores from final score by default It's otherwise very difficult to keep track of the score weights if we modify a config programmatically, source components etc. * Update warnings and use logger.warning 2021-01-26 06:51:52 +03:00			`msg.good(f"Saving label data for component '{name}' to {output_file}")`
Move init labels to init pipeline module 2020-09-29 19:09:33 +03:00			`else:`
WIP: Various small training changes (#6818) * Allow output_path to be None during training * Fix cat scoring (?) * Improve error message for weighted None score * Improve messages So we can call this in other places etc. * FIx output path check * Use latest wasabi * Revert "Improve error message for weighted None score" This reverts commit 70599267635e2cfcc6c8922e3e4fb20dc978beb6. * Exclude None scores from final score by default It's otherwise very difficult to keep track of the score weights if we modify a config programmatically, source components etc. * Update warnings and use logger.warning 2021-01-26 06:51:52 +03:00			`msg.info(f"No label data found for component '{name}'")`