mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-11 04:08:09 +03:00
153 lines
5.6 KiB
Python
153 lines
5.6 KiB
Python
from typing import Optional, Literal, List
|
|
import logging
|
|
from pathlib import Path
|
|
from wasabi import msg
|
|
import srsly
|
|
from radicli import Arg, ExistingPath, ExistingFilePathOrDash, ExistingFilePath
|
|
|
|
from .. import util
|
|
from ..training.initialize import init_nlp, convert_vectors
|
|
from ..language import Language
|
|
from ._util import cli, parse_config_overrides, show_validation_error
|
|
from ._util import import_code, setup_gpu, _handle_renamed_language_codes
|
|
|
|
|
|
@cli.subcommand(
|
|
"init",
|
|
"vectors",
|
|
# fmt: off
|
|
lang=Arg(help="The language of the nlp object to create"),
|
|
vectors_loc=Arg(help="Vectors file in Word2Vec format"),
|
|
output_dir=Arg(help="Pipeline output directory"),
|
|
prune=Arg("--prune", "-p", help="Optional number of vectors to prune to"),
|
|
truncate=Arg("--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
|
|
mode=Arg("--mode", "-m", help="Vectors mode: default or floret"),
|
|
verbose=Arg("--verbose", "-V", help="Display more information for debugging purposes"),
|
|
jsonl_loc=Arg("--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file"),
|
|
# fmt: on
|
|
)
|
|
def init_vectors_cli(
|
|
lang: str,
|
|
vectors_loc: ExistingPath,
|
|
output_dir: Path,
|
|
prune: int = -1,
|
|
truncate: int = 0,
|
|
mode: Literal["default", "floret"] = "default",
|
|
verbose: bool = False,
|
|
jsonl_loc: Optional[Path] = None,
|
|
):
|
|
"""Convert word vectors for use with spaCy. Will export an nlp object that
|
|
you can use in the [initialize] block of your config to initialize
|
|
a model with vectors.
|
|
"""
|
|
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
|
|
|
# Throw error for renamed language codes in v4
|
|
_handle_renamed_language_codes(lang)
|
|
|
|
msg.info(f"Creating blank nlp object for language '{lang}'")
|
|
nlp = util.get_lang_class(lang)()
|
|
if jsonl_loc is not None:
|
|
update_lexemes(nlp, jsonl_loc)
|
|
convert_vectors(
|
|
nlp,
|
|
vectors_loc,
|
|
truncate=truncate,
|
|
prune=prune,
|
|
mode=mode,
|
|
)
|
|
msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
|
|
nlp.to_disk(output_dir)
|
|
msg.good(
|
|
"Saved nlp object with vectors to output directory. You can now use the "
|
|
"path to it in your config as the 'vectors' setting in [initialize].",
|
|
output_dir.resolve(),
|
|
)
|
|
|
|
|
|
def update_lexemes(nlp: Language, jsonl_loc: Path) -> None:
|
|
# Mostly used for backwards-compatibility and may be removed in the future
|
|
lex_attrs = srsly.read_jsonl(jsonl_loc)
|
|
for attrs in lex_attrs:
|
|
if "settings" in attrs:
|
|
continue
|
|
lexeme = nlp.vocab[attrs["orth"]]
|
|
lexeme.set_attrs(**attrs)
|
|
|
|
|
|
@cli.subcommand_with_extra(
|
|
"init",
|
|
"nlp",
|
|
# fmt: off
|
|
config_path=Arg(help="Path to config file"),
|
|
output_path=Arg(help="Output directory for the prepared data"),
|
|
code_path=Arg("--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
|
verbose=Arg("--verbose", "-V", help="Display more information for debugging purposes"),
|
|
use_gpu=Arg("--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
|
# fmt: on
|
|
)
|
|
def init_pipeline_cli(
|
|
config_path: ExistingFilePathOrDash,
|
|
output_path: Path,
|
|
code_path: Optional[ExistingFilePath] = None,
|
|
verbose: bool = False,
|
|
use_gpu: int = -1,
|
|
_extra: List[str] = [],
|
|
):
|
|
"""Initialize a pipeline."""
|
|
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
|
overrides = parse_config_overrides(_extra)
|
|
import_code(code_path)
|
|
setup_gpu(use_gpu)
|
|
with show_validation_error(config_path):
|
|
config = util.load_config(config_path, overrides=overrides)
|
|
with show_validation_error(hint_fill=False):
|
|
nlp = init_nlp(config, use_gpu=use_gpu)
|
|
nlp.to_disk(output_path)
|
|
msg.good(f"Saved initialized pipeline to {output_path}")
|
|
|
|
|
|
@cli.subcommand_with_extra(
|
|
"init",
|
|
"labels",
|
|
# fmt: off
|
|
config_path=Arg(help="Path to config file"),
|
|
output_path=Arg(help="Output directory for the labels"),
|
|
code_path=Arg("--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
|
verbose=Arg("--verbose", "-V", help="Display more information for debugging purposes"),
|
|
use_gpu=Arg("--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
|
# fmt: on
|
|
)
|
|
def init_labels_cli(
|
|
config_path: ExistingFilePathOrDash,
|
|
output_path: Path,
|
|
code_path: Optional[ExistingFilePath] = None,
|
|
verbose: bool = False,
|
|
use_gpu: int = -1,
|
|
_extra: List[str] = [],
|
|
):
|
|
"""Generate JSON files for the labels in the data. This helps speed up the
|
|
training process, since spaCy won't have to preprocess the data to
|
|
extract the labels."""
|
|
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
|
if not output_path.exists():
|
|
output_path.mkdir(parents=True)
|
|
overrides = parse_config_overrides(_extra)
|
|
import_code(code_path)
|
|
setup_gpu(use_gpu)
|
|
with show_validation_error(config_path):
|
|
config = util.load_config(config_path, overrides=overrides)
|
|
with show_validation_error(hint_fill=False):
|
|
nlp = init_nlp(config, use_gpu=use_gpu)
|
|
_init_labels(nlp, output_path)
|
|
|
|
|
|
def _init_labels(nlp, output_path):
|
|
for name, component in nlp.pipeline:
|
|
if getattr(component, "label_data", None) is not None:
|
|
output_file = output_path / f"{name}.json"
|
|
srsly.write_json(output_file, component.label_data)
|
|
msg.good(f"Saving label data for component '{name}' to {output_file}")
|
|
else:
|
|
msg.info(f"No label data found for component '{name}'")
|