mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	* remove migration support form * initial test commit * add fixture * add combo test * pull out parameter example data * fix formatting on examples * remove unused import * remove unncessary fmt:off instructions * only set logger level if verbose flag is explicitly set --------- Co-authored-by: svlandeg <svlandeg@github.com>
		
			
				
	
	
		
			143 lines
		
	
	
		
			5.6 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			143 lines
		
	
	
		
			5.6 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import logging
 | |
| from pathlib import Path
 | |
| from typing import Optional
 | |
| 
 | |
| import srsly
 | |
| import typer
 | |
| from wasabi import msg
 | |
| 
 | |
| from .. import util
 | |
| from ..language import Language
 | |
| from ..training.initialize import convert_vectors, init_nlp
 | |
| from ._util import (
 | |
|     Arg,
 | |
|     Opt,
 | |
|     import_code,
 | |
|     init_cli,
 | |
|     parse_config_overrides,
 | |
|     setup_gpu,
 | |
|     show_validation_error,
 | |
| )
 | |
| 
 | |
| 
 | |
| @init_cli.command("vectors")
 | |
| def init_vectors_cli(
 | |
|     # fmt: off
 | |
|     lang: str = Arg(..., help="The language of the nlp object to create"),
 | |
|     vectors_loc: Path = Arg(..., help="Vectors file in Word2Vec format", exists=True),
 | |
|     output_dir: Path = Arg(..., help="Pipeline output directory"),
 | |
|     prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
 | |
|     truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
 | |
|     mode: str = Opt("default", "--mode", "-m", help="Vectors mode: default or floret"),
 | |
|     name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
 | |
|     verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
 | |
|     jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
 | |
|     attr: str = Opt("ORTH", "--attr", "-a", help="Optional token attribute to use for vectors, e.g. LOWER or NORM"),
 | |
|     # fmt: on
 | |
| ):
 | |
|     """Convert word vectors for use with spaCy. Will export an nlp object that
 | |
|     you can use in the [initialize] block of your config to initialize
 | |
|     a model with vectors.
 | |
|     """
 | |
|     if verbose:
 | |
|         util.logger.setLevel(logging.DEBUG)
 | |
|     msg.info(f"Creating blank nlp object for language '{lang}'")
 | |
|     nlp = util.get_lang_class(lang)()
 | |
|     if jsonl_loc is not None:
 | |
|         update_lexemes(nlp, jsonl_loc)
 | |
|     convert_vectors(
 | |
|         nlp,
 | |
|         vectors_loc,
 | |
|         truncate=truncate,
 | |
|         prune=prune,
 | |
|         name=name,
 | |
|         mode=mode,
 | |
|         attr=attr,
 | |
|     )
 | |
|     msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
 | |
|     nlp.to_disk(output_dir)
 | |
|     msg.good(
 | |
|         "Saved nlp object with vectors to output directory. You can now use the "
 | |
|         "path to it in your config as the 'vectors' setting in [initialize].",
 | |
|         output_dir.resolve(),
 | |
|     )
 | |
| 
 | |
| 
 | |
| def update_lexemes(nlp: Language, jsonl_loc: Path) -> None:
 | |
|     # Mostly used for backwards-compatibility and may be removed in the future
 | |
|     lex_attrs = srsly.read_jsonl(jsonl_loc)
 | |
|     for attrs in lex_attrs:
 | |
|         if "settings" in attrs:
 | |
|             continue
 | |
|         lexeme = nlp.vocab[attrs["orth"]]
 | |
|         lexeme.set_attrs(**attrs)
 | |
| 
 | |
| 
 | |
| @init_cli.command(
 | |
|     "nlp",
 | |
|     context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
 | |
|     hidden=True,
 | |
| )
 | |
| def init_pipeline_cli(
 | |
|     # fmt: off
 | |
|     ctx: typer.Context,  # This is only used to read additional arguments
 | |
|     config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
 | |
|     output_path: Path = Arg(..., help="Output directory for the prepared data"),
 | |
|     code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
 | |
|     verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
 | |
|     use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
 | |
|     # fmt: on
 | |
| ):
 | |
|     if verbose:
 | |
|         util.logger.setLevel(logging.DEBUG)
 | |
|     overrides = parse_config_overrides(ctx.args)
 | |
|     import_code(code_path)
 | |
|     setup_gpu(use_gpu)
 | |
|     with show_validation_error(config_path):
 | |
|         config = util.load_config(config_path, overrides=overrides)
 | |
|     with show_validation_error(hint_fill=False):
 | |
|         nlp = init_nlp(config, use_gpu=use_gpu)
 | |
|     nlp.to_disk(output_path)
 | |
|     msg.good(f"Saved initialized pipeline to {output_path}")
 | |
| 
 | |
| 
 | |
| @init_cli.command(
 | |
|     "labels",
 | |
|     context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
 | |
| )
 | |
| def init_labels_cli(
 | |
|     # fmt: off
 | |
|     ctx: typer.Context,  # This is only used to read additional arguments
 | |
|     config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
 | |
|     output_path: Path = Arg(..., help="Output directory for the labels"),
 | |
|     code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
 | |
|     verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
 | |
|     use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
 | |
|     # fmt: on
 | |
| ):
 | |
|     """Generate JSON files for the labels in the data. This helps speed up the
 | |
|     training process, since spaCy won't have to preprocess the data to
 | |
|     extract the labels."""
 | |
|     if verbose:
 | |
|         util.logger.setLevel(logging.DEBUG)
 | |
|     if not output_path.exists():
 | |
|         output_path.mkdir(parents=True)
 | |
|     overrides = parse_config_overrides(ctx.args)
 | |
|     import_code(code_path)
 | |
|     setup_gpu(use_gpu)
 | |
|     with show_validation_error(config_path):
 | |
|         config = util.load_config(config_path, overrides=overrides)
 | |
|     with show_validation_error(hint_fill=False):
 | |
|         nlp = init_nlp(config, use_gpu=use_gpu)
 | |
|     _init_labels(nlp, output_path)
 | |
| 
 | |
| 
 | |
| def _init_labels(nlp, output_path):
 | |
|     for name, component in nlp.pipeline:
 | |
|         if getattr(component, "label_data", None) is not None:
 | |
|             output_file = output_path / f"{name}.json"
 | |
|             srsly.write_json(output_file, component.label_data)
 | |
|             msg.good(f"Saving label data for component '{name}' to {output_file}")
 | |
|         else:
 | |
|             msg.info(f"No label data found for component '{name}'")
 |