mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-25 21:21:10 +03:00 
			
		
		
		
	* Add support for fasttext-bloom hash-only vectors
Overview:
* Extend `Vectors` to have two modes: `default` and `ngram`
  * `default` is the default mode and equivalent to the current
    `Vectors`
  * `ngram` supports the hash-only ngram tables from `fasttext-bloom`
* Extend `spacy.StaticVectors.v2` to handle both modes with no changes
  for `default` vectors
* Extend `spacy init vectors` to support ngram tables
The `ngram` mode **only** supports vector tables produced by this
fork of fastText, which adds an option to represent all vectors using
only the ngram buckets table and which uses the exact same ngram
generation algorithm and hash function (`MurmurHash3_x64_128`).
`fasttext-bloom` produces an additional `.hashvec` table, which can be
loaded by `spacy init vectors --fasttext-bloom-vectors`.
https://github.com/adrianeboyd/fastText/tree/feature/bloom
Implementation details:
* `Vectors` now includes the `StringStore` as `Vectors.strings` so that
  the API can stay consistent for both `default` (which can look up from
  `str` or `int`) and `ngram` (which requires `str` to calculate the
  ngrams).
* In ngram mode `Vectors` uses a default `Vectors` object as a cache
  since the ngram vectors lookups are relatively expensive.
  * The default cache size is the same size as the provided ngram vector
    table.
  * Once the cache is full, no more entries are added. The user is
    responsible for managing the cache in cases where the initial
    documents are not representative of the texts.
  * The cache can be resized by setting `Vectors.ngram_cache_size` or
    cleared with `vectors._ngram_cache.clear()`.
* The API ends up a bit split between methods for `default` and for
  `ngram`, so functions that only make sense for `default` or `ngram`
  include warnings with custom messages suggesting alternatives where
  possible.
* `Vocab.vectors` becomes a property so that the string stores can be
  synced when assigning vectors to a vocab.
* `Vectors` serializes its own config settings as `vectors.cfg`.
* The `Vectors` serialization methods have added support for `exclude`
  so that the `Vocab` can exclude the `Vectors` strings while serializing.
Removed:
* The `minn` and `maxn` options and related code from
  `Vocab.get_vector`, which does not work in a meaningful way for default
  vector tables.
* The unused `GlobalRegistry` in `Vectors`.
* Refactor to use reduce_mean
Refactor to use reduce_mean and remove the ngram vectors cache.
* Rename to floret
* Rename to floret in error messages
* Use --vectors-mode in CLI, vector init
* Fix vectors mode in init
* Remove unused var
* Minor API and docstrings adjustments
* Rename `--vectors-mode` to `--mode` in `init vectors` CLI
* Rename `Vectors.get_floret_vectors` to `Vectors.get_batch` and support
  both modes.
* Minor updates to Vectors docstrings.
* Update API docs for Vectors and init vectors CLI
* Update types for StaticVectors
		
	
			
		
			
				
	
	
		
			130 lines
		
	
	
		
			5.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			130 lines
		
	
	
		
			5.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| from typing import Optional
 | |
| import logging
 | |
| from pathlib import Path
 | |
| from wasabi import msg
 | |
| import typer
 | |
| import srsly
 | |
| 
 | |
| from .. import util
 | |
| from ..training.initialize import init_nlp, convert_vectors
 | |
| from ..language import Language
 | |
| from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
 | |
| from ._util import import_code, setup_gpu
 | |
| 
 | |
| 
 | |
| @init_cli.command("vectors")
 | |
| def init_vectors_cli(
 | |
|     # fmt: off
 | |
|     lang: str = Arg(..., help="The language of the nlp object to create"),
 | |
|     vectors_loc: Path = Arg(..., help="Vectors file in Word2Vec format", exists=True),
 | |
|     output_dir: Path = Arg(..., help="Pipeline output directory"),
 | |
|     prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
 | |
|     truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
 | |
|     mode: str = Opt("default", "--mode", "-m", help="Vectors mode: default or floret"),
 | |
|     name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
 | |
|     verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
 | |
|     jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
 | |
|     # fmt: on
 | |
| ):
 | |
|     """Convert word vectors for use with spaCy. Will export an nlp object that
 | |
|     you can use in the [initialize] block of your config to initialize
 | |
|     a model with vectors.
 | |
|     """
 | |
|     util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
 | |
|     msg.info(f"Creating blank nlp object for language '{lang}'")
 | |
|     nlp = util.get_lang_class(lang)()
 | |
|     if jsonl_loc is not None:
 | |
|         update_lexemes(nlp, jsonl_loc)
 | |
|     convert_vectors(
 | |
|         nlp,
 | |
|         vectors_loc,
 | |
|         truncate=truncate,
 | |
|         prune=prune,
 | |
|         name=name,
 | |
|         mode=mode,
 | |
|     )
 | |
|     msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
 | |
|     nlp.to_disk(output_dir)
 | |
|     msg.good(
 | |
|         "Saved nlp object with vectors to output directory. You can now use the "
 | |
|         "path to it in your config as the 'vectors' setting in [initialize].",
 | |
|         output_dir.resolve(),
 | |
|     )
 | |
| 
 | |
| 
 | |
| def update_lexemes(nlp: Language, jsonl_loc: Path) -> None:
 | |
|     # Mostly used for backwards-compatibility and may be removed in the future
 | |
|     lex_attrs = srsly.read_jsonl(jsonl_loc)
 | |
|     for attrs in lex_attrs:
 | |
|         if "settings" in attrs:
 | |
|             continue
 | |
|         lexeme = nlp.vocab[attrs["orth"]]
 | |
|         lexeme.set_attrs(**attrs)
 | |
| 
 | |
| 
 | |
| @init_cli.command(
 | |
|     "nlp",
 | |
|     context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
 | |
|     hidden=True,
 | |
| )
 | |
| def init_pipeline_cli(
 | |
|     # fmt: off
 | |
|     ctx: typer.Context,  # This is only used to read additional arguments
 | |
|     config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
 | |
|     output_path: Path = Arg(..., help="Output directory for the prepared data"),
 | |
|     code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
 | |
|     verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
 | |
|     use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
 | |
|     # fmt: on
 | |
| ):
 | |
|     util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
 | |
|     overrides = parse_config_overrides(ctx.args)
 | |
|     import_code(code_path)
 | |
|     setup_gpu(use_gpu)
 | |
|     with show_validation_error(config_path):
 | |
|         config = util.load_config(config_path, overrides=overrides)
 | |
|     with show_validation_error(hint_fill=False):
 | |
|         nlp = init_nlp(config, use_gpu=use_gpu)
 | |
|     nlp.to_disk(output_path)
 | |
|     msg.good(f"Saved initialized pipeline to {output_path}")
 | |
| 
 | |
| 
 | |
| @init_cli.command(
 | |
|     "labels",
 | |
|     context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
 | |
| )
 | |
| def init_labels_cli(
 | |
|     # fmt: off
 | |
|     ctx: typer.Context,  # This is only used to read additional arguments
 | |
|     config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
 | |
|     output_path: Path = Arg(..., help="Output directory for the labels"),
 | |
|     code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
 | |
|     verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
 | |
|     use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
 | |
|     # fmt: on
 | |
| ):
 | |
|     """Generate JSON files for the labels in the data. This helps speed up the
 | |
|     training process, since spaCy won't have to preprocess the data to
 | |
|     extract the labels."""
 | |
|     util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
 | |
|     if not output_path.exists():
 | |
|         output_path.mkdir(parents=True)
 | |
|     overrides = parse_config_overrides(ctx.args)
 | |
|     import_code(code_path)
 | |
|     setup_gpu(use_gpu)
 | |
|     with show_validation_error(config_path):
 | |
|         config = util.load_config(config_path, overrides=overrides)
 | |
|     with show_validation_error(hint_fill=False):
 | |
|         nlp = init_nlp(config, use_gpu=use_gpu)
 | |
|     _init_labels(nlp, output_path)
 | |
| 
 | |
| 
 | |
| def _init_labels(nlp, output_path):
 | |
|     for name, component in nlp.pipeline:
 | |
|         if getattr(component, "label_data", None) is not None:
 | |
|             output_file = output_path / f"{name}.json"
 | |
|             srsly.write_json(output_file, component.label_data)
 | |
|             msg.good(f"Saving label data for component '{name}' to {output_file}")
 | |
|         else:
 | |
|             msg.info(f"No label data found for component '{name}'")
 |