diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py index a92705cb0..0e9de0eb4 100644 --- a/spacy/cli/init_pipeline.py +++ b/spacy/cli/init_pipeline.py @@ -5,11 +5,35 @@ from wasabi import msg import typer from .. import util -from ..training.initialize import init_nlp +from ..training.initialize import init_nlp, convert_vectors from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error from ._util import import_code, setup_gpu +@init_cli.command("vectors") +def init_vectors_cli( + # fmt: off + lang: str = Arg(..., help="The language of the nlp object to create"), + vectors_loc: Path = Arg(..., help="Vectors file in Word2Vec format", exists=True), + output_dir: Path = Arg(..., help="Pipeline output directory"), + prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"), + truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"), + name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"), + # fmt: on +): + msg.info(f"Creating blank nlp object for language '{lang}'") + nlp = util.get_lang_class(lang)() + convert_vectors( + nlp, vectors_loc, truncate=truncate, prune=prune, name=name, silent=False + ) + nlp.to_disk(output_dir) + msg.good( + "Saved nlp object with vectors to output directory. You can now use the " + "path to it in your config as the 'vectors' setting in [initialize.vocab].", + output_dir, + ) + + @init_cli.command( "nlp", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index 23debfb28..9a47a7f69 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -1,13 +1,19 @@ -from typing import Union, Dict, Optional, Any, List +from typing import Union, Dict, Optional, Any, List, IO from thinc.api import Config, fix_random_seed, set_gpu_allocator from thinc.api import ConfigValidationError from pathlib import Path from wasabi import Printer import srsly +import numpy +import tarfile +import gzip +import zipfile +import tqdm from .loop import create_before_to_disk_callback from ..language import Language from ..lookups import Lookups +from ..vectors import Vectors from ..errors import Errors from ..schemas import ConfigSchemaTraining, ConfigSchemaInit, ConfigSchemaPretrain from ..util import registry, load_model_from_config, resolve_dot_names @@ -49,8 +55,10 @@ def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> Langu msg.info(f"Resuming training for: {resume_components}") nlp.resume_training(sgd=optimizer) with nlp.select_pipes(disable=[*frozen_components, *resume_components]): - nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer) - msg.good(f"Initialized pipeline components") + nlp.initialize( + lambda: train_corpus(nlp), sgd=optimizer, settings=I["components"] + ) + msg.good("Initialized pipeline components") # Verify the config after calling 'initialize' to ensure labels # are properly initialized verify_config(nlp) @@ -103,7 +111,7 @@ def init_vocab( def load_vectors_into_model( - nlp: "Language", name: Union[str, Path], *, add_strings: bool = True + nlp: Language, name: Union[str, Path], *, add_strings: bool = True ) -> None: """Load word vectors from an installed model or path into a model instance.""" try: @@ -202,3 +210,104 @@ def get_sourced_components(config: Union[Dict[str, Any], Config]) -> List[str]: for name, cfg in config.get("components", {}).items() if "factory" not in cfg and "source" in cfg ] + + +def convert_vectors( + nlp: Language, + vectors_loc: Optional[Path], + *, + truncate: int, + prune: int, + name: Optional[str] = None, + silent: bool = True, +) -> None: + msg = Printer(no_print=silent) + vectors_loc = ensure_path(vectors_loc) + if vectors_loc and vectors_loc.parts[-1].endswith(".npz"): + nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb"))) + for lex in nlp.vocab: + if lex.rank and lex.rank != OOV_RANK: + nlp.vocab.vectors.add(lex.orth, row=lex.rank) + else: + if vectors_loc: + with msg.loading(f"Reading vectors from {vectors_loc}"): + vectors_data, vector_keys = read_vectors(vectors_loc, truncate) + msg.good(f"Loaded vectors from {vectors_loc}") + else: + vectors_data, vector_keys = (None, None) + if vector_keys is not None: + for word in vector_keys: + if word not in nlp.vocab: + nlp.vocab[word] + if vectors_data is not None: + nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys) + if name is None: + # TODO: Is this correct? Does this matter? + nlp.vocab.vectors.name = f"{nlp.meta['lang']}_{nlp.meta['name']}.vectors" + else: + nlp.vocab.vectors.name = name + nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name + if prune >= 1: + nlp.vocab.prune_vectors(prune) + msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors") + + +def read_vectors(vectors_loc: Path, truncate_vectors: int): + f = open_file(vectors_loc) + f = ensure_shape(f) + shape = tuple(int(size) for size in next(f).split()) + if truncate_vectors >= 1: + shape = (truncate_vectors, shape[1]) + vectors_data = numpy.zeros(shape=shape, dtype="f") + vectors_keys = [] + for i, line in enumerate(tqdm.tqdm(f)): + line = line.rstrip() + pieces = line.rsplit(" ", vectors_data.shape[1]) + word = pieces.pop(0) + if len(pieces) != vectors_data.shape[1]: + raise ValueError(Errors.E094.format(line_num=i, loc=vectors_loc)) + vectors_data[i] = numpy.asarray(pieces, dtype="f") + vectors_keys.append(word) + if i == truncate_vectors - 1: + break + return vectors_data, vectors_keys + + +def open_file(loc: Union[str, Path]) -> IO: + """Handle .gz, .tar.gz or unzipped files""" + loc = ensure_path(loc) + if tarfile.is_tarfile(str(loc)): + return tarfile.open(str(loc), "r:gz") + elif loc.parts[-1].endswith("gz"): + return (line.decode("utf8") for line in gzip.open(str(loc), "r")) + elif loc.parts[-1].endswith("zip"): + zip_file = zipfile.ZipFile(str(loc)) + names = zip_file.namelist() + file_ = zip_file.open(names[0]) + return (line.decode("utf8") for line in file_) + else: + return loc.open("r", encoding="utf8") + + +def ensure_shape(lines): + """Ensure that the first line of the data is the vectors shape. + If it's not, we read in the data and output the shape as the first result, + so that the reader doesn't have to deal with the problem. + """ + first_line = next(lines) + try: + shape = tuple(int(size) for size in first_line.split()) + except ValueError: + shape = None + if shape is not None: + # All good, give the data + yield first_line + yield from lines + else: + # Figure out the shape, make it the first value, and then give the + # rest of the data. + width = len(first_line.split()) - 1 + captured = [first_line] + list(lines) + length = len(captured) + yield f"{length} {width}" + yield from captured