From 4925ad760a87d84b7cc4bb2fb48b45845a2e0c30 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 29 Sep 2020 10:58:50 +0200
Subject: [PATCH] Add init vectors

---
 spacy/cli/init_pipeline.py   |  26 +++++++-
 spacy/training/initialize.py | 117 +++++++++++++++++++++++++++++++++--
 2 files changed, 138 insertions(+), 5 deletions(-)

diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py
index a92705cb0..0e9de0eb4 100644
--- a/spacy/cli/init_pipeline.py
+++ b/spacy/cli/init_pipeline.py
@@ -5,11 +5,35 @@ from wasabi import msg
 import typer
 
 from .. import util
-from ..training.initialize import init_nlp
+from ..training.initialize import init_nlp, convert_vectors
 from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
 from ._util import import_code, setup_gpu
 
 
+@init_cli.command("vectors")
+def init_vectors_cli(
+    # fmt: off
+    lang: str = Arg(..., help="The language of the nlp object to create"),
+    vectors_loc: Path = Arg(..., help="Vectors file in Word2Vec format", exists=True),
+    output_dir: Path = Arg(..., help="Pipeline output directory"),
+    prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
+    truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
+    name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
+    # fmt: on
+):
+    msg.info(f"Creating blank nlp object for language '{lang}'")
+    nlp = util.get_lang_class(lang)()
+    convert_vectors(
+        nlp, vectors_loc, truncate=truncate, prune=prune, name=name, silent=False
+    )
+    nlp.to_disk(output_dir)
+    msg.good(
+        "Saved nlp object with vectors to output directory. You can now use the "
+        "path to it in your config as the 'vectors' setting in [initialize.vocab].",
+        output_dir,
+    )
+
+
 @init_cli.command(
     "nlp",
     context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index 23debfb28..9a47a7f69 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -1,13 +1,19 @@
-from typing import Union, Dict, Optional, Any, List
+from typing import Union, Dict, Optional, Any, List, IO
 from thinc.api import Config, fix_random_seed, set_gpu_allocator
 from thinc.api import ConfigValidationError
 from pathlib import Path
 from wasabi import Printer
 import srsly
+import numpy
+import tarfile
+import gzip
+import zipfile
+import tqdm
 
 from .loop import create_before_to_disk_callback
 from ..language import Language
 from ..lookups import Lookups
+from ..vectors import Vectors
 from ..errors import Errors
 from ..schemas import ConfigSchemaTraining, ConfigSchemaInit, ConfigSchemaPretrain
 from ..util import registry, load_model_from_config, resolve_dot_names
@@ -49,8 +55,10 @@ def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> Langu
             msg.info(f"Resuming training for: {resume_components}")
             nlp.resume_training(sgd=optimizer)
     with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
-        nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
-        msg.good(f"Initialized pipeline components")
+        nlp.initialize(
+            lambda: train_corpus(nlp), sgd=optimizer, settings=I["components"]
+        )
+        msg.good("Initialized pipeline components")
     # Verify the config after calling 'initialize' to ensure labels
     # are properly initialized
     verify_config(nlp)
@@ -103,7 +111,7 @@ def init_vocab(
 
 
 def load_vectors_into_model(
-    nlp: "Language", name: Union[str, Path], *, add_strings: bool = True
+    nlp: Language, name: Union[str, Path], *, add_strings: bool = True
 ) -> None:
     """Load word vectors from an installed model or path into a model instance."""
     try:
@@ -202,3 +210,104 @@ def get_sourced_components(config: Union[Dict[str, Any], Config]) -> List[str]:
         for name, cfg in config.get("components", {}).items()
         if "factory" not in cfg and "source" in cfg
     ]
+
+
+def convert_vectors(
+    nlp: Language,
+    vectors_loc: Optional[Path],
+    *,
+    truncate: int,
+    prune: int,
+    name: Optional[str] = None,
+    silent: bool = True,
+) -> None:
+    msg = Printer(no_print=silent)
+    vectors_loc = ensure_path(vectors_loc)
+    if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
+        nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
+        for lex in nlp.vocab:
+            if lex.rank and lex.rank != OOV_RANK:
+                nlp.vocab.vectors.add(lex.orth, row=lex.rank)
+    else:
+        if vectors_loc:
+            with msg.loading(f"Reading vectors from {vectors_loc}"):
+                vectors_data, vector_keys = read_vectors(vectors_loc, truncate)
+            msg.good(f"Loaded vectors from {vectors_loc}")
+        else:
+            vectors_data, vector_keys = (None, None)
+        if vector_keys is not None:
+            for word in vector_keys:
+                if word not in nlp.vocab:
+                    nlp.vocab[word]
+        if vectors_data is not None:
+            nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
+    if name is None:
+        # TODO: Is this correct? Does this matter?
+        nlp.vocab.vectors.name = f"{nlp.meta['lang']}_{nlp.meta['name']}.vectors"
+    else:
+        nlp.vocab.vectors.name = name
+    nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
+    if prune >= 1:
+        nlp.vocab.prune_vectors(prune)
+    msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
+
+
+def read_vectors(vectors_loc: Path, truncate_vectors: int):
+    f = open_file(vectors_loc)
+    f = ensure_shape(f)
+    shape = tuple(int(size) for size in next(f).split())
+    if truncate_vectors >= 1:
+        shape = (truncate_vectors, shape[1])
+    vectors_data = numpy.zeros(shape=shape, dtype="f")
+    vectors_keys = []
+    for i, line in enumerate(tqdm.tqdm(f)):
+        line = line.rstrip()
+        pieces = line.rsplit(" ", vectors_data.shape[1])
+        word = pieces.pop(0)
+        if len(pieces) != vectors_data.shape[1]:
+            raise ValueError(Errors.E094.format(line_num=i, loc=vectors_loc))
+        vectors_data[i] = numpy.asarray(pieces, dtype="f")
+        vectors_keys.append(word)
+        if i == truncate_vectors - 1:
+            break
+    return vectors_data, vectors_keys
+
+
+def open_file(loc: Union[str, Path]) -> IO:
+    """Handle .gz, .tar.gz or unzipped files"""
+    loc = ensure_path(loc)
+    if tarfile.is_tarfile(str(loc)):
+        return tarfile.open(str(loc), "r:gz")
+    elif loc.parts[-1].endswith("gz"):
+        return (line.decode("utf8") for line in gzip.open(str(loc), "r"))
+    elif loc.parts[-1].endswith("zip"):
+        zip_file = zipfile.ZipFile(str(loc))
+        names = zip_file.namelist()
+        file_ = zip_file.open(names[0])
+        return (line.decode("utf8") for line in file_)
+    else:
+        return loc.open("r", encoding="utf8")
+
+
+def ensure_shape(lines):
+    """Ensure that the first line of the data is the vectors shape.
+    If it's not, we read in the data and output the shape as the first result,
+    so that the reader doesn't have to deal with the problem.
+    """
+    first_line = next(lines)
+    try:
+        shape = tuple(int(size) for size in first_line.split())
+    except ValueError:
+        shape = None
+    if shape is not None:
+        # All good, give the data
+        yield first_line
+        yield from lines
+    else:
+        # Figure out the shape, make it the first value, and then give the
+        # rest of the data.
+        width = len(first_line.split()) - 1
+        captured = [first_line] + list(lines)
+        length = len(captured)
+        yield f"{length} {width}"
+        yield from captured