From e026b29ea92c22de3ff11a56d6648ff404138c80 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 30 Oct 2017 17:59:43 +0100
Subject: [PATCH 1/2] Add prune_vectors method to Vocab

---
 spacy/vocab.pyx | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index 160f0d5bd..ff6c5b844 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -5,6 +5,7 @@ import numpy
 import dill
 
 from collections import OrderedDict
+from thinc.neural.util import get_array_module
 from .lexeme cimport EMPTY_LEXEME
 from .lexeme cimport Lexeme
 from .strings cimport hash_string
@@ -247,6 +248,44 @@ cdef class Vocab:
             width = self.vectors.data.shape[1]
         self.vectors = Vectors(self.strings, width=width)
 
+    def prune_vectors(self, nr_row, batch_size=1024):
+        """Reduce the current vector table to `nr_row` unique entries. Words
+        mapped to the discarded vectors will be remapped to the closest vector
+        among those remaining.
+        
+        For example, suppose the original table had vectors for the words:
+        ['sat', 'cat', 'feline', 'reclined']. If we prune the vector table to,
+        two rows, we would discard the vectors for 'feline' and 'reclined'.
+        These words would then be remapped to the closest remaining vector
+        -- so "feline" would have the same vector as "cat", and "reclined"
+        would have the same vector as "sat".
+
+        The similarities are judged by cosine. The original vectors may
+        be large, so the cosines are calculated in minibatches, to reduce
+        memory usage.
+        """
+        xp = get_array_module(self.vectors.data)
+        # Work in batches, to avoid memory problems.
+        keep = self.vectors.data[:nr_row]
+        toss = self.vectors.data[nr_row:]
+        # Normalize the vectors, so cosine similarity is just dot product.
+        # Note we can't modify the ones we're keeping in-place...
+        keep = keep / (xp.linalg.norm(keep)+1e-8)
+        keep = xp.ascontiguousarray(keep.T)
+        neighbours = xp.zeros((toss.shape[0],), dtype='i')
+        for i in range(0, toss.shape[0], batch_size):
+            batch = toss[i : i+batch_size]
+            batch /= xp.linalg.norm(batch)+1e-8
+            neighbours[i:i+batch_size] = xp.dot(batch, keep).argmax(axis=1)
+        for lex in self:
+            # If we're losing the vector for this word, map it to the nearest
+            # vector we're keeping.
+            if lex.rank >= nr_row:
+                lex.rank = neighbours[lex.rank-nr_row]
+                self.vectors.add(lex.orth, row=lex.rank)
+        # Make copy, to encourage the original table to be garbage collected.
+        self.vectors.data = xp.ascontiguousarray(self.vectors.data[:nr_row])
+
     def get_vector(self, orth):
         """Retrieve a vector for a word in the vocabulary. Words can be looked
         up by string or int ID. If no vectors data is loaded, ValueError is

From e98451b5f7dff6a4e91e9a47cd37cdadf24b4e47 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 30 Oct 2017 18:00:10 +0100
Subject: [PATCH 2/2] Add -prune-vectors argument to spacy.cly.train

---
 spacy/cli/train.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index fb96e6c05..34117db22 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -32,6 +32,7 @@ numpy.random.seed(0)
     n_sents=("number of sentences", "option", "ns", int),
     use_gpu=("Use GPU", "option", "g", int),
     vectors=("Model to load vectors from", "option", "v"),
+    vectors_limit=("Truncate to N vectors (requires -v)", "option", None, int),
     no_tagger=("Don't train tagger", "flag", "T", bool),
     no_parser=("Don't train parser", "flag", "P", bool),
     no_entities=("Don't train NER", "flag", "N", bool),
@@ -40,9 +41,9 @@ numpy.random.seed(0)
     meta_path=("Optional path to meta.json. All relevant properties will be "
                "overwritten.", "option", "m", Path))
 def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
-          use_gpu=-1, vectors=None, no_tagger=False, no_parser=False,
-          no_entities=False, gold_preproc=False, version="0.0.0",
-          meta_path=None):
+          use_gpu=-1, vectors=None, vectors_limit=None, no_tagger=False,
+          no_parser=False, no_entities=False, gold_preproc=False,
+          version="0.0.0", meta_path=None):
     """
     Train a model. Expects data in spaCy's JSON format.
     """
@@ -94,6 +95,8 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
     nlp.meta.update(meta)
     if vectors:
         util.load_model(vectors, vocab=nlp.vocab)
+        if vectors_limit is not None:
+            nlp.vocab.prune_vectors(vectors_limit)
     for name in pipeline:
         nlp.add_pipe(nlp.create_pipe(name), name=name)
     optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)