Merge branch 'develop' of https://github.com/explosion/spaCy into develop

2025-07-06 04:43:17 +03:00 · 2017-10-30 18:39:51 +01:00 · 2017-10-30 18:39:51 +01:00 · 803e41bc66
commit 803e41bc66
parent 8e02294241 e98451b5f7
2 changed files with 45 additions and 3 deletions
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -32,6 +32,7 @@ numpy.random.seed(0)
    n_sents=("number of sentences", "option", "ns", int),
    use_gpu=("Use GPU", "option", "g", int),
    vectors=("Model to load vectors from", "option", "v"),
    vectors_limit=("Truncate to N vectors (requires -v)", "option", None, int),
    no_tagger=("Don't train tagger", "flag", "T", bool),
    no_parser=("Don't train parser", "flag", "P", bool),
    no_entities=("Don't train NER", "flag", "N", bool),
@ -40,9 +41,9 @@ numpy.random.seed(0)
    meta_path=("Optional path to meta.json. All relevant properties will be "
               "overwritten.", "option", "m", Path))
 def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
-          use_gpu=-1, vectors=None, no_tagger=False, no_parser=False,
+          use_gpu=-1, vectors=None, vectors_limit=None, no_tagger=False,
-          no_entities=False, gold_preproc=False, version="0.0.0",
+          no_parser=False, no_entities=False, gold_preproc=False,
-          meta_path=None):
+          version="0.0.0", meta_path=None):
    """
    Train a model. Expects data in spaCy's JSON format.
    """
@ -94,6 +95,8 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
    nlp.meta.update(meta)
    if vectors:
        util.load_model(vectors, vocab=nlp.vocab)
        if vectors_limit is not None:
            nlp.vocab.prune_vectors(vectors_limit)
    for name in pipeline:
        nlp.add_pipe(nlp.create_pipe(name), name=name)
    optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -5,6 +5,7 @@ import numpy
 import dill
 from collections import OrderedDict
 from thinc.neural.util import get_array_module
 from .lexeme cimport EMPTY_LEXEME
 from .lexeme cimport Lexeme
 from .strings cimport hash_string
@ -247,6 +248,44 @@ cdef class Vocab:
            width = self.vectors.data.shape[1]
        self.vectors = Vectors(self.strings, width=width)
    def prune_vectors(self, nr_row, batch_size=1024):
        """Reduce the current vector table to `nr_row` unique entries. Words
        mapped to the discarded vectors will be remapped to the closest vector
        among those remaining.
        For example, suppose the original table had vectors for the words:
        ['sat', 'cat', 'feline', 'reclined']. If we prune the vector table to,
        two rows, we would discard the vectors for 'feline' and 'reclined'.
        These words would then be remapped to the closest remaining vector
        -- so "feline" would have the same vector as "cat", and "reclined"
        would have the same vector as "sat".
        The similarities are judged by cosine. The original vectors may
        be large, so the cosines are calculated in minibatches, to reduce
        memory usage.
        """
        xp = get_array_module(self.vectors.data)
        # Work in batches, to avoid memory problems.
        keep = self.vectors.data[:nr_row]
        toss = self.vectors.data[nr_row:]
        # Normalize the vectors, so cosine similarity is just dot product.
        # Note we can't modify the ones we're keeping in-place...
        keep = keep / (xp.linalg.norm(keep)+1e-8)
        keep = xp.ascontiguousarray(keep.T)
        neighbours = xp.zeros((toss.shape[0],), dtype='i')
        for i in range(0, toss.shape[0], batch_size):
            batch = toss[i : i+batch_size]
            batch /= xp.linalg.norm(batch)+1e-8
            neighbours[i:i+batch_size] = xp.dot(batch, keep).argmax(axis=1)
        for lex in self:
            # If we're losing the vector for this word, map it to the nearest
            # vector we're keeping.
            if lex.rank >= nr_row:
                lex.rank = neighbours[lex.rank-nr_row]
                self.vectors.add(lex.orth, row=lex.rank)
        # Make copy, to encourage the original table to be garbage collected.
        self.vectors.data = xp.ascontiguousarray(self.vectors.data[:nr_row])
    def get_vector(self, orth):
        """Retrieve a vector for a word in the vocabulary. Words can be looked
        up by string or int ID. If no vectors data is loaded, ValueError is