Update vocab docs and document Vocab.prune_vectors

This commit is contained in:
ines 2017-10-30 19:35:41 +01:00
parent 12343e23fd
commit ec657c1ddc
2 changed files with 58 additions and 5 deletions

View File

@ -252,7 +252,7 @@ cdef class Vocab:
"""Reduce the current vector table to `nr_row` unique entries. Words """Reduce the current vector table to `nr_row` unique entries. Words
mapped to the discarded vectors will be remapped to the closest vector mapped to the discarded vectors will be remapped to the closest vector
among those remaining. among those remaining.
For example, suppose the original table had vectors for the words: For example, suppose the original table had vectors for the words:
['sat', 'cat', 'feline', 'reclined']. If we prune the vector table to, ['sat', 'cat', 'feline', 'reclined']. If we prune the vector table to,
two rows, we would discard the vectors for 'feline' and 'reclined'. two rows, we would discard the vectors for 'feline' and 'reclined'.
@ -263,6 +263,15 @@ cdef class Vocab:
The similarities are judged by cosine. The original vectors may The similarities are judged by cosine. The original vectors may
be large, so the cosines are calculated in minibatches, to reduce be large, so the cosines are calculated in minibatches, to reduce
memory usage. memory usage.
nr_row (int): The number of rows to keep in the vector table.
batch_size (int): Batch of vectors for calculating the similarities.
Larger batch sizes might be faster, while temporarily requiring
more memory.
RETURNS (dict): A dictionary keyed by removed words mapped to
`(string, score)` tuples, where `string` is the entry the removed
word was mapped to, and `score` the similarity score between the
two words.
""" """
xp = get_array_module(self.vectors.data) xp = get_array_module(self.vectors.data)
# Work in batches, to avoid memory problems. # Work in batches, to avoid memory problems.
@ -285,6 +294,7 @@ cdef class Vocab:
self.vectors.add(lex.orth, row=lex.rank) self.vectors.add(lex.orth, row=lex.rank)
# Make copy, to encourage the original table to be garbage collected. # Make copy, to encourage the original table to be garbage collected.
self.vectors.data = xp.ascontiguousarray(self.vectors.data[:nr_row]) self.vectors.data = xp.ascontiguousarray(self.vectors.data[:nr_row])
# TODO: return new mapping
def get_vector(self, orth): def get_vector(self, orth):
"""Retrieve a vector for a word in the vocabulary. Words can be looked """Retrieve a vector for a word in the vocabulary. Words can be looked

View File

@ -162,7 +162,7 @@ p
+cell int +cell int
+cell The integer ID by which the flag value can be checked. +cell The integer ID by which the flag value can be checked.
+h(2, "add_flag") Vocab.clear_vectors +h(2, "clear_vectors") Vocab.clear_vectors
+tag method +tag method
+tag-new(2) +tag-new(2)
@ -181,7 +181,50 @@ p
| Number of dimensions of the new vectors. If #[code None], size | Number of dimensions of the new vectors. If #[code None], size
| is not changed. | is not changed.
+h(2, "add_flag") Vocab.get_vector +h(2, "prune_vectors") Vocab.prune_vectors
+tag method
+tag-new(2)
p
| Reduce the current vector table to #[code nr_row] unique entries. Words
| mapped to the discarded vectors will be remapped to the closest vector
| among those remaining. For example, suppose the original table had
| vectors for the words:
| #[code.u-break ['sat', 'cat', 'feline', 'reclined']]. If we prune the
| vector table to, two rows, we would discard the vectors for "feline"
| and "reclined". These words would then be remapped to the closest
| remaining vector so "feline" would have the same vector as "cat",
| and "reclined" would have the same vector as "sat". The similarities are
| judged by cosine. The original vectors may be large, so the cosines are
| calculated in minibatches, to reduce memory usage.
+aside-code("Example").
nlp.vocab.prune_vectors(10000)
assert len(nlp.vocab.vectors) <= 1000
+table(["Name", "Type", "Description"])
+row
+cell #[code nr_row]
+cell int
+cell The number of rows to keep in the vector table.
+row
+cell #[code batch_size]
+cell int
+cell
| Batch of vectors for calculating the similarities. Larger batch
| sizes might be faster, while temporarily requiring more memory.
+row("foot")
+cell returns
+cell dict
+cell
| A dictionary keyed by removed words mapped to
| #[code (string, score)] tuples, where #[code string] is the entry
| the removed word was mapped to, and #[code score] the similarity
| score between the two words.
+h(2, "get_vector") Vocab.get_vector
+tag method +tag method
+tag-new(2) +tag-new(2)
@ -206,7 +249,7 @@ p
| A word vector. Size and shape are determined by the | A word vector. Size and shape are determined by the
| #[code Vocab.vectors] instance. | #[code Vocab.vectors] instance.
+h(2, "add_flag") Vocab.set_vector +h(2, "set_vector") Vocab.set_vector
+tag method +tag method
+tag-new(2) +tag-new(2)
@ -228,7 +271,7 @@ p
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
+cell The vector to set. +cell The vector to set.
+h(2, "add_flag") Vocab.has_vector +h(2, "has_vector") Vocab.has_vector
+tag method +tag method
+tag-new(2) +tag-new(2)