mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-14 05:37:03 +03:00
Update vocab docs and document Vocab.prune_vectors
This commit is contained in:
parent
12343e23fd
commit
ec657c1ddc
|
@ -263,6 +263,15 @@ cdef class Vocab:
|
||||||
The similarities are judged by cosine. The original vectors may
|
The similarities are judged by cosine. The original vectors may
|
||||||
be large, so the cosines are calculated in minibatches, to reduce
|
be large, so the cosines are calculated in minibatches, to reduce
|
||||||
memory usage.
|
memory usage.
|
||||||
|
|
||||||
|
nr_row (int): The number of rows to keep in the vector table.
|
||||||
|
batch_size (int): Batch of vectors for calculating the similarities.
|
||||||
|
Larger batch sizes might be faster, while temporarily requiring
|
||||||
|
more memory.
|
||||||
|
RETURNS (dict): A dictionary keyed by removed words mapped to
|
||||||
|
`(string, score)` tuples, where `string` is the entry the removed
|
||||||
|
word was mapped to, and `score` the similarity score between the
|
||||||
|
two words.
|
||||||
"""
|
"""
|
||||||
xp = get_array_module(self.vectors.data)
|
xp = get_array_module(self.vectors.data)
|
||||||
# Work in batches, to avoid memory problems.
|
# Work in batches, to avoid memory problems.
|
||||||
|
@ -285,6 +294,7 @@ cdef class Vocab:
|
||||||
self.vectors.add(lex.orth, row=lex.rank)
|
self.vectors.add(lex.orth, row=lex.rank)
|
||||||
# Make copy, to encourage the original table to be garbage collected.
|
# Make copy, to encourage the original table to be garbage collected.
|
||||||
self.vectors.data = xp.ascontiguousarray(self.vectors.data[:nr_row])
|
self.vectors.data = xp.ascontiguousarray(self.vectors.data[:nr_row])
|
||||||
|
# TODO: return new mapping
|
||||||
|
|
||||||
def get_vector(self, orth):
|
def get_vector(self, orth):
|
||||||
"""Retrieve a vector for a word in the vocabulary. Words can be looked
|
"""Retrieve a vector for a word in the vocabulary. Words can be looked
|
||||||
|
|
|
@ -162,7 +162,7 @@ p
|
||||||
+cell int
|
+cell int
|
||||||
+cell The integer ID by which the flag value can be checked.
|
+cell The integer ID by which the flag value can be checked.
|
||||||
|
|
||||||
+h(2, "add_flag") Vocab.clear_vectors
|
+h(2, "clear_vectors") Vocab.clear_vectors
|
||||||
+tag method
|
+tag method
|
||||||
+tag-new(2)
|
+tag-new(2)
|
||||||
|
|
||||||
|
@ -181,7 +181,50 @@ p
|
||||||
| Number of dimensions of the new vectors. If #[code None], size
|
| Number of dimensions of the new vectors. If #[code None], size
|
||||||
| is not changed.
|
| is not changed.
|
||||||
|
|
||||||
+h(2, "add_flag") Vocab.get_vector
|
+h(2, "prune_vectors") Vocab.prune_vectors
|
||||||
|
+tag method
|
||||||
|
+tag-new(2)
|
||||||
|
|
||||||
|
p
|
||||||
|
| Reduce the current vector table to #[code nr_row] unique entries. Words
|
||||||
|
| mapped to the discarded vectors will be remapped to the closest vector
|
||||||
|
| among those remaining. For example, suppose the original table had
|
||||||
|
| vectors for the words:
|
||||||
|
| #[code.u-break ['sat', 'cat', 'feline', 'reclined']]. If we prune the
|
||||||
|
| vector table to, two rows, we would discard the vectors for "feline"
|
||||||
|
| and "reclined". These words would then be remapped to the closest
|
||||||
|
| remaining vector – so "feline" would have the same vector as "cat",
|
||||||
|
| and "reclined" would have the same vector as "sat". The similarities are
|
||||||
|
| judged by cosine. The original vectors may be large, so the cosines are
|
||||||
|
| calculated in minibatches, to reduce memory usage.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
nlp.vocab.prune_vectors(10000)
|
||||||
|
assert len(nlp.vocab.vectors) <= 1000
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+row
|
||||||
|
+cell #[code nr_row]
|
||||||
|
+cell int
|
||||||
|
+cell The number of rows to keep in the vector table.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code batch_size]
|
||||||
|
+cell int
|
||||||
|
+cell
|
||||||
|
| Batch of vectors for calculating the similarities. Larger batch
|
||||||
|
| sizes might be faster, while temporarily requiring more memory.
|
||||||
|
|
||||||
|
+row("foot")
|
||||||
|
+cell returns
|
||||||
|
+cell dict
|
||||||
|
+cell
|
||||||
|
| A dictionary keyed by removed words mapped to
|
||||||
|
| #[code (string, score)] tuples, where #[code string] is the entry
|
||||||
|
| the removed word was mapped to, and #[code score] the similarity
|
||||||
|
| score between the two words.
|
||||||
|
|
||||||
|
+h(2, "get_vector") Vocab.get_vector
|
||||||
+tag method
|
+tag method
|
||||||
+tag-new(2)
|
+tag-new(2)
|
||||||
|
|
||||||
|
@ -206,7 +249,7 @@ p
|
||||||
| A word vector. Size and shape are determined by the
|
| A word vector. Size and shape are determined by the
|
||||||
| #[code Vocab.vectors] instance.
|
| #[code Vocab.vectors] instance.
|
||||||
|
|
||||||
+h(2, "add_flag") Vocab.set_vector
|
+h(2, "set_vector") Vocab.set_vector
|
||||||
+tag method
|
+tag method
|
||||||
+tag-new(2)
|
+tag-new(2)
|
||||||
|
|
||||||
|
@ -228,7 +271,7 @@ p
|
||||||
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
|
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
|
||||||
+cell The vector to set.
|
+cell The vector to set.
|
||||||
|
|
||||||
+h(2, "add_flag") Vocab.has_vector
|
+h(2, "has_vector") Vocab.has_vector
|
||||||
+tag method
|
+tag method
|
||||||
+tag-new(2)
|
+tag-new(2)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user