From 0d8f4a534b8966083c7e6e938f5b5cfec39ada65 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 1 Nov 2017 00:56:54 +0100 Subject: [PATCH] Update Vectors API docs --- website/api/vectors.jade | 243 +++++++++++++++++++++++++++------------ 1 file changed, 172 insertions(+), 71 deletions(-) diff --git a/website/api/vectors.jade b/website/api/vectors.jade index 692bd1ca8..9685188c5 100644 --- a/website/api/vectors.jade +++ b/website/api/vectors.jade @@ -5,46 +5,47 @@ include ../_includes/_mixins p | Vectors data is kept in the #[code Vectors.data] attribute, which should | be an instance of #[code numpy.ndarray] (for CPU vectors) or - | #[code cupy.ndarray] (for GPU vectors). + | #[code cupy.ndarray] (for GPU vectors). Multiple keys can be mapped to + | the same vector, and not all of the rows in the table need to be + | assigned – so #[code vectors.n_keys] may be greater or smaller than + | #[code vectors.shape[0]]. +h(2, "init") Vectors.__init__ +tag method p - | Create a new vector store. To keep the vector table empty, pass - | #[code width=0]. You can also create the vector table and add - | vectors one by one, or set the vector values directly on initialisation. + | Create a new vector store. You can set the vector values and keys + | directly on initialisation, or supply a #[code shape] keyword argument + | to create an empty table you can add vectors to later. +aside-code("Example"). from spacy.vectors import Vectors - from spacy.strings import StringStore - empty_vectors = Vectors(StringStore()) + empty_vectors = Vectors(shape=(10000, 300)) - vectors = Vectors([u'cat'], width=300) - vectors[u'cat'] = numpy.random.uniform(-1, 1, (300,)) - - vector_table = numpy.zeros((3, 300), dtype='f') - vectors = Vectors(StringStore(), data=vector_table) + data = numpy.zeros((3, 300), dtype='f') + keys = [u'cat', u'dog', u'rat'] + vectors = Vectors(data=data, keys=keys) +table(["Name", "Type", "Description"]) - +row - +cell #[code strings] - +cell #[code StringStore] or list - +cell - | List of strings, or a #[+api("stringstore") #[code StringStore]] - | that maps strings to hash values, and vice versa. - - +row - +cell #[code width] - +cell int - +cell Number of dimensions. - +row +cell #[code data] - +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] + +cell #[code.u-break ndarray[ndim=1, dtype='float32']] +cell The vector data. + +row + +cell #[code keys] + +cell iterable + +cell A sequence of keys aligned with the data. + + +row + +cell #[code shape] + +cell tuple + +cell + | Size of the table as #[code (n_entries, n_columns)], the number + | of entries and number of columns. Not required if you're + | initialising the object with #[code data] and #[code keys]. + +row("foot") +cell returns +cell #[code Vectors] @@ -54,97 +55,92 @@ p +tag method p - | Get a vector by key. If key is a string, it is hashed to an integer ID - | using the #[code Vectors.strings] table. If the integer key is not found - | in the table, a #[code KeyError] is raised. + | Get a vector by key. If the key is not found in the table, a + | #[code KeyError] is raised. +aside-code("Example"). - vectors = Vectors(StringStore(), 300) - vectors.add(u'cat', numpy.random.uniform(-1, 1, (300,))) - cat_vector = vectors[u'cat'] + cat_id = nlp.vocab.strings[u'cat'] + cat_vector = nlp.vocab.vectors[cat_id] + assert cat_vector == nlp.vocab[u'cat'].vector +table(["Name", "Type", "Description"]) +row +cell #[code key] - +cell unicode / int + +cell int +cell The key to get the vector for. +row +cell returns - +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] + +cell #[code.u-break ndarray[ndim=1, dtype='float32']] +cell The vector for the key. +h(2, "setitem") Vectors.__setitem__ +tag method p - | Set a vector for the given key. If key is a string, it is hashed to an - | integer ID using the #[code Vectors.strings] table. + | Set a vector for the given key. +aside-code("Example"). - vectors = Vectors(StringStore(), 300) - vectors[u'cat'] = numpy.random.uniform(-1, 1, (300,)) + cat_id = nlp.vocab.strings[u'cat'] + vector = numpy.random.uniform(-1, 1, (300,)) + nlp.vocab.vectors[cat_id] = vector +table(["Name", "Type", "Description"]) +row +cell #[code key] - +cell unicode / int + +cell int +cell The key to set the vector for. +row +cell #[code vector] - +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] + +cell #[code.u-break ndarray[ndim=1, dtype='float32']] +cell The vector to set. +h(2, "iter") Vectors.__iter__ +tag method -p Yield vectors from the table. +p Iterate over the keys in the table. +aside-code("Example"). - vector_table = numpy.zeros((3, 300), dtype='f') - vectors = Vectors(StringStore(), vector_table) - for vector in vectors: - print(vector) + for key in nlp.vocab.vectors: + print(key, nlp.vocab.strings[key]) +table(["Name", "Type", "Description"]) +row("foot") +cell yields - +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] - +cell A vector from the table. + +cell int + +cell A key in the table. +h(2, "len") Vectors.__len__ +tag method -p Return the number of vectors that have been assigned. +p Return the number of vectors in the table. +aside-code("Example"). - vector_table = numpy.zeros((3, 300), dtype='f') - vectors = Vectors(StringStore(), vector_table) + vectors = Vectors(shape=(3, 300)) assert len(vectors) == 3 +table(["Name", "Type", "Description"]) +row("foot") +cell returns +cell int - +cell The number of vectors in the data. + +cell The number of vectors in the table. +h(2, "contains") Vectors.__contains__ +tag method p - | Check whether a key has a vector entry in the table. If key is a string, - | it is hashed to an integer ID using the #[code Vectors.strings] table. + | Check whether a key has been mapped to a vector entry in the table. +aside-code("Example"). - vectors = Vectors(StringStore(), 300) - vectors.add(u'cat', numpy.random.uniform(-1, 1, (300,))) - assert u'cat' in vectors + cat_id = nlp.vocab.strings[u'cat'] + nlp.vectors.add(cat_id, numpy.random.uniform(-1, 1, (300,))) + assert cat_id in vectors +table(["Name", "Type", "Description"]) +row +cell #[code key] - +cell unicode / int + +cell int +cell The key to check. +row("foot") @@ -156,13 +152,20 @@ p +tag method p - | Add a key to the table, optionally setting a vector value as well. If - | key is a string, it is hashed to an integer ID using the - | #[code Vectors.strings] table. + | Add a key to the table, optionally setting a vector value as well. Keys + | can be mapped to an existing vector by setting #[code row], or a new + | vector can be added. When adding unicode keys, keep in mind that the + | #[code Vectors] class itself has no + | #[+api("stringstore") #[code StringStore]], so you have to store the + | hash-to-string mapping separately. If you need to manage the strings, + | you should use the #[code Vectors] via the + | #[+api("vocab") #[code Vocab]] class, e.g. #[code vocab.vectors]. +aside-code("Example"). - vectors = Vectors(StringStore(), 300) - vectors.add(u'cat', numpy.random.uniform(-1, 1, (300,))) + vector = numpy.random.uniform(-1, 1, (300,)) + cat_id = nlp.vocab.strings[u'cat'] + nlp.vocab.vectors.add(cat_id, vector=vector) + nlp.vocab.vectors.add(u'dog', row=0) +table(["Name", "Type", "Description"]) +row @@ -172,25 +175,66 @@ p +row +cell #[code vector] - +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] - +cell An optional vector to add. + +cell #[code.u-break ndarray[ndim=1, dtype='float32']] + +cell An optional vector to add for the key. + + +row + +cell #[code row] + +cell int + +cell An optional row number of a vector to map the key to. + + +row("foot") + +cell returns + +cell int + +cell The row the vector was added to. + ++h(2, "keys") Vectors.keys + +tag method + +p A sequence of the keys in the table. + ++aside-code("Example"). + for key in nlp.vocab.vectors.keys(): + print(key, nlp.vocab.strings[key]) + ++table(["Name", "Type", "Description"]) + +row("foot") + +cell returns + +cell iterable + +cell The keys. + ++h(2, "values") Vectors.values + +tag method + +p + | Iterate over vectors that have been assigned to at least one key. Note + | that some vectors may be unassigned, so the number of vectors returned + | may be less than the length of the vectors table. + ++aside-code("Example"). + for vector in nlp.vocab.vectors.values(): + print(vector) + ++table(["Name", "Type", "Description"]) + +row("foot") + +cell yields + +cell #[code.u-break ndarray[ndim=1, dtype='float32']] + +cell A vector in the table. +h(2, "items") Vectors.items +tag method -p Iterate over #[code (string key, vector)] pairs, in order. +p Iterate over #[code (key, vector)] pairs, in order. +aside-code("Example"). - vectors = Vectors(StringStore(), 300) - vectors.add(u'cat', numpy.random.uniform(-1, 1, (300,))) - for key, vector in vectors.items(): - print(key, vector) + for key, vector in nlp.vocab.vectors.items(): + print(key, nlp.vocab.strings[key], vector) +table(["Name", "Type", "Description"]) +row("foot") +cell yields +cell tuple - +cell #[code (string key, vector)] pairs, in order. + +cell #[code (key, vector)] pairs, in order. +h(2, "shape") Vectors.shape +tag property @@ -200,7 +244,7 @@ p | dimensions in the vector table. +aside-code("Example"). - vectors = Vectors(StringStore(), 300) + vectors = Vectors(shape(1, 300)) vectors.add(u'cat', numpy.random.uniform(-1, 1, (300,))) rows, dims = vectors.shape assert rows == 1 @@ -212,6 +256,59 @@ p +cell tuple +cell A #[code (rows, dims)] pair. ++h(2, "size") Vectors.size + +tag property + +p The vector size, i.e. #[code rows * dims]. + ++aside-code("Example"). + vectors = Vectors(shape=(500, 300)) + assert vectors.size == 150000 + ++table(["Name", "Type", "Description"]) + +row("foot") + +cell returns + +cell int + +cell The vector size. + ++h(2, "is_full") Vectors.is_full + +tag property + +p + | Whether the vectors table is full and has no slots are available for new + | keys. If a table is full, it can be resized using + | #[+api("vectors#resize") #[code Vectors.resize]]. + ++aside-code("Example"). + vectors = Vectors(shape=(1, 300)) + vectors.add(u'cat', numpy.random.uniform(-1, 1, (300,))) + assert vectors.is_full + ++table(["Name", "Type", "Description"]) + +row("foot") + +cell returns + +cell bool + +cell Whether the vectors table is full. + ++h(2, "n_keys") Vectors.n_keys + +tag property + +p + | Get the number of keys in the table. Note that this is the number of + | #[em all] keys, not just unique vectors. If several keys are mapped + | are mapped to the same vectors, they will be counted individually. + ++aside-code("Example"). + vectors = Vectors(shape=(10, 300)) + assert len(vectors) == 10 + assert vectors.n_keys == 0 + ++table(["Name", "Type", "Description"]) + +row("foot") + +cell returns + +cell int + +cell The number of all keys in the table. + +h(2, "from_glove") Vectors.from_glove +tag method @@ -223,6 +320,10 @@ p | float32 vectors, #[code vectors.300.d.bin] for 300d float64 (double) | vectors, etc. By default GloVe outputs 64-bit vectors. ++aside-code("Example"). + vectors = Vectors() + vectors.from_glove('/path/to/glove_vectors') + +table(["Name", "Type", "Description"]) +row +cell #[code path] @@ -323,7 +424,7 @@ p Load state from a binary string. +table(["Name", "Type", "Description"]) +row +cell #[code data] - +cell #[code numpy.ndarray] / #[code cupy.ndarray] + +cell #[code.u-break ndarray[ndim=1, dtype='float32']] +cell | Stored vectors data. #[code numpy] is used for CPU vectors, | #[code cupy] for GPU vectors. @@ -337,7 +438,7 @@ p Load state from a binary string. +row +cell #[code keys] - +cell #[code numpy.ndarray] + +cell #[code.u-break ndarray[ndim=1, dtype='float32']] +cell | Array keeping the keys in order, such that | #[code keys[vectors.key2row[key]] == key]