spaCy/website/api/vectors.jade

477 lines
13 KiB
Plaintext
Raw Normal View History

2017-10-03 15:27:22 +03:00
//- 💫 DOCS > API > VECTORS
include ../_includes/_mixins
p
| Vectors data is kept in the #[code Vectors.data] attribute, which should
| be an instance of #[code numpy.ndarray] (for CPU vectors) or
2017-11-01 02:56:54 +03:00
| #[code cupy.ndarray] (for GPU vectors). Multiple keys can be mapped to
| the same vector, and not all of the rows in the table need to be
| assigned so #[code vectors.n_keys] may be greater or smaller than
| #[code vectors.shape[0]].
2017-10-03 15:27:22 +03:00
+h(2, "init") Vectors.__init__
+tag method
p
2017-11-01 02:56:54 +03:00
| Create a new vector store. You can set the vector values and keys
| directly on initialisation, or supply a #[code shape] keyword argument
| to create an empty table you can add vectors to later.
2017-10-03 15:27:22 +03:00
+aside-code("Example").
from spacy.vectors import Vectors
2017-11-01 02:56:54 +03:00
empty_vectors = Vectors(shape=(10000, 300))
2017-10-03 15:27:22 +03:00
2017-11-01 02:56:54 +03:00
data = numpy.zeros((3, 300), dtype='f')
keys = [u'cat', u'dog', u'rat']
vectors = Vectors(data=data, keys=keys)
2017-10-03 15:27:22 +03:00
+table(["Name", "Type", "Description"])
+row
2017-11-01 02:56:54 +03:00
+cell #[code data]
+cell #[code.u-break ndarray[ndim=1, dtype='float32']]
+cell The vector data.
2017-10-03 15:27:22 +03:00
+row
2017-11-01 02:56:54 +03:00
+cell #[code keys]
+cell iterable
+cell A sequence of keys aligned with the data.
2017-10-03 15:27:22 +03:00
2017-10-27 20:45:19 +03:00
+row
2017-11-01 02:56:54 +03:00
+cell #[code shape]
+cell tuple
+cell
| Size of the table as #[code (n_entries, n_columns)], the number
| of entries and number of columns. Not required if you're
| initialising the object with #[code data] and #[code keys].
2017-10-27 20:45:19 +03:00
2017-10-03 15:27:22 +03:00
+row("foot")
+cell returns
+cell #[code Vectors]
+cell The newly created object.
+h(2, "getitem") Vectors.__getitem__
+tag method
p
2017-11-01 02:56:54 +03:00
| Get a vector by key. If the key is not found in the table, a
| #[code KeyError] is raised.
2017-10-03 15:27:22 +03:00
+aside-code("Example").
2017-11-01 02:56:54 +03:00
cat_id = nlp.vocab.strings[u'cat']
cat_vector = nlp.vocab.vectors[cat_id]
assert cat_vector == nlp.vocab[u'cat'].vector
2017-10-03 15:27:22 +03:00
+table(["Name", "Type", "Description"])
+row
+cell #[code key]
2017-11-01 02:56:54 +03:00
+cell int
2017-10-03 15:27:22 +03:00
+cell The key to get the vector for.
+row
+cell returns
2017-11-01 02:56:54 +03:00
+cell #[code.u-break ndarray[ndim=1, dtype='float32']]
2017-10-03 15:27:22 +03:00
+cell The vector for the key.
+h(2, "setitem") Vectors.__setitem__
+tag method
p
2017-11-01 02:56:54 +03:00
| Set a vector for the given key.
2017-10-03 15:27:22 +03:00
+aside-code("Example").
2017-11-01 02:56:54 +03:00
cat_id = nlp.vocab.strings[u'cat']
vector = numpy.random.uniform(-1, 1, (300,))
nlp.vocab.vectors[cat_id] = vector
2017-10-03 15:27:22 +03:00
+table(["Name", "Type", "Description"])
+row
+cell #[code key]
2017-11-01 02:56:54 +03:00
+cell int
2017-10-03 15:27:22 +03:00
+cell The key to set the vector for.
+row
+cell #[code vector]
2017-11-01 02:56:54 +03:00
+cell #[code.u-break ndarray[ndim=1, dtype='float32']]
2017-10-03 15:27:22 +03:00
+cell The vector to set.
+h(2, "iter") Vectors.__iter__
+tag method
2017-11-01 02:56:54 +03:00
p Iterate over the keys in the table.
2017-10-03 15:27:22 +03:00
+aside-code("Example").
2017-11-01 02:56:54 +03:00
for key in nlp.vocab.vectors:
print(key, nlp.vocab.strings[key])
2017-10-03 15:27:22 +03:00
+table(["Name", "Type", "Description"])
+row("foot")
+cell yields
2017-11-01 02:56:54 +03:00
+cell int
+cell A key in the table.
2017-10-03 15:27:22 +03:00
+h(2, "len") Vectors.__len__
+tag method
2017-11-01 02:56:54 +03:00
p Return the number of vectors in the table.
2017-10-03 15:27:22 +03:00
+aside-code("Example").
2017-11-01 02:56:54 +03:00
vectors = Vectors(shape=(3, 300))
2017-10-03 15:27:22 +03:00
assert len(vectors) == 3
+table(["Name", "Type", "Description"])
+row("foot")
+cell returns
+cell int
2017-11-01 02:56:54 +03:00
+cell The number of vectors in the table.
2017-10-03 15:27:22 +03:00
+h(2, "contains") Vectors.__contains__
+tag method
p
2017-11-01 02:56:54 +03:00
| Check whether a key has been mapped to a vector entry in the table.
2017-10-03 15:27:22 +03:00
+aside-code("Example").
2017-11-01 02:56:54 +03:00
cat_id = nlp.vocab.strings[u'cat']
nlp.vectors.add(cat_id, numpy.random.uniform(-1, 1, (300,)))
assert cat_id in vectors
2017-10-03 15:27:22 +03:00
+table(["Name", "Type", "Description"])
+row
+cell #[code key]
2017-11-01 02:56:54 +03:00
+cell int
2017-10-03 15:27:22 +03:00
+cell The key to check.
+row("foot")
+cell returns
+cell bool
+cell Whether the key has a vector entry.
+h(2, "add") Vectors.add
+tag method
p
2017-11-01 02:56:54 +03:00
| Add a key to the table, optionally setting a vector value as well. Keys
| can be mapped to an existing vector by setting #[code row], or a new
| vector can be added. When adding unicode keys, keep in mind that the
| #[code Vectors] class itself has no
| #[+api("stringstore") #[code StringStore]], so you have to store the
| hash-to-string mapping separately. If you need to manage the strings,
| you should use the #[code Vectors] via the
| #[+api("vocab") #[code Vocab]] class, e.g. #[code vocab.vectors].
2017-10-03 15:27:22 +03:00
+aside-code("Example").
2017-11-01 02:56:54 +03:00
vector = numpy.random.uniform(-1, 1, (300,))
cat_id = nlp.vocab.strings[u'cat']
nlp.vocab.vectors.add(cat_id, vector=vector)
nlp.vocab.vectors.add(u'dog', row=0)
2017-10-03 15:27:22 +03:00
+table(["Name", "Type", "Description"])
+row
+cell #[code key]
+cell unicode / int
+cell The key to add.
+row
+cell #[code vector]
2017-11-01 02:56:54 +03:00
+cell #[code.u-break ndarray[ndim=1, dtype='float32']]
+cell An optional vector to add for the key.
+row
+cell #[code row]
+cell int
+cell An optional row number of a vector to map the key to.
+row("foot")
+cell returns
+cell int
+cell The row the vector was added to.
+h(2, "resize") Vectors.resize
+tag method
p
| Resize the underlying vectors array. If #[code inplace=True], the memory
| is reallocated. This may cause other references to the data to become
| invalid, so only use #[code inplace=True] if you're sure that's what you
| want. If the number of vectors is reduced, keys mapped to rows that have
| been deleted are removed. These removed items are returned as a list of
| #[code (key, row)] tuples.
+aside-code("Example").
removed = nlp.vocab.vectors.resize((10000, 300))
+table(["Name", "Type", "Description"])
+row
+cell #[code shape]
+cell tuple
+cell
| A #[code (rows, dims)] tuple describing the number of rows and
| dimensions.
+row
+cell #[code inplace]
+cell bool
+cell Reallocate the memory.
+row("foot")
+cell returns
+cell list
+cell The removed items as a list of #[code (key, row)] tuples.
2017-11-01 02:56:54 +03:00
+h(2, "keys") Vectors.keys
+tag method
p A sequence of the keys in the table.
+aside-code("Example").
for key in nlp.vocab.vectors.keys():
print(key, nlp.vocab.strings[key])
+table(["Name", "Type", "Description"])
+row("foot")
+cell returns
+cell iterable
+cell The keys.
+h(2, "values") Vectors.values
+tag method
p
| Iterate over vectors that have been assigned to at least one key. Note
| that some vectors may be unassigned, so the number of vectors returned
| may be less than the length of the vectors table.
+aside-code("Example").
for vector in nlp.vocab.vectors.values():
print(vector)
+table(["Name", "Type", "Description"])
+row("foot")
+cell yields
+cell #[code.u-break ndarray[ndim=1, dtype='float32']]
+cell A vector in the table.
2017-10-03 15:27:22 +03:00
+h(2, "items") Vectors.items
+tag method
2017-11-01 02:56:54 +03:00
p Iterate over #[code (key, vector)] pairs, in order.
2017-10-03 15:27:22 +03:00
+aside-code("Example").
2017-11-01 02:56:54 +03:00
for key, vector in nlp.vocab.vectors.items():
print(key, nlp.vocab.strings[key], vector)
2017-10-03 15:27:22 +03:00
+table(["Name", "Type", "Description"])
+row("foot")
+cell yields
+cell tuple
2017-11-01 02:56:54 +03:00
+cell #[code (key, vector)] pairs, in order.
2017-10-03 15:27:22 +03:00
+h(2, "shape") Vectors.shape
+tag property
p
| Get #[code (rows, dims)] tuples of number of rows and number of
| dimensions in the vector table.
+aside-code("Example").
2017-11-01 02:56:54 +03:00
vectors = Vectors(shape(1, 300))
2017-10-03 15:27:22 +03:00
vectors.add(u'cat', numpy.random.uniform(-1, 1, (300,)))
rows, dims = vectors.shape
assert rows == 1
assert dims == 300
+table(["Name", "Type", "Description"])
+row("foot")
+cell returns
+cell tuple
2017-10-27 20:45:19 +03:00
+cell A #[code (rows, dims)] pair.
2017-10-03 15:27:22 +03:00
2017-11-01 02:56:54 +03:00
+h(2, "size") Vectors.size
+tag property
p The vector size, i.e. #[code rows * dims].
+aside-code("Example").
vectors = Vectors(shape=(500, 300))
assert vectors.size == 150000
+table(["Name", "Type", "Description"])
+row("foot")
+cell returns
+cell int
+cell The vector size.
+h(2, "is_full") Vectors.is_full
+tag property
p
| Whether the vectors table is full and has no slots are available for new
| keys. If a table is full, it can be resized using
| #[+api("vectors#resize") #[code Vectors.resize]].
+aside-code("Example").
vectors = Vectors(shape=(1, 300))
vectors.add(u'cat', numpy.random.uniform(-1, 1, (300,)))
assert vectors.is_full
+table(["Name", "Type", "Description"])
+row("foot")
+cell returns
+cell bool
+cell Whether the vectors table is full.
+h(2, "n_keys") Vectors.n_keys
+tag property
p
| Get the number of keys in the table. Note that this is the number of
| #[em all] keys, not just unique vectors. If several keys are mapped
| are mapped to the same vectors, they will be counted individually.
+aside-code("Example").
vectors = Vectors(shape=(10, 300))
assert len(vectors) == 10
assert vectors.n_keys == 0
+table(["Name", "Type", "Description"])
+row("foot")
+cell returns
+cell int
+cell The number of all keys in the table.
2017-10-03 15:27:22 +03:00
+h(2, "from_glove") Vectors.from_glove
+tag method
p
| Load #[+a("https://nlp.stanford.edu/projects/glove/") GloVe] vectors from
| a directory. Assumes binary format, that the vocab is in a
| #[code vocab.txt], and that vectors are named
| #[code vectors.{size}.[fd].bin], e.g. #[code vectors.128.f.bin] for 128d
| float32 vectors, #[code vectors.300.d.bin] for 300d float64 (double)
| vectors, etc. By default GloVe outputs 64-bit vectors.
2017-11-01 02:56:54 +03:00
+aside-code("Example").
vectors = Vectors()
vectors.from_glove('/path/to/glove_vectors')
2017-10-03 15:27:22 +03:00
+table(["Name", "Type", "Description"])
+row
+cell #[code path]
+cell unicode / #[code Path]
+cell The path to load the GloVe vectors from.
+h(2, "to_disk") Vectors.to_disk
+tag method
p Save the current state to a directory.
+aside-code("Example").
vectors.to_disk('/path/to/vectors')
+table(["Name", "Type", "Description"])
+row
+cell #[code path]
2017-10-27 20:45:19 +03:00
+cell unicode / #[code Path]
2017-10-03 15:27:22 +03:00
+cell
| A path to a directory, which will be created if it doesn't exist.
| Paths may be either strings or #[code Path]-like objects.
2017-10-27 20:45:19 +03:00
+row
+cell #[code **exclude]
+cell -
+cell Named attributes to prevent from being saved.
2017-10-03 15:27:22 +03:00
+h(2, "from_disk") Vectors.from_disk
+tag method
p Loads state from a directory. Modifies the object in place and returns it.
+aside-code("Example").
vectors = Vectors(StringStore())
vectors.from_disk('/path/to/vectors')
+table(["Name", "Type", "Description"])
+row
+cell #[code path]
2017-10-27 20:45:19 +03:00
+cell unicode / #[code Path]
2017-10-03 15:27:22 +03:00
+cell
| A path to a directory. Paths may be either strings or
| #[code Path]-like objects.
+row("foot")
+cell returns
+cell #[code Vectors]
+cell The modified #[code Vectors] object.
+h(2, "to_bytes") Vectors.to_bytes
+tag method
p Serialize the current state to a binary string.
+aside-code("Example").
vectors_bytes = vectors.to_bytes()
+table(["Name", "Type", "Description"])
+row
+cell #[code **exclude]
+cell -
+cell Named attributes to prevent from being serialized.
+row("foot")
+cell returns
+cell bytes
+cell The serialized form of the #[code Vectors] object.
+h(2, "from_bytes") Vectors.from_bytes
+tag method
p Load state from a binary string.
+aside-code("Example").
fron spacy.vectors import Vectors
vectors_bytes = vectors.to_bytes()
new_vectors = Vectors(StringStore())
new_vectors.from_bytes(vectors_bytes)
+table(["Name", "Type", "Description"])
+row
2017-10-27 20:45:19 +03:00
+cell #[code data]
2017-10-03 15:27:22 +03:00
+cell bytes
+cell The data to load from.
+row
+cell #[code **exclude]
+cell -
+cell Named attributes to prevent from being loaded.
+row("foot")
+cell returns
+cell #[code Vectors]
+cell The #[code Vectors] object.
+h(2, "attributes") Attributes
+table(["Name", "Type", "Description"])
+row
+cell #[code data]
2017-11-01 02:56:54 +03:00
+cell #[code.u-break ndarray[ndim=1, dtype='float32']]
2017-10-03 15:27:22 +03:00
+cell
| Stored vectors data. #[code numpy] is used for CPU vectors,
| #[code cupy] for GPU vectors.
+row
+cell #[code key2row]
+cell dict
+cell
| Dictionary mapping word hashes to rows in the
| #[code Vectors.data] table.
+row
+cell #[code keys]
2017-11-01 02:56:54 +03:00
+cell #[code.u-break ndarray[ndim=1, dtype='float32']]
2017-10-03 15:27:22 +03:00
+cell
| Array keeping the keys in order, such that
| #[code keys[vectors.key2row[key]] == key]