spaCy/website/api/vocab.jade

412 lines
11 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

//- 💫 DOCS > API > VOCAB
include ../_includes/_mixins
p
| The #[code Vocab] object provides a lookup table that allows you to
| access #[+api("lexeme") #[code Lexeme]] objects, as well as the
| #[+api("stringstore") #[code StringStore]]. It also owns underlying
| C-data that is shared between #[code Doc] objects.
+h(2, "init") Vocab.__init__
+tag method
p Create the vocabulary.
+aside-code("Example").
from spacy.vocab import Vocab
vocab = Vocab(strings=[u'hello', u'world'])
+table(["Name", "Type", "Description"])
+row
+cell #[code lex_attr_getters]
+cell dict
+cell
| A dictionary mapping attribute IDs to functions to compute them.
| Defaults to #[code None].
+row
+cell #[code tag_map]
+cell dict
+cell
| A dictionary mapping fine-grained tags to coarse-grained
| parts-of-speech, and optionally morphological attributes.
+row
+cell #[code lemmatizer]
+cell object
+cell A lemmatizer. Defaults to #[code None].
+row
+cell #[code strings]
+cell #[code StringStore] or list
+cell
| A #[+api("stringstore") #[code StringStore]] that maps
| strings to hash values, and vice versa, or a list of strings.
+row("foot")
+cell returns
+cell #[code Vocab]
+cell The newly constructed object.
+h(2, "len") Vocab.__len__
+tag method
p Get the current number of lexemes in the vocabulary.
+aside-code("Example").
doc = nlp(u'This is a sentence.')
assert len(nlp.vocab) > 0
+table(["Name", "Type", "Description"])
+row("foot")
+cell returns
+cell int
+cell The number of lexems in the vocabulary.
+h(2, "getitem") Vocab.__getitem__
+tag method
p
| Retrieve a lexeme, given an int ID or a unicode string. If a previously
| unseen unicode string is given, a new lexeme is created and stored.
+aside-code("Example").
apple = nlp.vocab.strings['apple']
assert nlp.vocab[apple] == nlp.vocab[u'apple']
+table(["Name", "Type", "Description"])
+row
+cell #[code id_or_string]
+cell int / unicode
+cell The hash value of a word, or its unicode string.
+row("foot")
+cell returns
+cell #[code Lexeme]
+cell The lexeme indicated by the given ID.
+h(2, "iter") Vocab.__iter__
+tag method
p Iterate over the lexemes in the vocabulary.
+aside-code("Example").
stop_words = (lex for lex in nlp.vocab if lex.is_stop)
+table(["Name", "Type", "Description"])
+row("foot")
+cell yields
+cell #[code Lexeme]
+cell An entry in the vocabulary.
+h(2, "contains") Vocab.__contains__
+tag method
p
| Check whether the string has an entry in the vocabulary. To get the ID
| for a given string, you need to look it up in
| #[+api("vocab#attributes") #[code vocab.strings]].
+aside-code("Example").
apple = nlp.vocab.strings['apple']
oov = nlp.vocab.strings['dskfodkfos']
assert apple in nlp.vocab
assert oov not in nlp.vocab
+table(["Name", "Type", "Description"])
+row
+cell #[code string]
+cell unicode
+cell The ID string.
+row("foot")
+cell returns
+cell bool
+cell Whether the string has an entry in the vocabulary.
+h(2, "add_flag") Vocab.add_flag
+tag method
p
| Set a new boolean flag to words in the vocabulary. The #[code flag_getter]
| function will be called over the words currently in the vocab, and then
| applied to new words as they occur. You'll then be able to access the flag
| value on each token, using #[code token.check_flag(flag_id)].
+aside-code("Example").
def is_my_product(text):
products = [u'spaCy', u'Thinc', u'displaCy']
return text in products
MY_PRODUCT = nlp.vocab.add_flag(is_my_product)
doc = nlp(u'I like spaCy')
assert doc[2].check_flag(MY_PRODUCT) == True
+table(["Name", "Type", "Description"])
+row
+cell #[code flag_getter]
+cell dict
+cell A function #[code f(unicode) -> bool], to get the flag value.
+row
+cell #[code flag_id]
+cell int
+cell
| An integer between 1 and 63 (inclusive), specifying the bit at
| which the flag will be stored. If #[code -1], the lowest
| available bit will be chosen.
+row("foot")
+cell returns
+cell int
+cell The integer ID by which the flag value can be checked.
+h(2, "reset_vectors") Vocab.reset_vectors
+tag method
+tag-new(2)
p
| Drop the current vector table. Because all vectors must be the same
| width, you have to call this to change the size of the vectors. Only
| one of the #[code width] and #[code shape] keyword arguments can be
| specified.
+aside-code("Example").
nlp.vocab.reset_vectors(width=300)
+table(["Name", "Type", "Description"])
+row
+cell #[code width]
+cell int
+cell The new width (keyword argument only).
+row
+cell #[code shape]
+cell int
+cell The new shape (keyword argument only).
+h(2, "prune_vectors") Vocab.prune_vectors
+tag method
+tag-new(2)
p
| Reduce the current vector table to #[code nr_row] unique entries. Words
| mapped to the discarded vectors will be remapped to the closest vector
| among those remaining. For example, suppose the original table had
| vectors for the words:
| #[code.u-break ['sat', 'cat', 'feline', 'reclined']]. If we prune the
| vector table to, two rows, we would discard the vectors for "feline"
| and "reclined". These words would then be remapped to the closest
| remaining vector so "feline" would have the same vector as "cat",
| and "reclined" would have the same vector as "sat". The similarities are
| judged by cosine. The original vectors may be large, so the cosines are
| calculated in minibatches, to reduce memory usage.
+aside-code("Example").
nlp.vocab.prune_vectors(10000)
assert len(nlp.vocab.vectors) <= 1000
+table(["Name", "Type", "Description"])
+row
+cell #[code nr_row]
+cell int
+cell The number of rows to keep in the vector table.
+row
+cell #[code batch_size]
+cell int
+cell
| Batch of vectors for calculating the similarities. Larger batch
| sizes might be faster, while temporarily requiring more memory.
+row("foot")
+cell returns
+cell dict
+cell
| A dictionary keyed by removed words mapped to
| #[code (string, score)] tuples, where #[code string] is the entry
| the removed word was mapped to, and #[code score] the similarity
| score between the two words.
+h(2, "get_vector") Vocab.get_vector
+tag method
+tag-new(2)
p
| Retrieve a vector for a word in the vocabulary. Words can be looked up
| by string or hash value. If no vectors data is loaded, a
| #[code ValueError] is raised.
+aside-code("Example").
nlp.vocab.get_vector(u'apple')
+table(["Name", "Type", "Description"])
+row
+cell #[code orth]
+cell int / unicode
+cell The hash value of a word, or its unicode string.
+row("foot")
+cell returns
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
+cell
| A word vector. Size and shape are determined by the
| #[code Vocab.vectors] instance.
+h(2, "set_vector") Vocab.set_vector
+tag method
+tag-new(2)
p
| Set a vector for a word in the vocabulary. Words can be referenced by
| by string or hash value.
+aside-code("Example").
nlp.vocab.set_vector(u'apple', array([...]))
+table(["Name", "Type", "Description"])
+row
+cell #[code orth]
+cell int / unicode
+cell The hash value of a word, or its unicode string.
+row
+cell #[code vector]
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
+cell The vector to set.
+h(2, "has_vector") Vocab.has_vector
+tag method
+tag-new(2)
p
| Check whether a word has a vector. Returns #[code False] if no vectors
| are loaded. Words can be looked up by string or hash value.
+aside-code("Example").
if nlp.vocab.has_vector(u'apple'):
vector = nlp.vocab.get_vector(u'apple')
+table(["Name", "Type", "Description"])
+row
+cell #[code orth]
+cell int / unicode
+cell The hash value of a word, or its unicode string.
+row("foot")
+cell returns
+cell bool
+cell Whether the word has a vector.
+h(2, "to_disk") Vocab.to_disk
+tag method
+tag-new(2)
p Save the current state to a directory.
+aside-code("Example").
nlp.vocab.to_disk('/path/to/vocab')
+table(["Name", "Type", "Description"])
+row
+cell #[code path]
+cell unicode or #[code Path]
+cell
| A path to a directory, which will be created if it doesn't exist.
| Paths may be either strings or #[code Path]-like objects.
+h(2, "from_disk") Vocab.from_disk
+tag method
+tag-new(2)
p Loads state from a directory. Modifies the object in place and returns it.
+aside-code("Example").
from spacy.vocab import Vocab
vocab = Vocab().from_disk('/path/to/vocab')
+table(["Name", "Type", "Description"])
+row
+cell #[code path]
+cell unicode or #[code Path]
+cell
| A path to a directory. Paths may be either strings or
| #[code Path]-like objects.
+row("foot")
+cell returns
+cell #[code Vocab]
+cell The modified #[code Vocab] object.
+h(2, "to_bytes") Vocab.to_bytes
+tag method
p Serialize the current state to a binary string.
+aside-code("Example").
vocab_bytes = nlp.vocab.to_bytes()
+table(["Name", "Type", "Description"])
+row
+cell #[code **exclude]
+cell -
+cell Named attributes to prevent from being serialized.
+row("foot")
+cell returns
+cell bytes
+cell The serialized form of the #[code Vocab] object.
+h(2, "from_bytes") Vocab.from_bytes
+tag method
p Load state from a binary string.
+aside-code("Example").
fron spacy.vocab import Vocab
vocab_bytes = nlp.vocab.to_bytes()
vocab = Vocab()
vocab.from_bytes(vocab_bytes)
+table(["Name", "Type", "Description"])
+row
+cell #[code bytes_data]
+cell bytes
+cell The data to load from.
+row
+cell #[code **exclude]
+cell -
+cell Named attributes to prevent from being loaded.
+row("foot")
+cell returns
+cell #[code Vocab]
+cell The #[code Vocab] object.
+h(2, "attributes") Attributes
+aside-code("Example").
apple_id = nlp.vocab.strings['apple']
assert type(apple_id) == int
PERSON = nlp.vocab.strings['PERSON']
assert type(PERSON) == int
+table(["Name", "Type", "Description"])
+row
+cell #[code strings]
+cell #[code StringStore]
+cell A table managing the string-to-int mapping.
+row
+cell #[code vectors]
+tag-new(2)
+cell #[code Vectors]
+cell A table associating word IDs to word vectors.
+row
+cell #[code vectors_length]
+cell int
+cell Number of dimensions for each word vector.