2016-10-31 21:04:15 +03:00
|
|
|
|
//- 💫 DOCS > API > VOCAB
|
|
|
|
|
|
2017-10-03 15:27:22 +03:00
|
|
|
|
include ../_includes/_mixins
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
|
|
|
|
p
|
2017-10-03 15:27:22 +03:00
|
|
|
|
| The #[code Vocab] object provides a lookup table that allows you to
|
|
|
|
|
| access #[+api("lexeme") #[code Lexeme]] objects, as well as the
|
|
|
|
|
| #[+api("stringstore") #[code StringStore]]. It also owns underlying
|
|
|
|
|
| C-data that is shared between #[code Doc] objects.
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
2017-05-20 14:59:31 +03:00
|
|
|
|
+h(2, "init") Vocab.__init__
|
|
|
|
|
+tag method
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
2017-05-20 14:59:31 +03:00
|
|
|
|
p Create the vocabulary.
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
2017-10-03 15:27:22 +03:00
|
|
|
|
+aside-code("Example").
|
|
|
|
|
from spacy.vocab import Vocab
|
|
|
|
|
vocab = Vocab(strings=[u'hello', u'world'])
|
|
|
|
|
|
2016-10-31 21:04:15 +03:00
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
|
|
|
+row
|
|
|
|
|
+cell #[code lex_attr_getters]
|
|
|
|
|
+cell dict
|
|
|
|
|
+cell
|
|
|
|
|
| A dictionary mapping attribute IDs to functions to compute them.
|
|
|
|
|
| Defaults to #[code None].
|
|
|
|
|
|
|
|
|
|
+row
|
|
|
|
|
+cell #[code tag_map]
|
|
|
|
|
+cell dict
|
|
|
|
|
+cell
|
|
|
|
|
| A dictionary mapping fine-grained tags to coarse-grained
|
|
|
|
|
| parts-of-speech, and optionally morphological attributes.
|
|
|
|
|
|
|
|
|
|
+row
|
|
|
|
|
+cell #[code lemmatizer]
|
2017-05-20 14:59:31 +03:00
|
|
|
|
+cell object
|
2016-10-31 21:04:15 +03:00
|
|
|
|
+cell A lemmatizer. Defaults to #[code None].
|
|
|
|
|
|
|
|
|
|
+row
|
2017-05-20 14:59:31 +03:00
|
|
|
|
+cell #[code strings]
|
2017-05-29 02:06:49 +03:00
|
|
|
|
+cell #[code StringStore] or list
|
2016-10-31 21:04:15 +03:00
|
|
|
|
+cell
|
2017-05-29 02:06:49 +03:00
|
|
|
|
| A #[+api("stringstore") #[code StringStore]] that maps
|
|
|
|
|
| strings to hash values, and vice versa, or a list of strings.
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
2017-10-03 15:27:22 +03:00
|
|
|
|
+row("foot")
|
2017-05-19 01:02:34 +03:00
|
|
|
|
+cell returns
|
2016-10-31 21:04:15 +03:00
|
|
|
|
+cell #[code Vocab]
|
|
|
|
|
+cell The newly constructed object.
|
|
|
|
|
|
|
|
|
|
+h(2, "len") Vocab.__len__
|
|
|
|
|
+tag method
|
|
|
|
|
|
2017-05-20 14:59:31 +03:00
|
|
|
|
p Get the current number of lexemes in the vocabulary.
|
|
|
|
|
|
|
|
|
|
+aside-code("Example").
|
|
|
|
|
doc = nlp(u'This is a sentence.')
|
|
|
|
|
assert len(nlp.vocab) > 0
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
|
|
|
|
+table(["Name", "Type", "Description"])
|
2017-10-03 15:27:22 +03:00
|
|
|
|
+row("foot")
|
2017-05-19 01:02:34 +03:00
|
|
|
|
+cell returns
|
2016-10-31 21:04:15 +03:00
|
|
|
|
+cell int
|
|
|
|
|
+cell The number of lexems in the vocabulary.
|
|
|
|
|
|
|
|
|
|
+h(2, "getitem") Vocab.__getitem__
|
|
|
|
|
+tag method
|
|
|
|
|
|
|
|
|
|
p
|
|
|
|
|
| Retrieve a lexeme, given an int ID or a unicode string. If a previously
|
|
|
|
|
| unseen unicode string is given, a new lexeme is created and stored.
|
|
|
|
|
|
2017-05-20 14:59:31 +03:00
|
|
|
|
+aside-code("Example").
|
|
|
|
|
apple = nlp.vocab.strings['apple']
|
|
|
|
|
assert nlp.vocab[apple] == nlp.vocab[u'apple']
|
|
|
|
|
|
2016-10-31 21:04:15 +03:00
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
|
|
|
+row
|
|
|
|
|
+cell #[code id_or_string]
|
|
|
|
|
+cell int / unicode
|
2017-05-28 20:25:34 +03:00
|
|
|
|
+cell The hash value of a word, or its unicode string.
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
2017-10-03 15:27:22 +03:00
|
|
|
|
+row("foot")
|
2017-05-19 01:02:34 +03:00
|
|
|
|
+cell returns
|
2016-10-31 21:04:15 +03:00
|
|
|
|
+cell #[code Lexeme]
|
|
|
|
|
+cell The lexeme indicated by the given ID.
|
|
|
|
|
|
2017-05-20 14:00:13 +03:00
|
|
|
|
+h(2, "iter") Vocab.__iter__
|
2016-10-31 21:04:15 +03:00
|
|
|
|
+tag method
|
|
|
|
|
|
|
|
|
|
p Iterate over the lexemes in the vocabulary.
|
|
|
|
|
|
2017-05-20 14:59:31 +03:00
|
|
|
|
+aside-code("Example").
|
|
|
|
|
stop_words = (lex for lex in nlp.vocab if lex.is_stop)
|
|
|
|
|
|
2016-10-31 21:04:15 +03:00
|
|
|
|
+table(["Name", "Type", "Description"])
|
2017-10-03 15:27:22 +03:00
|
|
|
|
+row("foot")
|
2017-05-19 01:02:34 +03:00
|
|
|
|
+cell yields
|
2016-10-31 21:04:15 +03:00
|
|
|
|
+cell #[code Lexeme]
|
|
|
|
|
+cell An entry in the vocabulary.
|
|
|
|
|
|
|
|
|
|
+h(2, "contains") Vocab.__contains__
|
|
|
|
|
+tag method
|
|
|
|
|
|
2017-05-20 14:59:31 +03:00
|
|
|
|
p
|
|
|
|
|
| Check whether the string has an entry in the vocabulary. To get the ID
|
|
|
|
|
| for a given string, you need to look it up in
|
|
|
|
|
| #[+api("vocab#attributes") #[code vocab.strings]].
|
|
|
|
|
|
|
|
|
|
+aside-code("Example").
|
|
|
|
|
apple = nlp.vocab.strings['apple']
|
|
|
|
|
oov = nlp.vocab.strings['dskfodkfos']
|
|
|
|
|
assert apple in nlp.vocab
|
|
|
|
|
assert oov not in nlp.vocab
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
|
|
|
+row
|
|
|
|
|
+cell #[code string]
|
|
|
|
|
+cell unicode
|
|
|
|
|
+cell The ID string.
|
|
|
|
|
|
2017-10-03 15:27:22 +03:00
|
|
|
|
+row("foot")
|
2017-05-19 01:02:34 +03:00
|
|
|
|
+cell returns
|
2016-10-31 21:04:15 +03:00
|
|
|
|
+cell bool
|
|
|
|
|
+cell Whether the string has an entry in the vocabulary.
|
|
|
|
|
|
2017-05-20 14:59:31 +03:00
|
|
|
|
+h(2, "add_flag") Vocab.add_flag
|
2016-10-31 21:04:15 +03:00
|
|
|
|
+tag method
|
|
|
|
|
|
|
|
|
|
p
|
2017-05-20 14:59:31 +03:00
|
|
|
|
| Set a new boolean flag to words in the vocabulary. The #[code flag_getter]
|
|
|
|
|
| function will be called over the words currently in the vocab, and then
|
|
|
|
|
| applied to new words as they occur. You'll then be able to access the flag
|
|
|
|
|
| value on each token, using #[code token.check_flag(flag_id)].
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
2017-05-20 14:59:31 +03:00
|
|
|
|
+aside-code("Example").
|
|
|
|
|
def is_my_product(text):
|
|
|
|
|
products = [u'spaCy', u'Thinc', u'displaCy']
|
|
|
|
|
return text in products
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
2017-05-20 14:59:31 +03:00
|
|
|
|
MY_PRODUCT = nlp.vocab.add_flag(is_my_product)
|
|
|
|
|
doc = nlp(u'I like spaCy')
|
|
|
|
|
assert doc[2].check_flag(MY_PRODUCT) == True
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
|
|
|
+row
|
|
|
|
|
+cell #[code flag_getter]
|
|
|
|
|
+cell dict
|
|
|
|
|
+cell A function #[code f(unicode) -> bool], to get the flag value.
|
|
|
|
|
|
|
|
|
|
+row
|
|
|
|
|
+cell #[code flag_id]
|
|
|
|
|
+cell int
|
|
|
|
|
+cell
|
|
|
|
|
| An integer between 1 and 63 (inclusive), specifying the bit at
|
|
|
|
|
| which the flag will be stored. If #[code -1], the lowest
|
|
|
|
|
| available bit will be chosen.
|
|
|
|
|
|
2017-10-03 15:27:22 +03:00
|
|
|
|
+row("foot")
|
2017-05-19 01:02:34 +03:00
|
|
|
|
+cell returns
|
2016-10-31 21:04:15 +03:00
|
|
|
|
+cell int
|
|
|
|
|
+cell The integer ID by which the flag value can be checked.
|
|
|
|
|
|
2017-10-30 21:35:41 +03:00
|
|
|
|
+h(2, "clear_vectors") Vocab.clear_vectors
|
2017-10-03 15:27:22 +03:00
|
|
|
|
+tag method
|
|
|
|
|
+tag-new(2)
|
|
|
|
|
|
|
|
|
|
p
|
|
|
|
|
| Drop the current vector table. Because all vectors must be the same
|
|
|
|
|
| width, you have to call this to change the size of the vectors.
|
|
|
|
|
|
|
|
|
|
+aside-code("Example").
|
|
|
|
|
nlp.vocab.clear_vectors(new_dim=300)
|
|
|
|
|
|
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
|
|
|
+row
|
|
|
|
|
+cell #[code new_dim]
|
|
|
|
|
+cell int
|
|
|
|
|
+cell
|
|
|
|
|
| Number of dimensions of the new vectors. If #[code None], size
|
|
|
|
|
| is not changed.
|
|
|
|
|
|
2017-10-30 21:35:41 +03:00
|
|
|
|
+h(2, "prune_vectors") Vocab.prune_vectors
|
|
|
|
|
+tag method
|
|
|
|
|
+tag-new(2)
|
|
|
|
|
|
|
|
|
|
p
|
|
|
|
|
| Reduce the current vector table to #[code nr_row] unique entries. Words
|
|
|
|
|
| mapped to the discarded vectors will be remapped to the closest vector
|
|
|
|
|
| among those remaining. For example, suppose the original table had
|
|
|
|
|
| vectors for the words:
|
|
|
|
|
| #[code.u-break ['sat', 'cat', 'feline', 'reclined']]. If we prune the
|
|
|
|
|
| vector table to, two rows, we would discard the vectors for "feline"
|
|
|
|
|
| and "reclined". These words would then be remapped to the closest
|
|
|
|
|
| remaining vector – so "feline" would have the same vector as "cat",
|
|
|
|
|
| and "reclined" would have the same vector as "sat". The similarities are
|
|
|
|
|
| judged by cosine. The original vectors may be large, so the cosines are
|
|
|
|
|
| calculated in minibatches, to reduce memory usage.
|
|
|
|
|
|
|
|
|
|
+aside-code("Example").
|
|
|
|
|
nlp.vocab.prune_vectors(10000)
|
|
|
|
|
assert len(nlp.vocab.vectors) <= 1000
|
|
|
|
|
|
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
|
|
|
+row
|
|
|
|
|
+cell #[code nr_row]
|
|
|
|
|
+cell int
|
|
|
|
|
+cell The number of rows to keep in the vector table.
|
|
|
|
|
|
|
|
|
|
+row
|
|
|
|
|
+cell #[code batch_size]
|
|
|
|
|
+cell int
|
|
|
|
|
+cell
|
|
|
|
|
| Batch of vectors for calculating the similarities. Larger batch
|
|
|
|
|
| sizes might be faster, while temporarily requiring more memory.
|
|
|
|
|
|
|
|
|
|
+row("foot")
|
|
|
|
|
+cell returns
|
|
|
|
|
+cell dict
|
|
|
|
|
+cell
|
|
|
|
|
| A dictionary keyed by removed words mapped to
|
|
|
|
|
| #[code (string, score)] tuples, where #[code string] is the entry
|
|
|
|
|
| the removed word was mapped to, and #[code score] the similarity
|
|
|
|
|
| score between the two words.
|
|
|
|
|
|
|
|
|
|
+h(2, "get_vector") Vocab.get_vector
|
2017-10-03 15:27:22 +03:00
|
|
|
|
+tag method
|
|
|
|
|
+tag-new(2)
|
|
|
|
|
|
|
|
|
|
p
|
|
|
|
|
| Retrieve a vector for a word in the vocabulary. Words can be looked up
|
|
|
|
|
| by string or hash value. If no vectors data is loaded, a
|
|
|
|
|
| #[code ValueError] is raised.
|
|
|
|
|
|
|
|
|
|
+aside-code("Example").
|
|
|
|
|
nlp.vocab.get_vector(u'apple')
|
|
|
|
|
|
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
|
|
|
+row
|
|
|
|
|
+cell #[code orth]
|
|
|
|
|
+cell int / unicode
|
|
|
|
|
+cell The hash value of a word, or its unicode string.
|
|
|
|
|
|
|
|
|
|
+row("foot")
|
|
|
|
|
+cell returns
|
|
|
|
|
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
|
|
|
|
|
+cell
|
|
|
|
|
| A word vector. Size and shape are determined by the
|
|
|
|
|
| #[code Vocab.vectors] instance.
|
|
|
|
|
|
2017-10-30 21:35:41 +03:00
|
|
|
|
+h(2, "set_vector") Vocab.set_vector
|
2017-10-03 15:27:22 +03:00
|
|
|
|
+tag method
|
|
|
|
|
+tag-new(2)
|
|
|
|
|
|
|
|
|
|
p
|
|
|
|
|
| Set a vector for a word in the vocabulary. Words can be referenced by
|
|
|
|
|
| by string or hash value.
|
|
|
|
|
|
|
|
|
|
+aside-code("Example").
|
|
|
|
|
nlp.vocab.set_vector(u'apple', array([...]))
|
|
|
|
|
|
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
|
|
|
+row
|
|
|
|
|
+cell #[code orth]
|
|
|
|
|
+cell int / unicode
|
|
|
|
|
+cell The hash value of a word, or its unicode string.
|
|
|
|
|
|
|
|
|
|
+row
|
|
|
|
|
+cell #[code vector]
|
|
|
|
|
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
|
|
|
|
|
+cell The vector to set.
|
|
|
|
|
|
2017-10-30 21:35:41 +03:00
|
|
|
|
+h(2, "has_vector") Vocab.has_vector
|
2017-10-03 15:27:22 +03:00
|
|
|
|
+tag method
|
|
|
|
|
+tag-new(2)
|
|
|
|
|
|
|
|
|
|
p
|
|
|
|
|
| Check whether a word has a vector. Returns #[code False] if no vectors
|
|
|
|
|
| are loaded. Words can be looked up by string or hash value.
|
|
|
|
|
|
|
|
|
|
+aside-code("Example").
|
|
|
|
|
if nlp.vocab.has_vector(u'apple'):
|
|
|
|
|
vector = nlp.vocab.get_vector(u'apple')
|
|
|
|
|
|
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
|
|
|
+row
|
|
|
|
|
+cell #[code orth]
|
|
|
|
|
+cell int / unicode
|
|
|
|
|
+cell The hash value of a word, or its unicode string.
|
|
|
|
|
|
|
|
|
|
+row("foot")
|
|
|
|
|
+cell returns
|
|
|
|
|
+cell bool
|
|
|
|
|
+cell Whether the word has a vector.
|
|
|
|
|
|
2017-05-20 14:59:31 +03:00
|
|
|
|
+h(2, "to_disk") Vocab.to_disk
|
2016-10-31 21:04:15 +03:00
|
|
|
|
+tag method
|
2017-05-26 13:42:36 +03:00
|
|
|
|
+tag-new(2)
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
2017-05-20 14:59:31 +03:00
|
|
|
|
p Save the current state to a directory.
|
|
|
|
|
|
|
|
|
|
+aside-code("Example").
|
|
|
|
|
nlp.vocab.to_disk('/path/to/vocab')
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
|
|
|
+row
|
2017-05-20 14:59:31 +03:00
|
|
|
|
+cell #[code path]
|
|
|
|
|
+cell unicode or #[code Path]
|
|
|
|
|
+cell
|
|
|
|
|
| A path to a directory, which will be created if it doesn't exist.
|
|
|
|
|
| Paths may be either strings or #[code Path]-like objects.
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
2017-05-20 14:59:31 +03:00
|
|
|
|
+h(2, "from_disk") Vocab.from_disk
|
2016-10-31 21:04:15 +03:00
|
|
|
|
+tag method
|
2017-05-26 13:42:36 +03:00
|
|
|
|
+tag-new(2)
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
2017-05-20 14:59:31 +03:00
|
|
|
|
p Loads state from a directory. Modifies the object in place and returns it.
|
|
|
|
|
|
|
|
|
|
+aside-code("Example").
|
|
|
|
|
from spacy.vocab import Vocab
|
|
|
|
|
vocab = Vocab().from_disk('/path/to/vocab')
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
|
|
|
+row
|
2017-05-20 14:59:31 +03:00
|
|
|
|
+cell #[code path]
|
|
|
|
|
+cell unicode or #[code Path]
|
|
|
|
|
+cell
|
|
|
|
|
| A path to a directory. Paths may be either strings or
|
|
|
|
|
| #[code Path]-like objects.
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
2017-10-03 15:27:22 +03:00
|
|
|
|
+row("foot")
|
2017-05-19 01:02:34 +03:00
|
|
|
|
+cell returns
|
2017-05-20 14:59:31 +03:00
|
|
|
|
+cell #[code Vocab]
|
|
|
|
|
+cell The modified #[code Vocab] object.
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
2017-05-20 14:59:31 +03:00
|
|
|
|
+h(2, "to_bytes") Vocab.to_bytes
|
2016-10-31 21:04:15 +03:00
|
|
|
|
+tag method
|
|
|
|
|
|
2017-05-20 14:59:31 +03:00
|
|
|
|
p Serialize the current state to a binary string.
|
|
|
|
|
|
|
|
|
|
+aside-code("Example").
|
|
|
|
|
vocab_bytes = nlp.vocab.to_bytes()
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
|
|
|
+row
|
2017-05-20 14:59:31 +03:00
|
|
|
|
+cell #[code **exclude]
|
|
|
|
|
+cell -
|
|
|
|
|
+cell Named attributes to prevent from being serialized.
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
2017-10-03 15:27:22 +03:00
|
|
|
|
+row("foot")
|
2017-05-19 01:02:34 +03:00
|
|
|
|
+cell returns
|
2017-05-20 14:59:31 +03:00
|
|
|
|
+cell bytes
|
|
|
|
|
+cell The serialized form of the #[code Vocab] object.
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
2017-05-20 14:59:31 +03:00
|
|
|
|
+h(2, "from_bytes") Vocab.from_bytes
|
2016-10-31 21:04:15 +03:00
|
|
|
|
+tag method
|
|
|
|
|
|
2017-05-20 14:59:31 +03:00
|
|
|
|
p Load state from a binary string.
|
|
|
|
|
|
|
|
|
|
+aside-code("Example").
|
|
|
|
|
fron spacy.vocab import Vocab
|
|
|
|
|
vocab_bytes = nlp.vocab.to_bytes()
|
|
|
|
|
vocab = Vocab()
|
|
|
|
|
vocab.from_bytes(vocab_bytes)
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
|
|
|
+row
|
2017-05-20 14:59:31 +03:00
|
|
|
|
+cell #[code bytes_data]
|
|
|
|
|
+cell bytes
|
|
|
|
|
+cell The data to load from.
|
|
|
|
|
|
|
|
|
|
+row
|
|
|
|
|
+cell #[code **exclude]
|
|
|
|
|
+cell -
|
|
|
|
|
+cell Named attributes to prevent from being loaded.
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
2017-10-03 15:27:22 +03:00
|
|
|
|
+row("foot")
|
2017-05-19 01:02:34 +03:00
|
|
|
|
+cell returns
|
2017-05-21 14:18:39 +03:00
|
|
|
|
+cell #[code Vocab]
|
|
|
|
|
+cell The #[code Vocab] object.
|
2017-05-20 14:59:31 +03:00
|
|
|
|
|
|
|
|
|
+h(2, "attributes") Attributes
|
|
|
|
|
|
|
|
|
|
+aside-code("Example").
|
|
|
|
|
apple_id = nlp.vocab.strings['apple']
|
|
|
|
|
assert type(apple_id) == int
|
|
|
|
|
PERSON = nlp.vocab.strings['PERSON']
|
|
|
|
|
assert type(PERSON) == int
|
|
|
|
|
|
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
|
|
|
+row
|
|
|
|
|
+cell #[code strings]
|
|
|
|
|
+cell #[code StringStore]
|
|
|
|
|
+cell A table managing the string-to-int mapping.
|
2017-10-03 15:27:22 +03:00
|
|
|
|
|
|
|
|
|
+row
|
|
|
|
|
+cell #[code vectors]
|
|
|
|
|
+tag-new(2)
|
|
|
|
|
+cell #[code Vectors]
|
|
|
|
|
+cell A table associating word IDs to word vectors.
|
|
|
|
|
|
|
|
|
|
+row
|
|
|
|
|
+cell #[code vectors_length]
|
|
|
|
|
+cell int
|
|
|
|
|
+cell Number of dimensions for each word vector.
|