//- 💫 DOCS > API > VOCAB include ../_includes/_mixins p | The #[code Vocab] object provides a lookup table that allows you to | access #[+api("lexeme") #[code Lexeme]] objects, as well as the | #[+api("stringstore") #[code StringStore]]. It also owns underlying | C-data that is shared between #[code Doc] objects. +h(2, "init") Vocab.__init__ +tag method p Create the vocabulary. +aside-code("Example"). from spacy.vocab import Vocab vocab = Vocab(strings=[u'hello', u'world']) +table(["Name", "Type", "Description"]) +row +cell #[code lex_attr_getters] +cell dict +cell | A dictionary mapping attribute IDs to functions to compute them. | Defaults to #[code None]. +row +cell #[code tag_map] +cell dict +cell | A dictionary mapping fine-grained tags to coarse-grained | parts-of-speech, and optionally morphological attributes. +row +cell #[code lemmatizer] +cell object +cell A lemmatizer. Defaults to #[code None]. +row +cell #[code strings] +cell #[code StringStore] or list +cell | A #[+api("stringstore") #[code StringStore]] that maps | strings to hash values, and vice versa, or a list of strings. +row("foot") +cell returns +cell #[code Vocab] +cell The newly constructed object. +h(2, "len") Vocab.__len__ +tag method p Get the current number of lexemes in the vocabulary. +aside-code("Example"). doc = nlp(u'This is a sentence.') assert len(nlp.vocab) > 0 +table(["Name", "Type", "Description"]) +row("foot") +cell returns +cell int +cell The number of lexems in the vocabulary. +h(2, "getitem") Vocab.__getitem__ +tag method p | Retrieve a lexeme, given an int ID or a unicode string. If a previously | unseen unicode string is given, a new lexeme is created and stored. +aside-code("Example"). apple = nlp.vocab.strings['apple'] assert nlp.vocab[apple] == nlp.vocab[u'apple'] +table(["Name", "Type", "Description"]) +row +cell #[code id_or_string] +cell int / unicode +cell The hash value of a word, or its unicode string. +row("foot") +cell returns +cell #[code Lexeme] +cell The lexeme indicated by the given ID. +h(2, "iter") Vocab.__iter__ +tag method p Iterate over the lexemes in the vocabulary. +aside-code("Example"). stop_words = (lex for lex in nlp.vocab if lex.is_stop) +table(["Name", "Type", "Description"]) +row("foot") +cell yields +cell #[code Lexeme] +cell An entry in the vocabulary. +h(2, "contains") Vocab.__contains__ +tag method p | Check whether the string has an entry in the vocabulary. To get the ID | for a given string, you need to look it up in | #[+api("vocab#attributes") #[code vocab.strings]]. +aside-code("Example"). apple = nlp.vocab.strings['apple'] oov = nlp.vocab.strings['dskfodkfos'] assert apple in nlp.vocab assert oov not in nlp.vocab +table(["Name", "Type", "Description"]) +row +cell #[code string] +cell unicode +cell The ID string. +row("foot") +cell returns +cell bool +cell Whether the string has an entry in the vocabulary. +h(2, "add_flag") Vocab.add_flag +tag method p | Set a new boolean flag to words in the vocabulary. The #[code flag_getter] | function will be called over the words currently in the vocab, and then | applied to new words as they occur. You'll then be able to access the flag | value on each token, using #[code token.check_flag(flag_id)]. +aside-code("Example"). def is_my_product(text): products = [u'spaCy', u'Thinc', u'displaCy'] return text in products MY_PRODUCT = nlp.vocab.add_flag(is_my_product) doc = nlp(u'I like spaCy') assert doc[2].check_flag(MY_PRODUCT) == True +table(["Name", "Type", "Description"]) +row +cell #[code flag_getter] +cell dict +cell A function #[code f(unicode) -> bool], to get the flag value. +row +cell #[code flag_id] +cell int +cell | An integer between 1 and 63 (inclusive), specifying the bit at | which the flag will be stored. If #[code -1], the lowest | available bit will be chosen. +row("foot") +cell returns +cell int +cell The integer ID by which the flag value can be checked. +h(2, "add_flag") Vocab.clear_vectors +tag method +tag-new(2) p | Drop the current vector table. Because all vectors must be the same | width, you have to call this to change the size of the vectors. +aside-code("Example"). nlp.vocab.clear_vectors(new_dim=300) +table(["Name", "Type", "Description"]) +row +cell #[code new_dim] +cell int +cell | Number of dimensions of the new vectors. If #[code None], size | is not changed. +h(2, "add_flag") Vocab.get_vector +tag method +tag-new(2) p | Retrieve a vector for a word in the vocabulary. Words can be looked up | by string or hash value. If no vectors data is loaded, a | #[code ValueError] is raised. +aside-code("Example"). nlp.vocab.get_vector(u'apple') +table(["Name", "Type", "Description"]) +row +cell #[code orth] +cell int / unicode +cell The hash value of a word, or its unicode string. +row("foot") +cell returns +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] +cell | A word vector. Size and shape are determined by the | #[code Vocab.vectors] instance. +h(2, "add_flag") Vocab.set_vector +tag method +tag-new(2) p | Set a vector for a word in the vocabulary. Words can be referenced by | by string or hash value. +aside-code("Example"). nlp.vocab.set_vector(u'apple', array([...])) +table(["Name", "Type", "Description"]) +row +cell #[code orth] +cell int / unicode +cell The hash value of a word, or its unicode string. +row +cell #[code vector] +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] +cell The vector to set. +h(2, "add_flag") Vocab.has_vector +tag method +tag-new(2) p | Check whether a word has a vector. Returns #[code False] if no vectors | are loaded. Words can be looked up by string or hash value. +aside-code("Example"). if nlp.vocab.has_vector(u'apple'): vector = nlp.vocab.get_vector(u'apple') +table(["Name", "Type", "Description"]) +row +cell #[code orth] +cell int / unicode +cell The hash value of a word, or its unicode string. +row("foot") +cell returns +cell bool +cell Whether the word has a vector. +h(2, "to_disk") Vocab.to_disk +tag method +tag-new(2) p Save the current state to a directory. +aside-code("Example"). nlp.vocab.to_disk('/path/to/vocab') +table(["Name", "Type", "Description"]) +row +cell #[code path] +cell unicode or #[code Path] +cell | A path to a directory, which will be created if it doesn't exist. | Paths may be either strings or #[code Path]-like objects. +h(2, "from_disk") Vocab.from_disk +tag method +tag-new(2) p Loads state from a directory. Modifies the object in place and returns it. +aside-code("Example"). from spacy.vocab import Vocab vocab = Vocab().from_disk('/path/to/vocab') +table(["Name", "Type", "Description"]) +row +cell #[code path] +cell unicode or #[code Path] +cell | A path to a directory. Paths may be either strings or | #[code Path]-like objects. +row("foot") +cell returns +cell #[code Vocab] +cell The modified #[code Vocab] object. +h(2, "to_bytes") Vocab.to_bytes +tag method p Serialize the current state to a binary string. +aside-code("Example"). vocab_bytes = nlp.vocab.to_bytes() +table(["Name", "Type", "Description"]) +row +cell #[code **exclude] +cell - +cell Named attributes to prevent from being serialized. +row("foot") +cell returns +cell bytes +cell The serialized form of the #[code Vocab] object. +h(2, "from_bytes") Vocab.from_bytes +tag method p Load state from a binary string. +aside-code("Example"). fron spacy.vocab import Vocab vocab_bytes = nlp.vocab.to_bytes() vocab = Vocab() vocab.from_bytes(vocab_bytes) +table(["Name", "Type", "Description"]) +row +cell #[code bytes_data] +cell bytes +cell The data to load from. +row +cell #[code **exclude] +cell - +cell Named attributes to prevent from being loaded. +row("foot") +cell returns +cell #[code Vocab] +cell The #[code Vocab] object. +h(2, "attributes") Attributes +aside-code("Example"). apple_id = nlp.vocab.strings['apple'] assert type(apple_id) == int PERSON = nlp.vocab.strings['PERSON'] assert type(PERSON) == int +table(["Name", "Type", "Description"]) +row +cell #[code strings] +cell #[code StringStore] +cell A table managing the string-to-int mapping. +row +cell #[code vectors] +tag-new(2) +cell #[code Vectors] +cell A table associating word IDs to word vectors. +row +cell #[code vectors_length] +cell int +cell Number of dimensions for each word vector.