//- ---------------------------------- //- 💫 DOCS > API > VOCAB //- ---------------------------------- +section("vocab") +h(2, "vocab", "https://github.com/" + SOCIAL.github + "/spaCy/blob/master/spacy/vocab.pyx") | #[+tag class] Vocab p | A look-up table that allows you to access #[code.lang-python Lexeme] | objects. The #[code.lang-python Vocab] instance also provides access to | the #[code.lang-python StringStore], and owns underlying C-data that | is shared between #[code.lang-python Doc] objects. +aside('Caveat'). You should avoid working with #[code Doc], #[code Token] or #[code Span] objects backed by multiple different #[code Vocab] instances, as they may assume inconsistent string-to-integer encodings. All #[code Doc] objects produced by the same #[code Language] instance will hold a reference to the same #[code Vocab] instance. +code("python", "Overview"). class Vocab: StringStore strings Morphology morphology dict get_lex_attr int vectors_length def __init__(self, get_lex_attr=None, tag_map=None, lemmatizer=None, serializer_freqs=None): return self @classmethod def load(cls, data_dir, get_lex_attr): return Vocab() @classmethod def from_package(cls, package, get_lx_attr=None, vectors_package=None): return Vocab() property serializer: return Packer() def __len__(self): return int def __contains__(self, string): return bool def __getitem__(self, id_or_string): return Lexeme() def dump(self, loc): return None def load_lexemes(self, loc): return None def dump_vectors(self, out_loc): return None def load_vectors(self, file_): return int def load_vectors_from_bin_loc(self, loc): return int +table(["Example", "Description"]) +row +cell #[code.lang-python lexeme = vocab[integer_id]] +cell. Get a lexeme by its orth ID. +row +cell #[code.lang-python lexeme = vocab[string]] +cell. Get a lexeme by the string corresponding to its orth ID. +row +cell #[code.lang-python for lexeme in vocab] +cell. Iterate over #[code Lexeme] objects. +row +cell #[code.lang-python int_id = vocab.strings[u'dog']] +cell. Access the #[code StringStore] via #[code vocab.strings] +row +cell #[code.lang-python nlp.vocab is nlp.tokenizer.vocab] +cell. Access the from #[code.lang-python Doc] +section("vocab-dump") +h(3, "vocab-dump") | #[+tag method] Vocab.dump +code("python", "Definition"). def dump(self, loc): return None +table(["Name", "Type", "Description"]) +row +cell loc +cell #[+a(link_unicode) unicode] +cell Path where the vocabulary should be saved. +section("vocab-load_lexemes") +h(3, "vocab-load_lexemes") | #[+tag method] Vocab.load_lexemes +code("python", "Definition"). def load_lexemes(self, loc): return None +table(["Name", "Type", "Description"]) +row +cell loc +cell #[+a(link_unicode) unicode] +cell Path to load the lexemes.bin file from. +section("vocab-dump_vectors") +h(3, "vocab-dump_vectors") | #[+tag method] Vocab.dump_vectors +code("python", "Definition"). def dump_vectors(self, loc): return None +section("vocab-loadvectors") +h(3, "vocab-loadvectors") | #[+tag method] Vocab.load_vectors +code("python", "Definition"). def load_vectors(self, file_): return None +table(["Name", "Type", "Description"]) +row +cell file +cell #[+a(link_unicode) unicode] +cell A file-like object, to load word vectors from. +section("vocab-loadvectorsfrombinloc") +h(3, "vocab-saveload-loadvectorsfrom") | #[+tag method] Vocab.load_vectors_from_bin_loc +code("python", "Definition"). def load_vectors_from_bin_loc(self, loc): return None +table(["Name", "Type", "Description"]) +row +cell loc +cell #[+a(link_unicode) unicode] +cell. A path to a file, in spaCy's binary word-vectors file format.