spaCy/website/docs/_api-vocab.jade

//-  Docs > API > Vocab
//- ============================================================================

+section('vocab')
    +h2('vocab', 'https://github.com/' + profiles.github + '/spaCy/blob/master/spacy/vocab.pyx#L47')
        | #[+label('tag') class] Vocab

    p
        | A look-up table that allows you to access #[code.lang-python Lexeme]
        | objects. The #[code.lang-python Vocab] instance also provides access to
        | the #[code.lang-python StringStore], and owns underlying C-data that
        | is shared between #[code.lang-python Doc] objects.

        +aside('Caveat').
            You should avoid working with #[code Doc], #[code Token] or #[code Span]
            objects backed by multiple different #[code Vocab] instances, as
            they may assume inconsistent string-to-integer encodings. All #[code Doc]
            objects produced by the same #[code Language] instance will hold
            a reference to the same #[code Vocab] instance.

    +code('python', 'Overview').
        class Vocab:
            StringStore strings
            Morphology morphology
            dict get_lex_attr
            int vectors_length

            def __init__(self, get_lex_attr=None, tag_map=None, lemmatizer=None, serializer_freqs=None):
                return self

            @classmethod
            def load(cls, data_dir, get_lex_attr):
                return Vocab()

            @classmethod
            def from_package(cls, package, get_lx_attr=None, vectors_package=None):
                return Vocab()

            property serializer:
                return Packer()

            def __len__(self):
                return int

            def __contains__(self, string):
                return bool

            def __getitem__(self, id_or_string):
                return Lexeme()

            def dump(self, loc):
                return None

            def load_lexemes(self, loc):
                return None

            def dump_vectors(self, out_loc):
                return None

            def load_vectors(self, file_):
                return int

            def load_vectors_from_bin_loc(self, loc):
                return int
    
    +table(['Example', 'Description'], 'code')
        +row
            +cell #[code.lang-python lexeme = vocab[integer_id]]
            +cell.
                Get a lexeme by its orth ID.

        +row
            +cell #[code.lang-python lexeme = vocab[string]]
            +cell.
                Get a lexeme by the string corresponding to its orth ID.

        +row
            +cell #[code.lang-python for lexeme in vocab]
            +cell.
                Iterate over #[code Lexeme] objects.
        +row
            +cell #[code.lang-python int_id = vocab.strings[u'dog']]
            +cell.
                Access the #[code StringStore] via #[code vocab.strings]
        +row
            +cell #[code.lang-python nlp.vocab is nlp.tokenizer.vocab]
            +cell.
                Access the from #[code.lang-python Doc]

    +section('vocab-dump')
        +h3('vocab-dump')
            | #[+label('tag') method] Vocab.dump

        +code('python', 'definition').
            def dump(self, loc):
                return None

        +table(['Name', 'Type', 'Description'], 'params')
            +row
                +cell loc
                +cell #[a(href=link_unicode target='_blank') unicode]
                +cell.
                    Path where the vocabulary should be saved.

    +section('vocab-load_lexemes')
        +h3('vocab-load_lexemes')
            | #[+label('tag') method] Vocab.load_lexemes

        +code('python', 'definition').
            def load_lexemes(self, loc):
                return None

        +table(['Name', 'Type', 'Description'], 'params')
            +row
                +cell loc
                +cell #[a(href=link_unicode target='_blank') unicode]
                +cell.
                    Path to load the lexemes.bin file from.

        +section('vocab-dump_vectors')
            +h3('vocab-dump_vectors')
                | #[+label('tag') method] Vocab.dump_vectors

            +code('python', 'definition').
                def dump_vectors(self, loc):
                    return None

    +section('vocab-loadvectors')
        +h3('vocab-loadvectors')
            | #[+label('tag') method] Vocab.load_vectors

        +code('python', 'definition').
            def load_vectors(self, file_):
                return None

        +table(['Name', 'Type', 'Description'], 'params')
            +row
                +cell file
                +cell #[a(href=link_unicode target='_blank') unicode]
                +cell.
                    A file-like object, to load word vectors from.


    +section('vocab-loadvectorsfrombinloc')
        +h3('vocab-saveload-loadvectorsfrom')
            | #[+label('tag') method] Vocab.load_vectors_from_bin_loc

        +code('python', 'definition').
            def load_vectors_from_bin_loc(self, loc):
                return None

        +table(['Name', 'Type', 'Description'], 'params')
            +row
                +cell loc
                +cell #[a(href=link_unicode target='_blank') unicode]
                +cell.
                    A path to a file, in spaCy's binary word-vectors file  format.
Replace website with new version 2016-03-31 17:24:48 +03:00			`//- Docs > API > Vocab`
			`//- ============================================================================`

			`+section('vocab')`
			`+h2('vocab', 'https://github.com/' + profiles.github + '/spaCy/blob/master/spacy/vocab.pyx#L47')`
			`\| #[+label('tag') class] Vocab`

			`p`
			`\| A look-up table that allows you to access #[code.lang-python Lexeme]`
			`\| objects. The #[code.lang-python Vocab] instance also provides access to`
			`\| the #[code.lang-python StringStore], and owns underlying C-data that`
			`\| is shared between #[code.lang-python Doc] objects.`

			`+aside('Caveat').`
			`You should avoid working with #[code Doc], #[code Token] or #[code Span]`
			`objects backed by multiple different #[code Vocab] instances, as`
			`they may assume inconsistent string-to-integer encodings. All #[code Doc]`
			`objects produced by the same #[code Language] instance will hold`
			`a reference to the same #[code Vocab] instance.`

			`+code('python', 'Overview').`
			`class Vocab:`
			`StringStore strings`
			`Morphology morphology`
			`dict get_lex_attr`
			`int vectors_length`

			`def __init__(self, get_lex_attr=None, tag_map=None, lemmatizer=None, serializer_freqs=None):`
			`return self`

			`@classmethod`
			`def load(cls, data_dir, get_lex_attr):`
			`return Vocab()`

			`@classmethod`
			`def from_package(cls, package, get_lx_attr=None, vectors_package=None):`
			`return Vocab()`

			`property serializer:`
			`return Packer()`

			`def __len__(self):`
			`return int`

			`def __contains__(self, string):`
			`return bool`

			`def __getitem__(self, id_or_string):`
			`return Lexeme()`

			`def dump(self, loc):`
			`return None`

			`def load_lexemes(self, loc):`
			`return None`

			`def dump_vectors(self, out_loc):`
			`return None`

			`def load_vectors(self, file_):`
			`return int`

			`def load_vectors_from_bin_loc(self, loc):`
			`return int`

			`+table(['Example', 'Description'], 'code')`
			`+row`
			`+cell #[code.lang-python lexeme = vocab[integer_id]]`
			`+cell.`
			`Get a lexeme by its orth ID.`

			`+row`
			`+cell #[code.lang-python lexeme = vocab[string]]`
			`+cell.`
			`Get a lexeme by the string corresponding to its orth ID.`

			`+row`
			`+cell #[code.lang-python for lexeme in vocab]`
			`+cell.`
			`Iterate over #[code Lexeme] objects.`
			`+row`
			`+cell #[code.lang-python int_id = vocab.strings[u'dog']]`
			`+cell.`
			`Access the #[code StringStore] via #[code vocab.strings]`
			`+row`
			`+cell #[code.lang-python nlp.vocab is nlp.tokenizer.vocab]`
			`+cell.`
			`Access the from #[code.lang-python Doc]`

			`+section('vocab-dump')`
			`+h3('vocab-dump')`
			`\| #[+label('tag') method] Vocab.dump`

			`+code('python', 'definition').`
			`def dump(self, loc):`
			`return None`

			`+table(['Name', 'Type', 'Description'], 'params')`
			`+row`
			`+cell loc`
			`+cell #[a(href=link_unicode target='_blank') unicode]`
			`+cell.`
			`Path where the vocabulary should be saved.`

			`+section('vocab-load_lexemes')`
			`+h3('vocab-load_lexemes')`
			`\| #[+label('tag') method] Vocab.load_lexemes`

			`+code('python', 'definition').`
			`def load_lexemes(self, loc):`
			`return None`

			`+table(['Name', 'Type', 'Description'], 'params')`
			`+row`
			`+cell loc`
			`+cell #[a(href=link_unicode target='_blank') unicode]`
			`+cell.`
			`Path to load the lexemes.bin file from.`

			`+section('vocab-dump_vectors')`
			`+h3('vocab-dump_vectors')`
			`\| #[+label('tag') method] Vocab.dump_vectors`

			`+code('python', 'definition').`
			`def dump_vectors(self, loc):`
			`return None`

			`+section('vocab-loadvectors')`
			`+h3('vocab-loadvectors')`
			`\| #[+label('tag') method] Vocab.load_vectors`

			`+code('python', 'definition').`
			`def load_vectors(self, file_):`
			`return None`

			`+table(['Name', 'Type', 'Description'], 'params')`
			`+row`
			`+cell file`
			`+cell #[a(href=link_unicode target='_blank') unicode]`
			`+cell.`
			`A file-like object, to load word vectors from.`


			`+section('vocab-loadvectorsfrombinloc')`
			`+h3('vocab-saveload-loadvectorsfrom')`
			`\| #[+label('tag') method] Vocab.load_vectors_from_bin_loc`

			`+code('python', 'definition').`
			`def load_vectors_from_bin_loc(self, loc):`
			`return None`

			`+table(['Name', 'Type', 'Description'], 'params')`
			`+row`
			`+cell loc`
			`+cell #[a(href=link_unicode target='_blank') unicode]`
			`+cell.`
			`A path to a file, in spaCy's binary word-vectors file format.`