mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 18:06:29 +03:00
279 lines
6.7 KiB
Plaintext
279 lines
6.7 KiB
Plaintext
//- 💫 DOCS > API > VOCAB
|
|
|
|
include ../../_includes/_mixins
|
|
|
|
p
|
|
| A look-up table that allows you to access #[code Lexeme] objects. The
|
|
| #[code Vocab] instance also provides access to the #[code StringStore],
|
|
| and owns underlying C-data that is shared between #[code Doc] objects.
|
|
|
|
+h(2, "attributes") Attributes
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
+row
|
|
+cell #[code strings]
|
|
+cell #[code StringStore]
|
|
+cell A table managing the string-to-int mapping.
|
|
|
|
+row
|
|
+cell #[code vectors_length]
|
|
+cell int
|
|
+cell The dimensionality of the word vectors, if present.
|
|
|
|
+h(2, "load") Vocab.load
|
|
+tag classmethod
|
|
|
|
p Load the vocabulary from a path.
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
+row
|
|
+cell #[code path]
|
|
+cell #[code Path]
|
|
+cell The path to load from.
|
|
|
|
+row
|
|
+cell #[code lex_attr_getters]
|
|
+cell dict
|
|
+cell
|
|
| A dictionary mapping attribute IDs to functions to compute them.
|
|
| Defaults to #[code None].
|
|
|
|
+row
|
|
+cell #[code lemmatizer]
|
|
+cell -
|
|
+cell A lemmatizer. Defaults to #[code None].
|
|
|
|
+row
|
|
+cell #[code tag_map]
|
|
+cell dict
|
|
+cell
|
|
| A dictionary mapping fine-grained tags to coarse-grained
|
|
| parts-of-speech, and optionally morphological attributes.
|
|
|
|
+row
|
|
+cell #[code oov_prob]
|
|
+cell float
|
|
+cell The default probability for out-of-vocabulary words.
|
|
|
|
+footrow
|
|
+cell return
|
|
+cell #[code Vocab]
|
|
+cell The newly constructed object.
|
|
|
|
+h(2, "init") Vocab.__init__
|
|
+tag method
|
|
|
|
p Create the vocabulary.
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
+row
|
|
+cell #[code lex_attr_getters]
|
|
+cell dict
|
|
+cell
|
|
| A dictionary mapping attribute IDs to functions to compute them.
|
|
| Defaults to #[code None].
|
|
|
|
+row
|
|
+cell #[code lemmatizer]
|
|
+cell -
|
|
+cell A lemmatizer. Defaults to #[code None].
|
|
|
|
+row
|
|
+cell #[code tag_map]
|
|
+cell dict
|
|
+cell
|
|
| A dictionary mapping fine-grained tags to coarse-grained
|
|
| parts-of-speech, and optionally morphological attributes.
|
|
|
|
+row
|
|
+cell #[code oov_prob]
|
|
+cell float
|
|
+cell The default probability for out-of-vocabulary words.
|
|
|
|
+footrow
|
|
+cell return
|
|
+cell #[code Vocab]
|
|
+cell The newly constructed object.
|
|
|
|
+h(2, "len") Vocab.__len__
|
|
+tag method
|
|
|
|
p Get the number of lexemes in the vocabulary.
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
+footrow
|
|
+cell return
|
|
+cell int
|
|
+cell The number of lexems in the vocabulary.
|
|
|
|
+h(2, "getitem") Vocab.__getitem__
|
|
+tag method
|
|
|
|
p
|
|
| Retrieve a lexeme, given an int ID or a unicode string. If a previously
|
|
| unseen unicode string is given, a new lexeme is created and stored.
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
+row
|
|
+cell #[code id_or_string]
|
|
+cell int / unicode
|
|
+cell The integer ID of a word, or its unicode string.
|
|
|
|
+footrow
|
|
+cell return
|
|
+cell #[code Lexeme]
|
|
+cell The lexeme indicated by the given ID.
|
|
|
|
+h(2, "iter") Span.__iter__
|
|
+tag method
|
|
|
|
p Iterate over the lexemes in the vocabulary.
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
+footrow
|
|
+cell yield
|
|
+cell #[code Lexeme]
|
|
+cell An entry in the vocabulary.
|
|
|
|
+h(2, "contains") Vocab.__contains__
|
|
+tag method
|
|
|
|
p Check whether the string has an entry in the vocabulary.
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
+row
|
|
+cell #[code string]
|
|
+cell unicode
|
|
+cell The ID string.
|
|
|
|
+footrow
|
|
+cell return
|
|
+cell bool
|
|
+cell Whether the string has an entry in the vocabulary.
|
|
|
|
+h(2, "resize_vectors") Vocab.resize_vectors
|
|
+tag method
|
|
|
|
p
|
|
| Set #[code vectors_length] to a new size, and allocate more memory for
|
|
| the #[code Lexeme] vectors if necessary. The memory will be zeroed.
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
+row
|
|
+cell #[code new_size]
|
|
+cell int
|
|
+cell The new size of the vectors.
|
|
|
|
+footrow
|
|
+cell return
|
|
+cell #[code None]
|
|
+cell -
|
|
|
|
+h(2, "add_flag") Vocab.add_flag
|
|
+tag method
|
|
|
|
p Set a new boolean flag to words in the vocabulary.
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
+row
|
|
+cell #[code flag_getter]
|
|
+cell dict
|
|
+cell A function #[code f(unicode) -> bool], to get the flag value.
|
|
|
|
+row
|
|
+cell #[code flag_id]
|
|
+cell int
|
|
+cell
|
|
| An integer between 1 and 63 (inclusive), specifying the bit at
|
|
| which the flag will be stored. If #[code -1], the lowest
|
|
| available bit will be chosen.
|
|
|
|
+footrow
|
|
+cell return
|
|
+cell int
|
|
+cell The integer ID by which the flag value can be checked.
|
|
|
|
+h(2, "dump") Vocab.dump
|
|
+tag method
|
|
|
|
p Save the lexemes binary data to the given location.
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
+row
|
|
+cell #[code loc]
|
|
+cell #[code Path]
|
|
+cell The path to load from.
|
|
|
|
+footrow
|
|
+cell return
|
|
+cell #[code None]
|
|
+cell -
|
|
|
|
+h(2, "load_lexemes") Vocab.load_lexemes
|
|
+tag method
|
|
|
|
p
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
+row
|
|
+cell #[code loc]
|
|
+cell unicode
|
|
+cell Path to load the lexemes.bin file from.
|
|
|
|
+footrow
|
|
+cell return
|
|
+cell #[code None]
|
|
+cell -
|
|
|
|
+h(2, "dump_vectors") Vocab.dump_vectors
|
|
+tag method
|
|
|
|
p Save the word vectors to a binary file.
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
+row
|
|
+cell #[code loc]
|
|
+cell #[code Path]
|
|
+cell The path to save to.
|
|
|
|
+footrow
|
|
+cell return
|
|
+cell #[code None]
|
|
+cell -
|
|
|
|
+h(2, "load_vectors") Vocab.load_vectors
|
|
+tag method
|
|
|
|
p Load vectors from a text-based file.
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
+row
|
|
+cell #[code file_]
|
|
+cell buffer
|
|
+cell
|
|
| The file to read from. Entries should be separated by newlines,
|
|
| and each entry should be whitespace delimited. The first value
|
|
| of the entry should be the word string, and subsequent entries
|
|
| should be the values of the vector.
|
|
|
|
+footrow
|
|
+cell return
|
|
+cell int
|
|
+cell The length of the vectors loaded.
|
|
|
|
+h(2, "load_vectors_from_bin_loc") Vocab.load_vectors_from_bin_loc
|
|
+tag method
|
|
|
|
p Load vectors from the location of a binary file.
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
+row
|
|
+cell #[code loc]
|
|
+cell unicode
|
|
+cell The path of the binary file to load from.
|
|
|
|
+footrow
|
|
+cell return
|
|
+cell int
|
|
+cell The length of the vectors loaded.
|