mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			155 lines
		
	
	
		
			4.9 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			155 lines
		
	
	
		
			4.9 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
| //- ----------------------------------
 | |
| //- 💫 DOCS > API > VOCAB
 | |
| //- ----------------------------------
 | |
| 
 | |
| +section("vocab")
 | |
|     +h(2, "vocab", "https://github.com/" + SOCIAL.github + "/spaCy/blob/master/spacy/vocab.pyx")
 | |
|         | #[+tag class] Vocab
 | |
| 
 | |
|     p
 | |
|         | A look-up table that allows you to access #[code.lang-python Lexeme]
 | |
|         | objects. The #[code.lang-python Vocab] instance also provides access to
 | |
|         | the #[code.lang-python StringStore], and owns underlying C-data that
 | |
|         | is shared between #[code.lang-python Doc] objects.
 | |
| 
 | |
|         +aside('Caveat').
 | |
|             You should avoid working with #[code Doc], #[code Token] or #[code Span]
 | |
|             objects backed by multiple different #[code Vocab] instances, as
 | |
|             they may assume inconsistent string-to-integer encodings. All #[code Doc]
 | |
|             objects produced by the same #[code Language] instance will hold
 | |
|             a reference to the same #[code Vocab] instance.
 | |
| 
 | |
|     +code("python", "Overview").
 | |
|         class Vocab:
 | |
|             StringStore strings
 | |
|             Morphology morphology
 | |
|             dict get_lex_attr
 | |
|             int vectors_length
 | |
| 
 | |
|             def __init__(self, get_lex_attr=None, tag_map=None, lemmatizer=None, serializer_freqs=None):
 | |
|                 return self
 | |
| 
 | |
|             @classmethod
 | |
|             def load(cls, data_dir, get_lex_attr):
 | |
|                 return Vocab()
 | |
| 
 | |
|             @classmethod
 | |
|             def from_package(cls, package, get_lx_attr=None, vectors_package=None):
 | |
|                 return Vocab()
 | |
| 
 | |
|             property serializer:
 | |
|                 return Packer()
 | |
| 
 | |
|             def __len__(self):
 | |
|                 return int
 | |
| 
 | |
|             def __contains__(self, string):
 | |
|                 return bool
 | |
| 
 | |
|             def __getitem__(self, id_or_string):
 | |
|                 return Lexeme()
 | |
| 
 | |
|             def dump(self, loc):
 | |
|                 return None
 | |
| 
 | |
|             def load_lexemes(self, loc):
 | |
|                 return None
 | |
| 
 | |
|             def dump_vectors(self, out_loc):
 | |
|                 return None
 | |
| 
 | |
|             def load_vectors(self, file_):
 | |
|                 return int
 | |
| 
 | |
|             def load_vectors_from_bin_loc(self, loc):
 | |
|                 return int
 | |
| 
 | |
|     +table(["Example", "Description"])
 | |
|         +row
 | |
|             +cell #[code.lang-python lexeme = vocab[integer_id]]
 | |
|             +cell.
 | |
|                 Get a lexeme by its orth ID.
 | |
| 
 | |
|         +row
 | |
|             +cell #[code.lang-python lexeme = vocab[string]]
 | |
|             +cell.
 | |
|                 Get a lexeme by the string corresponding to its orth ID.
 | |
| 
 | |
|         +row
 | |
|             +cell #[code.lang-python for lexeme in vocab]
 | |
|             +cell.
 | |
|                 Iterate over #[code Lexeme] objects.
 | |
|         +row
 | |
|             +cell #[code.lang-python int_id = vocab.strings[u'dog']]
 | |
|             +cell.
 | |
|                 Access the #[code StringStore] via #[code vocab.strings]
 | |
|         +row
 | |
|             +cell #[code.lang-python nlp.vocab is nlp.tokenizer.vocab]
 | |
|             +cell.
 | |
|                 Access the from #[code.lang-python Doc]
 | |
| 
 | |
|     +section("vocab-dump")
 | |
|         +h(3, "vocab-dump")
 | |
|             | #[+tag method] Vocab.dump
 | |
| 
 | |
|         +code("python", "Definition").
 | |
|             def dump(self, loc):
 | |
|                 return None
 | |
| 
 | |
|         +table(["Name", "Type", "Description"])
 | |
|             +row
 | |
|                 +cell loc
 | |
|                 +cell #[+a(link_unicode) unicode]
 | |
|                 +cell Path where the vocabulary should be saved.
 | |
| 
 | |
|     +section("vocab-load_lexemes")
 | |
|         +h(3, "vocab-load_lexemes")
 | |
|             | #[+tag method] Vocab.load_lexemes
 | |
| 
 | |
|         +code("python", "Definition").
 | |
|             def load_lexemes(self, loc):
 | |
|                 return None
 | |
| 
 | |
|         +table(["Name", "Type", "Description"])
 | |
|             +row
 | |
|                 +cell loc
 | |
|                 +cell #[+a(link_unicode) unicode]
 | |
|                 +cell Path to load the lexemes.bin file from.
 | |
| 
 | |
|         +section("vocab-dump_vectors")
 | |
|             +h(3, "vocab-dump_vectors")
 | |
|                 | #[+tag method] Vocab.dump_vectors
 | |
| 
 | |
|             +code("python", "Definition").
 | |
|                 def dump_vectors(self, loc):
 | |
|                     return None
 | |
| 
 | |
|     +section("vocab-loadvectors")
 | |
|         +h(3, "vocab-loadvectors")
 | |
|             | #[+tag method] Vocab.load_vectors
 | |
| 
 | |
|         +code("python", "Definition").
 | |
|             def load_vectors(self, file_):
 | |
|                 return None
 | |
| 
 | |
|         +table(["Name", "Type", "Description"])
 | |
|             +row
 | |
|                 +cell file
 | |
|                 +cell #[+a(link_unicode) unicode]
 | |
|                 +cell A file-like object, to load word vectors from.
 | |
| 
 | |
|     +section("vocab-loadvectorsfrombinloc")
 | |
|         +h(3, "vocab-saveload-loadvectorsfrom")
 | |
|             | #[+tag method] Vocab.load_vectors_from_bin_loc
 | |
| 
 | |
|         +code("python", "Definition").
 | |
|             def load_vectors_from_bin_loc(self, loc):
 | |
|                 return None
 | |
| 
 | |
|         +table(["Name", "Type", "Description"])
 | |
|             +row
 | |
|                 +cell loc
 | |
|                 +cell #[+a(link_unicode) unicode]
 | |
|                 +cell.
 | |
|                     A path to a file, in spaCy's binary word-vectors file  format.
 |