mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			412 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			412 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
| //- 💫 DOCS > API > VOCAB
 | ||
| 
 | ||
| include ../_includes/_mixins
 | ||
| 
 | ||
| p
 | ||
|     |  The #[code Vocab] object provides a lookup table that allows you to
 | ||
|     |  access #[+api("lexeme") #[code Lexeme]] objects, as well as the
 | ||
|     |  #[+api("stringstore") #[code StringStore]]. It also owns underlying
 | ||
|     |  C-data that is shared between #[code Doc] objects.
 | ||
| 
 | ||
| +h(2, "init") Vocab.__init__
 | ||
|     +tag method
 | ||
| 
 | ||
| p Create the vocabulary.
 | ||
| 
 | ||
| +aside-code("Example").
 | ||
|     from spacy.vocab import Vocab
 | ||
|     vocab = Vocab(strings=[u'hello', u'world'])
 | ||
| 
 | ||
| +table(["Name", "Type", "Description"])
 | ||
|     +row
 | ||
|         +cell #[code lex_attr_getters]
 | ||
|         +cell dict
 | ||
|         +cell
 | ||
|             |  A dictionary mapping attribute IDs to functions to compute them.
 | ||
|             |  Defaults to #[code None].
 | ||
| 
 | ||
|     +row
 | ||
|         +cell #[code tag_map]
 | ||
|         +cell dict
 | ||
|         +cell
 | ||
|             |  A dictionary mapping fine-grained tags to coarse-grained
 | ||
|             |  parts-of-speech, and optionally morphological attributes.
 | ||
| 
 | ||
|     +row
 | ||
|         +cell #[code lemmatizer]
 | ||
|         +cell object
 | ||
|         +cell A lemmatizer. Defaults to #[code None].
 | ||
| 
 | ||
|     +row
 | ||
|         +cell #[code strings]
 | ||
|         +cell #[code StringStore] or list
 | ||
|         +cell
 | ||
|             |  A #[+api("stringstore") #[code StringStore]] that maps
 | ||
|             |  strings to hash values, and vice versa, or a list of strings.
 | ||
| 
 | ||
|     +row("foot")
 | ||
|         +cell returns
 | ||
|         +cell #[code Vocab]
 | ||
|         +cell The newly constructed object.
 | ||
| 
 | ||
| +h(2, "len") Vocab.__len__
 | ||
|     +tag method
 | ||
| 
 | ||
| p Get the current number of lexemes in the vocabulary.
 | ||
| 
 | ||
| +aside-code("Example").
 | ||
|     doc = nlp(u'This is a sentence.')
 | ||
|     assert len(nlp.vocab) > 0
 | ||
| 
 | ||
| +table(["Name", "Type", "Description"])
 | ||
|     +row("foot")
 | ||
|         +cell returns
 | ||
|         +cell int
 | ||
|         +cell The number of lexems in the vocabulary.
 | ||
| 
 | ||
| +h(2, "getitem") Vocab.__getitem__
 | ||
|     +tag method
 | ||
| 
 | ||
| p
 | ||
|     |  Retrieve a lexeme, given an int ID or a unicode string. If a previously
 | ||
|     |  unseen unicode string is given, a new lexeme is created and stored.
 | ||
| 
 | ||
| +aside-code("Example").
 | ||
|     apple = nlp.vocab.strings['apple']
 | ||
|     assert nlp.vocab[apple] == nlp.vocab[u'apple']
 | ||
| 
 | ||
| +table(["Name", "Type", "Description"])
 | ||
|     +row
 | ||
|         +cell #[code id_or_string]
 | ||
|         +cell int / unicode
 | ||
|         +cell The hash value of a word, or its unicode string.
 | ||
| 
 | ||
|     +row("foot")
 | ||
|         +cell returns
 | ||
|         +cell #[code Lexeme]
 | ||
|         +cell The lexeme indicated by the given ID.
 | ||
| 
 | ||
| +h(2, "iter") Vocab.__iter__
 | ||
|     +tag method
 | ||
| 
 | ||
| p Iterate over the lexemes in the vocabulary.
 | ||
| 
 | ||
| +aside-code("Example").
 | ||
|     stop_words = (lex for lex in nlp.vocab if lex.is_stop)
 | ||
| 
 | ||
| +table(["Name", "Type", "Description"])
 | ||
|     +row("foot")
 | ||
|         +cell yields
 | ||
|         +cell #[code Lexeme]
 | ||
|         +cell An entry in the vocabulary.
 | ||
| 
 | ||
| +h(2, "contains") Vocab.__contains__
 | ||
|     +tag method
 | ||
| 
 | ||
| p
 | ||
|     |  Check whether the string has an entry in the vocabulary. To get the ID
 | ||
|     |  for a given string, you need to look it up in
 | ||
|     |  #[+api("vocab#attributes") #[code vocab.strings]].
 | ||
| 
 | ||
| +aside-code("Example").
 | ||
|     apple = nlp.vocab.strings['apple']
 | ||
|     oov = nlp.vocab.strings['dskfodkfos']
 | ||
|     assert apple in nlp.vocab
 | ||
|     assert oov not in nlp.vocab
 | ||
| 
 | ||
| +table(["Name", "Type", "Description"])
 | ||
|     +row
 | ||
|         +cell #[code string]
 | ||
|         +cell unicode
 | ||
|         +cell The ID string.
 | ||
| 
 | ||
|     +row("foot")
 | ||
|         +cell returns
 | ||
|         +cell bool
 | ||
|         +cell Whether the string has an entry in the vocabulary.
 | ||
| 
 | ||
| +h(2, "add_flag") Vocab.add_flag
 | ||
|     +tag method
 | ||
| 
 | ||
| p
 | ||
|     |  Set a new boolean flag to words in the vocabulary. The #[code flag_getter]
 | ||
|     |  function will be called over the words currently in the vocab, and then
 | ||
|     |  applied to new words as they occur. You'll then be able to access the flag
 | ||
|     |  value on each token, using #[code token.check_flag(flag_id)].
 | ||
| 
 | ||
| +aside-code("Example").
 | ||
|     def is_my_product(text):
 | ||
|         products = [u'spaCy', u'Thinc', u'displaCy']
 | ||
|         return text in products
 | ||
| 
 | ||
|     MY_PRODUCT = nlp.vocab.add_flag(is_my_product)
 | ||
|     doc = nlp(u'I like spaCy')
 | ||
|     assert doc[2].check_flag(MY_PRODUCT) == True
 | ||
| 
 | ||
| +table(["Name", "Type", "Description"])
 | ||
|     +row
 | ||
|         +cell #[code flag_getter]
 | ||
|         +cell dict
 | ||
|         +cell A function #[code f(unicode) -> bool], to get the flag value.
 | ||
| 
 | ||
|     +row
 | ||
|         +cell #[code flag_id]
 | ||
|         +cell int
 | ||
|         +cell
 | ||
|             |  An integer between 1 and 63 (inclusive), specifying the bit at
 | ||
|             |  which the flag will be stored. If #[code -1], the lowest
 | ||
|             |  available bit will be chosen.
 | ||
| 
 | ||
|     +row("foot")
 | ||
|         +cell returns
 | ||
|         +cell int
 | ||
|         +cell The integer ID by which the flag value can be checked.
 | ||
| 
 | ||
| +h(2, "reset_vectors") Vocab.reset_vectors
 | ||
|     +tag method
 | ||
|     +tag-new(2)
 | ||
| 
 | ||
| p
 | ||
|     |  Drop the current vector table. Because all vectors must be the same
 | ||
|     |  width, you have to call this to change the size of the vectors. Only
 | ||
|     |  one of the #[code width] and #[code shape] keyword arguments can be
 | ||
|     |  specified.
 | ||
| 
 | ||
| +aside-code("Example").
 | ||
|     nlp.vocab.reset_vectors(width=300)
 | ||
| 
 | ||
| +table(["Name", "Type", "Description"])
 | ||
|     +row
 | ||
|         +cell #[code width]
 | ||
|         +cell int
 | ||
|         +cell The new width (keyword argument only).
 | ||
| 
 | ||
|     +row
 | ||
|         +cell #[code shape]
 | ||
|         +cell int
 | ||
|         +cell The new shape (keyword argument only).
 | ||
| 
 | ||
| +h(2, "prune_vectors") Vocab.prune_vectors
 | ||
|     +tag method
 | ||
|     +tag-new(2)
 | ||
| 
 | ||
| p
 | ||
|     |  Reduce the current vector table to #[code nr_row] unique entries. Words
 | ||
|     |  mapped to the discarded vectors will be remapped to the closest vector
 | ||
|     |  among those remaining. For example, suppose the original table had
 | ||
|     |  vectors for the words:
 | ||
|     |  #[code.u-break ['sat', 'cat', 'feline', 'reclined']]. If we prune the
 | ||
|     |  vector table to, two rows, we would discard the vectors for "feline"
 | ||
|     |  and "reclined". These words would then be remapped to the closest
 | ||
|     |  remaining vector – so "feline" would have the same vector as "cat",
 | ||
|     |  and "reclined" would have the same vector as "sat". The similarities are
 | ||
|     |  judged by cosine. The original vectors may be large, so the cosines are
 | ||
|     |  calculated in minibatches, to reduce memory usage.
 | ||
| 
 | ||
| +aside-code("Example").
 | ||
|     nlp.vocab.prune_vectors(10000)
 | ||
|     assert len(nlp.vocab.vectors) <= 1000
 | ||
| 
 | ||
| +table(["Name", "Type", "Description"])
 | ||
|     +row
 | ||
|         +cell #[code nr_row]
 | ||
|         +cell int
 | ||
|         +cell The number of rows to keep in the vector table.
 | ||
| 
 | ||
|     +row
 | ||
|         +cell #[code batch_size]
 | ||
|         +cell int
 | ||
|         +cell
 | ||
|             |  Batch of vectors for calculating the similarities. Larger batch
 | ||
|             |  sizes might be faster, while temporarily requiring more memory.
 | ||
| 
 | ||
|     +row("foot")
 | ||
|         +cell returns
 | ||
|         +cell dict
 | ||
|         +cell
 | ||
|             |  A dictionary keyed by removed words mapped to
 | ||
|             |  #[code (string, score)] tuples, where #[code string] is the entry
 | ||
|             |  the removed word was mapped to, and #[code score] the similarity
 | ||
|             |  score between the two words.
 | ||
| 
 | ||
| +h(2, "get_vector") Vocab.get_vector
 | ||
|     +tag method
 | ||
|     +tag-new(2)
 | ||
| 
 | ||
| p
 | ||
|     |  Retrieve a vector for a word in the vocabulary. Words can be looked up
 | ||
|     |  by string or hash value. If no vectors data is loaded, a
 | ||
|     |  #[code ValueError] is raised.
 | ||
| 
 | ||
| +aside-code("Example").
 | ||
|     nlp.vocab.get_vector(u'apple')
 | ||
| 
 | ||
| +table(["Name", "Type", "Description"])
 | ||
|     +row
 | ||
|         +cell #[code orth]
 | ||
|         +cell int / unicode
 | ||
|         +cell The hash value of a word, or its unicode string.
 | ||
| 
 | ||
|     +row("foot")
 | ||
|         +cell returns
 | ||
|         +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
 | ||
|         +cell
 | ||
|             |  A word vector. Size and shape are determined by the
 | ||
|             |  #[code Vocab.vectors] instance.
 | ||
| 
 | ||
| +h(2, "set_vector") Vocab.set_vector
 | ||
|     +tag method
 | ||
|     +tag-new(2)
 | ||
| 
 | ||
| p
 | ||
|     |  Set a vector for a word in the vocabulary. Words can be referenced by
 | ||
|     |  by string or hash value.
 | ||
| 
 | ||
| +aside-code("Example").
 | ||
|     nlp.vocab.set_vector(u'apple', array([...]))
 | ||
| 
 | ||
| +table(["Name", "Type", "Description"])
 | ||
|     +row
 | ||
|         +cell #[code orth]
 | ||
|         +cell int / unicode
 | ||
|         +cell The hash value of a word, or its unicode string.
 | ||
| 
 | ||
|     +row
 | ||
|         +cell #[code vector]
 | ||
|         +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
 | ||
|         +cell The vector to set.
 | ||
| 
 | ||
| +h(2, "has_vector") Vocab.has_vector
 | ||
|     +tag method
 | ||
|     +tag-new(2)
 | ||
| 
 | ||
| p
 | ||
|     |  Check whether a word has a vector. Returns #[code False] if no vectors
 | ||
|     |  are loaded. Words can be looked up by string or hash value.
 | ||
| 
 | ||
| +aside-code("Example").
 | ||
|     if nlp.vocab.has_vector(u'apple'):
 | ||
|         vector = nlp.vocab.get_vector(u'apple')
 | ||
| 
 | ||
| +table(["Name", "Type", "Description"])
 | ||
|     +row
 | ||
|         +cell #[code orth]
 | ||
|         +cell int / unicode
 | ||
|         +cell The hash value of a word, or its unicode string.
 | ||
| 
 | ||
|     +row("foot")
 | ||
|         +cell returns
 | ||
|         +cell bool
 | ||
|         +cell Whether the word has a vector.
 | ||
| 
 | ||
| +h(2, "to_disk") Vocab.to_disk
 | ||
|     +tag method
 | ||
|     +tag-new(2)
 | ||
| 
 | ||
| p Save the current state to a directory.
 | ||
| 
 | ||
| +aside-code("Example").
 | ||
|     nlp.vocab.to_disk('/path/to/vocab')
 | ||
| 
 | ||
| +table(["Name", "Type", "Description"])
 | ||
|     +row
 | ||
|         +cell #[code path]
 | ||
|         +cell unicode or #[code Path]
 | ||
|         +cell
 | ||
|             |  A path to a directory, which will be created if it doesn't exist.
 | ||
|             |  Paths may be either strings or #[code Path]-like objects.
 | ||
| 
 | ||
| +h(2, "from_disk") Vocab.from_disk
 | ||
|     +tag method
 | ||
|     +tag-new(2)
 | ||
| 
 | ||
| p Loads state from a directory. Modifies the object in place and returns it.
 | ||
| 
 | ||
| +aside-code("Example").
 | ||
|     from spacy.vocab import Vocab
 | ||
|     vocab = Vocab().from_disk('/path/to/vocab')
 | ||
| 
 | ||
| +table(["Name", "Type", "Description"])
 | ||
|     +row
 | ||
|         +cell #[code path]
 | ||
|         +cell unicode or #[code Path]
 | ||
|         +cell
 | ||
|             |  A path to a directory. Paths may be either strings or
 | ||
|             |  #[code Path]-like objects.
 | ||
| 
 | ||
|     +row("foot")
 | ||
|         +cell returns
 | ||
|         +cell #[code Vocab]
 | ||
|         +cell The modified #[code Vocab] object.
 | ||
| 
 | ||
| +h(2, "to_bytes") Vocab.to_bytes
 | ||
|     +tag method
 | ||
| 
 | ||
| p Serialize the current state to a binary string.
 | ||
| 
 | ||
| +aside-code("Example").
 | ||
|     vocab_bytes = nlp.vocab.to_bytes()
 | ||
| 
 | ||
| +table(["Name", "Type", "Description"])
 | ||
|     +row
 | ||
|         +cell #[code **exclude]
 | ||
|         +cell -
 | ||
|         +cell Named attributes to prevent from being serialized.
 | ||
| 
 | ||
|     +row("foot")
 | ||
|         +cell returns
 | ||
|         +cell bytes
 | ||
|         +cell The serialized form of the #[code Vocab] object.
 | ||
| 
 | ||
| +h(2, "from_bytes") Vocab.from_bytes
 | ||
|     +tag method
 | ||
| 
 | ||
| p Load state from a binary string.
 | ||
| 
 | ||
| +aside-code("Example").
 | ||
|     fron spacy.vocab import Vocab
 | ||
|     vocab_bytes = nlp.vocab.to_bytes()
 | ||
|     vocab = Vocab()
 | ||
|     vocab.from_bytes(vocab_bytes)
 | ||
| 
 | ||
| +table(["Name", "Type", "Description"])
 | ||
|     +row
 | ||
|         +cell #[code bytes_data]
 | ||
|         +cell bytes
 | ||
|         +cell The data to load from.
 | ||
| 
 | ||
|     +row
 | ||
|         +cell #[code **exclude]
 | ||
|         +cell -
 | ||
|         +cell Named attributes to prevent from being loaded.
 | ||
| 
 | ||
|     +row("foot")
 | ||
|         +cell returns
 | ||
|         +cell #[code Vocab]
 | ||
|         +cell The #[code Vocab] object.
 | ||
| 
 | ||
| +h(2, "attributes") Attributes
 | ||
| 
 | ||
| +aside-code("Example").
 | ||
|     apple_id = nlp.vocab.strings['apple']
 | ||
|     assert type(apple_id) == int
 | ||
|     PERSON = nlp.vocab.strings['PERSON']
 | ||
|     assert type(PERSON) == int
 | ||
| 
 | ||
| +table(["Name", "Type", "Description"])
 | ||
|     +row
 | ||
|         +cell #[code strings]
 | ||
|         +cell #[code StringStore]
 | ||
|         +cell A table managing the string-to-int mapping.
 | ||
| 
 | ||
|     +row
 | ||
|         +cell #[code vectors]
 | ||
|             +tag-new(2)
 | ||
|         +cell #[code Vectors]
 | ||
|         +cell A table associating word IDs to word vectors.
 | ||
| 
 | ||
|     +row
 | ||
|         +cell #[code vectors_length]
 | ||
|         +cell int
 | ||
|         +cell Number of dimensions for each word vector.
 |