mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			257 lines
		
	
	
		
			6.7 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			257 lines
		
	
	
		
			6.7 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
//- 💫 DOCS > API > VOCAB
 | 
						|
 | 
						|
include ../../_includes/_mixins
 | 
						|
 | 
						|
p
 | 
						|
    |  A look-up table that allows you to access #[code Lexeme] objects. The
 | 
						|
    |  #[code Vocab] instance also provides access to the #[code StringStore],
 | 
						|
    |  and owns underlying C-data that is shared between #[code Doc] objects.
 | 
						|
 | 
						|
+h(2, "init") Vocab.__init__
 | 
						|
    +tag method
 | 
						|
 | 
						|
p Create the vocabulary.
 | 
						|
 | 
						|
+table(["Name", "Type", "Description"])
 | 
						|
    +row
 | 
						|
        +cell #[code lex_attr_getters]
 | 
						|
        +cell dict
 | 
						|
        +cell
 | 
						|
            |  A dictionary mapping attribute IDs to functions to compute them.
 | 
						|
            |  Defaults to #[code None].
 | 
						|
 | 
						|
    +row
 | 
						|
        +cell #[code tag_map]
 | 
						|
        +cell dict
 | 
						|
        +cell
 | 
						|
            |  A dictionary mapping fine-grained tags to coarse-grained
 | 
						|
            |  parts-of-speech, and optionally morphological attributes.
 | 
						|
 | 
						|
    +row
 | 
						|
        +cell #[code lemmatizer]
 | 
						|
        +cell object
 | 
						|
        +cell A lemmatizer. Defaults to #[code None].
 | 
						|
 | 
						|
    +row
 | 
						|
        +cell #[code strings]
 | 
						|
        +cell #[code StringStore]
 | 
						|
        +cell
 | 
						|
            |  A #[code StringStore] that maps strings to integers, and vice
 | 
						|
            |  versa.
 | 
						|
 | 
						|
    +footrow
 | 
						|
        +cell returns
 | 
						|
        +cell #[code Vocab]
 | 
						|
        +cell The newly constructed object.
 | 
						|
 | 
						|
+h(2, "len") Vocab.__len__
 | 
						|
    +tag method
 | 
						|
 | 
						|
p Get the current number of lexemes in the vocabulary.
 | 
						|
 | 
						|
+aside-code("Example").
 | 
						|
    doc = nlp(u'This is a sentence.')
 | 
						|
    assert len(nlp.vocab) > 0
 | 
						|
 | 
						|
+table(["Name", "Type", "Description"])
 | 
						|
    +footrow
 | 
						|
        +cell returns
 | 
						|
        +cell int
 | 
						|
        +cell The number of lexems in the vocabulary.
 | 
						|
 | 
						|
+h(2, "getitem") Vocab.__getitem__
 | 
						|
    +tag method
 | 
						|
 | 
						|
p
 | 
						|
    |  Retrieve a lexeme, given an int ID or a unicode string. If a previously
 | 
						|
    |  unseen unicode string is given, a new lexeme is created and stored.
 | 
						|
 | 
						|
+aside-code("Example").
 | 
						|
    apple = nlp.vocab.strings['apple']
 | 
						|
    assert nlp.vocab[apple] == nlp.vocab[u'apple']
 | 
						|
 | 
						|
+table(["Name", "Type", "Description"])
 | 
						|
    +row
 | 
						|
        +cell #[code id_or_string]
 | 
						|
        +cell int / unicode
 | 
						|
        +cell The integer ID of a word, or its unicode string.
 | 
						|
 | 
						|
    +footrow
 | 
						|
        +cell returns
 | 
						|
        +cell #[code Lexeme]
 | 
						|
        +cell The lexeme indicated by the given ID.
 | 
						|
 | 
						|
+h(2, "iter") Vocab.__iter__
 | 
						|
    +tag method
 | 
						|
 | 
						|
p Iterate over the lexemes in the vocabulary.
 | 
						|
 | 
						|
+aside-code("Example").
 | 
						|
    stop_words = (lex for lex in nlp.vocab if lex.is_stop)
 | 
						|
 | 
						|
+table(["Name", "Type", "Description"])
 | 
						|
    +footrow
 | 
						|
        +cell yields
 | 
						|
        +cell #[code Lexeme]
 | 
						|
        +cell An entry in the vocabulary.
 | 
						|
 | 
						|
+h(2, "contains") Vocab.__contains__
 | 
						|
    +tag method
 | 
						|
 | 
						|
p
 | 
						|
    |  Check whether the string has an entry in the vocabulary. To get the ID
 | 
						|
    |  for a given string, you need to look it up in
 | 
						|
    |  #[+api("vocab#attributes") #[code vocab.strings]].
 | 
						|
 | 
						|
+aside-code("Example").
 | 
						|
    apple = nlp.vocab.strings['apple']
 | 
						|
    oov = nlp.vocab.strings['dskfodkfos']
 | 
						|
    assert apple in nlp.vocab
 | 
						|
    assert oov not in nlp.vocab
 | 
						|
 | 
						|
+table(["Name", "Type", "Description"])
 | 
						|
    +row
 | 
						|
        +cell #[code string]
 | 
						|
        +cell unicode
 | 
						|
        +cell The ID string.
 | 
						|
 | 
						|
    +footrow
 | 
						|
        +cell returns
 | 
						|
        +cell bool
 | 
						|
        +cell Whether the string has an entry in the vocabulary.
 | 
						|
 | 
						|
+h(2, "add_flag") Vocab.add_flag
 | 
						|
    +tag method
 | 
						|
 | 
						|
p
 | 
						|
    |  Set a new boolean flag to words in the vocabulary. The #[code flag_getter]
 | 
						|
    |  function will be called over the words currently in the vocab, and then
 | 
						|
    |  applied to new words as they occur. You'll then be able to access the flag
 | 
						|
    |  value on each token, using #[code token.check_flag(flag_id)].
 | 
						|
 | 
						|
+aside-code("Example").
 | 
						|
    def is_my_product(text):
 | 
						|
        products = [u'spaCy', u'Thinc', u'displaCy']
 | 
						|
        return text in products
 | 
						|
 | 
						|
    MY_PRODUCT = nlp.vocab.add_flag(is_my_product)
 | 
						|
    doc = nlp(u'I like spaCy')
 | 
						|
    assert doc[2].check_flag(MY_PRODUCT) == True
 | 
						|
 | 
						|
+table(["Name", "Type", "Description"])
 | 
						|
    +row
 | 
						|
        +cell #[code flag_getter]
 | 
						|
        +cell dict
 | 
						|
        +cell A function #[code f(unicode) -> bool], to get the flag value.
 | 
						|
 | 
						|
    +row
 | 
						|
        +cell #[code flag_id]
 | 
						|
        +cell int
 | 
						|
        +cell
 | 
						|
            |  An integer between 1 and 63 (inclusive), specifying the bit at
 | 
						|
            |  which the flag will be stored. If #[code -1], the lowest
 | 
						|
            |  available bit will be chosen.
 | 
						|
 | 
						|
    +footrow
 | 
						|
        +cell returns
 | 
						|
        +cell int
 | 
						|
        +cell The integer ID by which the flag value can be checked.
 | 
						|
 | 
						|
+h(2, "to_disk") Vocab.to_disk
 | 
						|
    +tag method
 | 
						|
 | 
						|
p Save the current state to a directory.
 | 
						|
 | 
						|
+aside-code("Example").
 | 
						|
    nlp.vocab.to_disk('/path/to/vocab')
 | 
						|
 | 
						|
+table(["Name", "Type", "Description"])
 | 
						|
    +row
 | 
						|
        +cell #[code path]
 | 
						|
        +cell unicode or #[code Path]
 | 
						|
        +cell
 | 
						|
            |  A path to a directory, which will be created if it doesn't exist.
 | 
						|
            |  Paths may be either strings or #[code Path]-like objects.
 | 
						|
 | 
						|
+h(2, "from_disk") Vocab.from_disk
 | 
						|
    +tag method
 | 
						|
 | 
						|
p Loads state from a directory. Modifies the object in place and returns it.
 | 
						|
 | 
						|
+aside-code("Example").
 | 
						|
    from spacy.vocab import Vocab
 | 
						|
    vocab = Vocab().from_disk('/path/to/vocab')
 | 
						|
 | 
						|
+table(["Name", "Type", "Description"])
 | 
						|
    +row
 | 
						|
        +cell #[code path]
 | 
						|
        +cell unicode or #[code Path]
 | 
						|
        +cell
 | 
						|
            |  A path to a directory. Paths may be either strings or
 | 
						|
            |  #[code Path]-like objects.
 | 
						|
 | 
						|
    +footrow
 | 
						|
        +cell returns
 | 
						|
        +cell #[code Vocab]
 | 
						|
        +cell The modified #[code Vocab] object.
 | 
						|
 | 
						|
+h(2, "to_bytes") Vocab.to_bytes
 | 
						|
    +tag method
 | 
						|
 | 
						|
p Serialize the current state to a binary string.
 | 
						|
 | 
						|
+aside-code("Example").
 | 
						|
    vocab_bytes = nlp.vocab.to_bytes()
 | 
						|
 | 
						|
+table(["Name", "Type", "Description"])
 | 
						|
    +row
 | 
						|
        +cell #[code **exclude]
 | 
						|
        +cell -
 | 
						|
        +cell Named attributes to prevent from being serialized.
 | 
						|
 | 
						|
    +footrow
 | 
						|
        +cell returns
 | 
						|
        +cell bytes
 | 
						|
        +cell The serialized form of the #[code Vocab] object.
 | 
						|
 | 
						|
+h(2, "from_bytes") Vocab.from_bytes
 | 
						|
    +tag method
 | 
						|
 | 
						|
p Load state from a binary string.
 | 
						|
 | 
						|
+aside-code("Example").
 | 
						|
    fron spacy.vocab import Vocab
 | 
						|
    vocab_bytes = nlp.vocab.to_bytes()
 | 
						|
    vocab = Vocab()
 | 
						|
    vocab.from_bytes(vocab_bytes)
 | 
						|
 | 
						|
+table(["Name", "Type", "Description"])
 | 
						|
    +row
 | 
						|
        +cell #[code bytes_data]
 | 
						|
        +cell bytes
 | 
						|
        +cell The data to load from.
 | 
						|
 | 
						|
    +row
 | 
						|
        +cell #[code **exclude]
 | 
						|
        +cell -
 | 
						|
        +cell Named attributes to prevent from being loaded.
 | 
						|
 | 
						|
    +footrow
 | 
						|
        +cell returns
 | 
						|
        +cell #[code Vocab]
 | 
						|
        +cell The #[code Vocab] object.
 | 
						|
 | 
						|
+h(2, "attributes") Attributes
 | 
						|
 | 
						|
+aside-code("Example").
 | 
						|
    apple_id = nlp.vocab.strings['apple']
 | 
						|
    assert type(apple_id) == int
 | 
						|
    PERSON = nlp.vocab.strings['PERSON']
 | 
						|
    assert type(PERSON) == int
 | 
						|
 | 
						|
+table(["Name", "Type", "Description"])
 | 
						|
    +row
 | 
						|
        +cell #[code strings]
 | 
						|
        +cell #[code StringStore]
 | 
						|
        +cell A table managing the string-to-int mapping.
 |