spaCy/website/docs/api/vocab.jade

//- 💫 DOCS > API > VOCAB

include ../../_includes/_mixins

p
    |  A lookup table that allows you to access #[code Lexeme] objects. The
    |  #[code Vocab] instance also provides access to the #[code StringStore],
    |  and owns underlying C-data that is shared between #[code Doc] objects.

+h(2, "init") Vocab.__init__
    +tag method

p Create the vocabulary.

+table(["Name", "Type", "Description"])
    +row
        +cell #[code lex_attr_getters]
        +cell dict
        +cell
            |  A dictionary mapping attribute IDs to functions to compute them.
            |  Defaults to #[code None].

    +row
        +cell #[code tag_map]
        +cell dict
        +cell
            |  A dictionary mapping fine-grained tags to coarse-grained
            |  parts-of-speech, and optionally morphological attributes.

    +row
        +cell #[code lemmatizer]
        +cell object
        +cell A lemmatizer. Defaults to #[code None].

    +row
        +cell #[code strings]
        +cell #[code StringStore] or list
        +cell
            |  A #[+api("stringstore") #[code StringStore]] that maps
            |  strings to hash values, and vice versa, or a list of strings.

    +footrow
        +cell returns
        +cell #[code Vocab]
        +cell The newly constructed object.

+h(2, "len") Vocab.__len__
    +tag method

p Get the current number of lexemes in the vocabulary.

+aside-code("Example").
    doc = nlp(u'This is a sentence.')
    assert len(nlp.vocab) > 0

+table(["Name", "Type", "Description"])
    +footrow
        +cell returns
        +cell int
        +cell The number of lexems in the vocabulary.

+h(2, "getitem") Vocab.__getitem__
    +tag method

p
    |  Retrieve a lexeme, given an int ID or a unicode string. If a previously
    |  unseen unicode string is given, a new lexeme is created and stored.

+aside-code("Example").
    apple = nlp.vocab.strings['apple']
    assert nlp.vocab[apple] == nlp.vocab[u'apple']

+table(["Name", "Type", "Description"])
    +row
        +cell #[code id_or_string]
        +cell int / unicode
        +cell The hash value of a word, or its unicode string.

    +footrow
        +cell returns
        +cell #[code Lexeme]
        +cell The lexeme indicated by the given ID.

+h(2, "iter") Vocab.__iter__
    +tag method

p Iterate over the lexemes in the vocabulary.

+aside-code("Example").
    stop_words = (lex for lex in nlp.vocab if lex.is_stop)

+table(["Name", "Type", "Description"])
    +footrow
        +cell yields
        +cell #[code Lexeme]
        +cell An entry in the vocabulary.

+h(2, "contains") Vocab.__contains__
    +tag method

p
    |  Check whether the string has an entry in the vocabulary. To get the ID
    |  for a given string, you need to look it up in
    |  #[+api("vocab#attributes") #[code vocab.strings]].

+aside-code("Example").
    apple = nlp.vocab.strings['apple']
    oov = nlp.vocab.strings['dskfodkfos']
    assert apple in nlp.vocab
    assert oov not in nlp.vocab

+table(["Name", "Type", "Description"])
    +row
        +cell #[code string]
        +cell unicode
        +cell The ID string.

    +footrow
        +cell returns
        +cell bool
        +cell Whether the string has an entry in the vocabulary.

+h(2, "add_flag") Vocab.add_flag
    +tag method

p
    |  Set a new boolean flag to words in the vocabulary. The #[code flag_getter]
    |  function will be called over the words currently in the vocab, and then
    |  applied to new words as they occur. You'll then be able to access the flag
    |  value on each token, using #[code token.check_flag(flag_id)].

+aside-code("Example").
    def is_my_product(text):
        products = [u'spaCy', u'Thinc', u'displaCy']
        return text in products

    MY_PRODUCT = nlp.vocab.add_flag(is_my_product)
    doc = nlp(u'I like spaCy')
    assert doc[2].check_flag(MY_PRODUCT) == True

+table(["Name", "Type", "Description"])
    +row
        +cell #[code flag_getter]
        +cell dict
        +cell A function #[code f(unicode) -> bool], to get the flag value.

    +row
        +cell #[code flag_id]
        +cell int
        +cell
            |  An integer between 1 and 63 (inclusive), specifying the bit at
            |  which the flag will be stored. If #[code -1], the lowest
            |  available bit will be chosen.

    +footrow
        +cell returns
        +cell int
        +cell The integer ID by which the flag value can be checked.

+h(2, "to_disk") Vocab.to_disk
    +tag method
    +tag-new(2)

p Save the current state to a directory.

+aside-code("Example").
    nlp.vocab.to_disk('/path/to/vocab')

+table(["Name", "Type", "Description"])
    +row
        +cell #[code path]
        +cell unicode or #[code Path]
        +cell
            |  A path to a directory, which will be created if it doesn't exist.
            |  Paths may be either strings or #[code Path]-like objects.

+h(2, "from_disk") Vocab.from_disk
    +tag method
    +tag-new(2)

p Loads state from a directory. Modifies the object in place and returns it.

+aside-code("Example").
    from spacy.vocab import Vocab
    vocab = Vocab().from_disk('/path/to/vocab')

+table(["Name", "Type", "Description"])
    +row
        +cell #[code path]
        +cell unicode or #[code Path]
        +cell
            |  A path to a directory. Paths may be either strings or
            |  #[code Path]-like objects.

    +footrow
        +cell returns
        +cell #[code Vocab]
        +cell The modified #[code Vocab] object.

+h(2, "to_bytes") Vocab.to_bytes
    +tag method

p Serialize the current state to a binary string.

+aside-code("Example").
    vocab_bytes = nlp.vocab.to_bytes()

+table(["Name", "Type", "Description"])
    +row
        +cell #[code **exclude]
        +cell -
        +cell Named attributes to prevent from being serialized.

    +footrow
        +cell returns
        +cell bytes
        +cell The serialized form of the #[code Vocab] object.

+h(2, "from_bytes") Vocab.from_bytes
    +tag method

p Load state from a binary string.

+aside-code("Example").
    fron spacy.vocab import Vocab
    vocab_bytes = nlp.vocab.to_bytes()
    vocab = Vocab()
    vocab.from_bytes(vocab_bytes)

+table(["Name", "Type", "Description"])
    +row
        +cell #[code bytes_data]
        +cell bytes
        +cell The data to load from.

    +row
        +cell #[code **exclude]
        +cell -
        +cell Named attributes to prevent from being loaded.

    +footrow
        +cell returns
        +cell #[code Vocab]
        +cell The #[code Vocab] object.

+h(2, "attributes") Attributes

+aside-code("Example").
    apple_id = nlp.vocab.strings['apple']
    assert type(apple_id) == int
    PERSON = nlp.vocab.strings['PERSON']
    assert type(PERSON) == int

+table(["Name", "Type", "Description"])
    +row
        +cell #[code strings]
        +cell #[code StringStore]
        +cell A table managing the string-to-int mapping.
Update to new website 2016-10-31 21:04:15 +03:00			`//- 💫 DOCS > API > VOCAB`

			`include ../../_includes/_mixins`

			`p`
Update formatting 2017-05-23 12:32:25 +03:00			`\| A lookup table that allows you to access #[code Lexeme] objects. The`
Update to new website 2016-10-31 21:04:15 +03:00			`\| #[code Vocab] instance also provides access to the #[code StringStore],`
			`\| and owns underlying C-data that is shared between #[code Doc] objects.`

Update docstrings and API docs for Vocab 2017-05-20 14:59:31 +03:00			`+h(2, "init") Vocab.__init__`
			`+tag method`
Update to new website 2016-10-31 21:04:15 +03:00
Update docstrings and API docs for Vocab 2017-05-20 14:59:31 +03:00			`p Create the vocabulary.`
Update to new website 2016-10-31 21:04:15 +03:00
			`+table(["Name", "Type", "Description"])`
			`+row`
			`+cell #[code lex_attr_getters]`
			`+cell dict`
			`+cell`
			`\| A dictionary mapping attribute IDs to functions to compute them.`
			`\| Defaults to #[code None].`

			`+row`
			`+cell #[code tag_map]`
			`+cell dict`
			`+cell`
			`\| A dictionary mapping fine-grained tags to coarse-grained`
			`\| parts-of-speech, and optionally morphological attributes.`

			`+row`
			`+cell #[code lemmatizer]`
Update docstrings and API docs for Vocab 2017-05-20 14:59:31 +03:00			`+cell object`
Update to new website 2016-10-31 21:04:15 +03:00			`+cell A lemmatizer. Defaults to #[code None].`

			`+row`
Update docstrings and API docs for Vocab 2017-05-20 14:59:31 +03:00			`+cell #[code strings]`
Fix typos, long integers and tests 2017-05-29 02:06:49 +03:00			`+cell #[code StringStore] or list`
Update to new website 2016-10-31 21:04:15 +03:00			`+cell`
Fix typos, long integers and tests 2017-05-29 02:06:49 +03:00			`\| A #[+api("stringstore") #[code StringStore]] that maps`
			`\| strings to hash values, and vice versa, or a list of strings.`
Update to new website 2016-10-31 21:04:15 +03:00
			`+footrow`
Use returns/yields instead of return/yield 2017-05-19 01:02:34 +03:00			`+cell returns`
Update to new website 2016-10-31 21:04:15 +03:00			`+cell #[code Vocab]`
			`+cell The newly constructed object.`

			`+h(2, "len") Vocab.__len__`
			`+tag method`

Update docstrings and API docs for Vocab 2017-05-20 14:59:31 +03:00			`p Get the current number of lexemes in the vocabulary.`

			`+aside-code("Example").`
			`doc = nlp(u'This is a sentence.')`
			`assert len(nlp.vocab) > 0`
Update to new website 2016-10-31 21:04:15 +03:00
			`+table(["Name", "Type", "Description"])`
			`+footrow`
Use returns/yields instead of return/yield 2017-05-19 01:02:34 +03:00			`+cell returns`
Update to new website 2016-10-31 21:04:15 +03:00			`+cell int`
			`+cell The number of lexems in the vocabulary.`

			`+h(2, "getitem") Vocab.__getitem__`
			`+tag method`

			`p`
			`\| Retrieve a lexeme, given an int ID or a unicode string. If a previously`
			`\| unseen unicode string is given, a new lexeme is created and stored.`

Update docstrings and API docs for Vocab 2017-05-20 14:59:31 +03:00			`+aside-code("Example").`
			`apple = nlp.vocab.strings['apple']`
			`assert nlp.vocab[apple] == nlp.vocab[u'apple']`

Update to new website 2016-10-31 21:04:15 +03:00			`+table(["Name", "Type", "Description"])`
			`+row`
			`+cell #[code id_or_string]`
			`+cell int / unicode`
Update docs and change integer IDs to hash values 2017-05-28 20:25:34 +03:00			`+cell The hash value of a word, or its unicode string.`
Update to new website 2016-10-31 21:04:15 +03:00
			`+footrow`
Use returns/yields instead of return/yield 2017-05-19 01:02:34 +03:00			`+cell returns`
Update to new website 2016-10-31 21:04:15 +03:00			`+cell #[code Lexeme]`
			`+cell The lexeme indicated by the given ID.`

Fix typo 2017-05-20 14:00:13 +03:00			`+h(2, "iter") Vocab.__iter__`
Update to new website 2016-10-31 21:04:15 +03:00			`+tag method`

			`p Iterate over the lexemes in the vocabulary.`

Update docstrings and API docs for Vocab 2017-05-20 14:59:31 +03:00			`+aside-code("Example").`
			`stop_words = (lex for lex in nlp.vocab if lex.is_stop)`

Update to new website 2016-10-31 21:04:15 +03:00			`+table(["Name", "Type", "Description"])`
			`+footrow`
Use returns/yields instead of return/yield 2017-05-19 01:02:34 +03:00			`+cell yields`
Update to new website 2016-10-31 21:04:15 +03:00			`+cell #[code Lexeme]`
			`+cell An entry in the vocabulary.`

			`+h(2, "contains") Vocab.__contains__`
			`+tag method`

Update docstrings and API docs for Vocab 2017-05-20 14:59:31 +03:00			`p`
			`\| Check whether the string has an entry in the vocabulary. To get the ID`
			`\| for a given string, you need to look it up in`
			`\| #[+api("vocab#attributes") #[code vocab.strings]].`

			`+aside-code("Example").`
			`apple = nlp.vocab.strings['apple']`
			`oov = nlp.vocab.strings['dskfodkfos']`
			`assert apple in nlp.vocab`
			`assert oov not in nlp.vocab`
Update to new website 2016-10-31 21:04:15 +03:00
			`+table(["Name", "Type", "Description"])`
			`+row`
			`+cell #[code string]`
			`+cell unicode`
			`+cell The ID string.`

			`+footrow`
Use returns/yields instead of return/yield 2017-05-19 01:02:34 +03:00			`+cell returns`
Update to new website 2016-10-31 21:04:15 +03:00			`+cell bool`
			`+cell Whether the string has an entry in the vocabulary.`

Update docstrings and API docs for Vocab 2017-05-20 14:59:31 +03:00			`+h(2, "add_flag") Vocab.add_flag`
Update to new website 2016-10-31 21:04:15 +03:00			`+tag method`

			`p`
Update docstrings and API docs for Vocab 2017-05-20 14:59:31 +03:00			`\| Set a new boolean flag to words in the vocabulary. The #[code flag_getter]`
			`\| function will be called over the words currently in the vocab, and then`
			`\| applied to new words as they occur. You'll then be able to access the flag`
			`\| value on each token, using #[code token.check_flag(flag_id)].`
Update to new website 2016-10-31 21:04:15 +03:00
Update docstrings and API docs for Vocab 2017-05-20 14:59:31 +03:00			`+aside-code("Example").`
			`def is_my_product(text):`
			`products = [u'spaCy', u'Thinc', u'displaCy']`
			`return text in products`
Update to new website 2016-10-31 21:04:15 +03:00
Update docstrings and API docs for Vocab 2017-05-20 14:59:31 +03:00			`MY_PRODUCT = nlp.vocab.add_flag(is_my_product)`
			`doc = nlp(u'I like spaCy')`
			`assert doc[2].check_flag(MY_PRODUCT) == True`
Update to new website 2016-10-31 21:04:15 +03:00
			`+table(["Name", "Type", "Description"])`
			`+row`
			`+cell #[code flag_getter]`
			`+cell dict`
			`+cell A function #[code f(unicode) -> bool], to get the flag value.`

			`+row`
			`+cell #[code flag_id]`
			`+cell int`
			`+cell`
			`\| An integer between 1 and 63 (inclusive), specifying the bit at`
			`\| which the flag will be stored. If #[code -1], the lowest`
			`\| available bit will be chosen.`

			`+footrow`
Use returns/yields instead of return/yield 2017-05-19 01:02:34 +03:00			`+cell returns`
Update to new website 2016-10-31 21:04:15 +03:00			`+cell int`
			`+cell The integer ID by which the flag value can be checked.`

Update docstrings and API docs for Vocab 2017-05-20 14:59:31 +03:00			`+h(2, "to_disk") Vocab.to_disk`
Update to new website 2016-10-31 21:04:15 +03:00			`+tag method`
Add version tag mixin to label new features 2017-05-26 13:42:36 +03:00			`+tag-new(2)`
Update to new website 2016-10-31 21:04:15 +03:00
Update docstrings and API docs for Vocab 2017-05-20 14:59:31 +03:00			`p Save the current state to a directory.`

			`+aside-code("Example").`
			`nlp.vocab.to_disk('/path/to/vocab')`
Update to new website 2016-10-31 21:04:15 +03:00
			`+table(["Name", "Type", "Description"])`
			`+row`
Update docstrings and API docs for Vocab 2017-05-20 14:59:31 +03:00			`+cell #[code path]`
			`+cell unicode or #[code Path]`
			`+cell`
			`\| A path to a directory, which will be created if it doesn't exist.`
			`\| Paths may be either strings or #[code Path]-like objects.`
Update to new website 2016-10-31 21:04:15 +03:00
Update docstrings and API docs for Vocab 2017-05-20 14:59:31 +03:00			`+h(2, "from_disk") Vocab.from_disk`
Update to new website 2016-10-31 21:04:15 +03:00			`+tag method`
Add version tag mixin to label new features 2017-05-26 13:42:36 +03:00			`+tag-new(2)`
Update to new website 2016-10-31 21:04:15 +03:00
Update docstrings and API docs for Vocab 2017-05-20 14:59:31 +03:00			`p Loads state from a directory. Modifies the object in place and returns it.`

			`+aside-code("Example").`
			`from spacy.vocab import Vocab`
			`vocab = Vocab().from_disk('/path/to/vocab')`
Update to new website 2016-10-31 21:04:15 +03:00
			`+table(["Name", "Type", "Description"])`
			`+row`
Update docstrings and API docs for Vocab 2017-05-20 14:59:31 +03:00			`+cell #[code path]`
			`+cell unicode or #[code Path]`
			`+cell`
			`\| A path to a directory. Paths may be either strings or`
			`\| #[code Path]-like objects.`
Update to new website 2016-10-31 21:04:15 +03:00
			`+footrow`
Use returns/yields instead of return/yield 2017-05-19 01:02:34 +03:00			`+cell returns`
Update docstrings and API docs for Vocab 2017-05-20 14:59:31 +03:00			`+cell #[code Vocab]`
			`+cell The modified #[code Vocab] object.`
Update to new website 2016-10-31 21:04:15 +03:00
Update docstrings and API docs for Vocab 2017-05-20 14:59:31 +03:00			`+h(2, "to_bytes") Vocab.to_bytes`
Update to new website 2016-10-31 21:04:15 +03:00			`+tag method`

Update docstrings and API docs for Vocab 2017-05-20 14:59:31 +03:00			`p Serialize the current state to a binary string.`

			`+aside-code("Example").`
			`vocab_bytes = nlp.vocab.to_bytes()`
Update to new website 2016-10-31 21:04:15 +03:00
			`+table(["Name", "Type", "Description"])`
			`+row`
Update docstrings and API docs for Vocab 2017-05-20 14:59:31 +03:00			`+cell #[code **exclude]`
			`+cell -`
			`+cell Named attributes to prevent from being serialized.`
Update to new website 2016-10-31 21:04:15 +03:00
			`+footrow`
Use returns/yields instead of return/yield 2017-05-19 01:02:34 +03:00			`+cell returns`
Update docstrings and API docs for Vocab 2017-05-20 14:59:31 +03:00			`+cell bytes`
			`+cell The serialized form of the #[code Vocab] object.`
Update to new website 2016-10-31 21:04:15 +03:00
Update docstrings and API docs for Vocab 2017-05-20 14:59:31 +03:00			`+h(2, "from_bytes") Vocab.from_bytes`
Update to new website 2016-10-31 21:04:15 +03:00			`+tag method`

Update docstrings and API docs for Vocab 2017-05-20 14:59:31 +03:00			`p Load state from a binary string.`

			`+aside-code("Example").`
			`fron spacy.vocab import Vocab`
			`vocab_bytes = nlp.vocab.to_bytes()`
			`vocab = Vocab()`
			`vocab.from_bytes(vocab_bytes)`
Update to new website 2016-10-31 21:04:15 +03:00
			`+table(["Name", "Type", "Description"])`
			`+row`
Update docstrings and API docs for Vocab 2017-05-20 14:59:31 +03:00			`+cell #[code bytes_data]`
			`+cell bytes`
			`+cell The data to load from.`

			`+row`
			`+cell #[code **exclude]`
			`+cell -`
			`+cell Named attributes to prevent from being loaded.`
Update to new website 2016-10-31 21:04:15 +03:00
			`+footrow`
Use returns/yields instead of return/yield 2017-05-19 01:02:34 +03:00			`+cell returns`
Fix typo and use consistent description for from_bytes 2017-05-21 14:18:39 +03:00			`+cell #[code Vocab]`
			`+cell The #[code Vocab] object.`
Update docstrings and API docs for Vocab 2017-05-20 14:59:31 +03:00
			`+h(2, "attributes") Attributes`

			`+aside-code("Example").`
			`apple_id = nlp.vocab.strings['apple']`
			`assert type(apple_id) == int`
			`PERSON = nlp.vocab.strings['PERSON']`
			`assert type(PERSON) == int`

			`+table(["Name", "Type", "Description"])`
			`+row`
			`+cell #[code strings]`
			`+cell #[code StringStore]`
			`+cell A table managing the string-to-int mapping.`