mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 01:04:34 +03:00
Update docstrings and API docs for Vocab
This commit is contained in:
parent
a93276bb78
commit
f0cc642bb9
460
spacy/vocab.pyx
460
spacy/vocab.pyx
|
@ -36,79 +36,22 @@ EMPTY_LEXEME.vector = EMPTY_VEC
|
||||||
|
|
||||||
|
|
||||||
cdef class Vocab:
|
cdef class Vocab:
|
||||||
|
"""A look-up table that allows you to access `Lexeme` objects. The `Vocab`
|
||||||
|
instance also provides access to the `StringStore`, and owns underlying
|
||||||
|
C-data that is shared between `Doc` objects.
|
||||||
"""
|
"""
|
||||||
A map container for a language's LexemeC structs.
|
|
||||||
"""
|
|
||||||
@classmethod
|
|
||||||
def load(cls, path, lex_attr_getters=None, lemmatizer=True,
|
|
||||||
tag_map=True, oov_prob=True, **deprecated_kwargs):
|
|
||||||
"""
|
|
||||||
Deprecated --- replace in spaCy 2
|
|
||||||
Load the vocabulary from a path.
|
|
||||||
|
|
||||||
Arguments:
|
|
||||||
path (Path):
|
|
||||||
The path to load from.
|
|
||||||
lex_attr_getters (dict):
|
|
||||||
A dictionary mapping attribute IDs to functions to compute them.
|
|
||||||
Defaults to None.
|
|
||||||
lemmatizer (object):
|
|
||||||
A lemmatizer. Defaults to None.
|
|
||||||
tag_map (dict):
|
|
||||||
A dictionary mapping fine-grained tags to coarse-grained parts-of-speech,
|
|
||||||
and optionally morphological attributes.
|
|
||||||
oov_prob (float):
|
|
||||||
The default probability for out-of-vocabulary words.
|
|
||||||
Returns:
|
|
||||||
Vocab: The newly constructed vocab object.
|
|
||||||
"""
|
|
||||||
path = util.ensure_path(path)
|
|
||||||
util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs)
|
|
||||||
if 'vectors' in deprecated_kwargs:
|
|
||||||
raise AttributeError(
|
|
||||||
"vectors argument to Vocab.load() deprecated. "
|
|
||||||
"Install vectors after loading.")
|
|
||||||
if tag_map is True and (path / 'vocab' / 'tag_map.json').exists():
|
|
||||||
with (path / 'vocab' / 'tag_map.json').open('r', encoding='utf8') as file_:
|
|
||||||
tag_map = ujson.load(file_)
|
|
||||||
elif tag_map is True:
|
|
||||||
tag_map = None
|
|
||||||
if lex_attr_getters is not None \
|
|
||||||
and oov_prob is True \
|
|
||||||
and (path / 'vocab' / 'oov_prob').exists():
|
|
||||||
with (path / 'vocab' / 'oov_prob').open('r', encoding='utf8') as file_:
|
|
||||||
oov_prob = float(file_.read())
|
|
||||||
lex_attr_getters[PROB] = lambda text: oov_prob
|
|
||||||
if lemmatizer is True:
|
|
||||||
lemmatizer = Lemmatizer.load(path)
|
|
||||||
|
|
||||||
with (path / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_:
|
|
||||||
strings_list = ujson.load(file_)
|
|
||||||
cdef Vocab self = cls(lex_attr_getters=lex_attr_getters, tag_map=tag_map,
|
|
||||||
lemmatizer=lemmatizer,
|
|
||||||
strings=strings_list)
|
|
||||||
self.load_lexemes(path / 'vocab' / 'lexemes.bin')
|
|
||||||
return self
|
|
||||||
|
|
||||||
|
|
||||||
def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None,
|
def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None,
|
||||||
strings=tuple(), **deprecated_kwargs):
|
strings=tuple(), **deprecated_kwargs):
|
||||||
"""
|
"""Create the vocabulary.
|
||||||
Create the vocabulary.
|
|
||||||
|
|
||||||
lex_attr_getters (dict):
|
lex_attr_getters (dict): A dictionary mapping attribute IDs to functions
|
||||||
A dictionary mapping attribute IDs to functions to compute them.
|
to compute them. Defaults to `None`.
|
||||||
Defaults to None.
|
tag_map (dict): A dictionary mapping fine-grained tags to coarse-grained
|
||||||
lemmatizer (object):
|
parts-of-speech, and optionally morphological attributes.
|
||||||
A lemmatizer. Defaults to None.
|
lemmatizer (object): A lemmatizer. Defaults to `None`.
|
||||||
tag_map (dict):
|
strings (StringStore): StringStore that maps strings to integers, and
|
||||||
A dictionary mapping fine-grained tags to coarse-grained parts-of-speech,
|
vice versa.
|
||||||
and optionally morphological attributes.
|
RETURNS (Vocab): The newly constructed vocab object.
|
||||||
oov_prob (float):
|
|
||||||
The default probability for out-of-vocabulary words.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Vocab: The newly constructed vocab object.
|
|
||||||
"""
|
"""
|
||||||
util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs)
|
util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs)
|
||||||
|
|
||||||
|
@ -148,33 +91,32 @@ cdef class Vocab:
|
||||||
return langfunc('_') if langfunc else ''
|
return langfunc('_') if langfunc else ''
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
"""
|
"""The current number of lexemes stored.
|
||||||
The current number of lexemes stored.
|
|
||||||
|
RETURNS (int): The current number of lexemes stored.
|
||||||
"""
|
"""
|
||||||
return self.length
|
return self.length
|
||||||
|
|
||||||
def add_flag(self, flag_getter, int flag_id=-1):
|
def add_flag(self, flag_getter, int flag_id=-1):
|
||||||
"""
|
"""Set a new boolean flag to words in the vocabulary.
|
||||||
Set a new boolean flag to words in the vocabulary.
|
|
||||||
|
|
||||||
The flag_setter function will be called over the words currently in the
|
The flag_getter function will be called over the words currently in the
|
||||||
vocab, and then applied to new words as they occur. You'll then be able
|
vocab, and then applied to new words as they occur. You'll then be able
|
||||||
to access the flag value on each token, using token.check_flag(flag_id).
|
to access the flag value on each token, using token.check_flag(flag_id).
|
||||||
|
See also: `Lexeme.set_flag`, `Lexeme.check_flag`, `Token.set_flag`,
|
||||||
|
`Token.check_flag`.
|
||||||
|
|
||||||
See also:
|
flag_getter (function): A function `f(unicode) -> bool`, to get the flag
|
||||||
Lexeme.set_flag, Lexeme.check_flag, Token.set_flag, Token.check_flag.
|
value.
|
||||||
|
flag_id (int): An integer between 1 and 63 (inclusive), specifying
|
||||||
|
the bit at which the flag will be stored. If -1, the lowest
|
||||||
|
available bit will be chosen.
|
||||||
|
RETURNS (int): The integer ID by which the flag value can be checked.
|
||||||
|
|
||||||
Arguments:
|
EXAMPLE:
|
||||||
flag_getter:
|
>>> MY_PRODUCT = nlp.vocab.add_flag(lambda text: text in ['spaCy', 'dislaCy'])
|
||||||
A function f(unicode) -> bool, to get the flag value.
|
>>> doc = nlp(u'I like spaCy')
|
||||||
|
>>> assert doc[2].check_flag(MY_PRODUCT) == True
|
||||||
flag_id (int):
|
|
||||||
An integer between 1 and 63 (inclusive), specifying the bit at which the
|
|
||||||
flag will be stored. If -1, the lowest available bit will be
|
|
||||||
chosen.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
flag_id (int): The integer ID by which the flag value can be checked.
|
|
||||||
"""
|
"""
|
||||||
if flag_id == -1:
|
if flag_id == -1:
|
||||||
for bit in range(1, 64):
|
for bit in range(1, 64):
|
||||||
|
@ -196,8 +138,7 @@ cdef class Vocab:
|
||||||
return flag_id
|
return flag_id
|
||||||
|
|
||||||
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL:
|
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL:
|
||||||
"""
|
"""Get a pointer to a `LexemeC` from the lexicon, creating a new `Lexeme`
|
||||||
Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
|
|
||||||
if necessary, using memory acquired from the given pool. If the pool
|
if necessary, using memory acquired from the given pool. If the pool
|
||||||
is the lexicon's own memory, the lexeme is saved in the lexicon.
|
is the lexicon's own memory, the lexeme is saved in the lexicon.
|
||||||
"""
|
"""
|
||||||
|
@ -216,8 +157,7 @@ cdef class Vocab:
|
||||||
return self._new_lexeme(mem, string)
|
return self._new_lexeme(mem, string)
|
||||||
|
|
||||||
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL:
|
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL:
|
||||||
"""
|
"""Get a pointer to a `LexemeC` from the lexicon, creating a new `Lexeme`
|
||||||
Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
|
|
||||||
if necessary, using memory acquired from the given pool. If the pool
|
if necessary, using memory acquired from the given pool. If the pool
|
||||||
is the lexicon's own memory, the lexeme is saved in the lexicon.
|
is the lexicon's own memory, the lexeme is saved in the lexicon.
|
||||||
"""
|
"""
|
||||||
|
@ -263,24 +203,19 @@ cdef class Vocab:
|
||||||
self.length += 1
|
self.length += 1
|
||||||
|
|
||||||
def __contains__(self, unicode string):
|
def __contains__(self, unicode string):
|
||||||
"""
|
"""Check whether the string has an entry in the vocabulary.
|
||||||
Check whether the string has an entry in the vocabulary.
|
|
||||||
|
|
||||||
Arguments:
|
|
||||||
string (unicode): The ID string.
|
string (unicode): The ID string.
|
||||||
|
RETURNS (bool) Whether the string has an entry in the vocabulary.
|
||||||
Returns:
|
|
||||||
bool Whether the string has an entry in the vocabulary.
|
|
||||||
"""
|
"""
|
||||||
key = hash_string(string)
|
key = hash_string(string)
|
||||||
lex = self._by_hash.get(key)
|
lex = self._by_hash.get(key)
|
||||||
return lex is not NULL
|
return lex is not NULL
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
"""
|
"""Iterate over the lexemes in the vocabulary.
|
||||||
Iterate over the lexemes in the vocabulary.
|
|
||||||
|
|
||||||
Yields: Lexeme An entry in the vocabulary.
|
YIELDS (Lexeme): An entry in the vocabulary.
|
||||||
"""
|
"""
|
||||||
cdef attr_t orth
|
cdef attr_t orth
|
||||||
cdef size_t addr
|
cdef size_t addr
|
||||||
|
@ -288,19 +223,19 @@ cdef class Vocab:
|
||||||
yield Lexeme(self, orth)
|
yield Lexeme(self, orth)
|
||||||
|
|
||||||
def __getitem__(self, id_or_string):
|
def __getitem__(self, id_or_string):
|
||||||
"""
|
"""Retrieve a lexeme, given an int ID or a unicode string. If a
|
||||||
Retrieve a lexeme, given an int ID or a unicode string. If a previously
|
previously unseen unicode string is given, a new lexeme is created and
|
||||||
unseen unicode string is given, a new lexeme is created and stored.
|
stored.
|
||||||
|
|
||||||
Arguments:
|
id_or_string (int or unicode): The integer ID of a word, or its unicode
|
||||||
id_or_string (int or unicode):
|
string. If `int >= Lexicon.size`, `IndexError` is raised. If
|
||||||
The integer ID of a word, or its unicode string.
|
`id_or_string` is neither an int nor a unicode string, `ValueError`
|
||||||
|
is raised.
|
||||||
|
RETURNS (Lexeme): The lexeme indicated by the given ID.
|
||||||
|
|
||||||
If an int >= Lexicon.size, IndexError is raised. If id_or_string
|
EXAMPLE:
|
||||||
is neither an int nor a unicode string, ValueError is raised.
|
>>> apple = nlp.vocab.strings['apple']
|
||||||
|
>>> assert nlp.vocab[apple] == nlp.vocab[u'apple']
|
||||||
Returns:
|
|
||||||
lexeme (Lexeme): The lexeme indicated by the given ID.
|
|
||||||
"""
|
"""
|
||||||
cdef attr_t orth
|
cdef attr_t orth
|
||||||
if type(id_or_string) == unicode:
|
if type(id_or_string) == unicode:
|
||||||
|
@ -324,15 +259,29 @@ cdef class Vocab:
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
def to_disk(self, path):
|
def to_disk(self, path):
|
||||||
|
"""Save the current state to a directory.
|
||||||
|
|
||||||
|
path (unicode or Path): A path to a directory, which will be created if
|
||||||
|
it doesn't exist. Paths may be either strings or `Path`-like objects.
|
||||||
|
"""
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
if not path.exists():
|
if not path.exists():
|
||||||
path.mkdir()
|
path.mkdir()
|
||||||
strings_loc = path / 'strings.json'
|
strings_loc = path / 'strings.json'
|
||||||
with strings_loc.open('w', encoding='utf8') as file_:
|
with strings_loc.open('w', encoding='utf8') as file_:
|
||||||
self.strings.dump(file_)
|
self.strings.dump(file_)
|
||||||
self.dump(path / 'lexemes.bin')
|
|
||||||
|
# TODO: pickle
|
||||||
|
# self.dump(path / 'lexemes.bin')
|
||||||
|
|
||||||
def from_disk(self, path):
|
def from_disk(self, path):
|
||||||
|
"""Loads state from a directory. Modifies the object in place and
|
||||||
|
returns it.
|
||||||
|
|
||||||
|
path (unicode or Path): A path to a directory. Paths may be either
|
||||||
|
strings or `Path`-like objects.
|
||||||
|
RETURNS (Vocab): The modified `Vocab` object.
|
||||||
|
"""
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
with (path / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_:
|
with (path / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_:
|
||||||
strings_list = ujson.load(file_)
|
strings_list = ujson.load(file_)
|
||||||
|
@ -340,6 +289,23 @@ cdef class Vocab:
|
||||||
self.strings[string]
|
self.strings[string]
|
||||||
self.load_lexemes(path / 'lexemes.bin')
|
self.load_lexemes(path / 'lexemes.bin')
|
||||||
|
|
||||||
|
def to_bytes(self, **exclude):
|
||||||
|
"""Serialize the current state to a binary string.
|
||||||
|
|
||||||
|
**exclude: Named attributes to prevent from being serialized.
|
||||||
|
RETURNS (bytes): The serialized form of the `Vocab` object.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def from_bytes(self, bytest_data, **exclude):
|
||||||
|
"""Load state from a binary string.
|
||||||
|
|
||||||
|
bytes_data (bytes): The data to load from.
|
||||||
|
**exclude: Named attributes to prevent from being loaded.
|
||||||
|
RETURNS (Vocab): The `Vocab` object.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
def lexemes_to_bytes(self, **exclude):
|
def lexemes_to_bytes(self, **exclude):
|
||||||
cdef hash_t key
|
cdef hash_t key
|
||||||
cdef size_t addr
|
cdef size_t addr
|
||||||
|
@ -365,9 +331,7 @@ cdef class Vocab:
|
||||||
return byte_string
|
return byte_string
|
||||||
|
|
||||||
def lexemes_from_bytes(self, bytes bytes_data):
|
def lexemes_from_bytes(self, bytes bytes_data):
|
||||||
"""
|
"""Load the binary vocabulary data from the given string."""
|
||||||
Load the binary vocabulary data from the given string.
|
|
||||||
"""
|
|
||||||
cdef LexemeC* lexeme
|
cdef LexemeC* lexeme
|
||||||
cdef hash_t key
|
cdef hash_t key
|
||||||
cdef unicode py_str
|
cdef unicode py_str
|
||||||
|
@ -393,14 +357,10 @@ cdef class Vocab:
|
||||||
# Deprecated --- delete these once stable
|
# Deprecated --- delete these once stable
|
||||||
|
|
||||||
def dump_vectors(self, out_loc):
|
def dump_vectors(self, out_loc):
|
||||||
"""
|
"""Save the word vectors to a binary file.
|
||||||
Save the word vectors to a binary file.
|
|
||||||
|
|
||||||
Arguments:
|
|
||||||
loc (Path): The path to save to.
|
loc (Path): The path to save to.
|
||||||
Returns:
|
"""
|
||||||
None
|
|
||||||
#"""
|
|
||||||
cdef int32_t vec_len = self.vectors_length
|
cdef int32_t vec_len = self.vectors_length
|
||||||
cdef int32_t word_len
|
cdef int32_t word_len
|
||||||
cdef bytes word_str
|
cdef bytes word_str
|
||||||
|
@ -424,17 +384,14 @@ cdef class Vocab:
|
||||||
|
|
||||||
|
|
||||||
def load_vectors(self, file_):
|
def load_vectors(self, file_):
|
||||||
"""
|
"""Load vectors from a text-based file.
|
||||||
Load vectors from a text-based file.
|
|
||||||
|
|
||||||
Arguments:
|
file_ (buffer): The file to read from. Entries should be separated by
|
||||||
file_ (buffer): The file to read from. Entries should be separated by newlines,
|
newlines, and each entry should be whitespace delimited. The first value of the entry
|
||||||
and each entry should be whitespace delimited. The first value of the entry
|
|
||||||
should be the word string, and subsequent entries should be the values of the
|
should be the word string, and subsequent entries should be the values of the
|
||||||
vector.
|
vector.
|
||||||
|
|
||||||
Returns:
|
RETURNS (int): The length of the vectors loaded.
|
||||||
vec_len (int): The length of the vectors loaded.
|
|
||||||
"""
|
"""
|
||||||
cdef LexemeC* lexeme
|
cdef LexemeC* lexeme
|
||||||
cdef attr_t orth
|
cdef attr_t orth
|
||||||
|
@ -464,14 +421,11 @@ cdef class Vocab:
|
||||||
return vec_len
|
return vec_len
|
||||||
|
|
||||||
def load_vectors_from_bin_loc(self, loc):
|
def load_vectors_from_bin_loc(self, loc):
|
||||||
"""
|
"""Load vectors from the location of a binary file.
|
||||||
Load vectors from the location of a binary file.
|
|
||||||
|
|
||||||
Arguments:
|
|
||||||
loc (unicode): The path of the binary file to load from.
|
loc (unicode): The path of the binary file to load from.
|
||||||
|
|
||||||
Returns:
|
RETURNS (int): The length of the vectors loaded.
|
||||||
vec_len (int): The length of the vectors loaded.
|
|
||||||
"""
|
"""
|
||||||
cdef CFile file_ = CFile(loc, b'rb')
|
cdef CFile file_ = CFile(loc, b'rb')
|
||||||
cdef int32_t word_len
|
cdef int32_t word_len
|
||||||
|
@ -526,11 +480,9 @@ cdef class Vocab:
|
||||||
|
|
||||||
|
|
||||||
def resize_vectors(self, int new_size):
|
def resize_vectors(self, int new_size):
|
||||||
"""
|
"""Set vectors_length to a new size, and allocate more memory for the
|
||||||
Set vectors_length to a new size, and allocate more memory for the Lexeme
|
`Lexeme` vectors if necessary. The memory will be zeroed.
|
||||||
vectors if necessary. The memory will be zeroed.
|
|
||||||
|
|
||||||
Arguments:
|
|
||||||
new_size (int): The new size of the vectors.
|
new_size (int): The new size of the vectors.
|
||||||
"""
|
"""
|
||||||
cdef hash_t key
|
cdef hash_t key
|
||||||
|
@ -633,237 +585,3 @@ class VectorReadError(Exception):
|
||||||
"Vector size: %d\n"
|
"Vector size: %d\n"
|
||||||
"Max size: %d\n"
|
"Max size: %d\n"
|
||||||
"Min size: 1\n" % (loc, size, MAX_VEC_SIZE))
|
"Min size: 1\n" % (loc, size, MAX_VEC_SIZE))
|
||||||
|
|
||||||
|
|
||||||
#
|
|
||||||
#Deprecated --- delete these once stable
|
|
||||||
#
|
|
||||||
# def dump_vectors(self, out_loc):
|
|
||||||
# """
|
|
||||||
# Save the word vectors to a binary file.
|
|
||||||
#
|
|
||||||
# Arguments:
|
|
||||||
# loc (Path): The path to save to.
|
|
||||||
# Returns:
|
|
||||||
# None
|
|
||||||
# #"""
|
|
||||||
# cdef int32_t vec_len = self.vectors_length
|
|
||||||
# cdef int32_t word_len
|
|
||||||
# cdef bytes word_str
|
|
||||||
# cdef char* chars
|
|
||||||
#
|
|
||||||
# cdef Lexeme lexeme
|
|
||||||
# cdef CFile out_file = CFile(out_loc, 'wb')
|
|
||||||
# for lexeme in self:
|
|
||||||
# word_str = lexeme.orth_.encode('utf8')
|
|
||||||
# vec = lexeme.c.vector
|
|
||||||
# word_len = len(word_str)
|
|
||||||
#
|
|
||||||
# out_file.write_from(&word_len, 1, sizeof(word_len))
|
|
||||||
# out_file.write_from(&vec_len, 1, sizeof(vec_len))
|
|
||||||
#
|
|
||||||
# chars = <char*>word_str
|
|
||||||
# out_file.write_from(chars, word_len, sizeof(char))
|
|
||||||
# out_file.write_from(vec, vec_len, sizeof(float))
|
|
||||||
# out_file.close()
|
|
||||||
#
|
|
||||||
#
|
|
||||||
#
|
|
||||||
# def load_vectors(self, file_):
|
|
||||||
# """
|
|
||||||
# Load vectors from a text-based file.
|
|
||||||
#
|
|
||||||
# Arguments:
|
|
||||||
# file_ (buffer): The file to read from. Entries should be separated by newlines,
|
|
||||||
# and each entry should be whitespace delimited. The first value of the entry
|
|
||||||
# should be the word string, and subsequent entries should be the values of the
|
|
||||||
# vector.
|
|
||||||
#
|
|
||||||
# Returns:
|
|
||||||
# vec_len (int): The length of the vectors loaded.
|
|
||||||
# """
|
|
||||||
# cdef LexemeC* lexeme
|
|
||||||
# cdef attr_t orth
|
|
||||||
# cdef int32_t vec_len = -1
|
|
||||||
# cdef double norm = 0.0
|
|
||||||
#
|
|
||||||
# whitespace_pattern = re.compile(r'\s', re.UNICODE)
|
|
||||||
#
|
|
||||||
# for line_num, line in enumerate(file_):
|
|
||||||
# pieces = line.split()
|
|
||||||
# word_str = " " if whitespace_pattern.match(line) else pieces.pop(0)
|
|
||||||
# if vec_len == -1:
|
|
||||||
# vec_len = len(pieces)
|
|
||||||
# elif vec_len != len(pieces):
|
|
||||||
# raise VectorReadError.mismatched_sizes(file_, line_num,
|
|
||||||
# vec_len, len(pieces))
|
|
||||||
# orth = self.strings[word_str]
|
|
||||||
# lexeme = <LexemeC*><void*>self.get_by_orth(self.mem, orth)
|
|
||||||
# lexeme.vector = <float*>self.mem.alloc(vec_len, sizeof(float))
|
|
||||||
# for i, val_str in enumerate(pieces):
|
|
||||||
# lexeme.vector[i] = float(val_str)
|
|
||||||
# norm = 0.0
|
|
||||||
# for i in range(vec_len):
|
|
||||||
# norm += lexeme.vector[i] * lexeme.vector[i]
|
|
||||||
# lexeme.l2_norm = sqrt(norm)
|
|
||||||
# self.vectors_length = vec_len
|
|
||||||
# return vec_len
|
|
||||||
#
|
|
||||||
# def load_vectors_from_bin_loc(self, loc):
|
|
||||||
# """
|
|
||||||
# Load vectors from the location of a binary file.
|
|
||||||
#
|
|
||||||
# Arguments:
|
|
||||||
# loc (unicode): The path of the binary file to load from.
|
|
||||||
#
|
|
||||||
# Returns:
|
|
||||||
# vec_len (int): The length of the vectors loaded.
|
|
||||||
# """
|
|
||||||
# cdef CFile file_ = CFile(loc, b'rb')
|
|
||||||
# cdef int32_t word_len
|
|
||||||
# cdef int32_t vec_len = 0
|
|
||||||
# cdef int32_t prev_vec_len = 0
|
|
||||||
# cdef float* vec
|
|
||||||
# cdef Address mem
|
|
||||||
# cdef attr_t string_id
|
|
||||||
# cdef bytes py_word
|
|
||||||
# cdef vector[float*] vectors
|
|
||||||
# cdef int line_num = 0
|
|
||||||
# cdef Pool tmp_mem = Pool()
|
|
||||||
# while True:
|
|
||||||
# try:
|
|
||||||
# file_.read_into(&word_len, sizeof(word_len), 1)
|
|
||||||
# except IOError:
|
|
||||||
# break
|
|
||||||
# file_.read_into(&vec_len, sizeof(vec_len), 1)
|
|
||||||
# if prev_vec_len != 0 and vec_len != prev_vec_len:
|
|
||||||
# raise VectorReadError.mismatched_sizes(loc, line_num,
|
|
||||||
# vec_len, prev_vec_len)
|
|
||||||
# if 0 >= vec_len >= MAX_VEC_SIZE:
|
|
||||||
# raise VectorReadError.bad_size(loc, vec_len)
|
|
||||||
#
|
|
||||||
# chars = <char*>file_.alloc_read(tmp_mem, word_len, sizeof(char))
|
|
||||||
# vec = <float*>file_.alloc_read(self.mem, vec_len, sizeof(float))
|
|
||||||
#
|
|
||||||
# string_id = self.strings[chars[:word_len]]
|
|
||||||
# # Insert words into vocab to add vector.
|
|
||||||
# self.get_by_orth(self.mem, string_id)
|
|
||||||
# while string_id >= vectors.size():
|
|
||||||
# vectors.push_back(EMPTY_VEC)
|
|
||||||
# assert vec != NULL
|
|
||||||
# vectors[string_id] = vec
|
|
||||||
# line_num += 1
|
|
||||||
# cdef LexemeC* lex
|
|
||||||
# cdef size_t lex_addr
|
|
||||||
# cdef double norm = 0.0
|
|
||||||
# cdef int i
|
|
||||||
# for orth, lex_addr in self._by_orth.items():
|
|
||||||
# lex = <LexemeC*>lex_addr
|
|
||||||
# if lex.lower < vectors.size():
|
|
||||||
# lex.vector = vectors[lex.lower]
|
|
||||||
# norm = 0.0
|
|
||||||
# for i in range(vec_len):
|
|
||||||
# norm += lex.vector[i] * lex.vector[i]
|
|
||||||
# lex.l2_norm = sqrt(norm)
|
|
||||||
# else:
|
|
||||||
# lex.vector = EMPTY_VEC
|
|
||||||
# self.vectors_length = vec_len
|
|
||||||
# return vec_len
|
|
||||||
#
|
|
||||||
#
|
|
||||||
#def write_binary_vectors(in_loc, out_loc):
|
|
||||||
# cdef CFile out_file = CFile(out_loc, 'wb')
|
|
||||||
# cdef Address mem
|
|
||||||
# cdef int32_t word_len
|
|
||||||
# cdef int32_t vec_len
|
|
||||||
# cdef char* chars
|
|
||||||
# with bz2.BZ2File(in_loc, 'r') as file_:
|
|
||||||
# for line in file_:
|
|
||||||
# pieces = line.split()
|
|
||||||
# word = pieces.pop(0)
|
|
||||||
# mem = Address(len(pieces), sizeof(float))
|
|
||||||
# vec = <float*>mem.ptr
|
|
||||||
# for i, val_str in enumerate(pieces):
|
|
||||||
# vec[i] = float(val_str)
|
|
||||||
#
|
|
||||||
# word_len = len(word)
|
|
||||||
# vec_len = len(pieces)
|
|
||||||
#
|
|
||||||
# out_file.write_from(&word_len, 1, sizeof(word_len))
|
|
||||||
# out_file.write_from(&vec_len, 1, sizeof(vec_len))
|
|
||||||
#
|
|
||||||
# chars = <char*>word
|
|
||||||
# out_file.write_from(chars, len(word), sizeof(char))
|
|
||||||
# out_file.write_from(vec, vec_len, sizeof(float))
|
|
||||||
#
|
|
||||||
#
|
|
||||||
# def resize_vectors(self, int new_size):
|
|
||||||
# """
|
|
||||||
# Set vectors_length to a new size, and allocate more memory for the Lexeme
|
|
||||||
# vectors if necessary. The memory will be zeroed.
|
|
||||||
#
|
|
||||||
# Arguments:
|
|
||||||
# new_size (int): The new size of the vectors.
|
|
||||||
# """
|
|
||||||
# cdef hash_t key
|
|
||||||
# cdef size_t addr
|
|
||||||
# if new_size > self.vectors_length:
|
|
||||||
# for key, addr in self._by_hash.items():
|
|
||||||
# lex = <LexemeC*>addr
|
|
||||||
# lex.vector = <float*>self.mem.realloc(lex.vector,
|
|
||||||
# new_size * sizeof(lex.vector[0]))
|
|
||||||
# self.vectors_length = new_size
|
|
||||||
#
|
|
||||||
#
|
|
||||||
|
|
||||||
#
|
|
||||||
# def dump(self, loc=None):
|
|
||||||
# """
|
|
||||||
# Save the lexemes binary data to the given location, or
|
|
||||||
# return a byte-string with the data if loc is None.
|
|
||||||
#
|
|
||||||
# Arguments:
|
|
||||||
# loc (Path or None): The path to save to, or None.
|
|
||||||
# """
|
|
||||||
# if loc is None:
|
|
||||||
# return self.to_bytes()
|
|
||||||
# else:
|
|
||||||
# return self.to_disk(loc)
|
|
||||||
#
|
|
||||||
# def load_lexemes(self, loc):
|
|
||||||
# """
|
|
||||||
# Load the binary vocabulary data from the given location.
|
|
||||||
#
|
|
||||||
# Arguments:
|
|
||||||
# loc (Path): The path to load from.
|
|
||||||
#
|
|
||||||
# Returns:
|
|
||||||
# None
|
|
||||||
# """
|
|
||||||
# fp = CFile(loc, 'rb',
|
|
||||||
# on_open_error=lambda: IOError('LexemeCs file not found at %s' % loc))
|
|
||||||
# cdef LexemeC* lexeme = NULL
|
|
||||||
# cdef SerializedLexemeC lex_data
|
|
||||||
# cdef hash_t key
|
|
||||||
# cdef unicode py_str
|
|
||||||
# cdef attr_t orth = 0
|
|
||||||
# assert sizeof(orth) == sizeof(lexeme.orth)
|
|
||||||
# i = 0
|
|
||||||
# while True:
|
|
||||||
# try:
|
|
||||||
# fp.read_into(&orth, 1, sizeof(orth))
|
|
||||||
# except IOError:
|
|
||||||
# break
|
|
||||||
# lexeme = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
|
|
||||||
# # Copy data from the file into the lexeme
|
|
||||||
# fp.read_into(&lex_data.data, 1, sizeof(lex_data.data))
|
|
||||||
# Lexeme.c_from_bytes(lexeme, lex_data)
|
|
||||||
#
|
|
||||||
# lexeme.vector = EMPTY_VEC
|
|
||||||
# py_str = self.strings[lexeme.orth]
|
|
||||||
# key = hash_string(py_str)
|
|
||||||
# self._by_hash.set(key, lexeme)
|
|
||||||
# self._by_orth.set(lexeme.orth, lexeme)
|
|
||||||
# self.length += 1
|
|
||||||
# i += 1
|
|
||||||
# fp.close()
|
|
||||||
|
|
|
@ -7,59 +7,6 @@ p
|
||||||
| #[code Vocab] instance also provides access to the #[code StringStore],
|
| #[code Vocab] instance also provides access to the #[code StringStore],
|
||||||
| and owns underlying C-data that is shared between #[code Doc] objects.
|
| and owns underlying C-data that is shared between #[code Doc] objects.
|
||||||
|
|
||||||
+h(2, "attributes") Attributes
|
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
|
||||||
+row
|
|
||||||
+cell #[code strings]
|
|
||||||
+cell #[code StringStore]
|
|
||||||
+cell A table managing the string-to-int mapping.
|
|
||||||
|
|
||||||
+row
|
|
||||||
+cell #[code vectors_length]
|
|
||||||
+cell int
|
|
||||||
+cell The dimensionality of the word vectors, if present.
|
|
||||||
|
|
||||||
+h(2, "load") Vocab.load
|
|
||||||
+tag classmethod
|
|
||||||
|
|
||||||
p Load the vocabulary from a path.
|
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
|
||||||
+row
|
|
||||||
+cell #[code path]
|
|
||||||
+cell #[code Path]
|
|
||||||
+cell The path to load from.
|
|
||||||
|
|
||||||
+row
|
|
||||||
+cell #[code lex_attr_getters]
|
|
||||||
+cell dict
|
|
||||||
+cell
|
|
||||||
| A dictionary mapping attribute IDs to functions to compute them.
|
|
||||||
| Defaults to #[code None].
|
|
||||||
|
|
||||||
+row
|
|
||||||
+cell #[code lemmatizer]
|
|
||||||
+cell -
|
|
||||||
+cell A lemmatizer. Defaults to #[code None].
|
|
||||||
|
|
||||||
+row
|
|
||||||
+cell #[code tag_map]
|
|
||||||
+cell dict
|
|
||||||
+cell
|
|
||||||
| A dictionary mapping fine-grained tags to coarse-grained
|
|
||||||
| parts-of-speech, and optionally morphological attributes.
|
|
||||||
|
|
||||||
+row
|
|
||||||
+cell #[code oov_prob]
|
|
||||||
+cell float
|
|
||||||
+cell The default probability for out-of-vocabulary words.
|
|
||||||
|
|
||||||
+footrow
|
|
||||||
+cell returns
|
|
||||||
+cell #[code Vocab]
|
|
||||||
+cell The newly constructed object.
|
|
||||||
|
|
||||||
+h(2, "init") Vocab.__init__
|
+h(2, "init") Vocab.__init__
|
||||||
+tag method
|
+tag method
|
||||||
|
|
||||||
|
@ -73,11 +20,6 @@ p Create the vocabulary.
|
||||||
| A dictionary mapping attribute IDs to functions to compute them.
|
| A dictionary mapping attribute IDs to functions to compute them.
|
||||||
| Defaults to #[code None].
|
| Defaults to #[code None].
|
||||||
|
|
||||||
+row
|
|
||||||
+cell #[code lemmatizer]
|
|
||||||
+cell -
|
|
||||||
+cell A lemmatizer. Defaults to #[code None].
|
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code tag_map]
|
+cell #[code tag_map]
|
||||||
+cell dict
|
+cell dict
|
||||||
|
@ -86,9 +28,16 @@ p Create the vocabulary.
|
||||||
| parts-of-speech, and optionally morphological attributes.
|
| parts-of-speech, and optionally morphological attributes.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code oov_prob]
|
+cell #[code lemmatizer]
|
||||||
+cell float
|
+cell object
|
||||||
+cell The default probability for out-of-vocabulary words.
|
+cell A lemmatizer. Defaults to #[code None].
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code strings]
|
||||||
|
+cell #[code StringStore]
|
||||||
|
+cell
|
||||||
|
| A #[code StringStore] that maps strings to integers, and vice
|
||||||
|
| versa.
|
||||||
|
|
||||||
+footrow
|
+footrow
|
||||||
+cell returns
|
+cell returns
|
||||||
|
@ -98,7 +47,11 @@ p Create the vocabulary.
|
||||||
+h(2, "len") Vocab.__len__
|
+h(2, "len") Vocab.__len__
|
||||||
+tag method
|
+tag method
|
||||||
|
|
||||||
p Get the number of lexemes in the vocabulary.
|
p Get the current number of lexemes in the vocabulary.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
doc = nlp(u'This is a sentence.')
|
||||||
|
assert len(nlp.vocab) > 0
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
+table(["Name", "Type", "Description"])
|
||||||
+footrow
|
+footrow
|
||||||
|
@ -113,6 +66,10 @@ p
|
||||||
| Retrieve a lexeme, given an int ID or a unicode string. If a previously
|
| Retrieve a lexeme, given an int ID or a unicode string. If a previously
|
||||||
| unseen unicode string is given, a new lexeme is created and stored.
|
| unseen unicode string is given, a new lexeme is created and stored.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
apple = nlp.vocab.strings['apple']
|
||||||
|
assert nlp.vocab[apple] == nlp.vocab[u'apple']
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
+table(["Name", "Type", "Description"])
|
||||||
+row
|
+row
|
||||||
+cell #[code id_or_string]
|
+cell #[code id_or_string]
|
||||||
|
@ -129,6 +86,9 @@ p
|
||||||
|
|
||||||
p Iterate over the lexemes in the vocabulary.
|
p Iterate over the lexemes in the vocabulary.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
stop_words = (lex for lex in nlp.vocab if lex.is_stop)
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
+table(["Name", "Type", "Description"])
|
||||||
+footrow
|
+footrow
|
||||||
+cell yields
|
+cell yields
|
||||||
|
@ -138,7 +98,16 @@ p Iterate over the lexemes in the vocabulary.
|
||||||
+h(2, "contains") Vocab.__contains__
|
+h(2, "contains") Vocab.__contains__
|
||||||
+tag method
|
+tag method
|
||||||
|
|
||||||
p Check whether the string has an entry in the vocabulary.
|
p
|
||||||
|
| Check whether the string has an entry in the vocabulary. To get the ID
|
||||||
|
| for a given string, you need to look it up in
|
||||||
|
| #[+api("vocab#attributes") #[code vocab.strings]].
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
apple = nlp.vocab.strings['apple']
|
||||||
|
oov = nlp.vocab.strings['dskfodkfos']
|
||||||
|
assert apple in nlp.vocab
|
||||||
|
assert oov not in nlp.vocab
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
+table(["Name", "Type", "Description"])
|
||||||
+row
|
+row
|
||||||
|
@ -151,28 +120,23 @@ p Check whether the string has an entry in the vocabulary.
|
||||||
+cell bool
|
+cell bool
|
||||||
+cell Whether the string has an entry in the vocabulary.
|
+cell Whether the string has an entry in the vocabulary.
|
||||||
|
|
||||||
+h(2, "resize_vectors") Vocab.resize_vectors
|
|
||||||
+tag method
|
|
||||||
|
|
||||||
p
|
|
||||||
| Set #[code vectors_length] to a new size, and allocate more memory for
|
|
||||||
| the #[code Lexeme] vectors if necessary. The memory will be zeroed.
|
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
|
||||||
+row
|
|
||||||
+cell #[code new_size]
|
|
||||||
+cell int
|
|
||||||
+cell The new size of the vectors.
|
|
||||||
|
|
||||||
+footrow
|
|
||||||
+cell returns
|
|
||||||
+cell #[code None]
|
|
||||||
+cell -
|
|
||||||
|
|
||||||
+h(2, "add_flag") Vocab.add_flag
|
+h(2, "add_flag") Vocab.add_flag
|
||||||
+tag method
|
+tag method
|
||||||
|
|
||||||
p Set a new boolean flag to words in the vocabulary.
|
p
|
||||||
|
| Set a new boolean flag to words in the vocabulary. The #[code flag_getter]
|
||||||
|
| function will be called over the words currently in the vocab, and then
|
||||||
|
| applied to new words as they occur. You'll then be able to access the flag
|
||||||
|
| value on each token, using #[code token.check_flag(flag_id)].
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
def is_my_product(text):
|
||||||
|
products = [u'spaCy', u'Thinc', u'displaCy']
|
||||||
|
return text in products
|
||||||
|
|
||||||
|
MY_PRODUCT = nlp.vocab.add_flag(is_my_product)
|
||||||
|
doc = nlp(u'I like spaCy')
|
||||||
|
assert doc[2].check_flag(MY_PRODUCT) == True
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
+table(["Name", "Type", "Description"])
|
||||||
+row
|
+row
|
||||||
|
@ -193,86 +157,124 @@ p Set a new boolean flag to words in the vocabulary.
|
||||||
+cell int
|
+cell int
|
||||||
+cell The integer ID by which the flag value can be checked.
|
+cell The integer ID by which the flag value can be checked.
|
||||||
|
|
||||||
+h(2, "dump") Vocab.dump
|
+h(2, "resize_vectors") Vocab.resize_vectors
|
||||||
+tag method
|
|
||||||
|
|
||||||
p Save the lexemes binary data to the given location.
|
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
|
||||||
+row
|
|
||||||
+cell #[code loc]
|
|
||||||
+cell #[code Path]
|
|
||||||
+cell The path to load from.
|
|
||||||
|
|
||||||
+footrow
|
|
||||||
+cell returns
|
|
||||||
+cell #[code None]
|
|
||||||
+cell -
|
|
||||||
|
|
||||||
+h(2, "load_lexemes") Vocab.load_lexemes
|
|
||||||
+tag method
|
+tag method
|
||||||
|
+tag-model("vectors")
|
||||||
|
|
||||||
p
|
p
|
||||||
|
| Set #[code vectors_length] to a new size, and allocate more memory for
|
||||||
|
| the #[code Lexeme] vectors if necessary. The memory will be zeroed.
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
+table(["Name", "Type", "Description"])
|
||||||
+row
|
+row
|
||||||
+cell #[code loc]
|
+cell #[code new_size]
|
||||||
+cell unicode
|
+cell int
|
||||||
+cell Path to load the lexemes.bin file from.
|
+cell The new size of the vectors.
|
||||||
|
|
||||||
+footrow
|
+footrow
|
||||||
+cell returns
|
+cell returns
|
||||||
+cell #[code None]
|
+cell #[code None]
|
||||||
+cell -
|
+cell -
|
||||||
|
|
||||||
+h(2, "dump_vectors") Vocab.dump_vectors
|
+h(2, "to_disk") Vocab.to_disk
|
||||||
+tag method
|
+tag method
|
||||||
|
|
||||||
p Save the word vectors to a binary file.
|
p Save the current state to a directory.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
nlp.vocab.to_disk('/path/to/vocab')
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
+table(["Name", "Type", "Description"])
|
||||||
+row
|
+row
|
||||||
+cell #[code loc]
|
+cell #[code path]
|
||||||
+cell #[code Path]
|
+cell unicode or #[code Path]
|
||||||
+cell The path to save to.
|
|
||||||
|
|
||||||
+footrow
|
|
||||||
+cell returns
|
|
||||||
+cell #[code None]
|
|
||||||
+cell -
|
|
||||||
|
|
||||||
+h(2, "load_vectors") Vocab.load_vectors
|
|
||||||
+tag method
|
|
||||||
|
|
||||||
p Load vectors from a text-based file.
|
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
|
||||||
+row
|
|
||||||
+cell #[code file_]
|
|
||||||
+cell buffer
|
|
||||||
+cell
|
+cell
|
||||||
| The file to read from. Entries should be separated by newlines,
|
| A path to a directory, which will be created if it doesn't exist.
|
||||||
| and each entry should be whitespace delimited. The first value
|
| Paths may be either strings or #[code Path]-like objects.
|
||||||
| of the entry should be the word string, and subsequent entries
|
|
||||||
| should be the values of the vector.
|
|
||||||
|
|
||||||
+footrow
|
+h(2, "from_disk") Vocab.from_disk
|
||||||
+cell returns
|
|
||||||
+cell int
|
|
||||||
+cell The length of the vectors loaded.
|
|
||||||
|
|
||||||
+h(2, "load_vectors_from_bin_loc") Vocab.load_vectors_from_bin_loc
|
|
||||||
+tag method
|
+tag method
|
||||||
|
|
||||||
p Load vectors from the location of a binary file.
|
p Loads state from a directory. Modifies the object in place and returns it.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
from spacy.vocab import Vocab
|
||||||
|
vocab = Vocab().from_disk('/path/to/vocab')
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
+table(["Name", "Type", "Description"])
|
||||||
+row
|
+row
|
||||||
+cell #[code loc]
|
+cell #[code path]
|
||||||
+cell unicode
|
+cell unicode or #[code Path]
|
||||||
+cell The path of the binary file to load from.
|
+cell
|
||||||
|
| A path to a directory. Paths may be either strings or
|
||||||
|
| #[code Path]-like objects.
|
||||||
|
|
||||||
+footrow
|
+footrow
|
||||||
+cell returns
|
+cell returns
|
||||||
|
+cell #[code Vocab]
|
||||||
|
+cell The modified #[code Vocab] object.
|
||||||
|
|
||||||
|
+h(2, "to_bytes") Vocab.to_bytes
|
||||||
|
+tag method
|
||||||
|
|
||||||
|
p Serialize the current state to a binary string.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
vocab_bytes = nlp.vocab.to_bytes()
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+row
|
||||||
|
+cell #[code **exclude]
|
||||||
|
+cell -
|
||||||
|
+cell Named attributes to prevent from being serialized.
|
||||||
|
|
||||||
|
+footrow
|
||||||
|
+cell returns
|
||||||
|
+cell bytes
|
||||||
|
+cell The serialized form of the #[code Vocab] object.
|
||||||
|
|
||||||
|
+h(2, "from_bytes") Vocab.from_bytes
|
||||||
|
+tag method
|
||||||
|
|
||||||
|
p Load state from a binary string.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
fron spacy.vocab import Vocab
|
||||||
|
vocab_bytes = nlp.vocab.to_bytes()
|
||||||
|
vocab = Vocab()
|
||||||
|
vocab.from_bytes(vocab_bytes)
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+row
|
||||||
|
+cell #[code bytes_data]
|
||||||
|
+cell bytes
|
||||||
|
+cell The data to load from.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code **exclude]
|
||||||
|
+cell -
|
||||||
|
+cell Named attributes to prevent from being loaded.
|
||||||
|
|
||||||
|
+footrow
|
||||||
|
+cell returns
|
||||||
|
+cell bytes
|
||||||
|
+cell The serialized form of the #[code Vocab] object.
|
||||||
|
|
||||||
|
+h(2, "attributes") Attributes
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
apple_id = nlp.vocab.strings['apple']
|
||||||
|
assert type(apple_id) == int
|
||||||
|
PERSON = nlp.vocab.strings['PERSON']
|
||||||
|
assert type(PERSON) == int
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+row
|
||||||
|
+cell #[code strings]
|
||||||
|
+cell #[code StringStore]
|
||||||
|
+cell A table managing the string-to-int mapping.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code vectors_length]
|
||||||
+cell int
|
+cell int
|
||||||
+cell The length of the vectors loaded.
|
+cell The dimensionality of the word vectors, if present.
|
||||||
|
|
Loading…
Reference in New Issue
Block a user