diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index d7d27a3e4..b6418bc43 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -26,15 +26,6 @@ from . import attrs from . import symbols -DEF MAX_VEC_SIZE = 100000 - - -cdef float[MAX_VEC_SIZE] EMPTY_VEC -memset(EMPTY_VEC, 0, sizeof(EMPTY_VEC)) -memset(&EMPTY_LEXEME, 0, sizeof(LexemeC)) -EMPTY_LEXEME.vector = EMPTY_VEC - - cdef class Vocab: """A look-up table that allows you to access `Lexeme` objects. The `Vocab` instance also provides access to the `StringStore`, and owns underlying @@ -179,7 +170,6 @@ cdef class Vocab: lex.orth = self.strings[string] lex.length = len(string) lex.id = self.length - lex.vector = mem.alloc(self.vectors_length, sizeof(float)) if self.lex_attr_getters is not None: for attr, func in self.lex_attr_getters.items(): value = func(string) @@ -258,6 +248,26 @@ cdef class Vocab: Token.set_struct_attr(token, attr_id, value) return tokens + def get_vector(self, orth): + """Retrieve a vector for a word in the vocabulary. + + Words can be looked up by string or int ID. + + RETURNS: + A word vector. Size and shape determed by the + vocab.vectors instance. Usually, a numpy ndarray + of shape (300,) and dtype float32. + + RAISES: If no vectors data is loaded, ValueError is raised. + """ + raise NotImplementedError + + def has_vector(self, orth): + """Check whether a word has a vector. Returns False if no + vectors have been loaded. Words can be looked up by string + or int ID.""" + raise NotImplementedError + def to_disk(self, path): """Save the current state to a directory. @@ -271,9 +281,6 @@ cdef class Vocab: with strings_loc.open('w', encoding='utf8') as file_: self.strings.dump(file_) - # TODO: pickle - # self.dump(path / 'lexemes.bin') - def from_disk(self, path): """Loads state from a directory. Modifies the object in place and returns it. @@ -346,7 +353,6 @@ cdef class Vocab: lex_data.data[j] = bytes_ptr[i+j] Lexeme.c_from_bytes(lexeme, lex_data) - lexeme.vector = EMPTY_VEC py_str = self.strings[lexeme.orth] assert self.strings[py_str] == lexeme.orth, (py_str, lexeme.orth) key = hash_string(py_str) @@ -354,172 +360,6 @@ cdef class Vocab: self._by_orth.set(lexeme.orth, lexeme) self.length += 1 - # Deprecated --- delete these once stable - - def dump_vectors(self, out_loc): - """Save the word vectors to a binary file. - - loc (Path): The path to save to. - """ - cdef int32_t vec_len = self.vectors_length - cdef int32_t word_len - cdef bytes word_str - cdef char* chars - - cdef Lexeme lexeme - cdef CFile out_file = CFile(out_loc, 'wb') - for lexeme in self: - word_str = lexeme.orth_.encode('utf8') - vec = lexeme.c.vector - word_len = len(word_str) - - out_file.write_from(&word_len, 1, sizeof(word_len)) - out_file.write_from(&vec_len, 1, sizeof(vec_len)) - - chars = word_str - out_file.write_from(chars, word_len, sizeof(char)) - out_file.write_from(vec, vec_len, sizeof(float)) - out_file.close() - - - - def load_vectors(self, file_): - """Load vectors from a text-based file. - - file_ (buffer): The file to read from. Entries should be separated by - newlines, and each entry should be whitespace delimited. The first value of the entry - should be the word string, and subsequent entries should be the values of the - vector. - - RETURNS (int): The length of the vectors loaded. - """ - cdef LexemeC* lexeme - cdef attr_t orth - cdef int32_t vec_len = -1 - cdef double norm = 0.0 - - whitespace_pattern = re.compile(r'\s', re.UNICODE) - - for line_num, line in enumerate(file_): - pieces = line.split() - word_str = " " if whitespace_pattern.match(line) else pieces.pop(0) - if vec_len == -1: - vec_len = len(pieces) - elif vec_len != len(pieces): - raise VectorReadError.mismatched_sizes(file_, line_num, - vec_len, len(pieces)) - orth = self.strings[word_str] - lexeme = self.get_by_orth(self.mem, orth) - lexeme.vector = self.mem.alloc(vec_len, sizeof(float)) - for i, val_str in enumerate(pieces): - lexeme.vector[i] = float(val_str) - norm = 0.0 - for i in range(vec_len): - norm += lexeme.vector[i] * lexeme.vector[i] - lexeme.l2_norm = sqrt(norm) - self.vectors_length = vec_len - return vec_len - - def load_vectors_from_bin_loc(self, loc): - """Load vectors from the location of a binary file. - - loc (unicode): The path of the binary file to load from. - - RETURNS (int): The length of the vectors loaded. - """ - cdef CFile file_ = CFile(loc, b'rb') - cdef int32_t word_len - cdef int32_t vec_len = 0 - cdef int32_t prev_vec_len = 0 - cdef float* vec - cdef Address mem - cdef attr_t string_id - cdef bytes py_word - cdef vector[float*] vectors - cdef int line_num = 0 - cdef Pool tmp_mem = Pool() - while True: - try: - file_.read_into(&word_len, sizeof(word_len), 1) - except IOError: - break - file_.read_into(&vec_len, sizeof(vec_len), 1) - if prev_vec_len != 0 and vec_len != prev_vec_len: - raise VectorReadError.mismatched_sizes(loc, line_num, - vec_len, prev_vec_len) - if 0 >= vec_len >= MAX_VEC_SIZE: - raise VectorReadError.bad_size(loc, vec_len) - - chars = file_.alloc_read(tmp_mem, word_len, sizeof(char)) - vec = file_.alloc_read(self.mem, vec_len, sizeof(float)) - - string_id = self.strings[chars[:word_len]] - # Insert words into vocab to add vector. - self.get_by_orth(self.mem, string_id) - while string_id >= vectors.size(): - vectors.push_back(EMPTY_VEC) - assert vec != NULL - vectors[string_id] = vec - line_num += 1 - cdef LexemeC* lex - cdef size_t lex_addr - cdef double norm = 0.0 - cdef int i - for orth, lex_addr in self._by_orth.items(): - lex = lex_addr - if lex.lower < vectors.size(): - lex.vector = vectors[lex.lower] - norm = 0.0 - for i in range(vec_len): - norm += lex.vector[i] * lex.vector[i] - lex.l2_norm = sqrt(norm) - else: - lex.vector = EMPTY_VEC - self.vectors_length = vec_len - return vec_len - - - def resize_vectors(self, int new_size): - """Set vectors_length to a new size, and allocate more memory for the - `Lexeme` vectors if necessary. The memory will be zeroed. - - new_size (int): The new size of the vectors. - """ - cdef hash_t key - cdef size_t addr - if new_size > self.vectors_length: - for key, addr in self._by_hash.items(): - lex = addr - lex.vector = self.mem.realloc(lex.vector, - new_size * sizeof(lex.vector[0])) - self.vectors_length = new_size - - -def write_binary_vectors(in_loc, out_loc): - cdef CFile out_file = CFile(out_loc, 'wb') - cdef Address mem - cdef int32_t word_len - cdef int32_t vec_len - cdef char* chars - with bz2.BZ2File(in_loc, 'r') as file_: - for line in file_: - pieces = line.split() - word = pieces.pop(0) - mem = Address(len(pieces), sizeof(float)) - vec = mem.ptr - for i, val_str in enumerate(pieces): - vec[i] = float(val_str) - - word_len = len(word) - vec_len = len(pieces) - - out_file.write_from(&word_len, 1, sizeof(word_len)) - out_file.write_from(&vec_len, 1, sizeof(vec_len)) - - chars = word - out_file.write_from(chars, len(word), sizeof(char)) - out_file.write_from(vec, vec_len, sizeof(float)) - def pickle_vocab(vocab): sstore = vocab.strings @@ -567,21 +407,3 @@ class LookupError(Exception): "ID of orth: {orth_id}".format( query=repr(original_string), orth_str=repr(id_string), orth_id=id_) ) - - -class VectorReadError(Exception): - @classmethod - def mismatched_sizes(cls, loc, line_num, prev_size, curr_size): - return cls( - "Error reading word vectors from %s on line %d.\n" - "All vectors must be the same size.\n" - "Prev size: %d\n" - "Curr size: %d" % (loc, line_num, prev_size, curr_size)) - - @classmethod - def bad_size(cls, loc, size): - return cls( - "Error reading word vectors from %s.\n" - "Vector size: %d\n" - "Max size: %d\n" - "Min size: 1\n" % (loc, size, MAX_VEC_SIZE))