From de32515bf895a89476b07b98da67691270f0f269 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 20 Jun 2020 20:15:21 +0200 Subject: [PATCH] Allocate Doc before starting to add words --- spacy/tokens/doc.pyx | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index f9e7c97dd..686f3be54 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -3,6 +3,7 @@ cimport cython cimport numpy as np from libc.string cimport memcpy, memset from libc.math cimport sqrt +from libc.stdint cimport int32_t, uint64_t from collections import Counter import numpy @@ -186,7 +187,7 @@ cdef class Doc: DOCS: https://spacy.io/api/doc#init """ self.vocab = vocab - size = 20 + size = max(20, (len(words) if words is not None else 0)) self.mem = Pool() # Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds # However, we need to remember the true starting places, so that we can @@ -211,7 +212,6 @@ cdef class Doc: self.user_data = {} if user_data is None else user_data self._vector = None self.noun_chunks_iterator = _get_chunker(self.vocab.lang) - cdef unicode orth cdef bint has_space if orths_and_spaces is None and words is not None: if spaces is None: @@ -219,19 +219,22 @@ cdef class Doc: elif len(spaces) != len(words): raise ValueError(Errors.E027) orths_and_spaces = zip(words, spaces) + cdef const LexemeC* lexeme if orths_and_spaces is not None: + orths_and_spaces = list(orths_and_spaces) for orth_space in orths_and_spaces: if isinstance(orth_space, unicode): - orth = orth_space + lexeme = self.vocab.get(self.mem, orth_space) has_space = True elif isinstance(orth_space, bytes): raise ValueError(Errors.E028.format(value=orth_space)) + elif isinstance(orth_space[0], unicode): + lexeme = self.vocab.get(self.mem, orth_space[0]) + has_space = orth_space[1] else: - orth, has_space = orth_space - # Note that we pass self.mem here --- we have ownership, if LexemeC - # must be created. - self.push_back( - self.vocab.get(self.mem, orth), has_space) + lexeme = self.vocab.get_by_orth(self.mem, orth_space[0]) + has_space = orth_space[1] + self.push_back(lexeme, has_space) # Tough to decide on policy for this. Is an empty doc tagged and parsed? # There's no information we'd like to add to it, so I guess so? if self.length == 0: @@ -753,6 +756,8 @@ cdef class Doc: return dict(counts) def _realloc(self, new_size): + if new_size < self.max_length: + return self.max_length = new_size n = new_size + (PADDING * 2) # What we're storing is a "padded" array. We've jumped forward PADDING