Allocate Doc before starting to add words

This commit is contained in:
Matthew Honnibal 2020-06-20 20:15:21 +02:00
parent be81577719
commit de32515bf8

View File

@ -3,6 +3,7 @@ cimport cython
cimport numpy as np cimport numpy as np
from libc.string cimport memcpy, memset from libc.string cimport memcpy, memset
from libc.math cimport sqrt from libc.math cimport sqrt
from libc.stdint cimport int32_t, uint64_t
from collections import Counter from collections import Counter
import numpy import numpy
@ -186,7 +187,7 @@ cdef class Doc:
DOCS: https://spacy.io/api/doc#init DOCS: https://spacy.io/api/doc#init
""" """
self.vocab = vocab self.vocab = vocab
size = 20 size = max(20, (len(words) if words is not None else 0))
self.mem = Pool() self.mem = Pool()
# Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds # Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds
# However, we need to remember the true starting places, so that we can # However, we need to remember the true starting places, so that we can
@ -211,7 +212,6 @@ cdef class Doc:
self.user_data = {} if user_data is None else user_data self.user_data = {} if user_data is None else user_data
self._vector = None self._vector = None
self.noun_chunks_iterator = _get_chunker(self.vocab.lang) self.noun_chunks_iterator = _get_chunker(self.vocab.lang)
cdef unicode orth
cdef bint has_space cdef bint has_space
if orths_and_spaces is None and words is not None: if orths_and_spaces is None and words is not None:
if spaces is None: if spaces is None:
@ -219,19 +219,22 @@ cdef class Doc:
elif len(spaces) != len(words): elif len(spaces) != len(words):
raise ValueError(Errors.E027) raise ValueError(Errors.E027)
orths_and_spaces = zip(words, spaces) orths_and_spaces = zip(words, spaces)
cdef const LexemeC* lexeme
if orths_and_spaces is not None: if orths_and_spaces is not None:
orths_and_spaces = list(orths_and_spaces)
for orth_space in orths_and_spaces: for orth_space in orths_and_spaces:
if isinstance(orth_space, unicode): if isinstance(orth_space, unicode):
orth = orth_space lexeme = self.vocab.get(self.mem, orth_space)
has_space = True has_space = True
elif isinstance(orth_space, bytes): elif isinstance(orth_space, bytes):
raise ValueError(Errors.E028.format(value=orth_space)) raise ValueError(Errors.E028.format(value=orth_space))
elif isinstance(orth_space[0], unicode):
lexeme = self.vocab.get(self.mem, orth_space[0])
has_space = orth_space[1]
else: else:
orth, has_space = orth_space lexeme = self.vocab.get_by_orth(self.mem, orth_space[0])
# Note that we pass self.mem here --- we have ownership, if LexemeC has_space = orth_space[1]
# must be created. self.push_back(lexeme, has_space)
self.push_back(
<const LexemeC*>self.vocab.get(self.mem, orth), has_space)
# Tough to decide on policy for this. Is an empty doc tagged and parsed? # Tough to decide on policy for this. Is an empty doc tagged and parsed?
# There's no information we'd like to add to it, so I guess so? # There's no information we'd like to add to it, so I guess so?
if self.length == 0: if self.length == 0:
@ -753,6 +756,8 @@ cdef class Doc:
return dict(counts) return dict(counts)
def _realloc(self, new_size): def _realloc(self, new_size):
if new_size < self.max_length:
return
self.max_length = new_size self.max_length = new_size
n = new_size + (PADDING * 2) n = new_size + (PADDING * 2)
# What we're storing is a "padded" array. We've jumped forward PADDING # What we're storing is a "padded" array. We've jumped forward PADDING