mirror of
https://github.com/explosion/spaCy.git
synced 2025-03-12 07:15:48 +03:00
Allocate Doc before starting to add words
This commit is contained in:
parent
be81577719
commit
de32515bf8
|
@ -3,6 +3,7 @@ cimport cython
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
from libc.string cimport memcpy, memset
|
from libc.string cimport memcpy, memset
|
||||||
from libc.math cimport sqrt
|
from libc.math cimport sqrt
|
||||||
|
from libc.stdint cimport int32_t, uint64_t
|
||||||
|
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
import numpy
|
import numpy
|
||||||
|
@ -186,7 +187,7 @@ cdef class Doc:
|
||||||
DOCS: https://spacy.io/api/doc#init
|
DOCS: https://spacy.io/api/doc#init
|
||||||
"""
|
"""
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
size = 20
|
size = max(20, (len(words) if words is not None else 0))
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
# Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds
|
# Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds
|
||||||
# However, we need to remember the true starting places, so that we can
|
# However, we need to remember the true starting places, so that we can
|
||||||
|
@ -211,7 +212,6 @@ cdef class Doc:
|
||||||
self.user_data = {} if user_data is None else user_data
|
self.user_data = {} if user_data is None else user_data
|
||||||
self._vector = None
|
self._vector = None
|
||||||
self.noun_chunks_iterator = _get_chunker(self.vocab.lang)
|
self.noun_chunks_iterator = _get_chunker(self.vocab.lang)
|
||||||
cdef unicode orth
|
|
||||||
cdef bint has_space
|
cdef bint has_space
|
||||||
if orths_and_spaces is None and words is not None:
|
if orths_and_spaces is None and words is not None:
|
||||||
if spaces is None:
|
if spaces is None:
|
||||||
|
@ -219,19 +219,22 @@ cdef class Doc:
|
||||||
elif len(spaces) != len(words):
|
elif len(spaces) != len(words):
|
||||||
raise ValueError(Errors.E027)
|
raise ValueError(Errors.E027)
|
||||||
orths_and_spaces = zip(words, spaces)
|
orths_and_spaces = zip(words, spaces)
|
||||||
|
cdef const LexemeC* lexeme
|
||||||
if orths_and_spaces is not None:
|
if orths_and_spaces is not None:
|
||||||
|
orths_and_spaces = list(orths_and_spaces)
|
||||||
for orth_space in orths_and_spaces:
|
for orth_space in orths_and_spaces:
|
||||||
if isinstance(orth_space, unicode):
|
if isinstance(orth_space, unicode):
|
||||||
orth = orth_space
|
lexeme = self.vocab.get(self.mem, orth_space)
|
||||||
has_space = True
|
has_space = True
|
||||||
elif isinstance(orth_space, bytes):
|
elif isinstance(orth_space, bytes):
|
||||||
raise ValueError(Errors.E028.format(value=orth_space))
|
raise ValueError(Errors.E028.format(value=orth_space))
|
||||||
|
elif isinstance(orth_space[0], unicode):
|
||||||
|
lexeme = self.vocab.get(self.mem, orth_space[0])
|
||||||
|
has_space = orth_space[1]
|
||||||
else:
|
else:
|
||||||
orth, has_space = orth_space
|
lexeme = self.vocab.get_by_orth(self.mem, orth_space[0])
|
||||||
# Note that we pass self.mem here --- we have ownership, if LexemeC
|
has_space = orth_space[1]
|
||||||
# must be created.
|
self.push_back(lexeme, has_space)
|
||||||
self.push_back(
|
|
||||||
<const LexemeC*>self.vocab.get(self.mem, orth), has_space)
|
|
||||||
# Tough to decide on policy for this. Is an empty doc tagged and parsed?
|
# Tough to decide on policy for this. Is an empty doc tagged and parsed?
|
||||||
# There's no information we'd like to add to it, so I guess so?
|
# There's no information we'd like to add to it, so I guess so?
|
||||||
if self.length == 0:
|
if self.length == 0:
|
||||||
|
@ -753,6 +756,8 @@ cdef class Doc:
|
||||||
return dict(counts)
|
return dict(counts)
|
||||||
|
|
||||||
def _realloc(self, new_size):
|
def _realloc(self, new_size):
|
||||||
|
if new_size < self.max_length:
|
||||||
|
return
|
||||||
self.max_length = new_size
|
self.max_length = new_size
|
||||||
n = new_size + (PADDING * 2)
|
n = new_size + (PADDING * 2)
|
||||||
# What we're storing is a "padded" array. We've jumped forward PADDING
|
# What we're storing is a "padded" array. We've jumped forward PADDING
|
||||||
|
|
Loading…
Reference in New Issue
Block a user