mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
Fix Doc.copy bugs (#6809)
* Dont let the Doc own LexemeC, to fix Doc.copy * Copy doc.spans * Copy doc.spans
This commit is contained in:
parent
0f2de39efb
commit
42b117e561
|
@ -33,6 +33,9 @@ class SpanGroups(UserDict):
|
||||||
def _make_span_group(self, name: str, spans: Iterable["Span"]) -> SpanGroup:
|
def _make_span_group(self, name: str, spans: Iterable["Span"]) -> SpanGroup:
|
||||||
return SpanGroup(self.doc_ref(), name=name, spans=spans)
|
return SpanGroup(self.doc_ref(), name=name, spans=spans)
|
||||||
|
|
||||||
|
def copy(self) -> "SpanGroups":
|
||||||
|
return SpanGroups(self.doc_ref()).from_bytes(self.to_bytes())
|
||||||
|
|
||||||
def to_bytes(self) -> bytes:
|
def to_bytes(self) -> bytes:
|
||||||
# We don't need to serialize this as a dict, because the groups
|
# We don't need to serialize this as a dict, because the groups
|
||||||
# know their names.
|
# know their names.
|
||||||
|
|
|
@ -1187,6 +1187,7 @@ cdef class Doc:
|
||||||
other.user_span_hooks = dict(self.user_span_hooks)
|
other.user_span_hooks = dict(self.user_span_hooks)
|
||||||
other.length = self.length
|
other.length = self.length
|
||||||
other.max_length = self.max_length
|
other.max_length = self.max_length
|
||||||
|
other.spans = self.spans.copy()
|
||||||
buff_size = other.max_length + (PADDING*2)
|
buff_size = other.max_length + (PADDING*2)
|
||||||
assert buff_size > 0
|
assert buff_size > 0
|
||||||
tokens = <TokenC*>other.mem.alloc(buff_size, sizeof(TokenC))
|
tokens = <TokenC*>other.mem.alloc(buff_size, sizeof(TokenC))
|
||||||
|
|
|
@ -161,8 +161,16 @@ cdef class Vocab:
|
||||||
return self._new_lexeme(mem, self.strings[orth])
|
return self._new_lexeme(mem, self.strings[orth])
|
||||||
|
|
||||||
cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL:
|
cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL:
|
||||||
if len(string) < 3 or self.length < 10000:
|
# I think this heuristic is bad, and the Vocab should always
|
||||||
mem = self.mem
|
# own the lexemes. It avoids weird bugs this way, as it's how the thing
|
||||||
|
# was originally supposed to work. The best solution to the growing
|
||||||
|
# memory use is to periodically reset the vocab, which is an action
|
||||||
|
# that should be up to the user to do (so we don't need to keep track
|
||||||
|
# of the doc ownership).
|
||||||
|
# TODO: Change the C API so that the mem isn't passed in here.
|
||||||
|
mem = self.mem
|
||||||
|
#if len(string) < 3 or self.length < 10000:
|
||||||
|
# mem = self.mem
|
||||||
cdef bint is_oov = mem is not self.mem
|
cdef bint is_oov = mem is not self.mem
|
||||||
lex = <LexemeC*>mem.alloc(1, sizeof(LexemeC))
|
lex = <LexemeC*>mem.alloc(1, sizeof(LexemeC))
|
||||||
lex.orth = self.strings.add(string)
|
lex.orth = self.strings.add(string)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user