mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
* Pass ownership of C data to Token instances if Tokens object is being garbage-collected, but Token instances are staying alive.
This commit is contained in:
parent
db3f26a51b
commit
7572e31f5e
|
@ -58,6 +58,7 @@ cdef class Token:
|
|||
cdef const TokenC* c
|
||||
cdef readonly int i
|
||||
cdef int array_len
|
||||
cdef bint _owns_c_data
|
||||
|
||||
|
||||
cdef list _py
|
||||
|
@ -86,3 +87,5 @@ cdef class Token:
|
|||
self._dep_strings = dep_strings
|
||||
py_tokens[offset] = self
|
||||
return self
|
||||
|
||||
cdef int take_ownership_of_c_data(self) except -1
|
||||
|
|
328
spacy/tokens.pyx
328
spacy/tokens.pyx
|
@ -17,6 +17,9 @@ import numpy
|
|||
|
||||
cimport cython
|
||||
|
||||
from cpython.mem cimport PyMem_Malloc, PyMem_Free
|
||||
from libc.string cimport memcpy
|
||||
import sys
|
||||
|
||||
DEF PADDING = 5
|
||||
|
||||
|
@ -92,6 +95,21 @@ cdef class Tokens:
|
|||
self._tag_strings = tuple() # These will be set by the POS tagger and parser
|
||||
self._dep_strings = tuple() # The strings are arbitrary and model-specific.
|
||||
|
||||
def __dealloc__(self):
|
||||
# The Token object initially only gets a view of the underlying C
|
||||
# data --- it doesn't own it. But, if we have Token objects that are
|
||||
# going to outlive this instance, those objects need a copy of the C
|
||||
# data.
|
||||
cdef Token token
|
||||
if self._py_tokens is not None:
|
||||
for token in self._py_tokens:
|
||||
if token is not None:
|
||||
# Why 3? 1 for the entry in the _py_tokens list,
|
||||
# and 1 for this reference. If we have _another_ ref, then
|
||||
# the token will live, and needs to own its data.
|
||||
if sys.getrefcount(token) >= 3:
|
||||
token.take_ownership_of_c_data()
|
||||
|
||||
def __getitem__(self, object i):
|
||||
"""Retrieve a token.
|
||||
|
||||
|
@ -139,8 +157,6 @@ cdef class Tokens:
|
|||
self._py_tokens.append(None)
|
||||
return idx + t.lex.length
|
||||
|
||||
|
||||
|
||||
@cython.boundscheck(False)
|
||||
cpdef long[:,:] to_array(self, object py_attr_ids):
|
||||
"""Given a list of M attribute IDs, export the tokens to a numpy ndarray
|
||||
|
@ -234,196 +250,208 @@ cdef class Tokens:
|
|||
|
||||
|
||||
cdef class Token:
|
||||
"""An individual token."""
|
||||
"""An individual token --- i.e. a word, a punctuation symbol, etc. Created
|
||||
via Tokens.__getitem__ and Tokens.__iter__.
|
||||
"""
|
||||
def __cinit__(self, Vocab vocab, unicode string):
|
||||
self.vocab = vocab
|
||||
self._string = string
|
||||
|
||||
def __dealloc__(self):
|
||||
if self._owns_c_data:
|
||||
# Cast through const, if we own the data
|
||||
PyMem_Free(<void*>self.c)
|
||||
|
||||
def __len__(self):
|
||||
return self.c.lex.length
|
||||
|
||||
def __unicode__(self):
|
||||
return self.string
|
||||
|
||||
cdef int take_ownership_of_c_data(self) except -1:
|
||||
owned_data = <TokenC*>PyMem_Malloc(sizeof(TokenC) * self.array_len)
|
||||
memcpy(owned_data, self.c, sizeof(TokenC) * self.array_len)
|
||||
self.c = owned_data
|
||||
self._owns_c_data = True
|
||||
|
||||
def nbor(self, int i=1):
|
||||
return Token.cinit(self.vocab, self._string,
|
||||
self.c, self.i, self.array_len,
|
||||
self._py, self._tag_strings, self._dep_strings)
|
||||
|
||||
@property
|
||||
def string(self):
|
||||
cdef int next_idx = (self.c + 1).idx
|
||||
if next_idx < self.c.idx:
|
||||
next_idx = self.c.idx + self.c.lex.length
|
||||
return self._string[self.c.idx:next_idx]
|
||||
property string:
|
||||
def __get__(self):
|
||||
cdef int next_idx = (self.c + 1).idx
|
||||
if next_idx < self.c.idx:
|
||||
next_idx = self.c.idx + self.c.lex.length
|
||||
return self._string[self.c.idx:next_idx]
|
||||
|
||||
@property
|
||||
def prob(self):
|
||||
return self.c.lex.prob
|
||||
property prob:
|
||||
def __get__(self):
|
||||
return self.c.lex.prob
|
||||
|
||||
@property
|
||||
def idx(self):
|
||||
return self.c.idx
|
||||
property idx:
|
||||
def __get__(self):
|
||||
return self.c.idx
|
||||
|
||||
@property
|
||||
def cluster(self):
|
||||
return self.c.lex.cluster
|
||||
property cluster:
|
||||
def __get__(self):
|
||||
return self.c.lex.cluster
|
||||
|
||||
@property
|
||||
def cluster(self):
|
||||
return self.c.lex.cluster
|
||||
property orth:
|
||||
def __get__(self):
|
||||
return self.c.lex.orth
|
||||
|
||||
@property
|
||||
def orth(self):
|
||||
return self.c.lex.orth
|
||||
property lower:
|
||||
def __get__(self):
|
||||
return self.c.lex.lower
|
||||
|
||||
@property
|
||||
def lower(self):
|
||||
return self.c.lex.lower
|
||||
property norm:
|
||||
def __get__(self):
|
||||
return self.c.lex.norm
|
||||
|
||||
@property
|
||||
def norm(self):
|
||||
return self.c.lex.norm
|
||||
property shape:
|
||||
def __get__(self):
|
||||
return self.c.lex.shape
|
||||
|
||||
@property
|
||||
def shape(self):
|
||||
return self.c.lex.shape
|
||||
property prefix:
|
||||
def __get__(self):
|
||||
return self.c.lex.prefix
|
||||
|
||||
@property
|
||||
def prefix(self):
|
||||
return self.c.lex.prefix
|
||||
property suffix:
|
||||
def __get__(self):
|
||||
return self.c.lex.suffix
|
||||
|
||||
@property
|
||||
def suffix(self):
|
||||
return self.c.lex.suffix
|
||||
property lemma:
|
||||
def __get__(self):
|
||||
return self.c.lemma
|
||||
|
||||
@property
|
||||
def lemma(self):
|
||||
return self.c.lemma
|
||||
property pos:
|
||||
def __get__(self):
|
||||
return self.c.pos
|
||||
|
||||
@property
|
||||
def pos(self):
|
||||
return self.c.pos
|
||||
property tag:
|
||||
def __get__(self):
|
||||
return self.c.tag
|
||||
|
||||
@property
|
||||
def tag(self):
|
||||
return self.c.tag
|
||||
property dep:
|
||||
def __get__(self):
|
||||
return self.c.dep
|
||||
|
||||
@property
|
||||
def dep(self):
|
||||
return self.c.dep
|
||||
property repvec:
|
||||
def __get__(self):
|
||||
return numpy.asarray(<float[:300,]> self.c.lex.repvec)
|
||||
|
||||
@property
|
||||
def repvec(self):
|
||||
return numpy.asarray(<float[:300,]> self.c.lex.repvec)
|
||||
|
||||
@property
|
||||
def n_lefts(self):
|
||||
cdef int n = 0
|
||||
cdef const TokenC* ptr = self.c - self.i
|
||||
while ptr != self.c:
|
||||
if ptr + ptr.head == self.c:
|
||||
n += 1
|
||||
ptr += 1
|
||||
return n
|
||||
|
||||
@property
|
||||
def n_rights(self):
|
||||
cdef int n = 0
|
||||
cdef const TokenC* ptr = self.c + (self.array_len - self.i)
|
||||
while ptr != self.c:
|
||||
if ptr + ptr.head == self.c:
|
||||
n += 1
|
||||
ptr -= 1
|
||||
return n
|
||||
|
||||
@property
|
||||
def lefts(self):
|
||||
"""The leftward immediate children of the word, in the syntactic
|
||||
dependency parse.
|
||||
"""
|
||||
cdef const TokenC* ptr = self.c - self.i
|
||||
while ptr < self.c:
|
||||
# If this head is still to the right of us, we can skip to it
|
||||
# No token that's between this token and this head could be our
|
||||
# child.
|
||||
if (ptr.head >= 1) and (ptr + ptr.head) < self.c:
|
||||
ptr += ptr.head
|
||||
|
||||
elif ptr + ptr.head == self.c:
|
||||
yield Token.cinit(self.vocab, self._string,
|
||||
ptr, ptr - (self.c - self.i), self.array_len,
|
||||
self._py, self._tag_strings, self._dep_strings)
|
||||
ptr += 1
|
||||
else:
|
||||
property n_lefts:
|
||||
def __get__(self):
|
||||
cdef int n = 0
|
||||
cdef const TokenC* ptr = self.c - self.i
|
||||
while ptr != self.c:
|
||||
if ptr + ptr.head == self.c:
|
||||
n += 1
|
||||
ptr += 1
|
||||
return n
|
||||
|
||||
@property
|
||||
def rights(self):
|
||||
"""The rightward immediate children of the word, in the syntactic
|
||||
dependency parse."""
|
||||
cdef const TokenC* ptr = (self.c - self.i) + (self.array_len - 1)
|
||||
while ptr > self.c:
|
||||
# If this head is still to the right of us, we can skip to it
|
||||
# No token that's between this token and this head could be our
|
||||
# child.
|
||||
if (ptr.head < 0) and ((ptr + ptr.head) > self.c):
|
||||
ptr += ptr.head
|
||||
elif ptr + ptr.head == self.c:
|
||||
yield Token.cinit(self.vocab, self._string,
|
||||
ptr, ptr - (self.c - self.i), self.array_len,
|
||||
self._py, self._tag_strings, self._dep_strings)
|
||||
ptr -= 1
|
||||
else:
|
||||
property n_rights:
|
||||
def __get__(self):
|
||||
cdef int n = 0
|
||||
cdef const TokenC* ptr = self.c + (self.array_len - self.i)
|
||||
while ptr != self.c:
|
||||
if ptr + ptr.head == self.c:
|
||||
n += 1
|
||||
ptr -= 1
|
||||
return n
|
||||
|
||||
@property
|
||||
def head(self):
|
||||
"""The token predicted by the parser to be the head of the current token."""
|
||||
return Token.cinit(self.vocab, self._string,
|
||||
self.c + self.c.head, self.i + self.c.head, self.array_len,
|
||||
self._py, self._tag_strings, self._dep_strings)
|
||||
property lefts:
|
||||
def __get__(self):
|
||||
"""The leftward immediate children of the word, in the syntactic
|
||||
dependency parse.
|
||||
"""
|
||||
cdef const TokenC* ptr = self.c - self.i
|
||||
while ptr < self.c:
|
||||
# If this head is still to the right of us, we can skip to it
|
||||
# No token that's between this token and this head could be our
|
||||
# child.
|
||||
if (ptr.head >= 1) and (ptr + ptr.head) < self.c:
|
||||
ptr += ptr.head
|
||||
|
||||
@property
|
||||
def whitespace_(self):
|
||||
return self.string[self.c.lex.length:]
|
||||
elif ptr + ptr.head == self.c:
|
||||
yield Token.cinit(self.vocab, self._string,
|
||||
ptr, ptr - (self.c - self.i), self.array_len,
|
||||
self._py, self._tag_strings, self._dep_strings)
|
||||
ptr += 1
|
||||
else:
|
||||
ptr += 1
|
||||
|
||||
@property
|
||||
def orth_(self):
|
||||
return self.vocab.strings[self.c.lex.orth]
|
||||
property rights:
|
||||
def __get__(self):
|
||||
"""The rightward immediate children of the word, in the syntactic
|
||||
dependency parse."""
|
||||
cdef const TokenC* ptr = (self.c - self.i) + (self.array_len - 1)
|
||||
while ptr > self.c:
|
||||
# If this head is still to the right of us, we can skip to it
|
||||
# No token that's between this token and this head could be our
|
||||
# child.
|
||||
if (ptr.head < 0) and ((ptr + ptr.head) > self.c):
|
||||
ptr += ptr.head
|
||||
elif ptr + ptr.head == self.c:
|
||||
yield Token.cinit(self.vocab, self._string,
|
||||
ptr, ptr - (self.c - self.i), self.array_len,
|
||||
self._py, self._tag_strings, self._dep_strings)
|
||||
ptr -= 1
|
||||
else:
|
||||
ptr -= 1
|
||||
|
||||
@property
|
||||
def lower_(self):
|
||||
return self.vocab.strings[self.c.lex.lower]
|
||||
property head:
|
||||
def __get__(self):
|
||||
"""The token predicted by the parser to be the head of the current token."""
|
||||
return Token.cinit(self.vocab, self._string,
|
||||
self.c + self.c.head, self.i + self.c.head, self.array_len,
|
||||
self._py, self._tag_strings, self._dep_strings)
|
||||
|
||||
@property
|
||||
def norm_(self):
|
||||
return self.vocab.strings[self.c.lex.norm]
|
||||
property whitespace_:
|
||||
def __get__(self):
|
||||
return self.string[self.c.lex.length:]
|
||||
|
||||
@property
|
||||
def shape_(self):
|
||||
return self.vocab.strings[self.c.lex.shape]
|
||||
property orth_:
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.lex.orth]
|
||||
|
||||
@property
|
||||
def prefix_(self):
|
||||
return self.vocab.strings[self.c.lex.prefix]
|
||||
property lower_:
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.lex.lower]
|
||||
|
||||
@property
|
||||
def suffix_(self):
|
||||
return self.vocab.strings[self.c.lex.suffix]
|
||||
property norm_:
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.lex.norm]
|
||||
|
||||
@property
|
||||
def lemma_(self):
|
||||
return self.vocab.strings[self.c.lemma]
|
||||
property shape_:
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.lex.shape]
|
||||
|
||||
@property
|
||||
def pos_(self):
|
||||
return _pos_id_to_string[self.c.pos]
|
||||
property prefix_:
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.lex.prefix]
|
||||
|
||||
@property
|
||||
def tag_(self):
|
||||
return self._tag_strings[self.c.tag]
|
||||
property suffix_:
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.lex.suffix]
|
||||
|
||||
@property
|
||||
def dep_(self):
|
||||
return self._dep_strings[self.c.dep]
|
||||
property lemma_:
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.lemma]
|
||||
|
||||
property pos_:
|
||||
def __get__(self):
|
||||
return _pos_id_to_string[self.c.pos]
|
||||
|
||||
property tag_:
|
||||
def __get__(self):
|
||||
return self._tag_strings[self.c.tag]
|
||||
|
||||
property dep_:
|
||||
def __get__(self):
|
||||
return self._dep_strings[self.c.dep]
|
||||
|
||||
|
||||
_pos_id_to_string = {id_: string for string, id_ in UNIV_POS_NAMES.items()}
|
||||
|
|
24
tests/test_token_references.py
Normal file
24
tests/test_token_references.py
Normal file
|
@ -0,0 +1,24 @@
|
|||
from __future__ import unicode_literals
|
||||
import pytest
|
||||
import gc
|
||||
|
||||
from spacy.en import English
|
||||
|
||||
|
||||
def get_orphan_token(text, i):
|
||||
nlp = English()
|
||||
tokens = nlp(text)
|
||||
gc.collect()
|
||||
token = tokens[i]
|
||||
del tokens
|
||||
return token
|
||||
|
||||
|
||||
def test_orphan():
|
||||
orphan = get_orphan_token('An orphan token', 1)
|
||||
gc.collect()
|
||||
dummy = get_orphan_token('Load and flush the memory', 0)
|
||||
dummy = get_orphan_token('Load again...', 0)
|
||||
assert orphan.orth_ == 'orphan'
|
||||
assert orphan.pos_ == 'ADJ'
|
||||
assert orphan.head.orth_ == 'token'
|
Loading…
Reference in New Issue
Block a user