* Go back to having token reference doc, instead of complicated gymnastics. Rename the attr 'doc', to expose it in the API

This commit is contained in:
Matthew Honnibal 2015-07-14 00:10:11 +02:00
parent e1c702e498
commit 81aa4e6dcc
3 changed files with 27 additions and 52 deletions

View File

@ -89,12 +89,6 @@ cdef class Doc:
self.is_parsed = False self.is_parsed = False
self._py_tokens = [] self._py_tokens = []
def __dealloc__(self):
cdef Token token
if self._py_tokens is not None:
for token in self._py_tokens:
token.take_ownership_of_c_data()
def __getitem__(self, object i): def __getitem__(self, object i):
"""Get a token. """Get a token.
@ -110,7 +104,10 @@ cdef class Doc:
if i < 0: if i < 0:
i = self.length + i i = self.length + i
bounds_check(i, self.length, PADDING) bounds_check(i, self.length, PADDING)
return Token.cinit(self.vocab, &self.data[i], i, self.length, self._py_tokens) if self._py_tokens[i] is not None:
return self._py_tokens[i]
else:
return Token.cinit(self.vocab, &self.data[i], i, self)
def __iter__(self): def __iter__(self):
"""Iterate over the tokens. """Iterate over the tokens.
@ -119,7 +116,7 @@ cdef class Doc:
token (Token): token (Token):
""" """
for i in range(self.length): for i in range(self.length):
yield Token.cinit(self.vocab, &self.data[i], i, self.length, self._py_tokens) yield Token.cinit(self.vocab, &self.data[i], i, self)
def __len__(self): def __len__(self):
return self.length return self.length
@ -172,7 +169,6 @@ cdef class Doc:
Yield a list of sentence Span objects, calculated from the dependency parse. Yield a list of sentence Span objects, calculated from the dependency parse.
""" """
cdef int i cdef int i
cdef Doc sent = Doc(self.vocab, self._string[self.data[0].idx:])
start = 0 start = 0
for i in range(1, self.length): for i in range(1, self.length):
if self.data[i].sent_start: if self.data[i].sent_start:
@ -288,9 +284,10 @@ cdef class Doc:
break break
else: else:
return None return None
cdef unicode string = self.string
# Get LexemeC for newly merged token # Get LexemeC for newly merged token
cdef UniStr new_orth_c cdef UniStr new_orth_c
slice_unicode(&new_orth_c, self._string, start_idx, end_idx) slice_unicode(&new_orth_c, string, start_idx, end_idx)
cdef const LexemeC* lex = self.vocab.get(self.mem, &new_orth_c) cdef const LexemeC* lex = self.vocab.get(self.mem, &new_orth_c)
# House the new merged token where it starts # House the new merged token where it starts
cdef TokenC* token = &self.data[start] cdef TokenC* token = &self.data[start]

View File

@ -1,6 +1,7 @@
from ..vocab cimport Vocab from ..vocab cimport Vocab
from ..structs cimport TokenC from ..structs cimport TokenC
from ..typedefs cimport attr_id_t from ..typedefs cimport attr_id_t
from .doc cimport Doc
cdef class Token: cdef class Token:
@ -8,25 +9,17 @@ cdef class Token:
cdef const TokenC* c cdef const TokenC* c
cdef readonly int i cdef readonly int i
cdef int array_len cdef int array_len
cdef bint _owns_c_data cdef readonly Doc doc
cdef list _py_tokens
@staticmethod @staticmethod
cdef inline Token cinit(Vocab vocab, const TokenC* token, int offset, int array_len, cdef inline Token cinit(Vocab vocab, const TokenC* token, int offset, Doc doc):
list _py_tokens): if offset < 0 or offset >= doc.length:
if offset < 0 or offset >= array_len:
msg = "Attempt to access token at %d, max length %d" msg = "Attempt to access token at %d, max length %d"
raise IndexError(msg % (offset, array_len)) raise IndexError(msg % (offset, doc.length))
if _py_tokens[offset] != None: if doc._py_tokens[offset] != None:
return _py_tokens[offset] return doc._py_tokens[offset]
cdef Token self = Token.__new__(Token, vocab) cdef Token self = Token.__new__(Token, vocab, doc, offset)
self.c = token doc._py_tokens[offset] = self
self.i = offset
self.array_len = array_len
self._py_tokens = _py_tokens
self._py_tokens[offset] = self
return self return self
cdef int take_ownership_of_c_data(self) except -1
cpdef bint check_flag(self, attr_id_t flag_id) except -1 cpdef bint check_flag(self, attr_id_t flag_id) except -1

View File

@ -21,14 +21,12 @@ cdef class Token:
"""An individual token --- i.e. a word, a punctuation symbol, etc. Created """An individual token --- i.e. a word, a punctuation symbol, etc. Created
via Doc.__getitem__ and Doc.__iter__. via Doc.__getitem__ and Doc.__iter__.
""" """
def __cinit__(self, Vocab vocab): def __cinit__(self, Vocab vocab, Doc doc, int offset):
self.vocab = vocab self.vocab = vocab
self._py_tokens = [] self.doc = doc
self.c = &self.doc.data[offset]
def __dealloc__(self): self.i = offset
if self._owns_c_data: self.array_len = doc.length
# Cast through const, if we own the data
PyMem_Free(<void*>self.c)
def __len__(self): def __len__(self):
return self.c.lex.length return self.c.lex.length
@ -39,14 +37,8 @@ cdef class Token:
cpdef bint check_flag(self, attr_id_t flag_id) except -1: cpdef bint check_flag(self, attr_id_t flag_id) except -1:
return check_flag(self.c.lex, flag_id) return check_flag(self.c.lex, flag_id)
cdef int take_ownership_of_c_data(self) except -1:
owned_data = <TokenC*>PyMem_Malloc(sizeof(TokenC) * self.array_len)
memcpy(owned_data, self.c, sizeof(TokenC) * self.array_len)
self.c = owned_data
self._owns_c_data = True
def nbor(self, int i=1): def nbor(self, int i=1):
return Token.cinit(self.vocab, self.c, self.i, self.array_len, self._py_tokens) return self.doc[self.i+i]
property lex_id: property lex_id:
def __get__(self): def __get__(self):
@ -152,8 +144,7 @@ cdef class Token:
ptr += ptr.head ptr += ptr.head
elif ptr + ptr.head == self.c: elif ptr + ptr.head == self.c:
yield Token.cinit(self.vocab, ptr, ptr - (self.c - self.i), yield self.doc[ptr - (self.c - self.i)]
self.array_len, self._py_tokens)
ptr += 1 ptr += 1
else: else:
ptr += 1 ptr += 1
@ -171,8 +162,7 @@ cdef class Token:
if (ptr.head < 0) and ((ptr + ptr.head) > self.c): if (ptr.head < 0) and ((ptr + ptr.head) > self.c):
ptr += ptr.head ptr += ptr.head
elif ptr + ptr.head == self.c: elif ptr + ptr.head == self.c:
tokens.append(Token.cinit(self.vocab, ptr, ptr - (self.c - self.i), tokens.append(self.doc[ptr - (self.c - self.i)])
self.array_len, self._py_tokens))
ptr -= 1 ptr -= 1
else: else:
ptr -= 1 ptr -= 1
@ -195,21 +185,16 @@ cdef class Token:
property left_edge: property left_edge:
def __get__(self): def __get__(self):
return Token.cinit(self.vocab, return self.doc[self.c.l_edge]
(self.c - self.i) + self.c.l_edge, self.c.l_edge,
self.array_len, self._py_tokens)
property right_edge: property right_edge:
def __get__(self): def __get__(self):
return Token.cinit(self.vocab, return self.doc[self.c.r_edge]
(self.c - self.i) + self.c.r_edge, self.c.r_edge,
self.array_len, self._py_tokens)
property head: property head:
def __get__(self): def __get__(self):
"""The token predicted by the parser to be the head of the current token.""" """The token predicted by the parser to be the head of the current token."""
return Token.cinit(self.vocab, self.c + self.c.head, self.i + self.c.head, return self.doc[self.i + self.c.head]
self.array_len, self._py_tokens)
property conjuncts: property conjuncts:
def __get__(self): def __get__(self):