From 81aa4e6dcc3e521ef95f2f56c4a3b8dca10b4cbf Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 14 Jul 2015 00:10:11 +0200 Subject: [PATCH] * Go back to having token reference doc, instead of complicated gymnastics. Rename the attr 'doc', to expose it in the API --- spacy/tokens/doc.pyx | 17 +++++++---------- spacy/tokens/token.pxd | 25 +++++++++---------------- spacy/tokens/token.pyx | 37 +++++++++++-------------------------- 3 files changed, 27 insertions(+), 52 deletions(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index d8585e92d..8d6266dea 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -89,12 +89,6 @@ cdef class Doc: self.is_parsed = False self._py_tokens = [] - def __dealloc__(self): - cdef Token token - if self._py_tokens is not None: - for token in self._py_tokens: - token.take_ownership_of_c_data() - def __getitem__(self, object i): """Get a token. @@ -110,7 +104,10 @@ cdef class Doc: if i < 0: i = self.length + i bounds_check(i, self.length, PADDING) - return Token.cinit(self.vocab, &self.data[i], i, self.length, self._py_tokens) + if self._py_tokens[i] is not None: + return self._py_tokens[i] + else: + return Token.cinit(self.vocab, &self.data[i], i, self) def __iter__(self): """Iterate over the tokens. @@ -119,7 +116,7 @@ cdef class Doc: token (Token): """ for i in range(self.length): - yield Token.cinit(self.vocab, &self.data[i], i, self.length, self._py_tokens) + yield Token.cinit(self.vocab, &self.data[i], i, self) def __len__(self): return self.length @@ -172,7 +169,6 @@ cdef class Doc: Yield a list of sentence Span objects, calculated from the dependency parse. """ cdef int i - cdef Doc sent = Doc(self.vocab, self._string[self.data[0].idx:]) start = 0 for i in range(1, self.length): if self.data[i].sent_start: @@ -288,9 +284,10 @@ cdef class Doc: break else: return None + cdef unicode string = self.string # Get LexemeC for newly merged token cdef UniStr new_orth_c - slice_unicode(&new_orth_c, self._string, start_idx, end_idx) + slice_unicode(&new_orth_c, string, start_idx, end_idx) cdef const LexemeC* lex = self.vocab.get(self.mem, &new_orth_c) # House the new merged token where it starts cdef TokenC* token = &self.data[start] diff --git a/spacy/tokens/token.pxd b/spacy/tokens/token.pxd index 433804fc2..83664d02e 100644 --- a/spacy/tokens/token.pxd +++ b/spacy/tokens/token.pxd @@ -1,6 +1,7 @@ from ..vocab cimport Vocab from ..structs cimport TokenC from ..typedefs cimport attr_id_t +from .doc cimport Doc cdef class Token: @@ -8,25 +9,17 @@ cdef class Token: cdef const TokenC* c cdef readonly int i cdef int array_len - cdef bint _owns_c_data - cdef list _py_tokens + cdef readonly Doc doc @staticmethod - cdef inline Token cinit(Vocab vocab, const TokenC* token, int offset, int array_len, - list _py_tokens): - if offset < 0 or offset >= array_len: + cdef inline Token cinit(Vocab vocab, const TokenC* token, int offset, Doc doc): + if offset < 0 or offset >= doc.length: msg = "Attempt to access token at %d, max length %d" - raise IndexError(msg % (offset, array_len)) - if _py_tokens[offset] != None: - return _py_tokens[offset] - cdef Token self = Token.__new__(Token, vocab) - self.c = token - self.i = offset - self.array_len = array_len - self._py_tokens = _py_tokens - self._py_tokens[offset] = self + raise IndexError(msg % (offset, doc.length)) + if doc._py_tokens[offset] != None: + return doc._py_tokens[offset] + cdef Token self = Token.__new__(Token, vocab, doc, offset) + doc._py_tokens[offset] = self return self - cdef int take_ownership_of_c_data(self) except -1 - cpdef bint check_flag(self, attr_id_t flag_id) except -1 diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 583a8c11f..17350501c 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -21,14 +21,12 @@ cdef class Token: """An individual token --- i.e. a word, a punctuation symbol, etc. Created via Doc.__getitem__ and Doc.__iter__. """ - def __cinit__(self, Vocab vocab): + def __cinit__(self, Vocab vocab, Doc doc, int offset): self.vocab = vocab - self._py_tokens = [] - - def __dealloc__(self): - if self._owns_c_data: - # Cast through const, if we own the data - PyMem_Free(self.c) + self.doc = doc + self.c = &self.doc.data[offset] + self.i = offset + self.array_len = doc.length def __len__(self): return self.c.lex.length @@ -39,14 +37,8 @@ cdef class Token: cpdef bint check_flag(self, attr_id_t flag_id) except -1: return check_flag(self.c.lex, flag_id) - cdef int take_ownership_of_c_data(self) except -1: - owned_data = PyMem_Malloc(sizeof(TokenC) * self.array_len) - memcpy(owned_data, self.c, sizeof(TokenC) * self.array_len) - self.c = owned_data - self._owns_c_data = True - def nbor(self, int i=1): - return Token.cinit(self.vocab, self.c, self.i, self.array_len, self._py_tokens) + return self.doc[self.i+i] property lex_id: def __get__(self): @@ -152,8 +144,7 @@ cdef class Token: ptr += ptr.head elif ptr + ptr.head == self.c: - yield Token.cinit(self.vocab, ptr, ptr - (self.c - self.i), - self.array_len, self._py_tokens) + yield self.doc[ptr - (self.c - self.i)] ptr += 1 else: ptr += 1 @@ -171,8 +162,7 @@ cdef class Token: if (ptr.head < 0) and ((ptr + ptr.head) > self.c): ptr += ptr.head elif ptr + ptr.head == self.c: - tokens.append(Token.cinit(self.vocab, ptr, ptr - (self.c - self.i), - self.array_len, self._py_tokens)) + tokens.append(self.doc[ptr - (self.c - self.i)]) ptr -= 1 else: ptr -= 1 @@ -195,21 +185,16 @@ cdef class Token: property left_edge: def __get__(self): - return Token.cinit(self.vocab, - (self.c - self.i) + self.c.l_edge, self.c.l_edge, - self.array_len, self._py_tokens) + return self.doc[self.c.l_edge] property right_edge: def __get__(self): - return Token.cinit(self.vocab, - (self.c - self.i) + self.c.r_edge, self.c.r_edge, - self.array_len, self._py_tokens) + return self.doc[self.c.r_edge] property head: def __get__(self): """The token predicted by the parser to be the head of the current token.""" - return Token.cinit(self.vocab, self.c + self.c.head, self.i + self.c.head, - self.array_len, self._py_tokens) + return self.doc[self.i + self.c.head] property conjuncts: def __get__(self):