From 99de44d8644729f170301883d7ebc4164cd2db13 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 30 Sep 2016 20:00:21 +0200 Subject: [PATCH] Changes to Doc and Token for new string store scheme --- spacy/tokens/doc.pyx | 12 +++++++++--- spacy/tokens/token.pyx | 16 +++++++++------- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 66654482e..09d8a439d 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -132,6 +132,12 @@ cdef class Doc: # must be created. self.push_back( self.vocab.get(self.mem, orth), has_space) + + def __dealloc__(self): + if self.mem is not None \ + and self.vocab is not None \ + and self.vocab.strings is not None: + self.vocab.strings.remove_oov_map(self.mem) def __getitem__(self, object i): ''' @@ -600,14 +606,14 @@ cdef class Doc: if tag in self.vocab.morphology.tag_map: self.vocab.morphology.assign_tag(token, tag) else: - token.tag = self.vocab.strings[tag] - token.lemma = self.vocab.strings[lemma] + token.tag = self.vocab.strings.intern(tag) + token.lemma = self.vocab.strings.intern(lemma, mem=self.mem) if ent_type == 'O': token.ent_iob = 2 token.ent_type = 0 else: token.ent_iob = 3 - token.ent_type = self.vocab.strings[ent_type] + token.ent_type = self.vocab.strings.intern(ent_type) # Begin by setting all the head indices to absolute token positions # This is easier to work with for now than the offsets # Before thinking of something simpler, beware the case where a dependency diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 9320cb85a..50a62fb0b 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + from libc.string cimport memcpy from cpython.mem cimport PyMem_Malloc, PyMem_Free # Compiler crashes on memory view coercion without this. Should report bug. @@ -438,19 +440,19 @@ cdef class Token: property orth_: def __get__(self): - return self.vocab.strings.decode_int(self.c.lex.orth, mem=self.mem) + return self.vocab.strings.decode_int(self.c.lex.orth, mem=self.doc.mem) property lower_: def __get__(self): - return self.vocab.strings.decode_int(self.c.lex.lower, mem=self.mem) + return self.vocab.strings.decode_int(self.c.lex.lower, mem=self.doc.mem) property norm_: def __get__(self): - return self.vocab.strings.decode_int(self.c.lex.norm, mem=self.mem) + return self.vocab.strings.decode_int(self.c.lex.norm, mem=self.doc.mem) property shape_: def __get__(self): - return self.vocab.strings.decode_int(self.c.lex.shape, mem=self.mem) + return self.vocab.strings.decode_int(self.c.lex.shape, mem=self.doc.mem) property prefix_: def __get__(self): @@ -462,11 +464,11 @@ cdef class Token: property lang_: def __get__(self): - return self.vocab.strings.decode_int(self.c.lex.lang, mem=self.mem) + return self.vocab.strings.decode_int(self.c.lex.lang, mem=self.doc.mem) property lemma_: def __get__(self): - return self.vocab.strings.decode_int(self.c.lemma, mem=self.mem) + return self.vocab.strings.decode_int(self.c.lemma, mem=self.doc.mem) property pos_: def __get__(self): @@ -474,7 +476,7 @@ cdef class Token: property tag_: def __get__(self): - return self.vocab.strings.decode_int(self.c.tag, mem=self.mem) + return self.vocab.strings.decode_int(self.c.tag, mem=self.doc.mem) property dep_: def __get__(self):