Changes to Doc and Token for new string store scheme

This commit is contained in:
Matthew Honnibal 2016-09-30 20:00:21 +02:00
parent 78f19baafa
commit 99de44d864
2 changed files with 18 additions and 10 deletions

View File

@ -133,6 +133,12 @@ cdef class Doc:
self.push_back(
<const LexemeC*>self.vocab.get(self.mem, orth), has_space)
def __dealloc__(self):
if self.mem is not None \
and self.vocab is not None \
and self.vocab.strings is not None:
self.vocab.strings.remove_oov_map(self.mem)
def __getitem__(self, object i):
'''
doc[i]
@ -600,14 +606,14 @@ cdef class Doc:
if tag in self.vocab.morphology.tag_map:
self.vocab.morphology.assign_tag(token, tag)
else:
token.tag = self.vocab.strings[tag]
token.lemma = self.vocab.strings[lemma]
token.tag = self.vocab.strings.intern(tag)
token.lemma = self.vocab.strings.intern(lemma, mem=self.mem)
if ent_type == 'O':
token.ent_iob = 2
token.ent_type = 0
else:
token.ent_iob = 3
token.ent_type = self.vocab.strings[ent_type]
token.ent_type = self.vocab.strings.intern(ent_type)
# Begin by setting all the head indices to absolute token positions
# This is easier to work with for now than the offsets
# Before thinking of something simpler, beware the case where a dependency

View File

@ -1,3 +1,5 @@
from __future__ import unicode_literals
from libc.string cimport memcpy
from cpython.mem cimport PyMem_Malloc, PyMem_Free
# Compiler crashes on memory view coercion without this. Should report bug.
@ -438,19 +440,19 @@ cdef class Token:
property orth_:
def __get__(self):
return self.vocab.strings.decode_int(self.c.lex.orth, mem=self.mem)
return self.vocab.strings.decode_int(self.c.lex.orth, mem=self.doc.mem)
property lower_:
def __get__(self):
return self.vocab.strings.decode_int(self.c.lex.lower, mem=self.mem)
return self.vocab.strings.decode_int(self.c.lex.lower, mem=self.doc.mem)
property norm_:
def __get__(self):
return self.vocab.strings.decode_int(self.c.lex.norm, mem=self.mem)
return self.vocab.strings.decode_int(self.c.lex.norm, mem=self.doc.mem)
property shape_:
def __get__(self):
return self.vocab.strings.decode_int(self.c.lex.shape, mem=self.mem)
return self.vocab.strings.decode_int(self.c.lex.shape, mem=self.doc.mem)
property prefix_:
def __get__(self):
@ -462,11 +464,11 @@ cdef class Token:
property lang_:
def __get__(self):
return self.vocab.strings.decode_int(self.c.lex.lang, mem=self.mem)
return self.vocab.strings.decode_int(self.c.lex.lang, mem=self.doc.mem)
property lemma_:
def __get__(self):
return self.vocab.strings.decode_int(self.c.lemma, mem=self.mem)
return self.vocab.strings.decode_int(self.c.lemma, mem=self.doc.mem)
property pos_:
def __get__(self):
@ -474,7 +476,7 @@ cdef class Token:
property tag_:
def __get__(self):
return self.vocab.strings.decode_int(self.c.tag, mem=self.mem)
return self.vocab.strings.decode_int(self.c.tag, mem=self.doc.mem)
property dep_:
def __get__(self):