* Restore lexemes field to lexicon

This commit is contained in:
Matthew Honnibal 2014-10-31 17:43:25 +11:00
parent 6c807aa45f
commit a8ca078b24
2 changed files with 16 additions and 4 deletions

View File

@ -26,6 +26,7 @@ cdef class Lexicon:
cdef Pool mem cdef Pool mem
cpdef readonly size_t size cpdef readonly size_t size
cpdef readonly StringStore strings cpdef readonly StringStore strings
cdef vector[Lexeme*] lexemes
cdef Lexeme* get(self, String* s) except NULL cdef Lexeme* get(self, String* s) except NULL

View File

@ -16,6 +16,7 @@ from murmurhash.mrmr cimport hash64
from preshed.maps cimport PreshMap from preshed.maps cimport PreshMap
from .lexeme cimport Lexeme from .lexeme cimport Lexeme
from .lexeme cimport EMPTY_LEXEME
from .lexeme cimport init as lexeme_init from .lexeme cimport init as lexeme_init
from . import util from . import util
@ -224,6 +225,7 @@ cdef class Lexicon:
self.mem = Pool() self.mem = Pool()
self._dict = PreshMap(2 ** 20) self._dict = PreshMap(2 ** 20)
self.strings = StringStore() self.strings = StringStore()
self.lexemes.push_back(&EMPTY_LEXEME)
self.size = 1 self.size = 1
cdef Lexeme* get(self, String* string) except NULL: cdef Lexeme* get(self, String* string) except NULL:
@ -232,14 +234,19 @@ cdef class Lexicon:
if lex != NULL: if lex != NULL:
return lex return lex
lex = <Lexeme*>self.mem.alloc(sizeof(Lexeme), 1) lex = <Lexeme*>self.mem.alloc(sizeof(Lexeme), 1)
lex[0] = lexeme_init(string.chars[:string.n], string.key, self.strings, {}) lex[0] = lexeme_init(self.size, string.chars[:string.n], string.key, self.strings, {})
self._dict.set(string.key, lex) self._dict.set(string.key, lex)
while self.lexemes.size() < (lex.id + 1):
self.lexemes.push_back(&EMPTY_LEXEME)
self.lexemes[lex.id] = lex
self.size += 1 self.size += 1
return lex return lex
def __getitem__(self, unicode uni_string): def __getitem__(self, id_or_string):
if type(id_or_string) == int:
return self.lexemes.at(id_or_string)[0]
cdef String string cdef String string
string_from_unicode(&string, uni_string) string_from_unicode(&string, id_or_string)
cdef Lexeme* lexeme = self.get(&string) cdef Lexeme* lexeme = self.get(&string)
return lexeme[0] return lexeme[0]
@ -247,7 +254,7 @@ cdef class Lexicon:
cdef String s cdef String s
string_from_unicode(&s, uni_string) string_from_unicode(&s, uni_string)
cdef Lexeme* lex = self.get(&s) cdef Lexeme* lex = self.get(&s)
lex[0] = lexeme_init(s.chars[:s.n], s.key, self.strings, props) lex[0] = lexeme_init(lex.id, s.chars[:s.n], s.key, self.strings, props)
def dump(self, loc): def dump(self, loc):
if path.exists(loc): if path.exists(loc):
@ -287,7 +294,11 @@ cdef class Lexicon:
if st != 1: if st != 1:
break break
self._dict.set(key, lexeme) self._dict.set(key, lexeme)
while self.lexemes.size() < (lexeme.id + 1):
self.lexemes.push_back(&EMPTY_LEXEME)
self.lexemes[lexeme.id] = lexeme
i += 1 i += 1
self.size += 1
fclose(fp) fclose(fp)