* WordTree in use. Need to reform the way chunks are handled. Should be properly one Lexeme per word, with split points being the things that are cached.

This commit is contained in:
Matthew Honnibal 2014-08-16 20:10:22 +02:00
parent 34b68a18ab
commit b94c9b72c9
2 changed files with 22 additions and 24 deletions

View File

@ -29,20 +29,20 @@ from spacy._hashing cimport WordTree
cdef class Language: cdef class Language:
cdef object name cdef object name
cdef WordTree vocab cdef WordTree vocab
cdef Vocab* distri cdef WordTree distri
cdef Vocab* ortho cdef WordTree ortho
cdef dict bacov cdef dict bacov
cpdef Tokens tokenize(self, unicode text) cpdef Tokens tokenize(self, unicode text)
cdef Lexeme_addr lookup(self, unicode string) except 0 cdef Lexeme_addr lookup(self, unicode string) except 0
cdef Lexeme_addr lookup_chunk(self, unicode string) except 0 cdef Lexeme_addr lookup_chunk(self, unicode string) except 0
cdef Orthography* lookup_orth(self, StringHash key, unicode lex) except NULL cdef Orthography* lookup_orth(self, unicode lex) except NULL
cdef Distribution* lookup_dist(self, StringHash key) except NULL cdef Distribution* lookup_dist(self, unicode lex) except NULL
cdef Lexeme* new_lexeme(self, unicode key, unicode lex) except NULL cdef Lexeme* new_lexeme(self, unicode key, unicode lex) except NULL
cdef Orthography* new_orth(self, StringHash hashed, unicode lex) except NULL cdef Orthography* new_orth(self, unicode lex) except NULL
cdef Distribution* new_dist(self, StringHash key) except NULL cdef Distribution* new_dist(self, unicode lex) except NULL
cdef unicode unhash(self, StringHash hashed) cdef unicode unhash(self, StringHash hashed)

View File

@ -65,10 +65,8 @@ cdef class Language:
self.name = name self.name = name
self.bacov = {} self.bacov = {}
self.vocab = WordTree(0, 5) self.vocab = WordTree(0, 5)
self.ortho = new Vocab() self.ortho = WordTree(0, 5)
self.distri = new Vocab() self.distri = WordTree(0, 5)
self.distri[0].set_empty_key(0)
self.ortho[0].set_empty_key(0)
self.load_tokenization(util.read_tokenization(name)) self.load_tokenization(util.read_tokenization(name))
cpdef Tokens tokenize(self, unicode characters): cpdef Tokens tokenize(self, unicode characters):
@ -125,16 +123,16 @@ cdef class Language:
word_ptr = self.new_lexeme(string, string) word_ptr = self.new_lexeme(string, string)
return <Lexeme_addr>word_ptr return <Lexeme_addr>word_ptr
cdef Orthography* lookup_orth(self, StringHash hashed, unicode lex): cdef Orthography* lookup_orth(self, unicode lex):
cdef Orthography* orth = <Orthography*>self.ortho[0][hashed] cdef Orthography* orth = <Orthography*>self.ortho.get(lex)
if orth == NULL: if orth == NULL:
orth = self.new_orth(hashed, lex) orth = self.new_orth(lex)
return orth return orth
cdef Distribution* lookup_dist(self, StringHash hashed): cdef Distribution* lookup_dist(self, unicode lex):
cdef Distribution* dist = <Distribution*>self.distri[0][hashed] cdef Distribution* dist = <Distribution*>self.distri.get(lex)
if dist == NULL: if dist == NULL:
dist = self.new_dist(hashed) dist = self.new_dist(lex)
return dist return dist
cdef Lexeme* new_lexeme(self, unicode key, unicode string) except NULL: cdef Lexeme* new_lexeme(self, unicode key, unicode string) except NULL:
@ -143,12 +141,12 @@ cdef class Language:
word.lex = hash(string) word.lex = hash(string)
self.bacov[word.lex] = string self.bacov[word.lex] = string
self.bacov[word.sic] = key self.bacov[word.sic] = key
word.orth = self.lookup_orth(word.lex, string) word.orth = self.lookup_orth(string)
word.dist = self.lookup_dist(word.lex) word.dist = self.lookup_dist(string)
self.vocab.set(key, <size_t>word) self.vocab.set(key, <size_t>word)
return word return word
cdef Orthography* new_orth(self, StringHash hashed, unicode lex) except NULL: cdef Orthography* new_orth(self, unicode lex) except NULL:
cdef unicode last3 cdef unicode last3
cdef unicode norm cdef unicode norm
cdef unicode shape cdef unicode shape
@ -160,7 +158,7 @@ cdef class Language:
orth.length = length orth.length = length
orth.flags = set_orth_flags(lex, orth.length) orth.flags = set_orth_flags(lex, orth.length)
orth.norm = hashed orth.norm = hash(lex)
last3 = substr(lex, length - 3, length, length) last3 = substr(lex, length - 3, length, length)
orth.last3 = hash(last3) orth.last3 = hash(last3)
norm = get_normalized(lex, length) norm = get_normalized(lex, length)
@ -172,12 +170,12 @@ cdef class Language:
self.bacov[orth.norm] = norm self.bacov[orth.norm] = norm
self.bacov[orth.shape] = shape self.bacov[orth.shape] = shape
self.ortho[0][hashed] = <size_t>orth self.ortho.set(lex, <size_t>orth)
return orth return orth
cdef Distribution* new_dist(self, StringHash hashed) except NULL: cdef Distribution* new_dist(self, unicode lex) except NULL:
dist = <Distribution*>calloc(1, sizeof(Distribution)) dist = <Distribution*>calloc(1, sizeof(Distribution))
self.distri[0][hashed] = <size_t>dist self.distri.set(lex, <size_t>dist)
return dist return dist
cdef unicode unhash(self, StringHash hash_value): cdef unicode unhash(self, StringHash hash_value):