From b94c9b72c91590023241aa6e09a7576abd2dc937 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 16 Aug 2014 20:10:22 +0200 Subject: [PATCH] * WordTree in use. Need to reform the way chunks are handled. Should be properly one Lexeme per word, with split points being the things that are cached. --- spacy/spacy.pxd | 12 ++++++------ spacy/spacy.pyx | 34 ++++++++++++++++------------------ 2 files changed, 22 insertions(+), 24 deletions(-) diff --git a/spacy/spacy.pxd b/spacy/spacy.pxd index 65b31f176..3afc9a467 100644 --- a/spacy/spacy.pxd +++ b/spacy/spacy.pxd @@ -29,20 +29,20 @@ from spacy._hashing cimport WordTree cdef class Language: cdef object name cdef WordTree vocab - cdef Vocab* distri - cdef Vocab* ortho + cdef WordTree distri + cdef WordTree ortho cdef dict bacov cpdef Tokens tokenize(self, unicode text) cdef Lexeme_addr lookup(self, unicode string) except 0 cdef Lexeme_addr lookup_chunk(self, unicode string) except 0 - cdef Orthography* lookup_orth(self, StringHash key, unicode lex) except NULL - cdef Distribution* lookup_dist(self, StringHash key) except NULL + cdef Orthography* lookup_orth(self, unicode lex) except NULL + cdef Distribution* lookup_dist(self, unicode lex) except NULL cdef Lexeme* new_lexeme(self, unicode key, unicode lex) except NULL - cdef Orthography* new_orth(self, StringHash hashed, unicode lex) except NULL - cdef Distribution* new_dist(self, StringHash key) except NULL + cdef Orthography* new_orth(self, unicode lex) except NULL + cdef Distribution* new_dist(self, unicode lex) except NULL cdef unicode unhash(self, StringHash hashed) diff --git a/spacy/spacy.pyx b/spacy/spacy.pyx index 301b9d412..d49138801 100644 --- a/spacy/spacy.pyx +++ b/spacy/spacy.pyx @@ -65,10 +65,8 @@ cdef class Language: self.name = name self.bacov = {} self.vocab = WordTree(0, 5) - self.ortho = new Vocab() - self.distri = new Vocab() - self.distri[0].set_empty_key(0) - self.ortho[0].set_empty_key(0) + self.ortho = WordTree(0, 5) + self.distri = WordTree(0, 5) self.load_tokenization(util.read_tokenization(name)) cpdef Tokens tokenize(self, unicode characters): @@ -125,16 +123,16 @@ cdef class Language: word_ptr = self.new_lexeme(string, string) return word_ptr - cdef Orthography* lookup_orth(self, StringHash hashed, unicode lex): - cdef Orthography* orth = self.ortho[0][hashed] + cdef Orthography* lookup_orth(self, unicode lex): + cdef Orthography* orth = self.ortho.get(lex) if orth == NULL: - orth = self.new_orth(hashed, lex) + orth = self.new_orth(lex) return orth - cdef Distribution* lookup_dist(self, StringHash hashed): - cdef Distribution* dist = self.distri[0][hashed] + cdef Distribution* lookup_dist(self, unicode lex): + cdef Distribution* dist = self.distri.get(lex) if dist == NULL: - dist = self.new_dist(hashed) + dist = self.new_dist(lex) return dist cdef Lexeme* new_lexeme(self, unicode key, unicode string) except NULL: @@ -143,12 +141,12 @@ cdef class Language: word.lex = hash(string) self.bacov[word.lex] = string self.bacov[word.sic] = key - word.orth = self.lookup_orth(word.lex, string) - word.dist = self.lookup_dist(word.lex) + word.orth = self.lookup_orth(string) + word.dist = self.lookup_dist(string) self.vocab.set(key, word) - return word + return word - cdef Orthography* new_orth(self, StringHash hashed, unicode lex) except NULL: + cdef Orthography* new_orth(self, unicode lex) except NULL: cdef unicode last3 cdef unicode norm cdef unicode shape @@ -160,7 +158,7 @@ cdef class Language: orth.length = length orth.flags = set_orth_flags(lex, orth.length) - orth.norm = hashed + orth.norm = hash(lex) last3 = substr(lex, length - 3, length, length) orth.last3 = hash(last3) norm = get_normalized(lex, length) @@ -172,12 +170,12 @@ cdef class Language: self.bacov[orth.norm] = norm self.bacov[orth.shape] = shape - self.ortho[0][hashed] = orth + self.ortho.set(lex, orth) return orth - cdef Distribution* new_dist(self, StringHash hashed) except NULL: + cdef Distribution* new_dist(self, unicode lex) except NULL: dist = calloc(1, sizeof(Distribution)) - self.distri[0][hashed] = dist + self.distri.set(lex, dist) return dist cdef unicode unhash(self, StringHash hash_value):