mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 09:56:28 +03:00
* WordTree in use. Need to reform the way chunks are handled. Should be properly one Lexeme per word, with split points being the things that are cached.
This commit is contained in:
parent
34b68a18ab
commit
b94c9b72c9
|
@ -29,20 +29,20 @@ from spacy._hashing cimport WordTree
|
||||||
cdef class Language:
|
cdef class Language:
|
||||||
cdef object name
|
cdef object name
|
||||||
cdef WordTree vocab
|
cdef WordTree vocab
|
||||||
cdef Vocab* distri
|
cdef WordTree distri
|
||||||
cdef Vocab* ortho
|
cdef WordTree ortho
|
||||||
cdef dict bacov
|
cdef dict bacov
|
||||||
|
|
||||||
cpdef Tokens tokenize(self, unicode text)
|
cpdef Tokens tokenize(self, unicode text)
|
||||||
|
|
||||||
cdef Lexeme_addr lookup(self, unicode string) except 0
|
cdef Lexeme_addr lookup(self, unicode string) except 0
|
||||||
cdef Lexeme_addr lookup_chunk(self, unicode string) except 0
|
cdef Lexeme_addr lookup_chunk(self, unicode string) except 0
|
||||||
cdef Orthography* lookup_orth(self, StringHash key, unicode lex) except NULL
|
cdef Orthography* lookup_orth(self, unicode lex) except NULL
|
||||||
cdef Distribution* lookup_dist(self, StringHash key) except NULL
|
cdef Distribution* lookup_dist(self, unicode lex) except NULL
|
||||||
|
|
||||||
cdef Lexeme* new_lexeme(self, unicode key, unicode lex) except NULL
|
cdef Lexeme* new_lexeme(self, unicode key, unicode lex) except NULL
|
||||||
cdef Orthography* new_orth(self, StringHash hashed, unicode lex) except NULL
|
cdef Orthography* new_orth(self, unicode lex) except NULL
|
||||||
cdef Distribution* new_dist(self, StringHash key) except NULL
|
cdef Distribution* new_dist(self, unicode lex) except NULL
|
||||||
|
|
||||||
cdef unicode unhash(self, StringHash hashed)
|
cdef unicode unhash(self, StringHash hashed)
|
||||||
|
|
||||||
|
|
|
@ -65,10 +65,8 @@ cdef class Language:
|
||||||
self.name = name
|
self.name = name
|
||||||
self.bacov = {}
|
self.bacov = {}
|
||||||
self.vocab = WordTree(0, 5)
|
self.vocab = WordTree(0, 5)
|
||||||
self.ortho = new Vocab()
|
self.ortho = WordTree(0, 5)
|
||||||
self.distri = new Vocab()
|
self.distri = WordTree(0, 5)
|
||||||
self.distri[0].set_empty_key(0)
|
|
||||||
self.ortho[0].set_empty_key(0)
|
|
||||||
self.load_tokenization(util.read_tokenization(name))
|
self.load_tokenization(util.read_tokenization(name))
|
||||||
|
|
||||||
cpdef Tokens tokenize(self, unicode characters):
|
cpdef Tokens tokenize(self, unicode characters):
|
||||||
|
@ -125,16 +123,16 @@ cdef class Language:
|
||||||
word_ptr = self.new_lexeme(string, string)
|
word_ptr = self.new_lexeme(string, string)
|
||||||
return <Lexeme_addr>word_ptr
|
return <Lexeme_addr>word_ptr
|
||||||
|
|
||||||
cdef Orthography* lookup_orth(self, StringHash hashed, unicode lex):
|
cdef Orthography* lookup_orth(self, unicode lex):
|
||||||
cdef Orthography* orth = <Orthography*>self.ortho[0][hashed]
|
cdef Orthography* orth = <Orthography*>self.ortho.get(lex)
|
||||||
if orth == NULL:
|
if orth == NULL:
|
||||||
orth = self.new_orth(hashed, lex)
|
orth = self.new_orth(lex)
|
||||||
return orth
|
return orth
|
||||||
|
|
||||||
cdef Distribution* lookup_dist(self, StringHash hashed):
|
cdef Distribution* lookup_dist(self, unicode lex):
|
||||||
cdef Distribution* dist = <Distribution*>self.distri[0][hashed]
|
cdef Distribution* dist = <Distribution*>self.distri.get(lex)
|
||||||
if dist == NULL:
|
if dist == NULL:
|
||||||
dist = self.new_dist(hashed)
|
dist = self.new_dist(lex)
|
||||||
return dist
|
return dist
|
||||||
|
|
||||||
cdef Lexeme* new_lexeme(self, unicode key, unicode string) except NULL:
|
cdef Lexeme* new_lexeme(self, unicode key, unicode string) except NULL:
|
||||||
|
@ -143,12 +141,12 @@ cdef class Language:
|
||||||
word.lex = hash(string)
|
word.lex = hash(string)
|
||||||
self.bacov[word.lex] = string
|
self.bacov[word.lex] = string
|
||||||
self.bacov[word.sic] = key
|
self.bacov[word.sic] = key
|
||||||
word.orth = self.lookup_orth(word.lex, string)
|
word.orth = self.lookup_orth(string)
|
||||||
word.dist = self.lookup_dist(word.lex)
|
word.dist = self.lookup_dist(string)
|
||||||
self.vocab.set(key, <size_t>word)
|
self.vocab.set(key, <size_t>word)
|
||||||
return word
|
return word
|
||||||
|
|
||||||
cdef Orthography* new_orth(self, StringHash hashed, unicode lex) except NULL:
|
cdef Orthography* new_orth(self, unicode lex) except NULL:
|
||||||
cdef unicode last3
|
cdef unicode last3
|
||||||
cdef unicode norm
|
cdef unicode norm
|
||||||
cdef unicode shape
|
cdef unicode shape
|
||||||
|
@ -160,7 +158,7 @@ cdef class Language:
|
||||||
|
|
||||||
orth.length = length
|
orth.length = length
|
||||||
orth.flags = set_orth_flags(lex, orth.length)
|
orth.flags = set_orth_flags(lex, orth.length)
|
||||||
orth.norm = hashed
|
orth.norm = hash(lex)
|
||||||
last3 = substr(lex, length - 3, length, length)
|
last3 = substr(lex, length - 3, length, length)
|
||||||
orth.last3 = hash(last3)
|
orth.last3 = hash(last3)
|
||||||
norm = get_normalized(lex, length)
|
norm = get_normalized(lex, length)
|
||||||
|
@ -172,12 +170,12 @@ cdef class Language:
|
||||||
self.bacov[orth.norm] = norm
|
self.bacov[orth.norm] = norm
|
||||||
self.bacov[orth.shape] = shape
|
self.bacov[orth.shape] = shape
|
||||||
|
|
||||||
self.ortho[0][hashed] = <size_t>orth
|
self.ortho.set(lex, <size_t>orth)
|
||||||
return orth
|
return orth
|
||||||
|
|
||||||
cdef Distribution* new_dist(self, StringHash hashed) except NULL:
|
cdef Distribution* new_dist(self, unicode lex) except NULL:
|
||||||
dist = <Distribution*>calloc(1, sizeof(Distribution))
|
dist = <Distribution*>calloc(1, sizeof(Distribution))
|
||||||
self.distri[0][hashed] = <size_t>dist
|
self.distri.set(lex, <size_t>dist)
|
||||||
return dist
|
return dist
|
||||||
|
|
||||||
cdef unicode unhash(self, StringHash hash_value):
|
cdef unicode unhash(self, StringHash hash_value):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user