mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-09 06:13:08 +03:00
* Remove lexemes vector from Lexicon, and the id and hash attributes from Lexeme
This commit is contained in:
parent
889b7b48b4
commit
e6b87766fe
|
@ -27,8 +27,6 @@ cdef class Lexicon:
|
||||||
cpdef readonly size_t size
|
cpdef readonly size_t size
|
||||||
cpdef readonly StringStore strings
|
cpdef readonly StringStore strings
|
||||||
|
|
||||||
cdef vector[Lexeme*] lexemes
|
|
||||||
|
|
||||||
cpdef Lexeme lookup(self, unicode string)
|
cpdef Lexeme lookup(self, unicode string)
|
||||||
cdef Lexeme* get(self, String* s) except NULL
|
cdef Lexeme* get(self, String* s) except NULL
|
||||||
|
|
||||||
|
|
|
@ -45,8 +45,9 @@ cdef class Language:
|
||||||
self.suffix_re = re.compile(suffix)
|
self.suffix_re = re.compile(suffix)
|
||||||
self.infix_re = re.compile(infix)
|
self.infix_re = re.compile(infix)
|
||||||
self.lexicon = Lexicon(lexemes)
|
self.lexicon = Lexicon(lexemes)
|
||||||
self.lexicon.load(path.join(util.DATA_DIR, name, 'lexemes'))
|
if path.exists(path.join(util.DATA_DIR, name, 'lexemes')):
|
||||||
self.lexicon.strings.load(path.join(util.DATA_DIR, name, 'strings'))
|
self.lexicon.load(path.join(util.DATA_DIR, name, 'lexemes'))
|
||||||
|
self.lexicon.strings.load(path.join(util.DATA_DIR, name, 'strings'))
|
||||||
self._load_special_tokenization(rules)
|
self._load_special_tokenization(rules)
|
||||||
|
|
||||||
cpdef Tokens tokenize(self, unicode string):
|
cpdef Tokens tokenize(self, unicode string):
|
||||||
|
@ -240,18 +241,16 @@ cdef class Lexicon:
|
||||||
for py_string, lexeme_dict in lexemes.iteritems():
|
for py_string, lexeme_dict in lexemes.iteritems():
|
||||||
string_from_unicode(&string, py_string)
|
string_from_unicode(&string, py_string)
|
||||||
lexeme = <Lexeme*>self.mem.alloc(1, sizeof(Lexeme))
|
lexeme = <Lexeme*>self.mem.alloc(1, sizeof(Lexeme))
|
||||||
lexeme[0] = lexeme_init(string.chars[:string.n], string.key, self.size,
|
lexeme[0] = lexeme_init(string.chars[:string.n], string.key, self.strings,
|
||||||
self.strings, lexeme_dict)
|
lexeme_dict)
|
||||||
self._dict.set(lexeme.hash, lexeme)
|
self._dict.set(string.key, lexeme)
|
||||||
self.lexemes.push_back(lexeme)
|
|
||||||
self.size += 1
|
self.size += 1
|
||||||
|
|
||||||
def set(self, unicode py_string, dict lexeme_dict):
|
def set(self, unicode py_string, dict lexeme_dict):
|
||||||
cdef String string
|
cdef String string
|
||||||
string_from_unicode(&string, py_string)
|
string_from_unicode(&string, py_string)
|
||||||
cdef Lexeme* lex = self.get(&string)
|
cdef Lexeme* lex = self.get(&string)
|
||||||
lex[0] = lexeme_init(string.chars[:string.n], string.key, lex.i,
|
lex[0] = lexeme_init(string.chars[:string.n], string.key, self.strings, lexeme_dict)
|
||||||
self.strings, lexeme_dict)
|
|
||||||
|
|
||||||
cdef Lexeme* get(self, String* string) except NULL:
|
cdef Lexeme* get(self, String* string) except NULL:
|
||||||
cdef Lexeme* lex
|
cdef Lexeme* lex
|
||||||
|
@ -259,10 +258,8 @@ cdef class Lexicon:
|
||||||
if lex != NULL:
|
if lex != NULL:
|
||||||
return lex
|
return lex
|
||||||
lex = <Lexeme*>self.mem.alloc(sizeof(Lexeme), 1)
|
lex = <Lexeme*>self.mem.alloc(sizeof(Lexeme), 1)
|
||||||
lex[0] = lexeme_init(string.chars[:string.n], string.key, self.size,
|
lex[0] = lexeme_init(string.chars[:string.n], string.key, self.strings, {})
|
||||||
self.strings, {})
|
self._dict.set(string.key, lex)
|
||||||
self._dict.set(lex.hash, lex)
|
|
||||||
self.lexemes.push_back(lex)
|
|
||||||
self.size += 1
|
self.size += 1
|
||||||
return lex
|
return lex
|
||||||
|
|
||||||
|
@ -287,8 +284,15 @@ cdef class Lexicon:
|
||||||
cdef FILE* fp = fopen(<char*>bytes_loc, 'wb')
|
cdef FILE* fp = fopen(<char*>bytes_loc, 'wb')
|
||||||
assert fp != NULL
|
assert fp != NULL
|
||||||
cdef size_t st
|
cdef size_t st
|
||||||
for i in range(self.size-1):
|
cdef hash_t key
|
||||||
st = fwrite(self.lexemes[i], sizeof(Lexeme), 1, fp)
|
for i in range(self._dict.length):
|
||||||
|
key = self._dict.c_map.cells[i].key
|
||||||
|
if key == 0:
|
||||||
|
continue
|
||||||
|
lexeme = <Lexeme*>self._dict.c_map.cells[i].value
|
||||||
|
st = fwrite(&key, sizeof(key), 1, fp)
|
||||||
|
assert st == 1
|
||||||
|
st = fwrite(lexeme, sizeof(Lexeme), 1, fp)
|
||||||
assert st == 1
|
assert st == 1
|
||||||
st = fclose(fp)
|
st = fclose(fp)
|
||||||
assert st == 0
|
assert st == 0
|
||||||
|
@ -300,14 +304,17 @@ cdef class Lexicon:
|
||||||
assert fp != NULL
|
assert fp != NULL
|
||||||
cdef size_t st
|
cdef size_t st
|
||||||
cdef Lexeme* lexeme
|
cdef Lexeme* lexeme
|
||||||
|
cdef hash_t key
|
||||||
i = 0
|
i = 0
|
||||||
while True:
|
while True:
|
||||||
|
st = fread(&key, sizeof(key), 1, fp)
|
||||||
|
if st != 1:
|
||||||
|
break
|
||||||
lexeme = <Lexeme*>self.mem.alloc(sizeof(Lexeme), 1)
|
lexeme = <Lexeme*>self.mem.alloc(sizeof(Lexeme), 1)
|
||||||
st = fread(lexeme, sizeof(Lexeme), 1, fp)
|
st = fread(lexeme, sizeof(Lexeme), 1, fp)
|
||||||
if st != 1:
|
if st != 1:
|
||||||
break
|
break
|
||||||
self.lexemes.push_back(lexeme)
|
self._dict.set(key, lexeme)
|
||||||
self._dict.set(lexeme.hash, lexeme)
|
|
||||||
i += 1
|
i += 1
|
||||||
print "Load %d lexemes" % i
|
print "Load %d lexemes" % i
|
||||||
fclose(fp)
|
fclose(fp)
|
||||||
|
|
|
@ -23,8 +23,6 @@ cpdef enum:
|
||||||
|
|
||||||
|
|
||||||
cdef struct Lexeme:
|
cdef struct Lexeme:
|
||||||
hash_t hash
|
|
||||||
atom_t i
|
|
||||||
atom_t length
|
atom_t length
|
||||||
|
|
||||||
atom_t sic
|
atom_t sic
|
||||||
|
@ -46,7 +44,7 @@ cdef struct Lexeme:
|
||||||
|
|
||||||
cdef Lexeme EMPTY_LEXEME
|
cdef Lexeme EMPTY_LEXEME
|
||||||
|
|
||||||
cpdef Lexeme init(unicode string, hash_t hashed, atom_t i,
|
cpdef Lexeme init(unicode string, hash_t hashed,
|
||||||
StringStore store, dict props) except *
|
StringStore store, dict props) except *
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -26,11 +26,9 @@ def get_flags(unicode string, float upper_pc, float title_pc, float lower_pc):
|
||||||
return flags
|
return flags
|
||||||
|
|
||||||
|
|
||||||
cpdef Lexeme init(unicode string, hash_t hashed, atom_t i,
|
cpdef Lexeme init(unicode string, hash_t hashed,
|
||||||
StringStore store, dict props) except *:
|
StringStore store, dict props) except *:
|
||||||
cdef Lexeme lex
|
cdef Lexeme lex
|
||||||
lex.hash = hashed
|
|
||||||
lex.i = i
|
|
||||||
lex.length = len(string)
|
lex.length = len(string)
|
||||||
lex.sic = get_string_id(string, store)
|
lex.sic = get_string_id(string, store)
|
||||||
|
|
||||||
|
|
|
@ -128,7 +128,7 @@ cdef int get_atoms(atom_t* atoms, Lexeme* p2, Lexeme* p1, Lexeme* n0, Lexeme* n1
|
||||||
|
|
||||||
|
|
||||||
cdef inline void _fill_token(atom_t* atoms, Lexeme* lex) nogil:
|
cdef inline void _fill_token(atom_t* atoms, Lexeme* lex) nogil:
|
||||||
atoms[0] = lex.i
|
atoms[0] = lex.sic
|
||||||
atoms[1] = lex.cluster
|
atoms[1] = lex.cluster
|
||||||
atoms[2] = lex.norm
|
atoms[2] = lex.norm
|
||||||
atoms[3] = lex.shape
|
atoms[3] = lex.shape
|
||||||
|
|
|
@ -31,7 +31,6 @@ cdef class Token:
|
||||||
cdef public int idx
|
cdef public int idx
|
||||||
cdef public int pos
|
cdef public int pos
|
||||||
|
|
||||||
cdef public atom_t id
|
|
||||||
cdef public atom_t cluster
|
cdef public atom_t cluster
|
||||||
cdef public atom_t length
|
cdef public atom_t length
|
||||||
cdef public atom_t lex_pos
|
cdef public atom_t lex_pos
|
||||||
|
|
|
@ -107,7 +107,6 @@ cdef class Token:
|
||||||
self.idx = idx
|
self.idx = idx
|
||||||
self.pos = pos
|
self.pos = pos
|
||||||
|
|
||||||
self.id = lex['i']
|
|
||||||
self.cluster = lex['cluster']
|
self.cluster = lex['cluster']
|
||||||
self.length = lex['length']
|
self.length = lex['length']
|
||||||
self.lex_pos = lex['pos']
|
self.lex_pos = lex['pos']
|
||||||
|
|
Loading…
Reference in New Issue
Block a user