mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-27 10:26:35 +03:00
* Allow the lexicon to create lexemes using an external memory pool, so that it can decide to make some lexemes temporary, rather than cached
This commit is contained in:
parent
75b8dfb348
commit
187372c7f3
|
@ -18,7 +18,7 @@ cdef class Lexicon:
|
||||||
cpdef readonly StringStore strings
|
cpdef readonly StringStore strings
|
||||||
cdef vector[Lexeme*] lexemes
|
cdef vector[Lexeme*] lexemes
|
||||||
|
|
||||||
cdef const Lexeme* get(self, UniStr* s) except NULL
|
cdef const Lexeme* get(self, Pool mem, UniStr* s) except NULL
|
||||||
|
|
||||||
cdef PreshMap _map
|
cdef PreshMap _map
|
||||||
|
|
||||||
|
|
|
@ -18,6 +18,7 @@ from preshed.maps cimport PreshMap
|
||||||
from .lexeme cimport Lexeme
|
from .lexeme cimport Lexeme
|
||||||
from .lexeme cimport EMPTY_LEXEME
|
from .lexeme cimport EMPTY_LEXEME
|
||||||
from .lexeme cimport init as lexeme_init
|
from .lexeme cimport init as lexeme_init
|
||||||
|
from .lexeme cimport check_flag, IS_ALPHA
|
||||||
|
|
||||||
from .utf8string cimport slice_unicode
|
from .utf8string cimport slice_unicode
|
||||||
|
|
||||||
|
@ -53,7 +54,7 @@ cdef class Language:
|
||||||
cdef int idx = 0
|
cdef int idx = 0
|
||||||
for i, py_string in enumerate(strings):
|
for i, py_string in enumerate(strings):
|
||||||
slice_unicode(&string_struct, py_string, 0, len(py_string))
|
slice_unicode(&string_struct, py_string, 0, len(py_string))
|
||||||
tokens.push_back(idx, self.lexicon.get(&string_struct))
|
tokens.push_back(idx, self.lexicon.get(tokens.mem, &string_struct))
|
||||||
idx += len(py_string) + 1
|
idx += len(py_string) + 1
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
|
@ -132,7 +133,7 @@ cdef class Language:
|
||||||
# Check whether we've hit a special-case
|
# Check whether we've hit a special-case
|
||||||
if minus_pre.n >= 1 and self._specials.get(minus_pre.key) != NULL:
|
if minus_pre.n >= 1 and self._specials.get(minus_pre.key) != NULL:
|
||||||
string[0] = minus_pre
|
string[0] = minus_pre
|
||||||
prefixes.push_back(self.lexicon.get(&prefix))
|
prefixes.push_back(self.lexicon.get(self.lexicon.mem, &prefix))
|
||||||
break
|
break
|
||||||
suf_len = self._find_suffix(string.chars, string.n)
|
suf_len = self._find_suffix(string.chars, string.n)
|
||||||
if suf_len != 0:
|
if suf_len != 0:
|
||||||
|
@ -141,18 +142,18 @@ cdef class Language:
|
||||||
# Check whether we've hit a special-case
|
# Check whether we've hit a special-case
|
||||||
if minus_suf.n >= 1 and self._specials.get(minus_suf.key) != NULL:
|
if minus_suf.n >= 1 and self._specials.get(minus_suf.key) != NULL:
|
||||||
string[0] = minus_suf
|
string[0] = minus_suf
|
||||||
suffixes.push_back(self.lexicon.get(&suffix))
|
suffixes.push_back(self.lexicon.get(self.lexicon.mem, &suffix))
|
||||||
break
|
break
|
||||||
if pre_len and suf_len and (pre_len + suf_len) <= string.n:
|
if pre_len and suf_len and (pre_len + suf_len) <= string.n:
|
||||||
slice_unicode(string, string.chars, pre_len, string.n - suf_len)
|
slice_unicode(string, string.chars, pre_len, string.n - suf_len)
|
||||||
prefixes.push_back(self.lexicon.get(&prefix))
|
prefixes.push_back(self.lexicon.get(self.lexicon.mem, &prefix))
|
||||||
suffixes.push_back(self.lexicon.get(&suffix))
|
suffixes.push_back(self.lexicon.get(self.lexicon.mem, &suffix))
|
||||||
elif pre_len:
|
elif pre_len:
|
||||||
string[0] = minus_pre
|
string[0] = minus_pre
|
||||||
prefixes.push_back(self.lexicon.get(&prefix))
|
prefixes.push_back(self.lexicon.get(self.lexicon.mem, &prefix))
|
||||||
elif suf_len:
|
elif suf_len:
|
||||||
string[0] = minus_suf
|
string[0] = minus_suf
|
||||||
suffixes.push_back(self.lexicon.get(&suffix))
|
suffixes.push_back(self.lexicon.get(self.lexicon.mem, &suffix))
|
||||||
if self._specials.get(string.key):
|
if self._specials.get(string.key):
|
||||||
break
|
break
|
||||||
return string
|
return string
|
||||||
|
@ -175,22 +176,25 @@ cdef class Language:
|
||||||
else:
|
else:
|
||||||
split = self._find_infix(string.chars, string.n)
|
split = self._find_infix(string.chars, string.n)
|
||||||
if split == 0 or split == -1:
|
if split == 0 or split == -1:
|
||||||
idx = tokens.push_back(idx, self.lexicon.get(string))
|
idx = tokens.push_back(idx, self.lexicon.get(tokens.mem, string))
|
||||||
else:
|
else:
|
||||||
slice_unicode(&span, string.chars, 0, split)
|
slice_unicode(&span, string.chars, 0, split)
|
||||||
idx = tokens.push_back(idx, self.lexicon.get(&span))
|
idx = tokens.push_back(idx, self.lexicon.get(tokens.mem, &span))
|
||||||
slice_unicode(&span, string.chars, split, split+1)
|
slice_unicode(&span, string.chars, split, split+1)
|
||||||
idx = tokens.push_back(idx, self.lexicon.get(&span))
|
idx = tokens.push_back(idx, self.lexicon.get(tokens.mem, &span))
|
||||||
slice_unicode(&span, string.chars, split + 1, string.n)
|
slice_unicode(&span, string.chars, split + 1, string.n)
|
||||||
idx = tokens.push_back(idx, self.lexicon.get(&span))
|
idx = tokens.push_back(idx, self.lexicon.get(tokens.mem, &span))
|
||||||
cdef vector[const Lexeme*].reverse_iterator it = suffixes.rbegin()
|
cdef vector[const Lexeme*].reverse_iterator it = suffixes.rbegin()
|
||||||
while it != suffixes.rend():
|
while it != suffixes.rend():
|
||||||
idx = tokens.push_back(idx, deref(it))
|
idx = tokens.push_back(idx, deref(it))
|
||||||
preinc(it)
|
preinc(it)
|
||||||
|
|
||||||
cdef int _save_cached(self, const Lexeme* const* tokens, hash_t key, int n) except -1:
|
cdef int _save_cached(self, const Lexeme* const* tokens, hash_t key, int n) except -1:
|
||||||
lexemes = <const Lexeme**>self.mem.alloc(n + 1, sizeof(Lexeme**))
|
|
||||||
cdef int i
|
cdef int i
|
||||||
|
for i in range(n):
|
||||||
|
if tokens[i].id == 1:
|
||||||
|
return 0
|
||||||
|
lexemes = <const Lexeme**>self.mem.alloc(n + 1, sizeof(Lexeme**))
|
||||||
for i in range(n):
|
for i in range(n):
|
||||||
lexemes[i] = tokens[i]
|
lexemes[i] = tokens[i]
|
||||||
lexemes[i + 1] = NULL
|
lexemes[i + 1] = NULL
|
||||||
|
@ -230,7 +234,7 @@ cdef class Language:
|
||||||
lexemes = <Lexeme**>self.mem.alloc(len(substrings) + 1, sizeof(Lexeme*))
|
lexemes = <Lexeme**>self.mem.alloc(len(substrings) + 1, sizeof(Lexeme*))
|
||||||
for i, substring in enumerate(substrings):
|
for i, substring in enumerate(substrings):
|
||||||
slice_unicode(&string, substring, 0, len(substring))
|
slice_unicode(&string, substring, 0, len(substring))
|
||||||
lexemes[i] = <Lexeme*>self.lexicon.get(&string)
|
lexemes[i] = <Lexeme*>self.lexicon.get(self.lexicon.mem, &string)
|
||||||
lexemes[i + 1] = NULL
|
lexemes[i + 1] = NULL
|
||||||
slice_unicode(&string, uni_string, 0, len(uni_string))
|
slice_unicode(&string, uni_string, 0, len(uni_string))
|
||||||
self._specials.set(string.key, lexemes)
|
self._specials.set(string.key, lexemes)
|
||||||
|
@ -247,23 +251,28 @@ cdef class Lexicon:
|
||||||
self._map = PreshMap(2 ** 20)
|
self._map = PreshMap(2 ** 20)
|
||||||
self.strings = StringStore()
|
self.strings = StringStore()
|
||||||
self.lexemes.push_back(&EMPTY_LEXEME)
|
self.lexemes.push_back(&EMPTY_LEXEME)
|
||||||
self.size = 1
|
self.size = 2
|
||||||
self.set_flags = set_flags
|
self.set_flags = set_flags
|
||||||
|
|
||||||
cdef const Lexeme* get(self, UniStr* string) except NULL:
|
cdef const Lexeme* get(self, Pool mem, UniStr* string) except NULL:
|
||||||
'''Retrieve a pointer to a Lexeme from the lexicon.'''
|
'''Retrieve a pointer to a Lexeme from the lexicon.'''
|
||||||
cdef Lexeme* lex
|
cdef Lexeme* lex
|
||||||
lex = <Lexeme*>self._map.get(string.key)
|
lex = <Lexeme*>self._map.get(string.key)
|
||||||
if lex != NULL:
|
if lex != NULL:
|
||||||
return lex
|
return lex
|
||||||
lex = <Lexeme*>self.mem.alloc(sizeof(Lexeme), 1)
|
if string.n < 3:
|
||||||
|
mem = self.mem
|
||||||
|
lex = <Lexeme*>mem.alloc(sizeof(Lexeme), 1)
|
||||||
lex[0] = lexeme_init(self.size, string.chars[:string.n], string.key,
|
lex[0] = lexeme_init(self.size, string.chars[:string.n], string.key,
|
||||||
self.strings, {'flags': self.set_flags(string.chars[:string.n])})
|
self.strings, {'flags': self.set_flags(string.chars[:string.n])})
|
||||||
|
if mem is self.mem:
|
||||||
self._map.set(string.key, lex)
|
self._map.set(string.key, lex)
|
||||||
while self.lexemes.size() < (lex.id + 1):
|
while self.lexemes.size() < (lex.id + 1):
|
||||||
self.lexemes.push_back(&EMPTY_LEXEME)
|
self.lexemes.push_back(&EMPTY_LEXEME)
|
||||||
self.lexemes[lex.id] = lex
|
self.lexemes[lex.id] = lex
|
||||||
self.size += 1
|
self.size += 1
|
||||||
|
else:
|
||||||
|
lex[0].id = 1
|
||||||
return lex
|
return lex
|
||||||
|
|
||||||
def __getitem__(self, id_or_string):
|
def __getitem__(self, id_or_string):
|
||||||
|
@ -290,7 +299,7 @@ cdef class Lexicon:
|
||||||
return self.lexemes.at(id_or_string)[0]
|
return self.lexemes.at(id_or_string)[0]
|
||||||
cdef UniStr string
|
cdef UniStr string
|
||||||
slice_unicode(&string, id_or_string, 0, len(id_or_string))
|
slice_unicode(&string, id_or_string, 0, len(id_or_string))
|
||||||
cdef const Lexeme* lexeme = self.get(&string)
|
cdef const Lexeme* lexeme = self.get(self.mem, &string)
|
||||||
return lexeme[0]
|
return lexeme[0]
|
||||||
|
|
||||||
def __setitem__(self, unicode uni_string, dict props):
|
def __setitem__(self, unicode uni_string, dict props):
|
||||||
|
@ -298,7 +307,7 @@ cdef class Lexicon:
|
||||||
slice_unicode(&s, uni_string, 0, len(uni_string))
|
slice_unicode(&s, uni_string, 0, len(uni_string))
|
||||||
# Cast through the const here, since we're allowed to change our own
|
# Cast through the const here, since we're allowed to change our own
|
||||||
# Lexemes.
|
# Lexemes.
|
||||||
lex = <Lexeme*><void*>self.get(&s)
|
lex = <Lexeme*><void*>self.get(self.mem, &s)
|
||||||
lex[0] = lexeme_init(lex.id, s.chars[:s.n], s.key, self.strings, props)
|
lex[0] = lexeme_init(lex.id, s.chars[:s.n], s.key, self.strings, props)
|
||||||
|
|
||||||
def dump(self, loc):
|
def dump(self, loc):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user