mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-13 18:10:35 +03:00
Refactor lexeme mem passing (#12125)
* Don't pass mem pool to new lexeme function * Remove unused mem from function args Two methods calling _new_lexeme, get and get_by_orth, took mem arguments just to call the internal method. That's no longer necessary, so this cleans it up. * prettier formatting * Remove more unused mem args
This commit is contained in:
parent
6348a7a4b4
commit
de360bc981
|
@ -41,7 +41,7 @@ cdef class Lexeme:
|
||||||
"""
|
"""
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.orth = orth
|
self.orth = orth
|
||||||
self.c = <LexemeC*><void*>vocab.get_by_orth(vocab.mem, orth)
|
self.c = <LexemeC*><void*>vocab.get_by_orth(orth)
|
||||||
if self.c.orth != orth:
|
if self.c.orth != orth:
|
||||||
raise ValueError(Errors.E071.format(orth=orth, vocab_orth=self.c.orth))
|
raise ValueError(Errors.E071.format(orth=orth, vocab_orth=self.c.orth))
|
||||||
|
|
||||||
|
|
|
@ -37,7 +37,7 @@ cdef class Tokenizer:
|
||||||
bint with_special_cases) except -1
|
bint with_special_cases) except -1
|
||||||
cdef int _tokenize(self, Doc tokens, str span, hash_t key,
|
cdef int _tokenize(self, Doc tokens, str span, hash_t key,
|
||||||
int* has_special, bint with_special_cases) except -1
|
int* has_special, bint with_special_cases) except -1
|
||||||
cdef str _split_affixes(self, Pool mem, str string,
|
cdef str _split_affixes(self, str string,
|
||||||
vector[LexemeC*] *prefixes,
|
vector[LexemeC*] *prefixes,
|
||||||
vector[LexemeC*] *suffixes, int* has_special,
|
vector[LexemeC*] *suffixes, int* has_special,
|
||||||
bint with_special_cases)
|
bint with_special_cases)
|
||||||
|
|
|
@ -389,14 +389,14 @@ cdef class Tokenizer:
|
||||||
cdef vector[LexemeC*] suffixes
|
cdef vector[LexemeC*] suffixes
|
||||||
cdef int orig_size
|
cdef int orig_size
|
||||||
orig_size = tokens.length
|
orig_size = tokens.length
|
||||||
span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes,
|
span = self._split_affixes(span, &prefixes, &suffixes,
|
||||||
has_special, with_special_cases)
|
has_special, with_special_cases)
|
||||||
self._attach_tokens(tokens, span, &prefixes, &suffixes, has_special,
|
self._attach_tokens(tokens, span, &prefixes, &suffixes, has_special,
|
||||||
with_special_cases)
|
with_special_cases)
|
||||||
self._save_cached(&tokens.c[orig_size], orig_key, has_special,
|
self._save_cached(&tokens.c[orig_size], orig_key, has_special,
|
||||||
tokens.length - orig_size)
|
tokens.length - orig_size)
|
||||||
|
|
||||||
cdef str _split_affixes(self, Pool mem, str string,
|
cdef str _split_affixes(self, str string,
|
||||||
vector[const LexemeC*] *prefixes,
|
vector[const LexemeC*] *prefixes,
|
||||||
vector[const LexemeC*] *suffixes,
|
vector[const LexemeC*] *suffixes,
|
||||||
int* has_special,
|
int* has_special,
|
||||||
|
@ -419,7 +419,7 @@ cdef class Tokenizer:
|
||||||
minus_pre = string[pre_len:]
|
minus_pre = string[pre_len:]
|
||||||
if minus_pre and with_special_cases and self._specials.get(hash_string(minus_pre)) != NULL:
|
if minus_pre and with_special_cases and self._specials.get(hash_string(minus_pre)) != NULL:
|
||||||
string = minus_pre
|
string = minus_pre
|
||||||
prefixes.push_back(self.vocab.get(mem, prefix))
|
prefixes.push_back(self.vocab.get(prefix))
|
||||||
break
|
break
|
||||||
suf_len = self.find_suffix(string[pre_len:])
|
suf_len = self.find_suffix(string[pre_len:])
|
||||||
if suf_len != 0:
|
if suf_len != 0:
|
||||||
|
@ -427,18 +427,18 @@ cdef class Tokenizer:
|
||||||
minus_suf = string[:-suf_len]
|
minus_suf = string[:-suf_len]
|
||||||
if minus_suf and with_special_cases and self._specials.get(hash_string(minus_suf)) != NULL:
|
if minus_suf and with_special_cases and self._specials.get(hash_string(minus_suf)) != NULL:
|
||||||
string = minus_suf
|
string = minus_suf
|
||||||
suffixes.push_back(self.vocab.get(mem, suffix))
|
suffixes.push_back(self.vocab.get(suffix))
|
||||||
break
|
break
|
||||||
if pre_len and suf_len and (pre_len + suf_len) <= len(string):
|
if pre_len and suf_len and (pre_len + suf_len) <= len(string):
|
||||||
string = string[pre_len:-suf_len]
|
string = string[pre_len:-suf_len]
|
||||||
prefixes.push_back(self.vocab.get(mem, prefix))
|
prefixes.push_back(self.vocab.get(prefix))
|
||||||
suffixes.push_back(self.vocab.get(mem, suffix))
|
suffixes.push_back(self.vocab.get(suffix))
|
||||||
elif pre_len:
|
elif pre_len:
|
||||||
string = minus_pre
|
string = minus_pre
|
||||||
prefixes.push_back(self.vocab.get(mem, prefix))
|
prefixes.push_back(self.vocab.get(prefix))
|
||||||
elif suf_len:
|
elif suf_len:
|
||||||
string = minus_suf
|
string = minus_suf
|
||||||
suffixes.push_back(self.vocab.get(mem, suffix))
|
suffixes.push_back(self.vocab.get(suffix))
|
||||||
return string
|
return string
|
||||||
|
|
||||||
cdef int _attach_tokens(self, Doc tokens, str string,
|
cdef int _attach_tokens(self, Doc tokens, str string,
|
||||||
|
@ -465,11 +465,11 @@ cdef class Tokenizer:
|
||||||
# We're always saying 'no' to spaces here -- the caller will
|
# We're always saying 'no' to spaces here -- the caller will
|
||||||
# fix up the outermost one, with reference to the original.
|
# fix up the outermost one, with reference to the original.
|
||||||
# See Issue #859
|
# See Issue #859
|
||||||
tokens.push_back(self.vocab.get(tokens.mem, string), False)
|
tokens.push_back(self.vocab.get(string), False)
|
||||||
else:
|
else:
|
||||||
matches = self.find_infix(string)
|
matches = self.find_infix(string)
|
||||||
if not matches:
|
if not matches:
|
||||||
tokens.push_back(self.vocab.get(tokens.mem, string), False)
|
tokens.push_back(self.vocab.get(string), False)
|
||||||
else:
|
else:
|
||||||
# Let's say we have dyn-o-mite-dave - the regex finds the
|
# Let's say we have dyn-o-mite-dave - the regex finds the
|
||||||
# start and end positions of the hyphens
|
# start and end positions of the hyphens
|
||||||
|
@ -484,7 +484,7 @@ cdef class Tokenizer:
|
||||||
|
|
||||||
if infix_start != start:
|
if infix_start != start:
|
||||||
span = string[start:infix_start]
|
span = string[start:infix_start]
|
||||||
tokens.push_back(self.vocab.get(tokens.mem, span), False)
|
tokens.push_back(self.vocab.get(span), False)
|
||||||
|
|
||||||
if infix_start != infix_end:
|
if infix_start != infix_end:
|
||||||
# If infix_start != infix_end, it means the infix
|
# If infix_start != infix_end, it means the infix
|
||||||
|
@ -492,11 +492,11 @@ cdef class Tokenizer:
|
||||||
# for tokenization in some languages (see
|
# for tokenization in some languages (see
|
||||||
# https://github.com/explosion/spaCy/issues/768)
|
# https://github.com/explosion/spaCy/issues/768)
|
||||||
infix_span = string[infix_start:infix_end]
|
infix_span = string[infix_start:infix_end]
|
||||||
tokens.push_back(self.vocab.get(tokens.mem, infix_span), False)
|
tokens.push_back(self.vocab.get(infix_span), False)
|
||||||
start = infix_end
|
start = infix_end
|
||||||
span = string[start:]
|
span = string[start:]
|
||||||
if span:
|
if span:
|
||||||
tokens.push_back(self.vocab.get(tokens.mem, span), False)
|
tokens.push_back(self.vocab.get(span), False)
|
||||||
cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin()
|
cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin()
|
||||||
while it != suffixes.rend():
|
while it != suffixes.rend():
|
||||||
lexeme = deref(it)
|
lexeme = deref(it)
|
||||||
|
|
|
@ -266,12 +266,12 @@ cdef class Doc:
|
||||||
cdef const LexemeC* lexeme
|
cdef const LexemeC* lexeme
|
||||||
for word, has_space in zip(words, spaces):
|
for word, has_space in zip(words, spaces):
|
||||||
if isinstance(word, str):
|
if isinstance(word, str):
|
||||||
lexeme = self.vocab.get(self.mem, word)
|
lexeme = self.vocab.get(word)
|
||||||
elif isinstance(word, bytes):
|
elif isinstance(word, bytes):
|
||||||
raise ValueError(Errors.E028.format(value=word))
|
raise ValueError(Errors.E028.format(value=word))
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
lexeme = self.vocab.get_by_orth(self.mem, word)
|
lexeme = self.vocab.get_by_orth(word)
|
||||||
except TypeError:
|
except TypeError:
|
||||||
raise TypeError(Errors.E1022.format(wtype=type(word)))
|
raise TypeError(Errors.E1022.format(wtype=type(word)))
|
||||||
self.push_back(lexeme, has_space)
|
self.push_back(lexeme, has_space)
|
||||||
|
@ -1430,7 +1430,7 @@ cdef class Doc:
|
||||||
end = start + attrs[i, 0]
|
end = start + attrs[i, 0]
|
||||||
has_space = attrs[i, 1]
|
has_space = attrs[i, 1]
|
||||||
orth_ = text[start:end]
|
orth_ = text[start:end]
|
||||||
lex = self.vocab.get(self.mem, orth_)
|
lex = self.vocab.get(orth_)
|
||||||
self.push_back(lex, has_space)
|
self.push_back(lex, has_space)
|
||||||
start = end + has_space
|
start = end + has_space
|
||||||
self.from_array(msg["array_head"][2:], attrs[:, 2:])
|
self.from_array(msg["array_head"][2:], attrs[:, 2:])
|
||||||
|
@ -1536,7 +1536,7 @@ cdef class Doc:
|
||||||
assert words == reconstructed_words
|
assert words == reconstructed_words
|
||||||
|
|
||||||
for word, has_space in zip(words, spaces):
|
for word, has_space in zip(words, spaces):
|
||||||
lex = self.vocab.get(self.mem, word)
|
lex = self.vocab.get(word)
|
||||||
self.push_back(lex, has_space)
|
self.push_back(lex, has_space)
|
||||||
|
|
||||||
# Set remaining token-level attributes via Doc.from_array().
|
# Set remaining token-level attributes via Doc.from_array().
|
||||||
|
|
|
@ -223,7 +223,7 @@ def _merge(Doc doc, merges):
|
||||||
if doc.vocab.vectors_length > 0:
|
if doc.vocab.vectors_length > 0:
|
||||||
doc.vocab.set_vector(new_orth, span.vector)
|
doc.vocab.set_vector(new_orth, span.vector)
|
||||||
token = tokens[token_index]
|
token = tokens[token_index]
|
||||||
lex = doc.vocab.get(doc.mem, new_orth)
|
lex = doc.vocab.get(new_orth)
|
||||||
token.lex = lex
|
token.lex = lex
|
||||||
# We set trailing space here too
|
# We set trailing space here too
|
||||||
token.spacy = doc.c[spans[token_index].end-1].spacy
|
token.spacy = doc.c[spans[token_index].end-1].spacy
|
||||||
|
@ -359,7 +359,7 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
|
||||||
cdef int idx_offset = 0
|
cdef int idx_offset = 0
|
||||||
for i, orth in enumerate(orths):
|
for i, orth in enumerate(orths):
|
||||||
token = &doc.c[token_index + i]
|
token = &doc.c[token_index + i]
|
||||||
lex = doc.vocab.get(doc.mem, orth)
|
lex = doc.vocab.get(orth)
|
||||||
token.lex = lex
|
token.lex = lex
|
||||||
# If lemma is currently set, set default lemma to orth
|
# If lemma is currently set, set default lemma to orth
|
||||||
if token.lemma != 0:
|
if token.lemma != 0:
|
||||||
|
|
|
@ -35,12 +35,11 @@ cdef class Vocab:
|
||||||
cdef public object lex_attr_getters
|
cdef public object lex_attr_getters
|
||||||
cdef public object cfg
|
cdef public object cfg
|
||||||
|
|
||||||
cdef const LexemeC* get(self, Pool mem, str string) except NULL
|
cdef const LexemeC* get(self, str string) except NULL
|
||||||
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL
|
cdef const LexemeC* get_by_orth(self, attr_t orth) except NULL
|
||||||
cdef const TokenC* make_fused_token(self, substrings) except NULL
|
cdef const TokenC* make_fused_token(self, substrings) except NULL
|
||||||
|
|
||||||
cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
|
cdef const LexemeC* _new_lexeme(self, str string) except NULL
|
||||||
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
|
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
|
||||||
cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
|
|
||||||
|
|
||||||
cdef PreshMap _by_orth
|
cdef PreshMap _by_orth
|
||||||
|
|
|
@ -139,7 +139,7 @@ cdef class Vocab:
|
||||||
self.lex_attr_getters[flag_id] = flag_getter
|
self.lex_attr_getters[flag_id] = flag_getter
|
||||||
return flag_id
|
return flag_id
|
||||||
|
|
||||||
cdef const LexemeC* get(self, Pool mem, str string) except NULL:
|
cdef const LexemeC* get(self, str string) except NULL:
|
||||||
"""Get a pointer to a `LexemeC` from the lexicon, creating a new
|
"""Get a pointer to a `LexemeC` from the lexicon, creating a new
|
||||||
`Lexeme` if necessary using memory acquired from the given pool. If the
|
`Lexeme` if necessary using memory acquired from the given pool. If the
|
||||||
pool is the lexicon's own memory, the lexeme is saved in the lexicon.
|
pool is the lexicon's own memory, the lexeme is saved in the lexicon.
|
||||||
|
@ -157,9 +157,9 @@ cdef class Vocab:
|
||||||
orth=key, orth_id=string))
|
orth=key, orth_id=string))
|
||||||
return lex
|
return lex
|
||||||
else:
|
else:
|
||||||
return self._new_lexeme(mem, string)
|
return self._new_lexeme(string)
|
||||||
|
|
||||||
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL:
|
cdef const LexemeC* get_by_orth(self, attr_t orth) except NULL:
|
||||||
"""Get a pointer to a `LexemeC` from the lexicon, creating a new
|
"""Get a pointer to a `LexemeC` from the lexicon, creating a new
|
||||||
`Lexeme` if necessary using memory acquired from the given pool. If the
|
`Lexeme` if necessary using memory acquired from the given pool. If the
|
||||||
pool is the lexicon's own memory, the lexeme is saved in the lexicon.
|
pool is the lexicon's own memory, the lexeme is saved in the lexicon.
|
||||||
|
@ -171,21 +171,10 @@ cdef class Vocab:
|
||||||
if lex != NULL:
|
if lex != NULL:
|
||||||
return lex
|
return lex
|
||||||
else:
|
else:
|
||||||
return self._new_lexeme(mem, self.strings[orth])
|
return self._new_lexeme(self.strings[orth])
|
||||||
|
|
||||||
cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL:
|
cdef const LexemeC* _new_lexeme(self, str string) except NULL:
|
||||||
# I think this heuristic is bad, and the Vocab should always
|
lex = <LexemeC*>self.mem.alloc(1, sizeof(LexemeC))
|
||||||
# own the lexemes. It avoids weird bugs this way, as it's how the thing
|
|
||||||
# was originally supposed to work. The best solution to the growing
|
|
||||||
# memory use is to periodically reset the vocab, which is an action
|
|
||||||
# that should be up to the user to do (so we don't need to keep track
|
|
||||||
# of the doc ownership).
|
|
||||||
# TODO: Change the C API so that the mem isn't passed in here.
|
|
||||||
mem = self.mem
|
|
||||||
#if len(string) < 3 or self.length < 10000:
|
|
||||||
# mem = self.mem
|
|
||||||
cdef bint is_oov = mem is not self.mem
|
|
||||||
lex = <LexemeC*>mem.alloc(1, sizeof(LexemeC))
|
|
||||||
lex.orth = self.strings.add(string)
|
lex.orth = self.strings.add(string)
|
||||||
lex.length = len(string)
|
lex.length = len(string)
|
||||||
if self.vectors is not None:
|
if self.vectors is not None:
|
||||||
|
@ -199,7 +188,6 @@ cdef class Vocab:
|
||||||
value = self.strings.add(value)
|
value = self.strings.add(value)
|
||||||
if value is not None:
|
if value is not None:
|
||||||
Lexeme.set_struct_attr(lex, attr, value)
|
Lexeme.set_struct_attr(lex, attr, value)
|
||||||
if not is_oov:
|
|
||||||
self._add_lex_to_vocab(lex.orth, lex)
|
self._add_lex_to_vocab(lex.orth, lex)
|
||||||
if lex == NULL:
|
if lex == NULL:
|
||||||
raise ValueError(Errors.E085.format(string=string))
|
raise ValueError(Errors.E085.format(string=string))
|
||||||
|
@ -271,7 +259,7 @@ cdef class Vocab:
|
||||||
props = intify_attrs(props, strings_map=self.strings)
|
props = intify_attrs(props, strings_map=self.strings)
|
||||||
token = &tokens[i]
|
token = &tokens[i]
|
||||||
# Set the special tokens up to have arbitrary attributes
|
# Set the special tokens up to have arbitrary attributes
|
||||||
lex = <LexemeC*>self.get_by_orth(self.mem, props[ORTH])
|
lex = <LexemeC*>self.get_by_orth(props[ORTH])
|
||||||
token.lex = lex
|
token.lex = lex
|
||||||
for attr_id, value in props.items():
|
for attr_id, value in props.items():
|
||||||
Token.set_struct_attr(token, attr_id, value)
|
Token.set_struct_attr(token, attr_id, value)
|
||||||
|
|
|
@ -163,12 +163,11 @@ vocabulary.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> lexeme = vocab.get(vocab.mem, "hello")
|
> lexeme = vocab.get("hello")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------- | ---------------------------------------------------------------------------------------------------------- |
|
| ----------- | ------------------------------------------------- |
|
||||||
| `mem` | A memory pool. Allocated memory will be freed once the `Vocab` object is garbage collected. ~~cymem.Pool~~ |
|
|
||||||
| `string` | The string of the word to look up. ~~str~~ |
|
| `string` | The string of the word to look up. ~~str~~ |
|
||||||
| **RETURNS** | The lexeme in the vocabulary. ~~const LexemeC\*~~ |
|
| **RETURNS** | The lexeme in the vocabulary. ~~const LexemeC\*~~ |
|
||||||
|
|
||||||
|
@ -184,8 +183,7 @@ vocabulary.
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------- | ---------------------------------------------------------------------------------------------------------- |
|
| ----------- | ------------------------------------------------------ |
|
||||||
| `mem` | A memory pool. Allocated memory will be freed once the `Vocab` object is garbage collected. ~~cymem.Pool~~ |
|
|
||||||
| `orth` | ID of the verbatim text content. ~~attr_t (uint64_t)~~ |
|
| `orth` | ID of the verbatim text content. ~~attr_t (uint64_t)~~ |
|
||||||
| **RETURNS** | The lexeme in the vocabulary. ~~const LexemeC\*~~ |
|
| **RETURNS** | The lexeme in the vocabulary. ~~const LexemeC\*~~ |
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user