Refactor lexeme mem passing (#12125)

* Don't pass mem pool to new lexeme function

* Remove unused mem from function args

Two methods calling _new_lexeme, get and get_by_orth, took mem arguments
just to call the internal method. That's no longer necessary, so this
cleans it up.

* prettier formatting

* Remove more unused mem args
This commit is contained in:
Paul O'Leary McCann 2023-01-25 12:50:21 +09:00 committed by GitHub
parent 6348a7a4b4
commit de360bc981
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 41 additions and 56 deletions

View File

@ -41,7 +41,7 @@ cdef class Lexeme:
""" """
self.vocab = vocab self.vocab = vocab
self.orth = orth self.orth = orth
self.c = <LexemeC*><void*>vocab.get_by_orth(vocab.mem, orth) self.c = <LexemeC*><void*>vocab.get_by_orth(orth)
if self.c.orth != orth: if self.c.orth != orth:
raise ValueError(Errors.E071.format(orth=orth, vocab_orth=self.c.orth)) raise ValueError(Errors.E071.format(orth=orth, vocab_orth=self.c.orth))

View File

@ -37,7 +37,7 @@ cdef class Tokenizer:
bint with_special_cases) except -1 bint with_special_cases) except -1
cdef int _tokenize(self, Doc tokens, str span, hash_t key, cdef int _tokenize(self, Doc tokens, str span, hash_t key,
int* has_special, bint with_special_cases) except -1 int* has_special, bint with_special_cases) except -1
cdef str _split_affixes(self, Pool mem, str string, cdef str _split_affixes(self, str string,
vector[LexemeC*] *prefixes, vector[LexemeC*] *prefixes,
vector[LexemeC*] *suffixes, int* has_special, vector[LexemeC*] *suffixes, int* has_special,
bint with_special_cases) bint with_special_cases)

View File

@ -389,14 +389,14 @@ cdef class Tokenizer:
cdef vector[LexemeC*] suffixes cdef vector[LexemeC*] suffixes
cdef int orig_size cdef int orig_size
orig_size = tokens.length orig_size = tokens.length
span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes, span = self._split_affixes(span, &prefixes, &suffixes,
has_special, with_special_cases) has_special, with_special_cases)
self._attach_tokens(tokens, span, &prefixes, &suffixes, has_special, self._attach_tokens(tokens, span, &prefixes, &suffixes, has_special,
with_special_cases) with_special_cases)
self._save_cached(&tokens.c[orig_size], orig_key, has_special, self._save_cached(&tokens.c[orig_size], orig_key, has_special,
tokens.length - orig_size) tokens.length - orig_size)
cdef str _split_affixes(self, Pool mem, str string, cdef str _split_affixes(self, str string,
vector[const LexemeC*] *prefixes, vector[const LexemeC*] *prefixes,
vector[const LexemeC*] *suffixes, vector[const LexemeC*] *suffixes,
int* has_special, int* has_special,
@ -419,7 +419,7 @@ cdef class Tokenizer:
minus_pre = string[pre_len:] minus_pre = string[pre_len:]
if minus_pre and with_special_cases and self._specials.get(hash_string(minus_pre)) != NULL: if minus_pre and with_special_cases and self._specials.get(hash_string(minus_pre)) != NULL:
string = minus_pre string = minus_pre
prefixes.push_back(self.vocab.get(mem, prefix)) prefixes.push_back(self.vocab.get(prefix))
break break
suf_len = self.find_suffix(string[pre_len:]) suf_len = self.find_suffix(string[pre_len:])
if suf_len != 0: if suf_len != 0:
@ -427,18 +427,18 @@ cdef class Tokenizer:
minus_suf = string[:-suf_len] minus_suf = string[:-suf_len]
if minus_suf and with_special_cases and self._specials.get(hash_string(minus_suf)) != NULL: if minus_suf and with_special_cases and self._specials.get(hash_string(minus_suf)) != NULL:
string = minus_suf string = minus_suf
suffixes.push_back(self.vocab.get(mem, suffix)) suffixes.push_back(self.vocab.get(suffix))
break break
if pre_len and suf_len and (pre_len + suf_len) <= len(string): if pre_len and suf_len and (pre_len + suf_len) <= len(string):
string = string[pre_len:-suf_len] string = string[pre_len:-suf_len]
prefixes.push_back(self.vocab.get(mem, prefix)) prefixes.push_back(self.vocab.get(prefix))
suffixes.push_back(self.vocab.get(mem, suffix)) suffixes.push_back(self.vocab.get(suffix))
elif pre_len: elif pre_len:
string = minus_pre string = minus_pre
prefixes.push_back(self.vocab.get(mem, prefix)) prefixes.push_back(self.vocab.get(prefix))
elif suf_len: elif suf_len:
string = minus_suf string = minus_suf
suffixes.push_back(self.vocab.get(mem, suffix)) suffixes.push_back(self.vocab.get(suffix))
return string return string
cdef int _attach_tokens(self, Doc tokens, str string, cdef int _attach_tokens(self, Doc tokens, str string,
@ -465,11 +465,11 @@ cdef class Tokenizer:
# We're always saying 'no' to spaces here -- the caller will # We're always saying 'no' to spaces here -- the caller will
# fix up the outermost one, with reference to the original. # fix up the outermost one, with reference to the original.
# See Issue #859 # See Issue #859
tokens.push_back(self.vocab.get(tokens.mem, string), False) tokens.push_back(self.vocab.get(string), False)
else: else:
matches = self.find_infix(string) matches = self.find_infix(string)
if not matches: if not matches:
tokens.push_back(self.vocab.get(tokens.mem, string), False) tokens.push_back(self.vocab.get(string), False)
else: else:
# Let's say we have dyn-o-mite-dave - the regex finds the # Let's say we have dyn-o-mite-dave - the regex finds the
# start and end positions of the hyphens # start and end positions of the hyphens
@ -484,7 +484,7 @@ cdef class Tokenizer:
if infix_start != start: if infix_start != start:
span = string[start:infix_start] span = string[start:infix_start]
tokens.push_back(self.vocab.get(tokens.mem, span), False) tokens.push_back(self.vocab.get(span), False)
if infix_start != infix_end: if infix_start != infix_end:
# If infix_start != infix_end, it means the infix # If infix_start != infix_end, it means the infix
@ -492,11 +492,11 @@ cdef class Tokenizer:
# for tokenization in some languages (see # for tokenization in some languages (see
# https://github.com/explosion/spaCy/issues/768) # https://github.com/explosion/spaCy/issues/768)
infix_span = string[infix_start:infix_end] infix_span = string[infix_start:infix_end]
tokens.push_back(self.vocab.get(tokens.mem, infix_span), False) tokens.push_back(self.vocab.get(infix_span), False)
start = infix_end start = infix_end
span = string[start:] span = string[start:]
if span: if span:
tokens.push_back(self.vocab.get(tokens.mem, span), False) tokens.push_back(self.vocab.get(span), False)
cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin() cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin()
while it != suffixes.rend(): while it != suffixes.rend():
lexeme = deref(it) lexeme = deref(it)

View File

@ -266,12 +266,12 @@ cdef class Doc:
cdef const LexemeC* lexeme cdef const LexemeC* lexeme
for word, has_space in zip(words, spaces): for word, has_space in zip(words, spaces):
if isinstance(word, str): if isinstance(word, str):
lexeme = self.vocab.get(self.mem, word) lexeme = self.vocab.get(word)
elif isinstance(word, bytes): elif isinstance(word, bytes):
raise ValueError(Errors.E028.format(value=word)) raise ValueError(Errors.E028.format(value=word))
else: else:
try: try:
lexeme = self.vocab.get_by_orth(self.mem, word) lexeme = self.vocab.get_by_orth(word)
except TypeError: except TypeError:
raise TypeError(Errors.E1022.format(wtype=type(word))) raise TypeError(Errors.E1022.format(wtype=type(word)))
self.push_back(lexeme, has_space) self.push_back(lexeme, has_space)
@ -1430,7 +1430,7 @@ cdef class Doc:
end = start + attrs[i, 0] end = start + attrs[i, 0]
has_space = attrs[i, 1] has_space = attrs[i, 1]
orth_ = text[start:end] orth_ = text[start:end]
lex = self.vocab.get(self.mem, orth_) lex = self.vocab.get(orth_)
self.push_back(lex, has_space) self.push_back(lex, has_space)
start = end + has_space start = end + has_space
self.from_array(msg["array_head"][2:], attrs[:, 2:]) self.from_array(msg["array_head"][2:], attrs[:, 2:])
@ -1536,7 +1536,7 @@ cdef class Doc:
assert words == reconstructed_words assert words == reconstructed_words
for word, has_space in zip(words, spaces): for word, has_space in zip(words, spaces):
lex = self.vocab.get(self.mem, word) lex = self.vocab.get(word)
self.push_back(lex, has_space) self.push_back(lex, has_space)
# Set remaining token-level attributes via Doc.from_array(). # Set remaining token-level attributes via Doc.from_array().

View File

@ -223,7 +223,7 @@ def _merge(Doc doc, merges):
if doc.vocab.vectors_length > 0: if doc.vocab.vectors_length > 0:
doc.vocab.set_vector(new_orth, span.vector) doc.vocab.set_vector(new_orth, span.vector)
token = tokens[token_index] token = tokens[token_index]
lex = doc.vocab.get(doc.mem, new_orth) lex = doc.vocab.get(new_orth)
token.lex = lex token.lex = lex
# We set trailing space here too # We set trailing space here too
token.spacy = doc.c[spans[token_index].end-1].spacy token.spacy = doc.c[spans[token_index].end-1].spacy
@ -359,7 +359,7 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
cdef int idx_offset = 0 cdef int idx_offset = 0
for i, orth in enumerate(orths): for i, orth in enumerate(orths):
token = &doc.c[token_index + i] token = &doc.c[token_index + i]
lex = doc.vocab.get(doc.mem, orth) lex = doc.vocab.get(orth)
token.lex = lex token.lex = lex
# If lemma is currently set, set default lemma to orth # If lemma is currently set, set default lemma to orth
if token.lemma != 0: if token.lemma != 0:

View File

@ -35,12 +35,11 @@ cdef class Vocab:
cdef public object lex_attr_getters cdef public object lex_attr_getters
cdef public object cfg cdef public object cfg
cdef const LexemeC* get(self, Pool mem, str string) except NULL cdef const LexemeC* get(self, str string) except NULL
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL cdef const LexemeC* get_by_orth(self, attr_t orth) except NULL
cdef const TokenC* make_fused_token(self, substrings) except NULL cdef const TokenC* make_fused_token(self, substrings) except NULL
cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL cdef const LexemeC* _new_lexeme(self, str string) except NULL
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1 cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
cdef PreshMap _by_orth cdef PreshMap _by_orth

View File

@ -139,7 +139,7 @@ cdef class Vocab:
self.lex_attr_getters[flag_id] = flag_getter self.lex_attr_getters[flag_id] = flag_getter
return flag_id return flag_id
cdef const LexemeC* get(self, Pool mem, str string) except NULL: cdef const LexemeC* get(self, str string) except NULL:
"""Get a pointer to a `LexemeC` from the lexicon, creating a new """Get a pointer to a `LexemeC` from the lexicon, creating a new
`Lexeme` if necessary using memory acquired from the given pool. If the `Lexeme` if necessary using memory acquired from the given pool. If the
pool is the lexicon's own memory, the lexeme is saved in the lexicon. pool is the lexicon's own memory, the lexeme is saved in the lexicon.
@ -157,9 +157,9 @@ cdef class Vocab:
orth=key, orth_id=string)) orth=key, orth_id=string))
return lex return lex
else: else:
return self._new_lexeme(mem, string) return self._new_lexeme(string)
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL: cdef const LexemeC* get_by_orth(self, attr_t orth) except NULL:
"""Get a pointer to a `LexemeC` from the lexicon, creating a new """Get a pointer to a `LexemeC` from the lexicon, creating a new
`Lexeme` if necessary using memory acquired from the given pool. If the `Lexeme` if necessary using memory acquired from the given pool. If the
pool is the lexicon's own memory, the lexeme is saved in the lexicon. pool is the lexicon's own memory, the lexeme is saved in the lexicon.
@ -171,21 +171,10 @@ cdef class Vocab:
if lex != NULL: if lex != NULL:
return lex return lex
else: else:
return self._new_lexeme(mem, self.strings[orth]) return self._new_lexeme(self.strings[orth])
cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL: cdef const LexemeC* _new_lexeme(self, str string) except NULL:
# I think this heuristic is bad, and the Vocab should always lex = <LexemeC*>self.mem.alloc(1, sizeof(LexemeC))
# own the lexemes. It avoids weird bugs this way, as it's how the thing
# was originally supposed to work. The best solution to the growing
# memory use is to periodically reset the vocab, which is an action
# that should be up to the user to do (so we don't need to keep track
# of the doc ownership).
# TODO: Change the C API so that the mem isn't passed in here.
mem = self.mem
#if len(string) < 3 or self.length < 10000:
# mem = self.mem
cdef bint is_oov = mem is not self.mem
lex = <LexemeC*>mem.alloc(1, sizeof(LexemeC))
lex.orth = self.strings.add(string) lex.orth = self.strings.add(string)
lex.length = len(string) lex.length = len(string)
if self.vectors is not None: if self.vectors is not None:
@ -199,7 +188,6 @@ cdef class Vocab:
value = self.strings.add(value) value = self.strings.add(value)
if value is not None: if value is not None:
Lexeme.set_struct_attr(lex, attr, value) Lexeme.set_struct_attr(lex, attr, value)
if not is_oov:
self._add_lex_to_vocab(lex.orth, lex) self._add_lex_to_vocab(lex.orth, lex)
if lex == NULL: if lex == NULL:
raise ValueError(Errors.E085.format(string=string)) raise ValueError(Errors.E085.format(string=string))
@ -271,7 +259,7 @@ cdef class Vocab:
props = intify_attrs(props, strings_map=self.strings) props = intify_attrs(props, strings_map=self.strings)
token = &tokens[i] token = &tokens[i]
# Set the special tokens up to have arbitrary attributes # Set the special tokens up to have arbitrary attributes
lex = <LexemeC*>self.get_by_orth(self.mem, props[ORTH]) lex = <LexemeC*>self.get_by_orth(props[ORTH])
token.lex = lex token.lex = lex
for attr_id, value in props.items(): for attr_id, value in props.items():
Token.set_struct_attr(token, attr_id, value) Token.set_struct_attr(token, attr_id, value)

View File

@ -163,12 +163,11 @@ vocabulary.
> #### Example > #### Example
> >
> ```python > ```python
> lexeme = vocab.get(vocab.mem, "hello") > lexeme = vocab.get("hello")
> ``` > ```
| Name | Description | | Name | Description |
| ----------- | ---------------------------------------------------------------------------------------------------------- | | ----------- | ------------------------------------------------- |
| `mem` | A memory pool. Allocated memory will be freed once the `Vocab` object is garbage collected. ~~cymem.Pool~~ |
| `string` | The string of the word to look up. ~~str~~ | | `string` | The string of the word to look up. ~~str~~ |
| **RETURNS** | The lexeme in the vocabulary. ~~const LexemeC\*~~ | | **RETURNS** | The lexeme in the vocabulary. ~~const LexemeC\*~~ |
@ -184,8 +183,7 @@ vocabulary.
> ``` > ```
| Name | Description | | Name | Description |
| ----------- | ---------------------------------------------------------------------------------------------------------- | | ----------- | ------------------------------------------------------ |
| `mem` | A memory pool. Allocated memory will be freed once the `Vocab` object is garbage collected. ~~cymem.Pool~~ |
| `orth` | ID of the verbatim text content. ~~attr_t (uint64_t)~~ | | `orth` | ID of the verbatim text content. ~~attr_t (uint64_t)~~ |
| **RETURNS** | The lexeme in the vocabulary. ~~const LexemeC\*~~ | | **RETURNS** | The lexeme in the vocabulary. ~~const LexemeC\*~~ |