diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 73bf28dc2..e57098f17 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -41,7 +41,7 @@ cdef class Lexeme: """ self.vocab = vocab self.orth = orth - self.c = vocab.get_by_orth(vocab.mem, orth) + self.c = vocab.get_by_orth(orth) if self.c.orth != orth: raise ValueError(Errors.E071.format(orth=orth, vocab_orth=self.c.orth)) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index ff8d85ac7..8840b181c 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -419,7 +419,7 @@ cdef class Tokenizer: minus_pre = string[pre_len:] if minus_pre and with_special_cases and self._specials.get(hash_string(minus_pre)) != NULL: string = minus_pre - prefixes.push_back(self.vocab.get(mem, prefix)) + prefixes.push_back(self.vocab.get(prefix)) break suf_len = self.find_suffix(string[pre_len:]) if suf_len != 0: @@ -427,18 +427,18 @@ cdef class Tokenizer: minus_suf = string[:-suf_len] if minus_suf and with_special_cases and self._specials.get(hash_string(minus_suf)) != NULL: string = minus_suf - suffixes.push_back(self.vocab.get(mem, suffix)) + suffixes.push_back(self.vocab.get(suffix)) break if pre_len and suf_len and (pre_len + suf_len) <= len(string): string = string[pre_len:-suf_len] - prefixes.push_back(self.vocab.get(mem, prefix)) - suffixes.push_back(self.vocab.get(mem, suffix)) + prefixes.push_back(self.vocab.get(prefix)) + suffixes.push_back(self.vocab.get(suffix)) elif pre_len: string = minus_pre - prefixes.push_back(self.vocab.get(mem, prefix)) + prefixes.push_back(self.vocab.get(prefix)) elif suf_len: string = minus_suf - suffixes.push_back(self.vocab.get(mem, suffix)) + suffixes.push_back(self.vocab.get(suffix)) return string cdef int _attach_tokens(self, Doc tokens, str string, @@ -465,11 +465,11 @@ cdef class Tokenizer: # We're always saying 'no' to spaces here -- the caller will # fix up the outermost one, with reference to the original. # See Issue #859 - tokens.push_back(self.vocab.get(tokens.mem, string), False) + tokens.push_back(self.vocab.get(string), False) else: matches = self.find_infix(string) if not matches: - tokens.push_back(self.vocab.get(tokens.mem, string), False) + tokens.push_back(self.vocab.get(string), False) else: # Let's say we have dyn-o-mite-dave - the regex finds the # start and end positions of the hyphens @@ -484,7 +484,7 @@ cdef class Tokenizer: if infix_start != start: span = string[start:infix_start] - tokens.push_back(self.vocab.get(tokens.mem, span), False) + tokens.push_back(self.vocab.get(span), False) if infix_start != infix_end: # If infix_start != infix_end, it means the infix @@ -492,11 +492,11 @@ cdef class Tokenizer: # for tokenization in some languages (see # https://github.com/explosion/spaCy/issues/768) infix_span = string[infix_start:infix_end] - tokens.push_back(self.vocab.get(tokens.mem, infix_span), False) + tokens.push_back(self.vocab.get(infix_span), False) start = infix_end span = string[start:] if span: - tokens.push_back(self.vocab.get(tokens.mem, span), False) + tokens.push_back(self.vocab.get(span), False) cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin() while it != suffixes.rend(): lexeme = deref(it) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 25af6ca6a..2b3b83e6a 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -266,12 +266,12 @@ cdef class Doc: cdef const LexemeC* lexeme for word, has_space in zip(words, spaces): if isinstance(word, str): - lexeme = self.vocab.get(self.mem, word) + lexeme = self.vocab.get(word) elif isinstance(word, bytes): raise ValueError(Errors.E028.format(value=word)) else: try: - lexeme = self.vocab.get_by_orth(self.mem, word) + lexeme = self.vocab.get_by_orth(word) except TypeError: raise TypeError(Errors.E1022.format(wtype=type(word))) self.push_back(lexeme, has_space) @@ -1430,7 +1430,7 @@ cdef class Doc: end = start + attrs[i, 0] has_space = attrs[i, 1] orth_ = text[start:end] - lex = self.vocab.get(self.mem, orth_) + lex = self.vocab.get(orth_) self.push_back(lex, has_space) start = end + has_space self.from_array(msg["array_head"][2:], attrs[:, 2:]) @@ -1536,7 +1536,7 @@ cdef class Doc: assert words == reconstructed_words for word, has_space in zip(words, spaces): - lex = self.vocab.get(self.mem, word) + lex = self.vocab.get(word) self.push_back(lex, has_space) # Set remaining token-level attributes via Doc.from_array(). diff --git a/spacy/tokens/retokenizer.pyx b/spacy/tokens/retokenizer.pyx index 29143bed3..8aef1d74f 100644 --- a/spacy/tokens/retokenizer.pyx +++ b/spacy/tokens/retokenizer.pyx @@ -223,7 +223,7 @@ def _merge(Doc doc, merges): if doc.vocab.vectors_length > 0: doc.vocab.set_vector(new_orth, span.vector) token = tokens[token_index] - lex = doc.vocab.get(doc.mem, new_orth) + lex = doc.vocab.get(new_orth) token.lex = lex # We set trailing space here too token.spacy = doc.c[spans[token_index].end-1].spacy @@ -359,7 +359,7 @@ def _split(Doc doc, int token_index, orths, heads, attrs): cdef int idx_offset = 0 for i, orth in enumerate(orths): token = &doc.c[token_index + i] - lex = doc.vocab.get(doc.mem, orth) + lex = doc.vocab.get(orth) token.lex = lex # If lemma is currently set, set default lemma to orth if token.lemma != 0: diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd index 35cdc6503..2db709b71 100644 --- a/spacy/vocab.pxd +++ b/spacy/vocab.pxd @@ -35,8 +35,8 @@ cdef class Vocab: cdef public object lex_attr_getters cdef public object cfg - cdef const LexemeC* get(self, Pool mem, str string) except NULL - cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL + cdef const LexemeC* get(self, str string) except NULL + cdef const LexemeC* get_by_orth(self, attr_t orth) except NULL cdef const TokenC* make_fused_token(self, substrings) except NULL cdef const LexemeC* _new_lexeme(self, str string) except NULL diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 9fc49a5b8..a87f50ad4 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -139,7 +139,7 @@ cdef class Vocab: self.lex_attr_getters[flag_id] = flag_getter return flag_id - cdef const LexemeC* get(self, Pool mem, str string) except NULL: + cdef const LexemeC* get(self, str string) except NULL: """Get a pointer to a `LexemeC` from the lexicon, creating a new `Lexeme` if necessary using memory acquired from the given pool. If the pool is the lexicon's own memory, the lexeme is saved in the lexicon. @@ -159,7 +159,7 @@ cdef class Vocab: else: return self._new_lexeme(string) - cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL: + cdef const LexemeC* get_by_orth(self, attr_t orth) except NULL: """Get a pointer to a `LexemeC` from the lexicon, creating a new `Lexeme` if necessary using memory acquired from the given pool. If the pool is the lexicon's own memory, the lexeme is saved in the lexicon. @@ -259,7 +259,7 @@ cdef class Vocab: props = intify_attrs(props, strings_map=self.strings) token = &tokens[i] # Set the special tokens up to have arbitrary attributes - lex = self.get_by_orth(self.mem, props[ORTH]) + lex = self.get_by_orth(props[ORTH]) token.lex = lex for attr_id, value in props.items(): Token.set_struct_attr(token, attr_id, value) diff --git a/website/docs/api/cython-classes.mdx b/website/docs/api/cython-classes.mdx index ce7c03940..a621d97be 100644 --- a/website/docs/api/cython-classes.mdx +++ b/website/docs/api/cython-classes.mdx @@ -163,12 +163,11 @@ vocabulary. > #### Example > > ```python -> lexeme = vocab.get(vocab.mem, "hello") +> lexeme = vocab.get("hello") > ``` | Name | Description | | ----------- | ---------------------------------------------------------------------------------------------------------- | -| `mem` | A memory pool. Allocated memory will be freed once the `Vocab` object is garbage collected. ~~cymem.Pool~~ | | `string` | The string of the word to look up. ~~str~~ | | **RETURNS** | The lexeme in the vocabulary. ~~const LexemeC\*~~ | @@ -185,7 +184,6 @@ vocabulary. | Name | Description | | ----------- | ---------------------------------------------------------------------------------------------------------- | -| `mem` | A memory pool. Allocated memory will be freed once the `Vocab` object is garbage collected. ~~cymem.Pool~~ | | `orth` | ID of the verbatim text content. ~~attr_t (uint64_t)~~ | | **RETURNS** | The lexeme in the vocabulary. ~~const LexemeC\*~~ |