mirror of
https://github.com/explosion/spaCy.git
synced 2025-04-05 09:44:12 +03:00
Integrate memory zones into v3.x
This commit is contained in:
parent
319e02545c
commit
3c86cc669a
|
@ -25,5 +25,10 @@ cdef class StringStore:
|
|||
cdef vector[hash_t] keys
|
||||
cdef public PreshMap _map
|
||||
|
||||
cdef const Utf8Str* intern_unicode(self, str py_string)
|
||||
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash)
|
||||
cdef const Utf8Str* intern_unicode(self, str py_string, bint allow_transient)
|
||||
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash, bint allow_transient)
|
||||
|
||||
cdef vector[hash_t] _transient_keys
|
||||
cdef PreshMap _transient_map
|
||||
cdef Pool _non_temp_mem
|
||||
|
||||
|
|
|
@ -1,6 +1,9 @@
|
|||
# cython: infer_types=True
|
||||
# cython: profile=False
|
||||
cimport cython
|
||||
from typing import Iterable, Iterator, List, Optional, Tuple, Union
|
||||
from contextlib import contextmanager
|
||||
|
||||
from libc.stdint cimport uint32_t
|
||||
from libc.string cimport memcpy
|
||||
from murmurhash.mrmr cimport hash32, hash64
|
||||
|
@ -119,7 +122,9 @@ cdef class StringStore:
|
|||
strings (iterable): A sequence of unicode strings to add to the store.
|
||||
"""
|
||||
self.mem = Pool()
|
||||
self._non_temp_mem = self.mem
|
||||
self._map = PreshMap()
|
||||
self._transient_map = None
|
||||
if strings is not None:
|
||||
for string in strings:
|
||||
self.add(string)
|
||||
|
@ -152,10 +157,13 @@ cdef class StringStore:
|
|||
return SYMBOLS_BY_INT[str_hash]
|
||||
else:
|
||||
utf8str = <Utf8Str*>self._map.get(str_hash)
|
||||
if utf8str is NULL and self._transient_map is not None:
|
||||
utf8str = <Utf8Str*>self._transient_map.get(str_hash)
|
||||
else:
|
||||
# TODO: Raise an error instead
|
||||
utf8str = <Utf8Str*>self._map.get(string_or_id)
|
||||
|
||||
if utf8str is NULL and self._transient_map is not None:
|
||||
utf8str = <Utf8Str*>self._transient_map.get(str_hash)
|
||||
if utf8str is NULL:
|
||||
raise KeyError(Errors.E018.format(hash_value=string_or_id))
|
||||
else:
|
||||
|
@ -175,10 +183,46 @@ cdef class StringStore:
|
|||
else:
|
||||
return self[key]
|
||||
|
||||
def add(self, string):
|
||||
def __reduce__(self):
|
||||
strings = list(self.non_transient_keys())
|
||||
return (StringStore, (strings,), None, None, None)
|
||||
|
||||
def __len__(self) -> int:
|
||||
"""The number of strings in the store.
|
||||
|
||||
RETURNS (int): The number of strings in the store.
|
||||
"""
|
||||
return self._keys.size() + self._transient_keys.size()
|
||||
|
||||
@contextmanager
|
||||
def memory_zone(self, mem: Optional[Pool]=None) -> Pool:
|
||||
"""Begin a block where all resources allocated during the block will
|
||||
be freed at the end of it. If a resources was created within the
|
||||
memory zone block, accessing it outside the block is invalid.
|
||||
Behaviour of this invalid access is undefined. Memory zones should
|
||||
not be nested.
|
||||
|
||||
The memory zone is helpful for services that need to process large
|
||||
volumes of text with a defined memory budget.
|
||||
"""
|
||||
if mem is None:
|
||||
mem = Pool()
|
||||
self.mem = mem
|
||||
self._transient_map = PreshMap()
|
||||
yield mem
|
||||
self.mem = self._non_temp_mem
|
||||
self._transient_map = None
|
||||
self._transient_keys.clear()
|
||||
|
||||
def add(self, string: str, allow_transient: bool = False) -> int:
|
||||
"""Add a string to the StringStore.
|
||||
|
||||
string (str): The string to add.
|
||||
allow_transient (bool): Allow the string to be stored in the 'transient'
|
||||
map, which will be flushed at the end of the memory zone. Strings
|
||||
encountered during arbitrary text processing should be added
|
||||
with allow_transient=True, while labels and other strings used
|
||||
internally should not.
|
||||
RETURNS (uint64): The string's hash value.
|
||||
"""
|
||||
cdef hash_t str_hash
|
||||
|
@ -188,22 +232,26 @@ cdef class StringStore:
|
|||
|
||||
string = string.encode("utf8")
|
||||
str_hash = hash_utf8(string, len(string))
|
||||
self._intern_utf8(string, len(string), &str_hash)
|
||||
self._intern_utf8(string, len(string), &str_hash, allow_transient)
|
||||
elif isinstance(string, bytes):
|
||||
if string in SYMBOLS_BY_STR:
|
||||
return SYMBOLS_BY_STR[string]
|
||||
str_hash = hash_utf8(string, len(string))
|
||||
self._intern_utf8(string, len(string), &str_hash)
|
||||
self._intern_utf8(string, len(string), &str_hash, allow_transient)
|
||||
else:
|
||||
raise TypeError(Errors.E017.format(value_type=type(string)))
|
||||
return str_hash
|
||||
|
||||
def __len__(self):
|
||||
"""The number of strings in the store.
|
||||
if string in SYMBOLS_BY_STR:
|
||||
return SYMBOLS_BY_STR[string]
|
||||
else:
|
||||
return self._intern_str(string, allow_transient)
|
||||
|
||||
RETURNS (int): The number of strings in the store.
|
||||
"""
|
||||
return self.keys.size()
|
||||
return self.keys.size() + self._transient_keys.size()
|
||||
|
||||
def __contains__(self, string_or_id not None):
|
||||
"""Check whether a string or ID is in the store.
|
||||
|
@ -222,30 +270,70 @@ cdef class StringStore:
|
|||
pass
|
||||
else:
|
||||
# TODO: Raise an error instead
|
||||
return self._map.get(string_or_id) is not NULL
|
||||
|
||||
if self._map.get(string_or_id) is not NULL:
|
||||
return True
|
||||
elif self._transient_map is not None and self._transient_map.get(string_or_id) is not NULL:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
if str_hash < len(SYMBOLS_BY_INT):
|
||||
return True
|
||||
else:
|
||||
return self._map.get(str_hash) is not NULL
|
||||
if self._map.get(str_hash) is not NULL:
|
||||
return True
|
||||
elif self._transient_map is not None and self._transient_map.get(string_or_id) is not NULL:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def __iter__(self):
|
||||
"""Iterate over the strings in the store, in order.
|
||||
|
||||
YIELDS (str): A string in the store.
|
||||
"""
|
||||
yield from self.non_transient_keys()
|
||||
yield from self.transient_keys()
|
||||
|
||||
def non_transient_keys(self) -> Iterator[str]:
|
||||
"""Iterate over the stored strings in insertion order.
|
||||
|
||||
RETURNS: A list of strings.
|
||||
"""
|
||||
cdef int i
|
||||
cdef hash_t key
|
||||
for i in range(self.keys.size()):
|
||||
key = self.keys[i]
|
||||
utf8str = <Utf8Str*>self._map.get(key)
|
||||
yield decode_Utf8Str(utf8str)
|
||||
# TODO: Iterate OOV here?
|
||||
|
||||
def __reduce__(self):
|
||||
strings = list(self)
|
||||
return (StringStore, (strings,), None, None, None)
|
||||
|
||||
def transient_keys(self) -> Iterator[str]:
|
||||
if self._transient_map is None:
|
||||
return []
|
||||
for i in range(self._transient_keys.size()):
|
||||
utf8str = <Utf8Str*>self._transient_map.get(self._transient_keys[i])
|
||||
yield decode_Utf8Str(utf8str)
|
||||
|
||||
def values(self) -> List[int]:
|
||||
"""Iterate over the stored strings hashes in insertion order.
|
||||
|
||||
RETURNS: A list of string hashs.
|
||||
"""
|
||||
cdef int i
|
||||
hashes = [None] * self._keys.size()
|
||||
for i in range(self._keys.size()):
|
||||
hashes[i] = self._keys[i]
|
||||
if self._transient_map is not None:
|
||||
transient_hashes = [None] * self._transient_keys.size()
|
||||
for i in range(self._transient_keys.size()):
|
||||
transient_hashes[i] = self._transient_keys[i]
|
||||
else:
|
||||
transient_hashes = []
|
||||
return hashes + transient_hashes
|
||||
|
||||
def to_disk(self, path):
|
||||
"""Save the current state to a directory.
|
||||
|
||||
|
@ -269,7 +357,7 @@ cdef class StringStore:
|
|||
prev = list(self)
|
||||
self._reset_and_load(strings)
|
||||
for word in prev:
|
||||
self.add(word)
|
||||
self.add(word, allow_transient=False)
|
||||
return self
|
||||
|
||||
def to_bytes(self, **kwargs):
|
||||
|
@ -289,7 +377,7 @@ cdef class StringStore:
|
|||
prev = list(self)
|
||||
self._reset_and_load(strings)
|
||||
for word in prev:
|
||||
self.add(word)
|
||||
self.add(word, allow_transient=False)
|
||||
return self
|
||||
|
||||
def _reset_and_load(self, strings):
|
||||
|
@ -297,22 +385,34 @@ cdef class StringStore:
|
|||
self._map = PreshMap()
|
||||
self.keys.clear()
|
||||
for string in strings:
|
||||
self.add(string)
|
||||
self.add(string, allow_transient=False)
|
||||
|
||||
cdef const Utf8Str* intern_unicode(self, str py_string):
|
||||
cdef const Utf8Str* intern_unicode(self, str py_string, bint allow_transient):
|
||||
# 0 means missing, but we don't bother offsetting the index.
|
||||
cdef bytes byte_string = py_string.encode("utf8")
|
||||
return self._intern_utf8(byte_string, len(byte_string), NULL)
|
||||
return self._intern_utf8(byte_string, len(byte_string), NULL, allow_transient)
|
||||
|
||||
@cython.final
|
||||
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash):
|
||||
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash, bint allow_transient):
|
||||
# TODO: This function's API/behaviour is an unholy mess...
|
||||
# 0 means missing, but we don't bother offsetting the index.
|
||||
cdef hash_t key = precalculated_hash[0] if precalculated_hash is not NULL else hash_utf8(utf8_string, length)
|
||||
cdef Utf8Str* value = <Utf8Str*>self._map.get(key)
|
||||
if value is not NULL:
|
||||
return value
|
||||
if allow_transient and self._transient_map is not None:
|
||||
# If we've already allocated a transient string, and now we
|
||||
# want to intern it permanently, we'll end up with the string
|
||||
# in both places. That seems fine -- I don't see why we need
|
||||
# to remove it from the transient map.
|
||||
value = <Utf8Str*>self._transient_map.get(key)
|
||||
if value is not NULL:
|
||||
return value
|
||||
value = _allocate(self.mem, <unsigned char*>utf8_string, length)
|
||||
self._map.set(key, value)
|
||||
self.keys.push_back(key)
|
||||
if allow_transient and self._transient_map is not None:
|
||||
self._transient_map.set(key, value)
|
||||
self._transient_keys.push_back(key)
|
||||
else:
|
||||
self._map.set(key, value)
|
||||
self.keys.push_back(key)
|
||||
return value
|
||||
|
|
36
spacy/tests/vocab_vectors/test_memory_zone.py
Normal file
36
spacy/tests/vocab_vectors/test_memory_zone.py
Normal file
|
@ -0,0 +1,36 @@
|
|||
from spacy.vocab import Vocab
|
||||
|
||||
|
||||
def test_memory_zone_no_insertion():
|
||||
vocab = Vocab()
|
||||
with vocab.memory_zone():
|
||||
pass
|
||||
lex = vocab["horse"]
|
||||
assert lex.text == "horse"
|
||||
|
||||
|
||||
def test_memory_zone_insertion():
|
||||
vocab = Vocab()
|
||||
_ = vocab["dog"]
|
||||
assert "dog" in vocab
|
||||
assert "horse" not in vocab
|
||||
with vocab.memory_zone():
|
||||
lex = vocab["horse"]
|
||||
assert lex.text == "horse"
|
||||
assert "dog" in vocab
|
||||
assert "horse" not in vocab
|
||||
|
||||
|
||||
def test_memory_zone_redundant_insertion():
|
||||
"""Test that if we insert an already-existing word while
|
||||
in the memory zone, it stays persistent"""
|
||||
vocab = Vocab()
|
||||
_ = vocab["dog"]
|
||||
assert "dog" in vocab
|
||||
assert "horse" not in vocab
|
||||
with vocab.memory_zone():
|
||||
lex = vocab["horse"]
|
||||
assert lex.text == "horse"
|
||||
_ = vocab["dog"]
|
||||
assert "dog" in vocab
|
||||
assert "horse" not in vocab
|
|
@ -25,9 +25,7 @@ cdef class Tokenizer:
|
|||
cdef PhraseMatcher _special_matcher
|
||||
# TODO convert to bool in v4
|
||||
cdef int _faster_heuristics
|
||||
# TODO next one is unused and should be removed in v4
|
||||
# https://github.com/explosion/spaCy/pull/9150
|
||||
cdef int _unused_int2
|
||||
cdef public int max_cache_size
|
||||
|
||||
cdef Doc _tokenize_affixes(self, str string, bint with_special_cases)
|
||||
cdef int _apply_special_cases(self, Doc doc) except -1
|
||||
|
|
|
@ -30,7 +30,7 @@ cdef class Tokenizer:
|
|||
"""
|
||||
def __init__(self, Vocab vocab, rules=None, prefix_search=None,
|
||||
suffix_search=None, infix_finditer=None, token_match=None,
|
||||
url_match=None, faster_heuristics=True):
|
||||
url_match=None, faster_heuristics=True, max_cache_size=10000):
|
||||
"""Create a `Tokenizer`, to create `Doc` objects given unicode text.
|
||||
|
||||
vocab (Vocab): A storage container for lexical types.
|
||||
|
@ -50,6 +50,7 @@ cdef class Tokenizer:
|
|||
faster_heuristics (bool): Whether to restrict the final
|
||||
Matcher-based pass for rules to those containing affixes or space.
|
||||
Defaults to True.
|
||||
max_cache_size (int): Maximum number of tokenization chunks to cache.
|
||||
|
||||
EXAMPLE:
|
||||
>>> tokenizer = Tokenizer(nlp.vocab)
|
||||
|
@ -69,6 +70,7 @@ cdef class Tokenizer:
|
|||
self._rules = {}
|
||||
self._special_matcher = PhraseMatcher(self.vocab)
|
||||
self._load_special_cases(rules)
|
||||
self.max_cache_size = max_cache_size
|
||||
|
||||
@property
|
||||
def token_match(self):
|
||||
|
@ -397,8 +399,9 @@ cdef class Tokenizer:
|
|||
has_special, with_special_cases)
|
||||
self._attach_tokens(tokens, span, &prefixes, &suffixes, has_special,
|
||||
with_special_cases)
|
||||
self._save_cached(&tokens.c[orig_size], orig_key, has_special,
|
||||
tokens.length - orig_size)
|
||||
if len(self._cache) < self.max_cache_size:
|
||||
self._save_cached(&tokens.c[orig_size], orig_key, has_special,
|
||||
tokens.length - orig_size)
|
||||
|
||||
cdef str _split_affixes(
|
||||
self,
|
||||
|
@ -514,6 +517,9 @@ cdef class Tokenizer:
|
|||
if n <= 0:
|
||||
# avoid mem alloc of zero length
|
||||
return 0
|
||||
# Historically this check was mostly used to avoid caching
|
||||
# chunks that had tokens owned by the Doc. Now that that's
|
||||
# not a thing, I don't think we need this?
|
||||
for i in range(n):
|
||||
if self.vocab._by_orth.get(tokens[i].lex.orth) == NULL:
|
||||
return 0
|
||||
|
|
|
@ -41,7 +41,9 @@ cdef class Vocab:
|
|||
cdef const TokenC* make_fused_token(self, substrings) except NULL
|
||||
|
||||
cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
|
||||
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
|
||||
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex, bint is_transient) except -1
|
||||
cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
|
||||
|
||||
cdef PreshMap _by_orth
|
||||
cdef Pool _non_temp_mem
|
||||
cdef vector[attr_t] _transient_orths
|
||||
|
|
|
@ -1,7 +1,9 @@
|
|||
from pathlib import Path
|
||||
from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Union
|
||||
from contextlib import contextmanager
|
||||
|
||||
from thinc.types import Floats1d, FloatsXd
|
||||
from cymem.cymem import Pool
|
||||
|
||||
from . import Language
|
||||
from .lexeme import Lexeme
|
||||
|
@ -67,6 +69,8 @@ class Vocab:
|
|||
def from_bytes(
|
||||
self, bytes_data: bytes, *, exclude: Iterable[str] = ...
|
||||
) -> Vocab: ...
|
||||
@contextmanager
|
||||
def memory_zone(self, mem: Optional[Pool] = None) -> Iterator[Pool]: ...
|
||||
|
||||
def pickle_vocab(vocab: Vocab) -> Any: ...
|
||||
def unpickle_vocab(
|
||||
|
|
|
@ -3,6 +3,8 @@ import functools
|
|||
import numpy
|
||||
import srsly
|
||||
from thinc.api import get_array_module, get_current_ops
|
||||
from contextlib import contextmanager, ExitStack
|
||||
from typing import Iterator, Optional
|
||||
|
||||
from .attrs cimport LANG, ORTH
|
||||
from .lexeme cimport EMPTY_LEXEME, OOV_RANK, Lexeme
|
||||
|
@ -87,6 +89,12 @@ cdef class Vocab:
|
|||
self.lookups = lookups
|
||||
self.writing_system = writing_system
|
||||
self.get_noun_chunks = get_noun_chunks
|
||||
# During a memory_zone we replace our mem object with one
|
||||
# that's passed to us. We keep a reference to our non-temporary
|
||||
# memory here, in case we need to make an allocation we want to
|
||||
# guarantee is not temporary. This is also how we check whether
|
||||
# we're in a memory zone: we check whether self.mem is self._non_temp_mem
|
||||
self._non_temp_mem = self.mem
|
||||
|
||||
@property
|
||||
def vectors(self):
|
||||
|
@ -114,6 +122,33 @@ cdef class Vocab:
|
|||
"""
|
||||
return self.length
|
||||
|
||||
@contextmanager
|
||||
def memory_zone(self, mem: Optional[Pool] = None) -> Iterator[Pool]:
|
||||
"""Begin a block where resources allocated during the block will
|
||||
be freed at the end of it. If a resources was created within the
|
||||
memory zone block, accessing it outside the block is invalid.
|
||||
Behaviour of this invalid access is undefined. Memory zones should
|
||||
not be nested.
|
||||
|
||||
The memory zone is helpful for services that need to process large
|
||||
volumes of text with a defined memory budget.
|
||||
"""
|
||||
if mem is None:
|
||||
mem = Pool()
|
||||
# The ExitStack allows programmatic nested context managers.
|
||||
# We don't know how many we need, so it would be awkward to have
|
||||
# them as nested blocks.
|
||||
with ExitStack() as stack:
|
||||
contexts = [stack.enter_context(self.strings.memory_zone(mem))]
|
||||
if hasattr(self.morphology, "memory_zone"):
|
||||
contexts.append(stack.enter_context(self.morphology.memory_zone(mem)))
|
||||
if hasattr(self._vectors, "memory_zone"):
|
||||
contexts.append(stack.enter_context(self._vectors.memory_zone(mem)))
|
||||
self.mem = mem
|
||||
yield mem
|
||||
self._clear_transient_orths()
|
||||
self.mem = self._non_temp_mem
|
||||
|
||||
def add_flag(self, flag_getter, int flag_id=-1):
|
||||
"""Set a new boolean flag to words in the vocabulary.
|
||||
|
||||
|
@ -148,8 +183,7 @@ cdef class Vocab:
|
|||
|
||||
cdef const LexemeC* get(self, Pool mem, str string) except NULL:
|
||||
"""Get a pointer to a `LexemeC` from the lexicon, creating a new
|
||||
`Lexeme` if necessary using memory acquired from the given pool. If the
|
||||
pool is the lexicon's own memory, the lexeme is saved in the lexicon.
|
||||
`Lexeme` if necessary.
|
||||
"""
|
||||
if string == "":
|
||||
return &EMPTY_LEXEME
|
||||
|
@ -180,17 +214,9 @@ cdef class Vocab:
|
|||
return self._new_lexeme(mem, self.strings[orth])
|
||||
|
||||
cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL:
|
||||
# I think this heuristic is bad, and the Vocab should always
|
||||
# own the lexemes. It avoids weird bugs this way, as it's how the thing
|
||||
# was originally supposed to work. The best solution to the growing
|
||||
# memory use is to periodically reset the vocab, which is an action
|
||||
# that should be up to the user to do (so we don't need to keep track
|
||||
# of the doc ownership).
|
||||
# TODO: Change the C API so that the mem isn't passed in here.
|
||||
# The mem argument is deprecated, replaced by memory zones. Same with
|
||||
# this size heuristic.
|
||||
mem = self.mem
|
||||
# if len(string) < 3 or self.length < 10000:
|
||||
# mem = self.mem
|
||||
cdef bint is_oov = mem is not self.mem
|
||||
lex = <LexemeC*>mem.alloc(1, sizeof(LexemeC))
|
||||
lex.orth = self.strings.add(string)
|
||||
lex.length = len(string)
|
||||
|
@ -202,18 +228,25 @@ cdef class Vocab:
|
|||
for attr, func in self.lex_attr_getters.items():
|
||||
value = func(string)
|
||||
if isinstance(value, str):
|
||||
value = self.strings.add(value)
|
||||
value = self.strings.add(value, allow_transient=True)
|
||||
if value is not None:
|
||||
Lexeme.set_struct_attr(lex, attr, value)
|
||||
if not is_oov:
|
||||
self._add_lex_to_vocab(lex.orth, lex)
|
||||
self._add_lex_to_vocab(lex.orth, lex, self.mem is not self._non_temp_mem)
|
||||
if lex == NULL:
|
||||
raise ValueError(Errors.E085.format(string=string))
|
||||
return lex
|
||||
|
||||
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
|
||||
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex, bint is_transient) except -1:
|
||||
self._by_orth.set(lex.orth, <void*>lex)
|
||||
self.length += 1
|
||||
if is_transient:
|
||||
self._transient_orths.push_back(lex.orth)
|
||||
|
||||
def _clear_transient_orths(self):
|
||||
"""Remove transient lexemes from the index (generally at the end of the memory zone)"""
|
||||
for orth in self._transient_orths:
|
||||
self._by_orth.pop(orth)
|
||||
self._transient_orths.clear()
|
||||
|
||||
def __contains__(self, key):
|
||||
"""Check whether the string or int key has an entry in the vocabulary.
|
||||
|
@ -265,7 +298,7 @@ cdef class Vocab:
|
|||
"""
|
||||
cdef attr_t orth
|
||||
if isinstance(id_or_string, str):
|
||||
orth = self.strings.add(id_or_string)
|
||||
orth = self.strings.add(id_or_string, allow_transient=True)
|
||||
else:
|
||||
orth = id_or_string
|
||||
return Lexeme(self, orth)
|
||||
|
@ -417,7 +450,7 @@ cdef class Vocab:
|
|||
DOCS: https://spacy.io/api/vocab#get_vector
|
||||
"""
|
||||
if isinstance(orth, str):
|
||||
orth = self.strings.add(orth)
|
||||
orth = self.strings.add(orth, allow_transient=True)
|
||||
cdef Lexeme lex = self[orth]
|
||||
key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
|
||||
if self.has_vector(key):
|
||||
|
@ -436,7 +469,7 @@ cdef class Vocab:
|
|||
DOCS: https://spacy.io/api/vocab#set_vector
|
||||
"""
|
||||
if isinstance(orth, str):
|
||||
orth = self.strings.add(orth)
|
||||
orth = self.strings.add(orth, allow_transient=False)
|
||||
cdef Lexeme lex = self[orth]
|
||||
key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
|
||||
if self.vectors.is_full and key not in self.vectors:
|
||||
|
@ -460,7 +493,7 @@ cdef class Vocab:
|
|||
DOCS: https://spacy.io/api/vocab#has_vector
|
||||
"""
|
||||
if isinstance(orth, str):
|
||||
orth = self.strings.add(orth)
|
||||
orth = self.strings.add(orth, allow_transient=True)
|
||||
cdef Lexeme lex = self[orth]
|
||||
key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
|
||||
return key in self.vectors
|
||||
|
|
Loading…
Reference in New Issue
Block a user