mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-06 12:53:19 +03:00
Integrate memory zones into v3.x
This commit is contained in:
parent
319e02545c
commit
3c86cc669a
|
@ -25,5 +25,10 @@ cdef class StringStore:
|
||||||
cdef vector[hash_t] keys
|
cdef vector[hash_t] keys
|
||||||
cdef public PreshMap _map
|
cdef public PreshMap _map
|
||||||
|
|
||||||
cdef const Utf8Str* intern_unicode(self, str py_string)
|
cdef const Utf8Str* intern_unicode(self, str py_string, bint allow_transient)
|
||||||
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash)
|
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash, bint allow_transient)
|
||||||
|
|
||||||
|
cdef vector[hash_t] _transient_keys
|
||||||
|
cdef PreshMap _transient_map
|
||||||
|
cdef Pool _non_temp_mem
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,9 @@
|
||||||
# cython: infer_types=True
|
# cython: infer_types=True
|
||||||
# cython: profile=False
|
# cython: profile=False
|
||||||
cimport cython
|
cimport cython
|
||||||
|
from typing import Iterable, Iterator, List, Optional, Tuple, Union
|
||||||
|
from contextlib import contextmanager
|
||||||
|
|
||||||
from libc.stdint cimport uint32_t
|
from libc.stdint cimport uint32_t
|
||||||
from libc.string cimport memcpy
|
from libc.string cimport memcpy
|
||||||
from murmurhash.mrmr cimport hash32, hash64
|
from murmurhash.mrmr cimport hash32, hash64
|
||||||
|
@ -119,7 +122,9 @@ cdef class StringStore:
|
||||||
strings (iterable): A sequence of unicode strings to add to the store.
|
strings (iterable): A sequence of unicode strings to add to the store.
|
||||||
"""
|
"""
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
|
self._non_temp_mem = self.mem
|
||||||
self._map = PreshMap()
|
self._map = PreshMap()
|
||||||
|
self._transient_map = None
|
||||||
if strings is not None:
|
if strings is not None:
|
||||||
for string in strings:
|
for string in strings:
|
||||||
self.add(string)
|
self.add(string)
|
||||||
|
@ -152,10 +157,13 @@ cdef class StringStore:
|
||||||
return SYMBOLS_BY_INT[str_hash]
|
return SYMBOLS_BY_INT[str_hash]
|
||||||
else:
|
else:
|
||||||
utf8str = <Utf8Str*>self._map.get(str_hash)
|
utf8str = <Utf8Str*>self._map.get(str_hash)
|
||||||
|
if utf8str is NULL and self._transient_map is not None:
|
||||||
|
utf8str = <Utf8Str*>self._transient_map.get(str_hash)
|
||||||
else:
|
else:
|
||||||
# TODO: Raise an error instead
|
# TODO: Raise an error instead
|
||||||
utf8str = <Utf8Str*>self._map.get(string_or_id)
|
utf8str = <Utf8Str*>self._map.get(string_or_id)
|
||||||
|
if utf8str is NULL and self._transient_map is not None:
|
||||||
|
utf8str = <Utf8Str*>self._transient_map.get(str_hash)
|
||||||
if utf8str is NULL:
|
if utf8str is NULL:
|
||||||
raise KeyError(Errors.E018.format(hash_value=string_or_id))
|
raise KeyError(Errors.E018.format(hash_value=string_or_id))
|
||||||
else:
|
else:
|
||||||
|
@ -175,10 +183,46 @@ cdef class StringStore:
|
||||||
else:
|
else:
|
||||||
return self[key]
|
return self[key]
|
||||||
|
|
||||||
def add(self, string):
|
def __reduce__(self):
|
||||||
|
strings = list(self.non_transient_keys())
|
||||||
|
return (StringStore, (strings,), None, None, None)
|
||||||
|
|
||||||
|
def __len__(self) -> int:
|
||||||
|
"""The number of strings in the store.
|
||||||
|
|
||||||
|
RETURNS (int): The number of strings in the store.
|
||||||
|
"""
|
||||||
|
return self._keys.size() + self._transient_keys.size()
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def memory_zone(self, mem: Optional[Pool]=None) -> Pool:
|
||||||
|
"""Begin a block where all resources allocated during the block will
|
||||||
|
be freed at the end of it. If a resources was created within the
|
||||||
|
memory zone block, accessing it outside the block is invalid.
|
||||||
|
Behaviour of this invalid access is undefined. Memory zones should
|
||||||
|
not be nested.
|
||||||
|
|
||||||
|
The memory zone is helpful for services that need to process large
|
||||||
|
volumes of text with a defined memory budget.
|
||||||
|
"""
|
||||||
|
if mem is None:
|
||||||
|
mem = Pool()
|
||||||
|
self.mem = mem
|
||||||
|
self._transient_map = PreshMap()
|
||||||
|
yield mem
|
||||||
|
self.mem = self._non_temp_mem
|
||||||
|
self._transient_map = None
|
||||||
|
self._transient_keys.clear()
|
||||||
|
|
||||||
|
def add(self, string: str, allow_transient: bool = False) -> int:
|
||||||
"""Add a string to the StringStore.
|
"""Add a string to the StringStore.
|
||||||
|
|
||||||
string (str): The string to add.
|
string (str): The string to add.
|
||||||
|
allow_transient (bool): Allow the string to be stored in the 'transient'
|
||||||
|
map, which will be flushed at the end of the memory zone. Strings
|
||||||
|
encountered during arbitrary text processing should be added
|
||||||
|
with allow_transient=True, while labels and other strings used
|
||||||
|
internally should not.
|
||||||
RETURNS (uint64): The string's hash value.
|
RETURNS (uint64): The string's hash value.
|
||||||
"""
|
"""
|
||||||
cdef hash_t str_hash
|
cdef hash_t str_hash
|
||||||
|
@ -188,22 +232,26 @@ cdef class StringStore:
|
||||||
|
|
||||||
string = string.encode("utf8")
|
string = string.encode("utf8")
|
||||||
str_hash = hash_utf8(string, len(string))
|
str_hash = hash_utf8(string, len(string))
|
||||||
self._intern_utf8(string, len(string), &str_hash)
|
self._intern_utf8(string, len(string), &str_hash, allow_transient)
|
||||||
elif isinstance(string, bytes):
|
elif isinstance(string, bytes):
|
||||||
if string in SYMBOLS_BY_STR:
|
if string in SYMBOLS_BY_STR:
|
||||||
return SYMBOLS_BY_STR[string]
|
return SYMBOLS_BY_STR[string]
|
||||||
str_hash = hash_utf8(string, len(string))
|
str_hash = hash_utf8(string, len(string))
|
||||||
self._intern_utf8(string, len(string), &str_hash)
|
self._intern_utf8(string, len(string), &str_hash, allow_transient)
|
||||||
else:
|
else:
|
||||||
raise TypeError(Errors.E017.format(value_type=type(string)))
|
raise TypeError(Errors.E017.format(value_type=type(string)))
|
||||||
return str_hash
|
return str_hash
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
"""The number of strings in the store.
|
"""The number of strings in the store.
|
||||||
|
if string in SYMBOLS_BY_STR:
|
||||||
|
return SYMBOLS_BY_STR[string]
|
||||||
|
else:
|
||||||
|
return self._intern_str(string, allow_transient)
|
||||||
|
|
||||||
RETURNS (int): The number of strings in the store.
|
RETURNS (int): The number of strings in the store.
|
||||||
"""
|
"""
|
||||||
return self.keys.size()
|
return self.keys.size() + self._transient_keys.size()
|
||||||
|
|
||||||
def __contains__(self, string_or_id not None):
|
def __contains__(self, string_or_id not None):
|
||||||
"""Check whether a string or ID is in the store.
|
"""Check whether a string or ID is in the store.
|
||||||
|
@ -222,30 +270,70 @@ cdef class StringStore:
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
# TODO: Raise an error instead
|
# TODO: Raise an error instead
|
||||||
return self._map.get(string_or_id) is not NULL
|
if self._map.get(string_or_id) is not NULL:
|
||||||
|
return True
|
||||||
|
elif self._transient_map is not None and self._transient_map.get(string_or_id) is not NULL:
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
if str_hash < len(SYMBOLS_BY_INT):
|
if str_hash < len(SYMBOLS_BY_INT):
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
return self._map.get(str_hash) is not NULL
|
if self._map.get(str_hash) is not NULL:
|
||||||
|
return True
|
||||||
|
elif self._transient_map is not None and self._transient_map.get(string_or_id) is not NULL:
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
"""Iterate over the strings in the store, in order.
|
"""Iterate over the strings in the store, in order.
|
||||||
|
|
||||||
YIELDS (str): A string in the store.
|
YIELDS (str): A string in the store.
|
||||||
"""
|
"""
|
||||||
|
yield from self.non_transient_keys()
|
||||||
|
yield from self.transient_keys()
|
||||||
|
|
||||||
|
def non_transient_keys(self) -> Iterator[str]:
|
||||||
|
"""Iterate over the stored strings in insertion order.
|
||||||
|
|
||||||
|
RETURNS: A list of strings.
|
||||||
|
"""
|
||||||
cdef int i
|
cdef int i
|
||||||
cdef hash_t key
|
cdef hash_t key
|
||||||
for i in range(self.keys.size()):
|
for i in range(self.keys.size()):
|
||||||
key = self.keys[i]
|
key = self.keys[i]
|
||||||
utf8str = <Utf8Str*>self._map.get(key)
|
utf8str = <Utf8Str*>self._map.get(key)
|
||||||
yield decode_Utf8Str(utf8str)
|
yield decode_Utf8Str(utf8str)
|
||||||
# TODO: Iterate OOV here?
|
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
strings = list(self)
|
strings = list(self)
|
||||||
return (StringStore, (strings,), None, None, None)
|
return (StringStore, (strings,), None, None, None)
|
||||||
|
|
||||||
|
def transient_keys(self) -> Iterator[str]:
|
||||||
|
if self._transient_map is None:
|
||||||
|
return []
|
||||||
|
for i in range(self._transient_keys.size()):
|
||||||
|
utf8str = <Utf8Str*>self._transient_map.get(self._transient_keys[i])
|
||||||
|
yield decode_Utf8Str(utf8str)
|
||||||
|
|
||||||
|
def values(self) -> List[int]:
|
||||||
|
"""Iterate over the stored strings hashes in insertion order.
|
||||||
|
|
||||||
|
RETURNS: A list of string hashs.
|
||||||
|
"""
|
||||||
|
cdef int i
|
||||||
|
hashes = [None] * self._keys.size()
|
||||||
|
for i in range(self._keys.size()):
|
||||||
|
hashes[i] = self._keys[i]
|
||||||
|
if self._transient_map is not None:
|
||||||
|
transient_hashes = [None] * self._transient_keys.size()
|
||||||
|
for i in range(self._transient_keys.size()):
|
||||||
|
transient_hashes[i] = self._transient_keys[i]
|
||||||
|
else:
|
||||||
|
transient_hashes = []
|
||||||
|
return hashes + transient_hashes
|
||||||
|
|
||||||
def to_disk(self, path):
|
def to_disk(self, path):
|
||||||
"""Save the current state to a directory.
|
"""Save the current state to a directory.
|
||||||
|
|
||||||
|
@ -269,7 +357,7 @@ cdef class StringStore:
|
||||||
prev = list(self)
|
prev = list(self)
|
||||||
self._reset_and_load(strings)
|
self._reset_and_load(strings)
|
||||||
for word in prev:
|
for word in prev:
|
||||||
self.add(word)
|
self.add(word, allow_transient=False)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_bytes(self, **kwargs):
|
def to_bytes(self, **kwargs):
|
||||||
|
@ -289,7 +377,7 @@ cdef class StringStore:
|
||||||
prev = list(self)
|
prev = list(self)
|
||||||
self._reset_and_load(strings)
|
self._reset_and_load(strings)
|
||||||
for word in prev:
|
for word in prev:
|
||||||
self.add(word)
|
self.add(word, allow_transient=False)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def _reset_and_load(self, strings):
|
def _reset_and_load(self, strings):
|
||||||
|
@ -297,22 +385,34 @@ cdef class StringStore:
|
||||||
self._map = PreshMap()
|
self._map = PreshMap()
|
||||||
self.keys.clear()
|
self.keys.clear()
|
||||||
for string in strings:
|
for string in strings:
|
||||||
self.add(string)
|
self.add(string, allow_transient=False)
|
||||||
|
|
||||||
cdef const Utf8Str* intern_unicode(self, str py_string):
|
cdef const Utf8Str* intern_unicode(self, str py_string, bint allow_transient):
|
||||||
# 0 means missing, but we don't bother offsetting the index.
|
# 0 means missing, but we don't bother offsetting the index.
|
||||||
cdef bytes byte_string = py_string.encode("utf8")
|
cdef bytes byte_string = py_string.encode("utf8")
|
||||||
return self._intern_utf8(byte_string, len(byte_string), NULL)
|
return self._intern_utf8(byte_string, len(byte_string), NULL, allow_transient)
|
||||||
|
|
||||||
@cython.final
|
@cython.final
|
||||||
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash):
|
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash, bint allow_transient):
|
||||||
# TODO: This function's API/behaviour is an unholy mess...
|
# TODO: This function's API/behaviour is an unholy mess...
|
||||||
# 0 means missing, but we don't bother offsetting the index.
|
# 0 means missing, but we don't bother offsetting the index.
|
||||||
cdef hash_t key = precalculated_hash[0] if precalculated_hash is not NULL else hash_utf8(utf8_string, length)
|
cdef hash_t key = precalculated_hash[0] if precalculated_hash is not NULL else hash_utf8(utf8_string, length)
|
||||||
cdef Utf8Str* value = <Utf8Str*>self._map.get(key)
|
cdef Utf8Str* value = <Utf8Str*>self._map.get(key)
|
||||||
if value is not NULL:
|
if value is not NULL:
|
||||||
return value
|
return value
|
||||||
|
if allow_transient and self._transient_map is not None:
|
||||||
|
# If we've already allocated a transient string, and now we
|
||||||
|
# want to intern it permanently, we'll end up with the string
|
||||||
|
# in both places. That seems fine -- I don't see why we need
|
||||||
|
# to remove it from the transient map.
|
||||||
|
value = <Utf8Str*>self._transient_map.get(key)
|
||||||
|
if value is not NULL:
|
||||||
|
return value
|
||||||
value = _allocate(self.mem, <unsigned char*>utf8_string, length)
|
value = _allocate(self.mem, <unsigned char*>utf8_string, length)
|
||||||
self._map.set(key, value)
|
if allow_transient and self._transient_map is not None:
|
||||||
self.keys.push_back(key)
|
self._transient_map.set(key, value)
|
||||||
|
self._transient_keys.push_back(key)
|
||||||
|
else:
|
||||||
|
self._map.set(key, value)
|
||||||
|
self.keys.push_back(key)
|
||||||
return value
|
return value
|
||||||
|
|
36
spacy/tests/vocab_vectors/test_memory_zone.py
Normal file
36
spacy/tests/vocab_vectors/test_memory_zone.py
Normal file
|
@ -0,0 +1,36 @@
|
||||||
|
from spacy.vocab import Vocab
|
||||||
|
|
||||||
|
|
||||||
|
def test_memory_zone_no_insertion():
|
||||||
|
vocab = Vocab()
|
||||||
|
with vocab.memory_zone():
|
||||||
|
pass
|
||||||
|
lex = vocab["horse"]
|
||||||
|
assert lex.text == "horse"
|
||||||
|
|
||||||
|
|
||||||
|
def test_memory_zone_insertion():
|
||||||
|
vocab = Vocab()
|
||||||
|
_ = vocab["dog"]
|
||||||
|
assert "dog" in vocab
|
||||||
|
assert "horse" not in vocab
|
||||||
|
with vocab.memory_zone():
|
||||||
|
lex = vocab["horse"]
|
||||||
|
assert lex.text == "horse"
|
||||||
|
assert "dog" in vocab
|
||||||
|
assert "horse" not in vocab
|
||||||
|
|
||||||
|
|
||||||
|
def test_memory_zone_redundant_insertion():
|
||||||
|
"""Test that if we insert an already-existing word while
|
||||||
|
in the memory zone, it stays persistent"""
|
||||||
|
vocab = Vocab()
|
||||||
|
_ = vocab["dog"]
|
||||||
|
assert "dog" in vocab
|
||||||
|
assert "horse" not in vocab
|
||||||
|
with vocab.memory_zone():
|
||||||
|
lex = vocab["horse"]
|
||||||
|
assert lex.text == "horse"
|
||||||
|
_ = vocab["dog"]
|
||||||
|
assert "dog" in vocab
|
||||||
|
assert "horse" not in vocab
|
|
@ -25,9 +25,7 @@ cdef class Tokenizer:
|
||||||
cdef PhraseMatcher _special_matcher
|
cdef PhraseMatcher _special_matcher
|
||||||
# TODO convert to bool in v4
|
# TODO convert to bool in v4
|
||||||
cdef int _faster_heuristics
|
cdef int _faster_heuristics
|
||||||
# TODO next one is unused and should be removed in v4
|
cdef public int max_cache_size
|
||||||
# https://github.com/explosion/spaCy/pull/9150
|
|
||||||
cdef int _unused_int2
|
|
||||||
|
|
||||||
cdef Doc _tokenize_affixes(self, str string, bint with_special_cases)
|
cdef Doc _tokenize_affixes(self, str string, bint with_special_cases)
|
||||||
cdef int _apply_special_cases(self, Doc doc) except -1
|
cdef int _apply_special_cases(self, Doc doc) except -1
|
||||||
|
|
|
@ -30,7 +30,7 @@ cdef class Tokenizer:
|
||||||
"""
|
"""
|
||||||
def __init__(self, Vocab vocab, rules=None, prefix_search=None,
|
def __init__(self, Vocab vocab, rules=None, prefix_search=None,
|
||||||
suffix_search=None, infix_finditer=None, token_match=None,
|
suffix_search=None, infix_finditer=None, token_match=None,
|
||||||
url_match=None, faster_heuristics=True):
|
url_match=None, faster_heuristics=True, max_cache_size=10000):
|
||||||
"""Create a `Tokenizer`, to create `Doc` objects given unicode text.
|
"""Create a `Tokenizer`, to create `Doc` objects given unicode text.
|
||||||
|
|
||||||
vocab (Vocab): A storage container for lexical types.
|
vocab (Vocab): A storage container for lexical types.
|
||||||
|
@ -50,6 +50,7 @@ cdef class Tokenizer:
|
||||||
faster_heuristics (bool): Whether to restrict the final
|
faster_heuristics (bool): Whether to restrict the final
|
||||||
Matcher-based pass for rules to those containing affixes or space.
|
Matcher-based pass for rules to those containing affixes or space.
|
||||||
Defaults to True.
|
Defaults to True.
|
||||||
|
max_cache_size (int): Maximum number of tokenization chunks to cache.
|
||||||
|
|
||||||
EXAMPLE:
|
EXAMPLE:
|
||||||
>>> tokenizer = Tokenizer(nlp.vocab)
|
>>> tokenizer = Tokenizer(nlp.vocab)
|
||||||
|
@ -69,6 +70,7 @@ cdef class Tokenizer:
|
||||||
self._rules = {}
|
self._rules = {}
|
||||||
self._special_matcher = PhraseMatcher(self.vocab)
|
self._special_matcher = PhraseMatcher(self.vocab)
|
||||||
self._load_special_cases(rules)
|
self._load_special_cases(rules)
|
||||||
|
self.max_cache_size = max_cache_size
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def token_match(self):
|
def token_match(self):
|
||||||
|
@ -397,8 +399,9 @@ cdef class Tokenizer:
|
||||||
has_special, with_special_cases)
|
has_special, with_special_cases)
|
||||||
self._attach_tokens(tokens, span, &prefixes, &suffixes, has_special,
|
self._attach_tokens(tokens, span, &prefixes, &suffixes, has_special,
|
||||||
with_special_cases)
|
with_special_cases)
|
||||||
self._save_cached(&tokens.c[orig_size], orig_key, has_special,
|
if len(self._cache) < self.max_cache_size:
|
||||||
tokens.length - orig_size)
|
self._save_cached(&tokens.c[orig_size], orig_key, has_special,
|
||||||
|
tokens.length - orig_size)
|
||||||
|
|
||||||
cdef str _split_affixes(
|
cdef str _split_affixes(
|
||||||
self,
|
self,
|
||||||
|
@ -514,6 +517,9 @@ cdef class Tokenizer:
|
||||||
if n <= 0:
|
if n <= 0:
|
||||||
# avoid mem alloc of zero length
|
# avoid mem alloc of zero length
|
||||||
return 0
|
return 0
|
||||||
|
# Historically this check was mostly used to avoid caching
|
||||||
|
# chunks that had tokens owned by the Doc. Now that that's
|
||||||
|
# not a thing, I don't think we need this?
|
||||||
for i in range(n):
|
for i in range(n):
|
||||||
if self.vocab._by_orth.get(tokens[i].lex.orth) == NULL:
|
if self.vocab._by_orth.get(tokens[i].lex.orth) == NULL:
|
||||||
return 0
|
return 0
|
||||||
|
|
|
@ -41,7 +41,9 @@ cdef class Vocab:
|
||||||
cdef const TokenC* make_fused_token(self, substrings) except NULL
|
cdef const TokenC* make_fused_token(self, substrings) except NULL
|
||||||
|
|
||||||
cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
|
cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
|
||||||
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
|
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex, bint is_transient) except -1
|
||||||
cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
|
cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
|
||||||
|
|
||||||
cdef PreshMap _by_orth
|
cdef PreshMap _by_orth
|
||||||
|
cdef Pool _non_temp_mem
|
||||||
|
cdef vector[attr_t] _transient_orths
|
||||||
|
|
|
@ -1,7 +1,9 @@
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Union
|
from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Union
|
||||||
|
from contextlib import contextmanager
|
||||||
|
|
||||||
from thinc.types import Floats1d, FloatsXd
|
from thinc.types import Floats1d, FloatsXd
|
||||||
|
from cymem.cymem import Pool
|
||||||
|
|
||||||
from . import Language
|
from . import Language
|
||||||
from .lexeme import Lexeme
|
from .lexeme import Lexeme
|
||||||
|
@ -67,6 +69,8 @@ class Vocab:
|
||||||
def from_bytes(
|
def from_bytes(
|
||||||
self, bytes_data: bytes, *, exclude: Iterable[str] = ...
|
self, bytes_data: bytes, *, exclude: Iterable[str] = ...
|
||||||
) -> Vocab: ...
|
) -> Vocab: ...
|
||||||
|
@contextmanager
|
||||||
|
def memory_zone(self, mem: Optional[Pool] = None) -> Iterator[Pool]: ...
|
||||||
|
|
||||||
def pickle_vocab(vocab: Vocab) -> Any: ...
|
def pickle_vocab(vocab: Vocab) -> Any: ...
|
||||||
def unpickle_vocab(
|
def unpickle_vocab(
|
||||||
|
|
|
@ -3,6 +3,8 @@ import functools
|
||||||
import numpy
|
import numpy
|
||||||
import srsly
|
import srsly
|
||||||
from thinc.api import get_array_module, get_current_ops
|
from thinc.api import get_array_module, get_current_ops
|
||||||
|
from contextlib import contextmanager, ExitStack
|
||||||
|
from typing import Iterator, Optional
|
||||||
|
|
||||||
from .attrs cimport LANG, ORTH
|
from .attrs cimport LANG, ORTH
|
||||||
from .lexeme cimport EMPTY_LEXEME, OOV_RANK, Lexeme
|
from .lexeme cimport EMPTY_LEXEME, OOV_RANK, Lexeme
|
||||||
|
@ -87,6 +89,12 @@ cdef class Vocab:
|
||||||
self.lookups = lookups
|
self.lookups = lookups
|
||||||
self.writing_system = writing_system
|
self.writing_system = writing_system
|
||||||
self.get_noun_chunks = get_noun_chunks
|
self.get_noun_chunks = get_noun_chunks
|
||||||
|
# During a memory_zone we replace our mem object with one
|
||||||
|
# that's passed to us. We keep a reference to our non-temporary
|
||||||
|
# memory here, in case we need to make an allocation we want to
|
||||||
|
# guarantee is not temporary. This is also how we check whether
|
||||||
|
# we're in a memory zone: we check whether self.mem is self._non_temp_mem
|
||||||
|
self._non_temp_mem = self.mem
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def vectors(self):
|
def vectors(self):
|
||||||
|
@ -114,6 +122,33 @@ cdef class Vocab:
|
||||||
"""
|
"""
|
||||||
return self.length
|
return self.length
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def memory_zone(self, mem: Optional[Pool] = None) -> Iterator[Pool]:
|
||||||
|
"""Begin a block where resources allocated during the block will
|
||||||
|
be freed at the end of it. If a resources was created within the
|
||||||
|
memory zone block, accessing it outside the block is invalid.
|
||||||
|
Behaviour of this invalid access is undefined. Memory zones should
|
||||||
|
not be nested.
|
||||||
|
|
||||||
|
The memory zone is helpful for services that need to process large
|
||||||
|
volumes of text with a defined memory budget.
|
||||||
|
"""
|
||||||
|
if mem is None:
|
||||||
|
mem = Pool()
|
||||||
|
# The ExitStack allows programmatic nested context managers.
|
||||||
|
# We don't know how many we need, so it would be awkward to have
|
||||||
|
# them as nested blocks.
|
||||||
|
with ExitStack() as stack:
|
||||||
|
contexts = [stack.enter_context(self.strings.memory_zone(mem))]
|
||||||
|
if hasattr(self.morphology, "memory_zone"):
|
||||||
|
contexts.append(stack.enter_context(self.morphology.memory_zone(mem)))
|
||||||
|
if hasattr(self._vectors, "memory_zone"):
|
||||||
|
contexts.append(stack.enter_context(self._vectors.memory_zone(mem)))
|
||||||
|
self.mem = mem
|
||||||
|
yield mem
|
||||||
|
self._clear_transient_orths()
|
||||||
|
self.mem = self._non_temp_mem
|
||||||
|
|
||||||
def add_flag(self, flag_getter, int flag_id=-1):
|
def add_flag(self, flag_getter, int flag_id=-1):
|
||||||
"""Set a new boolean flag to words in the vocabulary.
|
"""Set a new boolean flag to words in the vocabulary.
|
||||||
|
|
||||||
|
@ -148,8 +183,7 @@ cdef class Vocab:
|
||||||
|
|
||||||
cdef const LexemeC* get(self, Pool mem, str string) except NULL:
|
cdef const LexemeC* get(self, Pool mem, str string) except NULL:
|
||||||
"""Get a pointer to a `LexemeC` from the lexicon, creating a new
|
"""Get a pointer to a `LexemeC` from the lexicon, creating a new
|
||||||
`Lexeme` if necessary using memory acquired from the given pool. If the
|
`Lexeme` if necessary.
|
||||||
pool is the lexicon's own memory, the lexeme is saved in the lexicon.
|
|
||||||
"""
|
"""
|
||||||
if string == "":
|
if string == "":
|
||||||
return &EMPTY_LEXEME
|
return &EMPTY_LEXEME
|
||||||
|
@ -180,17 +214,9 @@ cdef class Vocab:
|
||||||
return self._new_lexeme(mem, self.strings[orth])
|
return self._new_lexeme(mem, self.strings[orth])
|
||||||
|
|
||||||
cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL:
|
cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL:
|
||||||
# I think this heuristic is bad, and the Vocab should always
|
# The mem argument is deprecated, replaced by memory zones. Same with
|
||||||
# own the lexemes. It avoids weird bugs this way, as it's how the thing
|
# this size heuristic.
|
||||||
# was originally supposed to work. The best solution to the growing
|
|
||||||
# memory use is to periodically reset the vocab, which is an action
|
|
||||||
# that should be up to the user to do (so we don't need to keep track
|
|
||||||
# of the doc ownership).
|
|
||||||
# TODO: Change the C API so that the mem isn't passed in here.
|
|
||||||
mem = self.mem
|
mem = self.mem
|
||||||
# if len(string) < 3 or self.length < 10000:
|
|
||||||
# mem = self.mem
|
|
||||||
cdef bint is_oov = mem is not self.mem
|
|
||||||
lex = <LexemeC*>mem.alloc(1, sizeof(LexemeC))
|
lex = <LexemeC*>mem.alloc(1, sizeof(LexemeC))
|
||||||
lex.orth = self.strings.add(string)
|
lex.orth = self.strings.add(string)
|
||||||
lex.length = len(string)
|
lex.length = len(string)
|
||||||
|
@ -202,18 +228,25 @@ cdef class Vocab:
|
||||||
for attr, func in self.lex_attr_getters.items():
|
for attr, func in self.lex_attr_getters.items():
|
||||||
value = func(string)
|
value = func(string)
|
||||||
if isinstance(value, str):
|
if isinstance(value, str):
|
||||||
value = self.strings.add(value)
|
value = self.strings.add(value, allow_transient=True)
|
||||||
if value is not None:
|
if value is not None:
|
||||||
Lexeme.set_struct_attr(lex, attr, value)
|
Lexeme.set_struct_attr(lex, attr, value)
|
||||||
if not is_oov:
|
self._add_lex_to_vocab(lex.orth, lex, self.mem is not self._non_temp_mem)
|
||||||
self._add_lex_to_vocab(lex.orth, lex)
|
|
||||||
if lex == NULL:
|
if lex == NULL:
|
||||||
raise ValueError(Errors.E085.format(string=string))
|
raise ValueError(Errors.E085.format(string=string))
|
||||||
return lex
|
return lex
|
||||||
|
|
||||||
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
|
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex, bint is_transient) except -1:
|
||||||
self._by_orth.set(lex.orth, <void*>lex)
|
self._by_orth.set(lex.orth, <void*>lex)
|
||||||
self.length += 1
|
self.length += 1
|
||||||
|
if is_transient:
|
||||||
|
self._transient_orths.push_back(lex.orth)
|
||||||
|
|
||||||
|
def _clear_transient_orths(self):
|
||||||
|
"""Remove transient lexemes from the index (generally at the end of the memory zone)"""
|
||||||
|
for orth in self._transient_orths:
|
||||||
|
self._by_orth.pop(orth)
|
||||||
|
self._transient_orths.clear()
|
||||||
|
|
||||||
def __contains__(self, key):
|
def __contains__(self, key):
|
||||||
"""Check whether the string or int key has an entry in the vocabulary.
|
"""Check whether the string or int key has an entry in the vocabulary.
|
||||||
|
@ -265,7 +298,7 @@ cdef class Vocab:
|
||||||
"""
|
"""
|
||||||
cdef attr_t orth
|
cdef attr_t orth
|
||||||
if isinstance(id_or_string, str):
|
if isinstance(id_or_string, str):
|
||||||
orth = self.strings.add(id_or_string)
|
orth = self.strings.add(id_or_string, allow_transient=True)
|
||||||
else:
|
else:
|
||||||
orth = id_or_string
|
orth = id_or_string
|
||||||
return Lexeme(self, orth)
|
return Lexeme(self, orth)
|
||||||
|
@ -417,7 +450,7 @@ cdef class Vocab:
|
||||||
DOCS: https://spacy.io/api/vocab#get_vector
|
DOCS: https://spacy.io/api/vocab#get_vector
|
||||||
"""
|
"""
|
||||||
if isinstance(orth, str):
|
if isinstance(orth, str):
|
||||||
orth = self.strings.add(orth)
|
orth = self.strings.add(orth, allow_transient=True)
|
||||||
cdef Lexeme lex = self[orth]
|
cdef Lexeme lex = self[orth]
|
||||||
key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
|
key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
|
||||||
if self.has_vector(key):
|
if self.has_vector(key):
|
||||||
|
@ -436,7 +469,7 @@ cdef class Vocab:
|
||||||
DOCS: https://spacy.io/api/vocab#set_vector
|
DOCS: https://spacy.io/api/vocab#set_vector
|
||||||
"""
|
"""
|
||||||
if isinstance(orth, str):
|
if isinstance(orth, str):
|
||||||
orth = self.strings.add(orth)
|
orth = self.strings.add(orth, allow_transient=False)
|
||||||
cdef Lexeme lex = self[orth]
|
cdef Lexeme lex = self[orth]
|
||||||
key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
|
key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
|
||||||
if self.vectors.is_full and key not in self.vectors:
|
if self.vectors.is_full and key not in self.vectors:
|
||||||
|
@ -460,7 +493,7 @@ cdef class Vocab:
|
||||||
DOCS: https://spacy.io/api/vocab#has_vector
|
DOCS: https://spacy.io/api/vocab#has_vector
|
||||||
"""
|
"""
|
||||||
if isinstance(orth, str):
|
if isinstance(orth, str):
|
||||||
orth = self.strings.add(orth)
|
orth = self.strings.add(orth, allow_transient=True)
|
||||||
cdef Lexeme lex = self[orth]
|
cdef Lexeme lex = self[orth]
|
||||||
key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
|
key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
|
||||||
return key in self.vectors
|
return key in self.vectors
|
||||||
|
|
Loading…
Reference in New Issue
Block a user