mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-03 22:06:37 +03:00
1b8d560d0e
Add a context manage nlp.memory_zone(), which will begin memory_zone() blocks on the vocab, string store, and potentially other components. Example usage: ``` with nlp.memory_zone(): for text in nlp.pipe(texts): do_something(doc) # do_something(doc) <-- Invalid ``` Once the memory_zone() block expires, spaCy will free any shared resources that were allocated for the text-processing that occurred within the memory_zone. If you create Doc objects within a memory zone, it's invalid to access them once the memory zone is expired. The purpose of this is that spaCy creates and stores Lexeme objects in the Vocab that can be shared between multiple Doc objects. It also interns strings. Normally, spaCy can't know when all Doc objects using a Lexeme are out-of-scope, so new Lexemes accumulate in the vocab, causing memory pressure. Memory zones solve this problem by telling spaCy "okay none of the documents allocated within this block will be accessed again". This lets spaCy free all new Lexeme objects and other data that were created during the block. The mechanism is general, so memory_zone() context managers can be added to other components that could benefit from them, e.g. pipeline components. I experimented with adding memory zone support to the tokenizer as well, for its cache. However, this seems unnecessarily complicated. It makes more sense to just stick a limit on the cache size. This lets spaCy benefit from the efficiency advantage of the cache better, because we can maintain a (bounded) cache even if only small batches of documents are being processed.
85 lines
2.8 KiB
Python
85 lines
2.8 KiB
Python
from contextlib import contextmanager
|
|
from pathlib import Path
|
|
from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Union
|
|
|
|
from cymem.cymem import Pool
|
|
from thinc.types import Floats1d, FloatsXd
|
|
|
|
from . import Language
|
|
from .lexeme import Lexeme
|
|
from .lookups import Lookups
|
|
from .morphology import Morphology
|
|
from .strings import StringStore
|
|
from .tokens import Doc, Span
|
|
from .vectors import Vectors
|
|
|
|
def create_vocab(
|
|
lang: Optional[str], defaults: Any, vectors_name: Optional[str] = ...
|
|
) -> Vocab: ...
|
|
|
|
class Vocab:
|
|
cfg: Dict[str, Any]
|
|
get_noun_chunks: Optional[Callable[[Union[Doc, Span]], Iterator[Span]]]
|
|
lookups: Lookups
|
|
morphology: Morphology
|
|
strings: StringStore
|
|
vectors: Vectors
|
|
writing_system: Dict[str, Any]
|
|
def __init__(
|
|
self,
|
|
lex_attr_getters: Optional[Dict[str, Callable[[str], Any]]] = ...,
|
|
strings: Optional[Union[List[str], StringStore]] = ...,
|
|
lookups: Optional[Lookups] = ...,
|
|
oov_prob: float = ...,
|
|
vectors_name: Optional[str] = ...,
|
|
writing_system: Dict[str, Any] = ...,
|
|
get_noun_chunks: Optional[Callable[[Union[Doc, Span]], Iterator[Span]]] = ...,
|
|
) -> None: ...
|
|
@property
|
|
def lang(self) -> str: ...
|
|
def __len__(self) -> int: ...
|
|
def add_flag(
|
|
self, flag_getter: Callable[[str], bool], flag_id: int = ...
|
|
) -> int: ...
|
|
def __contains__(self, key: str) -> bool: ...
|
|
def __iter__(self) -> Iterator[Lexeme]: ...
|
|
def __getitem__(self, id_or_string: Union[str, int]) -> Lexeme: ...
|
|
@property
|
|
def vectors_length(self) -> int: ...
|
|
def reset_vectors(
|
|
self, *, width: Optional[int] = ..., shape: Optional[int] = ...
|
|
) -> None: ...
|
|
def deduplicate_vectors(self) -> None: ...
|
|
def prune_vectors(self, nr_row: int, batch_size: int = ...) -> Dict[str, float]: ...
|
|
def get_vector(
|
|
self,
|
|
orth: Union[int, str],
|
|
minn: Optional[int] = ...,
|
|
maxn: Optional[int] = ...,
|
|
) -> FloatsXd: ...
|
|
def set_vector(self, orth: Union[int, str], vector: Floats1d) -> None: ...
|
|
def has_vector(self, orth: Union[int, str]) -> bool: ...
|
|
def to_disk(
|
|
self, path: Union[str, Path], *, exclude: Iterable[str] = ...
|
|
) -> None: ...
|
|
def from_disk(
|
|
self, path: Union[str, Path], *, exclude: Iterable[str] = ...
|
|
) -> Vocab: ...
|
|
def to_bytes(self, *, exclude: Iterable[str] = ...) -> bytes: ...
|
|
def from_bytes(
|
|
self, bytes_data: bytes, *, exclude: Iterable[str] = ...
|
|
) -> Vocab: ...
|
|
@contextmanager
|
|
def memory_zone(self, mem: Optional[Pool] = None) -> Iterator[Pool]: ...
|
|
|
|
def pickle_vocab(vocab: Vocab) -> Any: ...
|
|
def unpickle_vocab(
|
|
sstore: StringStore,
|
|
vectors: Any,
|
|
morphology: Any,
|
|
_unused_object: Any,
|
|
lex_attr_getters: Any,
|
|
lookups: Any,
|
|
get_noun_chunks: Any,
|
|
) -> Vocab: ...
|