spaCy/spacy/vocab.pyi
Adriane Boyd f98b41c390
Add vector deduplication (#10551)
* Add vector deduplication

* Add `Vocab.deduplicate_vectors()`
* Always run deduplication in `spacy init vectors`
* Clean up a few vector-related error messages and docs examples

* Always unique with numpy

* Fix types
2022-03-30 08:54:23 +02:00

80 lines
2.7 KiB
Python

from typing import Callable, Iterator, Optional, Union, List, Dict
from typing import Any, Iterable
from thinc.types import Floats1d, FloatsXd
from . import Language
from .strings import StringStore
from .lexeme import Lexeme
from .lookups import Lookups
from .morphology import Morphology
from .tokens import Doc, Span
from .vectors import Vectors
from pathlib import Path
def create_vocab(
lang: Optional[str], defaults: Any, vectors_name: Optional[str] = ...
) -> Vocab: ...
class Vocab:
cfg: Dict[str, Any]
get_noun_chunks: Optional[Callable[[Union[Doc, Span]], Iterator[Span]]]
lookups: Lookups
morphology: Morphology
strings: StringStore
vectors: Vectors
writing_system: Dict[str, Any]
def __init__(
self,
lex_attr_getters: Optional[Dict[str, Callable[[str], Any]]] = ...,
strings: Optional[Union[List[str], StringStore]] = ...,
lookups: Optional[Lookups] = ...,
oov_prob: float = ...,
vectors_name: Optional[str] = ...,
writing_system: Dict[str, Any] = ...,
get_noun_chunks: Optional[Callable[[Union[Doc, Span]], Iterator[Span]]] = ...,
) -> None: ...
@property
def lang(self) -> str: ...
def __len__(self) -> int: ...
def add_flag(
self, flag_getter: Callable[[str], bool], flag_id: int = ...
) -> int: ...
def __contains__(self, key: str) -> bool: ...
def __iter__(self) -> Iterator[Lexeme]: ...
def __getitem__(self, id_or_string: Union[str, int]) -> Lexeme: ...
@property
def vectors_length(self) -> int: ...
def reset_vectors(
self, *, width: Optional[int] = ..., shape: Optional[int] = ...
) -> None: ...
def deduplicate_vectors(self) -> None: ...
def prune_vectors(self, nr_row: int, batch_size: int = ...) -> Dict[str, float]: ...
def get_vector(
self,
orth: Union[int, str],
minn: Optional[int] = ...,
maxn: Optional[int] = ...,
) -> FloatsXd: ...
def set_vector(self, orth: Union[int, str], vector: Floats1d) -> None: ...
def has_vector(self, orth: Union[int, str]) -> bool: ...
def to_disk(
self, path: Union[str, Path], *, exclude: Iterable[str] = ...
) -> None: ...
def from_disk(
self, path: Union[str, Path], *, exclude: Iterable[str] = ...
) -> Vocab: ...
def to_bytes(self, *, exclude: Iterable[str] = ...) -> bytes: ...
def from_bytes(
self, bytes_data: bytes, *, exclude: Iterable[str] = ...
) -> Vocab: ...
def pickle_vocab(vocab: Vocab) -> Any: ...
def unpickle_vocab(
sstore: StringStore,
vectors: Any,
morphology: Any,
_unused_object: Any,
lex_attr_getters: Any,
lookups: Any,
get_noun_chunks: Any,
) -> Vocab: ...