spaCy/spacy/tokens/doc.pyi
Adriane Boyd c053f158c5
Add support for floret vectors (#8909)
* Add support for fasttext-bloom hash-only vectors

Overview:

* Extend `Vectors` to have two modes: `default` and `ngram`
  * `default` is the default mode and equivalent to the current
    `Vectors`
  * `ngram` supports the hash-only ngram tables from `fasttext-bloom`
* Extend `spacy.StaticVectors.v2` to handle both modes with no changes
  for `default` vectors
* Extend `spacy init vectors` to support ngram tables

The `ngram` mode **only** supports vector tables produced by this
fork of fastText, which adds an option to represent all vectors using
only the ngram buckets table and which uses the exact same ngram
generation algorithm and hash function (`MurmurHash3_x64_128`).
`fasttext-bloom` produces an additional `.hashvec` table, which can be
loaded by `spacy init vectors --fasttext-bloom-vectors`.

https://github.com/adrianeboyd/fastText/tree/feature/bloom

Implementation details:

* `Vectors` now includes the `StringStore` as `Vectors.strings` so that
  the API can stay consistent for both `default` (which can look up from
  `str` or `int`) and `ngram` (which requires `str` to calculate the
  ngrams).

* In ngram mode `Vectors` uses a default `Vectors` object as a cache
  since the ngram vectors lookups are relatively expensive.

  * The default cache size is the same size as the provided ngram vector
    table.

  * Once the cache is full, no more entries are added. The user is
    responsible for managing the cache in cases where the initial
    documents are not representative of the texts.

  * The cache can be resized by setting `Vectors.ngram_cache_size` or
    cleared with `vectors._ngram_cache.clear()`.

* The API ends up a bit split between methods for `default` and for
  `ngram`, so functions that only make sense for `default` or `ngram`
  include warnings with custom messages suggesting alternatives where
  possible.

* `Vocab.vectors` becomes a property so that the string stores can be
  synced when assigning vectors to a vocab.

* `Vectors` serializes its own config settings as `vectors.cfg`.

* The `Vectors` serialization methods have added support for `exclude`
  so that the `Vocab` can exclude the `Vectors` strings while serializing.

Removed:

* The `minn` and `maxn` options and related code from
  `Vocab.get_vector`, which does not work in a meaningful way for default
  vector tables.

* The unused `GlobalRegistry` in `Vectors`.

* Refactor to use reduce_mean

Refactor to use reduce_mean and remove the ngram vectors cache.

* Rename to floret

* Rename to floret in error messages

* Use --vectors-mode in CLI, vector init

* Fix vectors mode in init

* Remove unused var

* Minor API and docstrings adjustments

* Rename `--vectors-mode` to `--mode` in `init vectors` CLI
* Rename `Vectors.get_floret_vectors` to `Vectors.get_batch` and support
  both modes.
* Minor updates to Vectors docstrings.

* Update API docs for Vectors and init vectors CLI

* Update types for StaticVectors
2021-10-27 14:08:31 +02:00

171 lines
5.5 KiB
Python

from typing import Callable, Protocol, Iterable, Iterator, Optional
from typing import Union, Tuple, List, Dict, Any, overload
from cymem.cymem import Pool
from thinc.types import Floats1d, Floats2d, Ints2d
from .span import Span
from .token import Token
from ._dict_proxies import SpanGroups
from ._retokenize import Retokenizer
from ..lexeme import Lexeme
from ..vocab import Vocab
from .underscore import Underscore
from pathlib import Path
import numpy
class DocMethod(Protocol):
def __call__(self: Doc, *args: Any, **kwargs: Any) -> Any: ... # type: ignore[misc]
class Doc:
vocab: Vocab
mem: Pool
spans: SpanGroups
max_length: int
length: int
sentiment: float
cats: Dict[str, float]
user_hooks: Dict[str, Callable[..., Any]]
user_token_hooks: Dict[str, Callable[..., Any]]
user_span_hooks: Dict[str, Callable[..., Any]]
tensor: numpy.ndarray
user_data: Dict[str, Any]
has_unknown_spaces: bool
@classmethod
def set_extension(
cls,
name: str,
default: Optional[Any] = ...,
getter: Optional[Callable[[Doc], Any]] = ...,
setter: Optional[Callable[[Doc, Any], None]] = ...,
method: Optional[DocMethod] = ...,
force: bool = ...,
) -> None: ...
@classmethod
def get_extension(
cls, name: str
) -> Tuple[
Optional[Any],
Optional[DocMethod],
Optional[Callable[[Doc], Any]],
Optional[Callable[[Doc, Any], None]],
]: ...
@classmethod
def has_extension(cls, name: str) -> bool: ...
@classmethod
def remove_extension(
cls, name: str
) -> Tuple[
Optional[Any],
Optional[DocMethod],
Optional[Callable[[Doc], Any]],
Optional[Callable[[Doc, Any], None]],
]: ...
def __init__(
self,
vocab: Vocab,
words: Optional[List[str]] = ...,
spaces: Optional[List[bool]] = ...,
user_data: Optional[Dict[Any, Any]] = ...,
tags: Optional[List[str]] = ...,
pos: Optional[List[str]] = ...,
morphs: Optional[List[str]] = ...,
lemmas: Optional[List[str]] = ...,
heads: Optional[List[int]] = ...,
deps: Optional[List[str]] = ...,
sent_starts: Optional[List[Union[bool, None]]] = ...,
ents: Optional[List[str]] = ...,
) -> None: ...
@property
def _(self) -> Underscore: ...
@property
def is_tagged(self) -> bool: ...
@property
def is_parsed(self) -> bool: ...
@property
def is_nered(self) -> bool: ...
@property
def is_sentenced(self) -> bool: ...
def has_annotation(
self, attr: Union[int, str], *, require_complete: bool = ...
) -> bool: ...
@overload
def __getitem__(self, i: int) -> Token: ...
@overload
def __getitem__(self, i: slice) -> Span: ...
def __iter__(self) -> Iterator[Token]: ...
def __len__(self) -> int: ...
def __unicode__(self) -> str: ...
def __bytes__(self) -> bytes: ...
def __str__(self) -> str: ...
def __repr__(self) -> str: ...
@property
def doc(self) -> Doc: ...
def char_span(
self,
start_idx: int,
end_idx: int,
label: Union[int, str] = ...,
kb_id: Union[int, str] = ...,
vector: Optional[Floats1d] = ...,
alignment_mode: str = ...,
) -> Span: ...
def similarity(self, other: Union[Doc, Span, Token, Lexeme]) -> float: ...
@property
def has_vector(self) -> bool: ...
vector: Floats1d
vector_norm: float
@property
def text(self) -> str: ...
@property
def text_with_ws(self) -> str: ...
ents: Tuple[Span]
def set_ents(
self,
entities: List[Span],
*,
blocked: Optional[List[Span]] = ...,
missing: Optional[List[Span]] = ...,
outside: Optional[List[Span]] = ...,
default: str = ...
) -> None: ...
@property
def noun_chunks(self) -> Iterator[Span]: ...
@property
def sents(self) -> Iterator[Span]: ...
@property
def lang(self) -> int: ...
@property
def lang_(self) -> str: ...
def count_by(
self, attr_id: int, exclude: Optional[Any] = ..., counts: Optional[Any] = ...
) -> Dict[Any, int]: ...
def from_array(self, attrs: Union[int, str, List[Union[int, str]]], array: Ints2d) -> Doc: ...
def to_array(self, py_attr_ids: Union[int, str, List[Union[int, str]]]) -> numpy.ndarray: ...
@staticmethod
def from_docs(
docs: List[Doc],
ensure_whitespace: bool = ...,
attrs: Optional[Union[Tuple[Union[str, int]], List[Union[int, str]]]] = ...,
) -> Doc: ...
def get_lca_matrix(self) -> Ints2d: ...
def copy(self) -> Doc: ...
def to_disk(
self, path: Union[str, Path], *, exclude: Iterable[str] = ...
) -> None: ...
def from_disk(
self, path: Union[str, Path], *, exclude: Union[List[str], Tuple[str]] = ...
) -> Doc: ...
def to_bytes(self, *, exclude: Union[List[str], Tuple[str]] = ...) -> bytes: ...
def from_bytes(
self, bytes_data: bytes, *, exclude: Union[List[str], Tuple[str]] = ...
) -> Doc: ...
def to_dict(self, *, exclude: Union[List[str], Tuple[str]] = ...) -> bytes: ...
def from_dict(
self, msg: bytes, *, exclude: Union[List[str], Tuple[str]] = ...
) -> Doc: ...
def extend_tensor(self, tensor: Floats2d) -> None: ...
def retokenize(self) -> Retokenizer: ...
def to_json(self, underscore: Optional[List[str]] = ...) -> Dict[str, Any]: ...
def to_utf8_array(self, nr_char: int = ...) -> Ints2d: ...
@staticmethod
def _get_array_attrs() -> Tuple[Any]: ...