from pathlib import Path from typing import ( Any, Callable, Dict, Iterable, Iterator, List, Optional, Protocol, Sequence, Tuple, Union, overload, ) import numpy as np from cymem.cymem import Pool from thinc.types import ArrayXd, Floats1d, Floats2d, Ints2d, Ragged from ..lexeme import Lexeme from ..vocab import Vocab from .retokenizer import Retokenizer from .span import Span from .span_groups import SpanGroups from .token import Token from .underscore import Underscore DOCBIN_ALL_ATTRS: Tuple[str, ...] class DocMethod(Protocol): def __call__(self: Doc, *args: Any, **kwargs: Any) -> Any: ... # type: ignore[misc] class Doc: vocab: Vocab mem: Pool spans: SpanGroups max_length: int length: int activations: Dict[str, Dict[str, Union[ArrayXd, Ragged]]] cats: Dict[str, float] user_hooks: Dict[str, Callable[..., Any]] user_token_hooks: Dict[str, Callable[..., Any]] user_span_hooks: Dict[str, Callable[..., Any]] tensor: np.ndarray[Any, np.dtype[np.float64]] user_data: Dict[str, Any] has_unknown_spaces: bool _context: Any @classmethod def set_extension( cls, name: str, default: Optional[Any] = ..., getter: Optional[Callable[[Doc], Any]] = ..., setter: Optional[Callable[[Doc, Any], None]] = ..., method: Optional[DocMethod] = ..., force: bool = ..., ) -> None: ... @classmethod def get_extension( cls, name: str ) -> Tuple[ Optional[Any], Optional[DocMethod], Optional[Callable[[Doc], Any]], Optional[Callable[[Doc, Any], None]], ]: ... @classmethod def has_extension(cls, name: str) -> bool: ... @classmethod def remove_extension( cls, name: str ) -> Tuple[ Optional[Any], Optional[DocMethod], Optional[Callable[[Doc], Any]], Optional[Callable[[Doc, Any], None]], ]: ... def __init__( self, vocab: Vocab, words: Optional[List[str]] = ..., spaces: Optional[List[bool]] = ..., user_data: Optional[Dict[Any, Any]] = ..., tags: Optional[List[str]] = ..., pos: Optional[List[str]] = ..., morphs: Optional[List[str]] = ..., lemmas: Optional[List[str]] = ..., heads: Optional[List[int]] = ..., deps: Optional[List[str]] = ..., sent_starts: Optional[List[Union[bool, int, None]]] = ..., ents: Optional[List[str]] = ..., ) -> None: ... @property def _(self) -> Underscore: ... @property def is_tagged(self) -> bool: ... @property def is_parsed(self) -> bool: ... @property def is_nered(self) -> bool: ... @property def is_sentenced(self) -> bool: ... def has_annotation( self, attr: Union[int, str], *, require_complete: bool = ... ) -> bool: ... @overload def __getitem__(self, i: int) -> Token: ... @overload def __getitem__(self, i: slice) -> Span: ... def __iter__(self) -> Iterator[Token]: ... def __len__(self) -> int: ... def __unicode__(self) -> str: ... def __bytes__(self) -> bytes: ... def __str__(self) -> str: ... def __repr__(self) -> str: ... @property def doc(self) -> Doc: ... def char_span( self, start_idx: int, end_idx: int, label: Union[int, str] = ..., *, kb_id: Union[int, str] = ..., vector: Optional[Floats1d] = ..., alignment_mode: str = ..., span_id: Union[int, str] = ..., ) -> Optional[Span]: ... def similarity(self, other: Union[Doc, Span, Token, Lexeme]) -> float: ... @property def has_vector(self) -> bool: ... vector: Floats1d vector_norm: float @property def text(self) -> str: ... @property def text_with_ws(self) -> str: ... # Ideally the getter would output Tuple[Span] # see https://github.com/python/mypy/issues/3004 @property def ents(self) -> Sequence[Span]: ... @ents.setter def ents(self, value: Sequence[Span]) -> None: ... def set_ents( self, entities: List[Span], *, blocked: Optional[List[Span]] = ..., missing: Optional[List[Span]] = ..., outside: Optional[List[Span]] = ..., default: str = ..., ) -> None: ... @property def noun_chunks(self) -> Tuple[Span]: ... @property def sents(self) -> Tuple[Span]: ... @property def lang(self) -> int: ... @property def lang_(self) -> str: ... def count_by( self, attr_id: int, exclude: Optional[Any] = ..., counts: Optional[Any] = ... ) -> Dict[Any, int]: ... def from_array( self, attrs: Union[int, str, List[Union[int, str]]], array: Ints2d ) -> Doc: ... def to_array( self, py_attr_ids: Union[int, str, List[Union[int, str]]] ) -> np.ndarray[Any, np.dtype[np.float64]]: ... @staticmethod def from_docs( docs: List[Doc], ensure_whitespace: bool = ..., attrs: Optional[Union[Tuple[Union[str, int]], List[Union[int, str]]]] = ..., ) -> Doc: ... def get_lca_matrix(self) -> Ints2d: ... def copy(self) -> Doc: ... def to_disk( self, path: Union[str, Path], *, exclude: Iterable[str] = ... ) -> None: ... def from_disk( self, path: Union[str, Path], *, exclude: Iterable[str] = ... ) -> Doc: ... def to_bytes(self, *, exclude: Iterable[str] = ...) -> bytes: ... def from_bytes(self, bytes_data: bytes, *, exclude: Iterable[str] = ...) -> Doc: ... def to_dict(self, *, exclude: Iterable[str] = ...) -> Dict[str, Any]: ... def from_dict( self, msg: Dict[str, Any], *, exclude: Iterable[str] = ... ) -> Doc: ... def extend_tensor(self, tensor: Floats2d) -> None: ... def retokenize(self) -> Retokenizer: ... def to_json(self, underscore: Optional[List[str]] = ...) -> Dict[str, Any]: ... def from_json( self, doc_json: Dict[str, Any] = ..., validate: bool = False ) -> Doc: ... def to_utf8_array(self, nr_char: int = ...) -> Ints2d: ... @staticmethod def _get_array_attrs() -> Tuple[Any]: ...