from pathlib import Path
from typing import (
    Any,
    Callable,
    Dict,
    Iterable,
    Iterator,
    List,
    Optional,
    Protocol,
    Tuple,
    Union,
    overload,
)

import numpy as np
from cymem.cymem import Pool
from thinc.types import ArrayXd, Floats1d, Floats2d, Ints2d, Ragged

from ..lexeme import Lexeme
from ..vocab import Vocab
from .retokenizer import Retokenizer
from .span import Span
from .span_groups import SpanGroups
from .token import Token
from .underscore import Underscore

DOCBIN_ALL_ATTRS: Tuple[str, ...]

class DocMethod(Protocol):
    def __call__(self: Doc, *args: Any, **kwargs: Any) -> Any: ...  # type: ignore[misc]

class Doc:
    vocab: Vocab
    mem: Pool
    spans: SpanGroups
    max_length: int
    length: int
    activations: Dict[str, Dict[str, Union[ArrayXd, Ragged]]]
    cats: Dict[str, float]
    user_hooks: Dict[str, Callable[..., Any]]
    user_token_hooks: Dict[str, Callable[..., Any]]
    user_span_hooks: Dict[str, Callable[..., Any]]
    tensor: np.ndarray[Any, np.dtype[np.float_]]
    user_data: Dict[str, Any]
    has_unknown_spaces: bool
    _context: Any
    @classmethod
    def set_extension(
        cls,
        name: str,
        default: Optional[Any] = ...,
        getter: Optional[Callable[[Doc], Any]] = ...,
        setter: Optional[Callable[[Doc, Any], None]] = ...,
        method: Optional[DocMethod] = ...,
        force: bool = ...,
    ) -> None: ...
    @classmethod
    def get_extension(
        cls, name: str
    ) -> Tuple[
        Optional[Any],
        Optional[DocMethod],
        Optional[Callable[[Doc], Any]],
        Optional[Callable[[Doc, Any], None]],
    ]: ...
    @classmethod
    def has_extension(cls, name: str) -> bool: ...
    @classmethod
    def remove_extension(
        cls, name: str
    ) -> Tuple[
        Optional[Any],
        Optional[DocMethod],
        Optional[Callable[[Doc], Any]],
        Optional[Callable[[Doc, Any], None]],
    ]: ...
    def __init__(
        self,
        vocab: Vocab,
        words: Optional[List[str]] = ...,
        spaces: Optional[List[bool]] = ...,
        user_data: Optional[Dict[Any, Any]] = ...,
        tags: Optional[List[str]] = ...,
        pos: Optional[List[str]] = ...,
        morphs: Optional[List[str]] = ...,
        lemmas: Optional[List[str]] = ...,
        heads: Optional[List[int]] = ...,
        deps: Optional[List[str]] = ...,
        sent_starts: Optional[List[Union[bool, int, None]]] = ...,
        ents: Optional[List[str]] = ...,
    ) -> None: ...
    @property
    def _(self) -> Underscore: ...
    @property
    def is_tagged(self) -> bool: ...
    @property
    def is_parsed(self) -> bool: ...
    @property
    def is_nered(self) -> bool: ...
    @property
    def is_sentenced(self) -> bool: ...
    def has_annotation(
        self, attr: Union[int, str], *, require_complete: bool = ...
    ) -> bool: ...
    @overload
    def __getitem__(self, i: int) -> Token: ...
    @overload
    def __getitem__(self, i: slice) -> Span: ...
    def __iter__(self) -> Iterator[Token]: ...
    def __len__(self) -> int: ...
    def __unicode__(self) -> str: ...
    def __bytes__(self) -> bytes: ...
    def __str__(self) -> str: ...
    def __repr__(self) -> str: ...
    @property
    def doc(self) -> Doc: ...
    def char_span(
        self,
        start_idx: int,
        end_idx: int,
        label: Union[int, str] = ...,
        *,
        kb_id: Union[int, str] = ...,
        vector: Optional[Floats1d] = ...,
        alignment_mode: str = ...,
        span_id: Union[int, str] = ...,
    ) -> Span: ...
    def similarity(self, other: Union[Doc, Span, Token, Lexeme]) -> float: ...
    @property
    def has_vector(self) -> bool: ...
    vector: Floats1d
    vector_norm: float
    @property
    def text(self) -> str: ...
    @property
    def text_with_ws(self) -> str: ...
    ents: Tuple[Span]
    def set_ents(
        self,
        entities: List[Span],
        *,
        blocked: Optional[List[Span]] = ...,
        missing: Optional[List[Span]] = ...,
        outside: Optional[List[Span]] = ...,
        default: str = ...,
    ) -> None: ...
    @property
    def noun_chunks(self) -> Tuple[Span]: ...
    @property
    def sents(self) -> Tuple[Span]: ...
    @property
    def lang(self) -> int: ...
    @property
    def lang_(self) -> str: ...
    def count_by(
        self, attr_id: int, exclude: Optional[Any] = ..., counts: Optional[Any] = ...
    ) -> Dict[Any, int]: ...
    def from_array(
        self, attrs: Union[int, str, List[Union[int, str]]], array: Ints2d
    ) -> Doc: ...
    def to_array(
        self, py_attr_ids: Union[int, str, List[Union[int, str]]]
    ) -> np.ndarray[Any, np.dtype[np.float_]]: ...
    @staticmethod
    def from_docs(
        docs: List[Doc],
        ensure_whitespace: bool = ...,
        attrs: Optional[Union[Tuple[Union[str, int]], List[Union[int, str]]]] = ...,
    ) -> Doc: ...
    def get_lca_matrix(self) -> Ints2d: ...
    def copy(self) -> Doc: ...
    def to_disk(
        self, path: Union[str, Path], *, exclude: Iterable[str] = ...
    ) -> None: ...
    def from_disk(
        self, path: Union[str, Path], *, exclude: Union[List[str], Tuple[str]] = ...
    ) -> Doc: ...
    def to_bytes(self, *, exclude: Union[List[str], Tuple[str]] = ...) -> bytes: ...
    def from_bytes(
        self, bytes_data: bytes, *, exclude: Union[List[str], Tuple[str]] = ...
    ) -> Doc: ...
    def to_dict(self, *, exclude: Union[List[str], Tuple[str]] = ...) -> bytes: ...
    def from_dict(
        self, msg: bytes, *, exclude: Union[List[str], Tuple[str]] = ...
    ) -> Doc: ...
    def extend_tensor(self, tensor: Floats2d) -> None: ...
    def retokenize(self) -> Retokenizer: ...
    def to_json(self, underscore: Optional[List[str]] = ...) -> Dict[str, Any]: ...
    def from_json(
        self, doc_json: Dict[str, Any] = ..., validate: bool = False
    ) -> Doc: ...
    def to_utf8_array(self, nr_char: int = ...) -> Ints2d: ...
    @staticmethod
    def _get_array_attrs() -> Tuple[Any]: ...