From 2d1e61449b20563abc165daa810b947267c1a5de Mon Sep 17 00:00:00 2001
From: Basile Dura <basile@bdura.me>
Date: Fri, 26 May 2023 17:06:21 +0200
Subject: [PATCH 1/6] feat: add example stubs

---
 spacy/training/example.pyi | 61 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)
 create mode 100644 spacy/training/example.pyi

diff --git a/spacy/training/example.pyi b/spacy/training/example.pyi
new file mode 100644
index 000000000..4c2092d18
--- /dev/null
+++ b/spacy/training/example.pyi
@@ -0,0 +1,61 @@
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
+
+from ..tokens import Doc, Span
+from .alignment import Alignment
+
+def validate_examples(examples: Iterable[Example], method: str) -> None: ...
+def validate_get_examples(
+    get_examples: Callable[[], Iterable[Example]], method: str
+): ...
+
+class Example:
+    def __init__(
+        self,
+        predicted: Doc,
+        reference: Doc,
+        *,
+        alignment: Optional[Alignment] = None,
+    ): ...
+    def __len__(self) -> int: ...
+    @property
+    def predicted(self) -> Doc: ...
+    @predicted.setter
+    def predicted(self, doc: Doc) -> None: ...
+    @property
+    def reference(self) -> Doc: ...
+    @reference.setter
+    def reference(self, doc: Doc) -> None: ...
+    def copy(self) -> Example: ...
+    @classmethod
+    def from_dict(cls, predicted: Doc, example_dict: Dict) -> Example: ...
+    @property
+    def alignment(self) -> Optional[Alignment]: ...
+    def _get_aligned_vectorized(self, align, gold_values): ...
+    def _get_aligned_non_vectorized(self, align, gold_values): ...
+    def get_aligned(self, field, as_string=False): ...
+    def get_aligned_parse(self, projectivize=True): ...
+    def get_aligned_sent_starts(self): ...
+    def get_aligned_spans_x2y(self, x_spans, allow_overlap=False): ...
+    def get_aligned_spans_y2x(self, y_spans, allow_overlap=False): ...
+    def _get_aligned_spans(self, doc, spans, align, allow_overlap): ...
+    def get_aligned_ents_and_ner(self): ...
+    def get_aligned_ner(self): ...
+    def get_matching_ents(self, check_label: bool = True) -> List[Span]: ...
+    def to_dict(self) -> Dict[str, Any]: ...
+    def _spans_to_dict(self) -> Dict[str, List[Tuple[int, int, str, str]]]: ...
+    def _links_to_dict(self) -> Dict[Tuple[int, int], Dict[str, float]]: ...
+    def split_sents(self) -> List[Example]: ...
+    @property
+    def text(self) -> str: ...
+    def __str__(self) -> str: ...
+    def __repr__(self) -> str: ...
+
+def _annot2array(vocab, tok_annot, doc_annot): ...
+def _add_spans_to_doc(doc, spans_data): ...
+def _add_entities_to_doc(doc, ner_data): ...
+def _parse_example_dict_data(example_dict): ...
+def _fix_legacy_dict_data(example_dict): ...
+def _has_field(annot, field) -> bool: ...
+def _parse_ner_tags(biluo_or_offsets, vocab, words, spaces): ...
+def _parse_links(vocab, words, spaces, links): ...
+def _guess_spaces(text, words): ...

From 967ce504fda0d11d11d9520e68906a123818530a Mon Sep 17 00:00:00 2001
From: Basile Dura <basile@bdura.me>
Date: Wed, 31 May 2023 10:57:00 +0200
Subject: [PATCH 2/6] fix: add required annotations

---
 spacy/training/example.pyi | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/spacy/training/example.pyi b/spacy/training/example.pyi
index 4c2092d18..1413d857d 100644
--- a/spacy/training/example.pyi
+++ b/spacy/training/example.pyi
@@ -3,12 +3,24 @@ from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
 from ..tokens import Doc, Span
 from .alignment import Alignment
 
-def validate_examples(examples: Iterable[Example], method: str) -> None: ...
+def annotations_to_doc(
+    vocab,
+    tok_annot,
+    doc_annot,
+) -> Doc: ...
+def validate_examples(
+    examples: Iterable[Example],
+    method: str,
+) -> None: ...
 def validate_get_examples(
-    get_examples: Callable[[], Iterable[Example]], method: str
+    get_examples: Callable[[], Iterable[Example]],
+    method: str,
 ): ...
 
 class Example:
+    x: Doc
+    y: Doc
+
     def __init__(
         self,
         predicted: Doc,
@@ -29,7 +41,7 @@ class Example:
     @classmethod
     def from_dict(cls, predicted: Doc, example_dict: Dict) -> Example: ...
     @property
-    def alignment(self) -> Optional[Alignment]: ...
+    def alignment(self) -> Alignment: ...
     def _get_aligned_vectorized(self, align, gold_values): ...
     def _get_aligned_non_vectorized(self, align, gold_values): ...
     def get_aligned(self, field, as_string=False): ...

From 9cd17d7962b9600d130b96f686f3c733cfa5f744 Mon Sep 17 00:00:00 2001
From: Basile Dura <basile@bdura.me>
Date: Wed, 31 May 2023 10:57:11 +0200
Subject: [PATCH 3/6] fix: mypy issues

---
 spacy/tokens/doc.pyi     | 37 ++++++++++++++++++++++++++++---------
 spacy/training/corpus.py | 37 +++++++++++++++++++++----------------
 2 files changed, 49 insertions(+), 25 deletions(-)

diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi
index 9d45960ab..353983fd0 100644
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@@ -1,16 +1,30 @@
-from typing import Callable, Protocol, Iterable, Iterator, Optional
-from typing import Union, Tuple, List, Dict, Any, overload
+from pathlib import Path
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    Iterator,
+    List,
+    Optional,
+    Protocol,
+    Sequence,
+    Tuple,
+    Union,
+    overload,
+)
+
+import numpy as np
 from cymem.cymem import Pool
 from thinc.types import Floats1d, Floats2d, Ints2d
-from .span import Span
-from .token import Token
-from ._dict_proxies import SpanGroups
-from ._retokenize import Retokenizer
+
 from ..lexeme import Lexeme
 from ..vocab import Vocab
+from ._dict_proxies import SpanGroups
+from ._retokenize import Retokenizer
+from .span import Span
+from .token import Token
 from .underscore import Underscore
-from pathlib import Path
-import numpy as np
 
 class DocMethod(Protocol):
     def __call__(self: Doc, *args: Any, **kwargs: Any) -> Any: ...  # type: ignore[misc]
@@ -119,7 +133,12 @@ class Doc:
     def text(self) -> str: ...
     @property
     def text_with_ws(self) -> str: ...
-    ents: Tuple[Span]
+    # Ideally the getter would output Tuple[Span]
+    # see https://github.com/python/mypy/issues/3004
+    @property
+    def ents(self) -> Sequence[Span]: ...
+    @ents.setter
+    def ents(self, value: Sequence[Span]) -> None: ...
     def set_ents(
         self,
         entities: List[Span],
diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py
index 086ad831c..f05d09bcb 100644
--- a/spacy/training/corpus.py
+++ b/spacy/training/corpus.py
@@ -1,16 +1,16 @@
-import warnings
-from typing import Union, List, Iterable, Iterator, TYPE_CHECKING, Callable
-from typing import Optional
-from pathlib import Path
 import random
+import warnings
+from pathlib import Path
+from typing import TYPE_CHECKING, Callable, Iterable, List, Optional, Protocol, Union
+
 import srsly
 
 from .. import util
+from ..errors import Errors, Warnings
+from ..tokens import Doc, DocBin
+from ..vocab import Vocab
 from .augment import dont_augment
 from .example import Example
-from ..errors import Warnings, Errors
-from ..tokens import DocBin, Doc
-from ..vocab import Vocab
 
 if TYPE_CHECKING:
     # This lets us add type hints for mypy etc. without causing circular imports
@@ -19,6 +19,11 @@ if TYPE_CHECKING:
 FILE_TYPE = ".spacy"
 
 
+class ReaderProtocol(Protocol):
+    def __call__(self, nlp: "Language") -> Iterable[Example]:
+        pass
+
+
 @util.registry.readers("spacy.Corpus.v1")
 def create_docbin_reader(
     path: Optional[Path],
@@ -26,7 +31,7 @@ def create_docbin_reader(
     max_length: int = 0,
     limit: int = 0,
     augmenter: Optional[Callable] = None,
-) -> Callable[["Language"], Iterable[Example]]:
+) -> ReaderProtocol:
     if path is None:
         raise ValueError(Errors.E913)
     util.logger.debug("Loading corpus from path: %s", path)
@@ -45,7 +50,7 @@ def create_jsonl_reader(
     min_length: int = 0,
     max_length: int = 0,
     limit: int = 0,
-) -> Callable[["Language"], Iterable[Example]]:
+) -> ReaderProtocol:
     return JsonlCorpus(path, min_length=min_length, max_length=max_length, limit=limit)
 
 
@@ -63,7 +68,7 @@ def create_plain_text_reader(
     path: Optional[Path],
     min_length: int = 0,
     max_length: int = 0,
-) -> Callable[["Language"], Iterable[Doc]]:
+) -> ReaderProtocol:
     """Iterate Example objects from a file or directory of plain text
     UTF-8 files with one line per doc.
 
@@ -144,7 +149,7 @@ class Corpus:
         self.augmenter = augmenter if augmenter is not None else dont_augment
         self.shuffle = shuffle
 
-    def __call__(self, nlp: "Language") -> Iterator[Example]:
+    def __call__(self, nlp: "Language") -> Iterable[Example]:
         """Yield examples from the data.
 
         nlp (Language): The current nlp object.
@@ -182,7 +187,7 @@ class Corpus:
 
     def make_examples(
         self, nlp: "Language", reference_docs: Iterable[Doc]
-    ) -> Iterator[Example]:
+    ) -> Iterable[Example]:
         for reference in reference_docs:
             if len(reference) == 0:
                 continue
@@ -197,7 +202,7 @@ class Corpus:
 
     def make_examples_gold_preproc(
         self, nlp: "Language", reference_docs: Iterable[Doc]
-    ) -> Iterator[Example]:
+    ) -> Iterable[Example]:
         for reference in reference_docs:
             if reference.has_annotation("SENT_START"):
                 ref_sents = [sent.as_doc() for sent in reference.sents]
@@ -210,7 +215,7 @@ class Corpus:
 
     def read_docbin(
         self, vocab: Vocab, locs: Iterable[Union[str, Path]]
-    ) -> Iterator[Doc]:
+    ) -> Iterable[Doc]:
         """Yield training examples as example dicts"""
         i = 0
         for loc in locs:
@@ -257,7 +262,7 @@ class JsonlCorpus:
         self.max_length = max_length
         self.limit = limit
 
-    def __call__(self, nlp: "Language") -> Iterator[Example]:
+    def __call__(self, nlp: "Language") -> Iterable[Example]:
         """Yield examples from the data.
 
         nlp (Language): The current nlp object.
@@ -307,7 +312,7 @@ class PlainTextCorpus:
         self.min_length = min_length
         self.max_length = max_length
 
-    def __call__(self, nlp: "Language") -> Iterator[Example]:
+    def __call__(self, nlp: "Language") -> Iterable[Example]:
         """Yield examples from the data.
 
         nlp (Language): The current nlp object.

From 792b86d333741a66519fbadbde419830cb447323 Mon Sep 17 00:00:00 2001
From: Basile Dura <basile@bdura.me>
Date: Wed, 31 May 2023 11:12:13 +0200
Subject: [PATCH 4/6] fix: use Py36-compatible Portocol

---
 spacy/training/corpus.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py
index f05d09bcb..d6695c8e2 100644
--- a/spacy/training/corpus.py
+++ b/spacy/training/corpus.py
@@ -1,11 +1,12 @@
 import random
 import warnings
 from pathlib import Path
-from typing import TYPE_CHECKING, Callable, Iterable, List, Optional, Protocol, Union
+from typing import TYPE_CHECKING, Callable, Iterable, List, Optional, Union
 
 import srsly
 
 from .. import util
+from ..compat import Protocol
 from ..errors import Errors, Warnings
 from ..tokens import Doc, DocBin
 from ..vocab import Vocab

From 086601410f098897c4efef70c1223989b4a31d4c Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 9 Jun 2023 13:11:01 +0200
Subject: [PATCH 5/6] Minor reformatting

---
 spacy/tokens/doc.pyi     | 22 +++++-----------------
 spacy/training/corpus.py |  2 +-
 2 files changed, 6 insertions(+), 18 deletions(-)

diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi
index 353983fd0..c21a8aa17 100644
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@@ -1,20 +1,8 @@
+from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional
+from typing import Protocol, Sequence, Tuple, Union, overload
 from pathlib import Path
-from typing import (
-    Any,
-    Callable,
-    Dict,
-    Iterable,
-    Iterator,
-    List,
-    Optional,
-    Protocol,
-    Sequence,
-    Tuple,
-    Union,
-    overload,
-)
+import numpy
 
-import numpy as np
 from cymem.cymem import Pool
 from thinc.types import Floats1d, Floats2d, Ints2d
 
@@ -40,7 +28,7 @@ class Doc:
     user_hooks: Dict[str, Callable[..., Any]]
     user_token_hooks: Dict[str, Callable[..., Any]]
     user_span_hooks: Dict[str, Callable[..., Any]]
-    tensor: np.ndarray[Any, np.dtype[np.float_]]
+    tensor: numpy.ndarray[Any, numpy.dtype[numpy.float_]]
     user_data: Dict[str, Any]
     has_unknown_spaces: bool
     _context: Any
@@ -164,7 +152,7 @@ class Doc:
     ) -> Doc: ...
     def to_array(
         self, py_attr_ids: Union[int, str, List[Union[int, str]]]
-    ) -> np.ndarray[Any, np.dtype[np.float_]]: ...
+    ) -> numpy.ndarray[Any, numpy.dtype[numpy.float_]]: ...
     @staticmethod
     def from_docs(
         docs: List[Doc],
diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py
index d6695c8e2..63b5a361d 100644
--- a/spacy/training/corpus.py
+++ b/spacy/training/corpus.py
@@ -1,7 +1,7 @@
+from typing import TYPE_CHECKING, Callable, Iterable, List, Optional, Union
 import random
 import warnings
 from pathlib import Path
-from typing import TYPE_CHECKING, Callable, Iterable, List, Optional, Union
 
 import srsly
 

From a93f6202df63d86510c7086ca1b0db25a41fd35f Mon Sep 17 00:00:00 2001
From: svlandeg <svlandeg@github.com>
Date: Thu, 6 Jul 2023 15:57:42 +0200
Subject: [PATCH 6/6] adding further type specifications and removing internal
 methods

---
 spacy/training/example.pyi | 36 +++++++++++-------------------------
 1 file changed, 11 insertions(+), 25 deletions(-)

diff --git a/spacy/training/example.pyi b/spacy/training/example.pyi
index 1413d857d..9cd563465 100644
--- a/spacy/training/example.pyi
+++ b/spacy/training/example.pyi
@@ -1,12 +1,13 @@
-from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
+from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple
 
 from ..tokens import Doc, Span
+from ..vocab import Vocab
 from .alignment import Alignment
 
 def annotations_to_doc(
-    vocab,
-    tok_annot,
-    doc_annot,
+    vocab: Vocab,
+    tok_annot: Dict[str, Any],
+    doc_annot: Dict[str, Any],
 ) -> Doc: ...
 def validate_examples(
     examples: Iterable[Example],
@@ -39,35 +40,20 @@ class Example:
     def reference(self, doc: Doc) -> None: ...
     def copy(self) -> Example: ...
     @classmethod
-    def from_dict(cls, predicted: Doc, example_dict: Dict) -> Example: ...
+    def from_dict(cls, predicted: Doc, example_dict: Dict[str, Any]) -> Example: ...
     @property
     def alignment(self) -> Alignment: ...
-    def _get_aligned_vectorized(self, align, gold_values): ...
-    def _get_aligned_non_vectorized(self, align, gold_values): ...
-    def get_aligned(self, field, as_string=False): ...
+    def get_aligned(self, field: str, as_string=False): ...
     def get_aligned_parse(self, projectivize=True): ...
     def get_aligned_sent_starts(self): ...
-    def get_aligned_spans_x2y(self, x_spans, allow_overlap=False): ...
-    def get_aligned_spans_y2x(self, y_spans, allow_overlap=False): ...
-    def _get_aligned_spans(self, doc, spans, align, allow_overlap): ...
-    def get_aligned_ents_and_ner(self): ...
-    def get_aligned_ner(self): ...
+    def get_aligned_spans_x2y(self, x_spans: Sequence[Span], allow_overlap=False) -> List[Span]: ...
+    def get_aligned_spans_y2x(self, y_spans: Sequence[Span], allow_overlap=False) -> List[Span]: ...
+    def get_aligned_ents_and_ner(self) -> Tuple[List[Span], List[str]]: ...
+    def get_aligned_ner(self) -> List[str]: ...
     def get_matching_ents(self, check_label: bool = True) -> List[Span]: ...
     def to_dict(self) -> Dict[str, Any]: ...
-    def _spans_to_dict(self) -> Dict[str, List[Tuple[int, int, str, str]]]: ...
-    def _links_to_dict(self) -> Dict[Tuple[int, int], Dict[str, float]]: ...
     def split_sents(self) -> List[Example]: ...
     @property
     def text(self) -> str: ...
     def __str__(self) -> str: ...
     def __repr__(self) -> str: ...
-
-def _annot2array(vocab, tok_annot, doc_annot): ...
-def _add_spans_to_doc(doc, spans_data): ...
-def _add_entities_to_doc(doc, ner_data): ...
-def _parse_example_dict_data(example_dict): ...
-def _fix_legacy_dict_data(example_dict): ...
-def _has_field(annot, field) -> bool: ...
-def _parse_ner_tags(biluo_or_offsets, vocab, words, spaces): ...
-def _parse_links(vocab, words, spaces, links): ...
-def _guess_spaces(text, words): ...