From 2d1e61449b20563abc165daa810b947267c1a5de Mon Sep 17 00:00:00 2001 From: Basile Dura Date: Fri, 26 May 2023 17:06:21 +0200 Subject: [PATCH 1/6] feat: add example stubs --- spacy/training/example.pyi | 61 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 spacy/training/example.pyi diff --git a/spacy/training/example.pyi b/spacy/training/example.pyi new file mode 100644 index 000000000..4c2092d18 --- /dev/null +++ b/spacy/training/example.pyi @@ -0,0 +1,61 @@ +from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple + +from ..tokens import Doc, Span +from .alignment import Alignment + +def validate_examples(examples: Iterable[Example], method: str) -> None: ... +def validate_get_examples( + get_examples: Callable[[], Iterable[Example]], method: str +): ... + +class Example: + def __init__( + self, + predicted: Doc, + reference: Doc, + *, + alignment: Optional[Alignment] = None, + ): ... + def __len__(self) -> int: ... + @property + def predicted(self) -> Doc: ... + @predicted.setter + def predicted(self, doc: Doc) -> None: ... + @property + def reference(self) -> Doc: ... + @reference.setter + def reference(self, doc: Doc) -> None: ... + def copy(self) -> Example: ... + @classmethod + def from_dict(cls, predicted: Doc, example_dict: Dict) -> Example: ... + @property + def alignment(self) -> Optional[Alignment]: ... + def _get_aligned_vectorized(self, align, gold_values): ... + def _get_aligned_non_vectorized(self, align, gold_values): ... + def get_aligned(self, field, as_string=False): ... + def get_aligned_parse(self, projectivize=True): ... + def get_aligned_sent_starts(self): ... + def get_aligned_spans_x2y(self, x_spans, allow_overlap=False): ... + def get_aligned_spans_y2x(self, y_spans, allow_overlap=False): ... + def _get_aligned_spans(self, doc, spans, align, allow_overlap): ... + def get_aligned_ents_and_ner(self): ... + def get_aligned_ner(self): ... + def get_matching_ents(self, check_label: bool = True) -> List[Span]: ... + def to_dict(self) -> Dict[str, Any]: ... + def _spans_to_dict(self) -> Dict[str, List[Tuple[int, int, str, str]]]: ... + def _links_to_dict(self) -> Dict[Tuple[int, int], Dict[str, float]]: ... + def split_sents(self) -> List[Example]: ... + @property + def text(self) -> str: ... + def __str__(self) -> str: ... + def __repr__(self) -> str: ... + +def _annot2array(vocab, tok_annot, doc_annot): ... +def _add_spans_to_doc(doc, spans_data): ... +def _add_entities_to_doc(doc, ner_data): ... +def _parse_example_dict_data(example_dict): ... +def _fix_legacy_dict_data(example_dict): ... +def _has_field(annot, field) -> bool: ... +def _parse_ner_tags(biluo_or_offsets, vocab, words, spaces): ... +def _parse_links(vocab, words, spaces, links): ... +def _guess_spaces(text, words): ... From 967ce504fda0d11d11d9520e68906a123818530a Mon Sep 17 00:00:00 2001 From: Basile Dura Date: Wed, 31 May 2023 10:57:00 +0200 Subject: [PATCH 2/6] fix: add required annotations --- spacy/training/example.pyi | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/spacy/training/example.pyi b/spacy/training/example.pyi index 4c2092d18..1413d857d 100644 --- a/spacy/training/example.pyi +++ b/spacy/training/example.pyi @@ -3,12 +3,24 @@ from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple from ..tokens import Doc, Span from .alignment import Alignment -def validate_examples(examples: Iterable[Example], method: str) -> None: ... +def annotations_to_doc( + vocab, + tok_annot, + doc_annot, +) -> Doc: ... +def validate_examples( + examples: Iterable[Example], + method: str, +) -> None: ... def validate_get_examples( - get_examples: Callable[[], Iterable[Example]], method: str + get_examples: Callable[[], Iterable[Example]], + method: str, ): ... class Example: + x: Doc + y: Doc + def __init__( self, predicted: Doc, @@ -29,7 +41,7 @@ class Example: @classmethod def from_dict(cls, predicted: Doc, example_dict: Dict) -> Example: ... @property - def alignment(self) -> Optional[Alignment]: ... + def alignment(self) -> Alignment: ... def _get_aligned_vectorized(self, align, gold_values): ... def _get_aligned_non_vectorized(self, align, gold_values): ... def get_aligned(self, field, as_string=False): ... From 9cd17d7962b9600d130b96f686f3c733cfa5f744 Mon Sep 17 00:00:00 2001 From: Basile Dura Date: Wed, 31 May 2023 10:57:11 +0200 Subject: [PATCH 3/6] fix: mypy issues --- spacy/tokens/doc.pyi | 37 ++++++++++++++++++++++++++++--------- spacy/training/corpus.py | 37 +++++++++++++++++++++---------------- 2 files changed, 49 insertions(+), 25 deletions(-) diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi index 9d45960ab..353983fd0 100644 --- a/spacy/tokens/doc.pyi +++ b/spacy/tokens/doc.pyi @@ -1,16 +1,30 @@ -from typing import Callable, Protocol, Iterable, Iterator, Optional -from typing import Union, Tuple, List, Dict, Any, overload +from pathlib import Path +from typing import ( + Any, + Callable, + Dict, + Iterable, + Iterator, + List, + Optional, + Protocol, + Sequence, + Tuple, + Union, + overload, +) + +import numpy as np from cymem.cymem import Pool from thinc.types import Floats1d, Floats2d, Ints2d -from .span import Span -from .token import Token -from ._dict_proxies import SpanGroups -from ._retokenize import Retokenizer + from ..lexeme import Lexeme from ..vocab import Vocab +from ._dict_proxies import SpanGroups +from ._retokenize import Retokenizer +from .span import Span +from .token import Token from .underscore import Underscore -from pathlib import Path -import numpy as np class DocMethod(Protocol): def __call__(self: Doc, *args: Any, **kwargs: Any) -> Any: ... # type: ignore[misc] @@ -119,7 +133,12 @@ class Doc: def text(self) -> str: ... @property def text_with_ws(self) -> str: ... - ents: Tuple[Span] + # Ideally the getter would output Tuple[Span] + # see https://github.com/python/mypy/issues/3004 + @property + def ents(self) -> Sequence[Span]: ... + @ents.setter + def ents(self, value: Sequence[Span]) -> None: ... def set_ents( self, entities: List[Span], diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py index 086ad831c..f05d09bcb 100644 --- a/spacy/training/corpus.py +++ b/spacy/training/corpus.py @@ -1,16 +1,16 @@ -import warnings -from typing import Union, List, Iterable, Iterator, TYPE_CHECKING, Callable -from typing import Optional -from pathlib import Path import random +import warnings +from pathlib import Path +from typing import TYPE_CHECKING, Callable, Iterable, List, Optional, Protocol, Union + import srsly from .. import util +from ..errors import Errors, Warnings +from ..tokens import Doc, DocBin +from ..vocab import Vocab from .augment import dont_augment from .example import Example -from ..errors import Warnings, Errors -from ..tokens import DocBin, Doc -from ..vocab import Vocab if TYPE_CHECKING: # This lets us add type hints for mypy etc. without causing circular imports @@ -19,6 +19,11 @@ if TYPE_CHECKING: FILE_TYPE = ".spacy" +class ReaderProtocol(Protocol): + def __call__(self, nlp: "Language") -> Iterable[Example]: + pass + + @util.registry.readers("spacy.Corpus.v1") def create_docbin_reader( path: Optional[Path], @@ -26,7 +31,7 @@ def create_docbin_reader( max_length: int = 0, limit: int = 0, augmenter: Optional[Callable] = None, -) -> Callable[["Language"], Iterable[Example]]: +) -> ReaderProtocol: if path is None: raise ValueError(Errors.E913) util.logger.debug("Loading corpus from path: %s", path) @@ -45,7 +50,7 @@ def create_jsonl_reader( min_length: int = 0, max_length: int = 0, limit: int = 0, -) -> Callable[["Language"], Iterable[Example]]: +) -> ReaderProtocol: return JsonlCorpus(path, min_length=min_length, max_length=max_length, limit=limit) @@ -63,7 +68,7 @@ def create_plain_text_reader( path: Optional[Path], min_length: int = 0, max_length: int = 0, -) -> Callable[["Language"], Iterable[Doc]]: +) -> ReaderProtocol: """Iterate Example objects from a file or directory of plain text UTF-8 files with one line per doc. @@ -144,7 +149,7 @@ class Corpus: self.augmenter = augmenter if augmenter is not None else dont_augment self.shuffle = shuffle - def __call__(self, nlp: "Language") -> Iterator[Example]: + def __call__(self, nlp: "Language") -> Iterable[Example]: """Yield examples from the data. nlp (Language): The current nlp object. @@ -182,7 +187,7 @@ class Corpus: def make_examples( self, nlp: "Language", reference_docs: Iterable[Doc] - ) -> Iterator[Example]: + ) -> Iterable[Example]: for reference in reference_docs: if len(reference) == 0: continue @@ -197,7 +202,7 @@ class Corpus: def make_examples_gold_preproc( self, nlp: "Language", reference_docs: Iterable[Doc] - ) -> Iterator[Example]: + ) -> Iterable[Example]: for reference in reference_docs: if reference.has_annotation("SENT_START"): ref_sents = [sent.as_doc() for sent in reference.sents] @@ -210,7 +215,7 @@ class Corpus: def read_docbin( self, vocab: Vocab, locs: Iterable[Union[str, Path]] - ) -> Iterator[Doc]: + ) -> Iterable[Doc]: """Yield training examples as example dicts""" i = 0 for loc in locs: @@ -257,7 +262,7 @@ class JsonlCorpus: self.max_length = max_length self.limit = limit - def __call__(self, nlp: "Language") -> Iterator[Example]: + def __call__(self, nlp: "Language") -> Iterable[Example]: """Yield examples from the data. nlp (Language): The current nlp object. @@ -307,7 +312,7 @@ class PlainTextCorpus: self.min_length = min_length self.max_length = max_length - def __call__(self, nlp: "Language") -> Iterator[Example]: + def __call__(self, nlp: "Language") -> Iterable[Example]: """Yield examples from the data. nlp (Language): The current nlp object. From 792b86d333741a66519fbadbde419830cb447323 Mon Sep 17 00:00:00 2001 From: Basile Dura Date: Wed, 31 May 2023 11:12:13 +0200 Subject: [PATCH 4/6] fix: use Py36-compatible Portocol --- spacy/training/corpus.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py index f05d09bcb..d6695c8e2 100644 --- a/spacy/training/corpus.py +++ b/spacy/training/corpus.py @@ -1,11 +1,12 @@ import random import warnings from pathlib import Path -from typing import TYPE_CHECKING, Callable, Iterable, List, Optional, Protocol, Union +from typing import TYPE_CHECKING, Callable, Iterable, List, Optional, Union import srsly from .. import util +from ..compat import Protocol from ..errors import Errors, Warnings from ..tokens import Doc, DocBin from ..vocab import Vocab From 086601410f098897c4efef70c1223989b4a31d4c Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 9 Jun 2023 13:11:01 +0200 Subject: [PATCH 5/6] Minor reformatting --- spacy/tokens/doc.pyi | 22 +++++----------------- spacy/training/corpus.py | 2 +- 2 files changed, 6 insertions(+), 18 deletions(-) diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi index 353983fd0..c21a8aa17 100644 --- a/spacy/tokens/doc.pyi +++ b/spacy/tokens/doc.pyi @@ -1,20 +1,8 @@ +from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional +from typing import Protocol, Sequence, Tuple, Union, overload from pathlib import Path -from typing import ( - Any, - Callable, - Dict, - Iterable, - Iterator, - List, - Optional, - Protocol, - Sequence, - Tuple, - Union, - overload, -) +import numpy -import numpy as np from cymem.cymem import Pool from thinc.types import Floats1d, Floats2d, Ints2d @@ -40,7 +28,7 @@ class Doc: user_hooks: Dict[str, Callable[..., Any]] user_token_hooks: Dict[str, Callable[..., Any]] user_span_hooks: Dict[str, Callable[..., Any]] - tensor: np.ndarray[Any, np.dtype[np.float_]] + tensor: numpy.ndarray[Any, numpy.dtype[numpy.float_]] user_data: Dict[str, Any] has_unknown_spaces: bool _context: Any @@ -164,7 +152,7 @@ class Doc: ) -> Doc: ... def to_array( self, py_attr_ids: Union[int, str, List[Union[int, str]]] - ) -> np.ndarray[Any, np.dtype[np.float_]]: ... + ) -> numpy.ndarray[Any, numpy.dtype[numpy.float_]]: ... @staticmethod def from_docs( docs: List[Doc], diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py index d6695c8e2..63b5a361d 100644 --- a/spacy/training/corpus.py +++ b/spacy/training/corpus.py @@ -1,7 +1,7 @@ +from typing import TYPE_CHECKING, Callable, Iterable, List, Optional, Union import random import warnings from pathlib import Path -from typing import TYPE_CHECKING, Callable, Iterable, List, Optional, Union import srsly From a93f6202df63d86510c7086ca1b0db25a41fd35f Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 6 Jul 2023 15:57:42 +0200 Subject: [PATCH 6/6] adding further type specifications and removing internal methods --- spacy/training/example.pyi | 36 +++++++++++------------------------- 1 file changed, 11 insertions(+), 25 deletions(-) diff --git a/spacy/training/example.pyi b/spacy/training/example.pyi index 1413d857d..9cd563465 100644 --- a/spacy/training/example.pyi +++ b/spacy/training/example.pyi @@ -1,12 +1,13 @@ -from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple +from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple from ..tokens import Doc, Span +from ..vocab import Vocab from .alignment import Alignment def annotations_to_doc( - vocab, - tok_annot, - doc_annot, + vocab: Vocab, + tok_annot: Dict[str, Any], + doc_annot: Dict[str, Any], ) -> Doc: ... def validate_examples( examples: Iterable[Example], @@ -39,35 +40,20 @@ class Example: def reference(self, doc: Doc) -> None: ... def copy(self) -> Example: ... @classmethod - def from_dict(cls, predicted: Doc, example_dict: Dict) -> Example: ... + def from_dict(cls, predicted: Doc, example_dict: Dict[str, Any]) -> Example: ... @property def alignment(self) -> Alignment: ... - def _get_aligned_vectorized(self, align, gold_values): ... - def _get_aligned_non_vectorized(self, align, gold_values): ... - def get_aligned(self, field, as_string=False): ... + def get_aligned(self, field: str, as_string=False): ... def get_aligned_parse(self, projectivize=True): ... def get_aligned_sent_starts(self): ... - def get_aligned_spans_x2y(self, x_spans, allow_overlap=False): ... - def get_aligned_spans_y2x(self, y_spans, allow_overlap=False): ... - def _get_aligned_spans(self, doc, spans, align, allow_overlap): ... - def get_aligned_ents_and_ner(self): ... - def get_aligned_ner(self): ... + def get_aligned_spans_x2y(self, x_spans: Sequence[Span], allow_overlap=False) -> List[Span]: ... + def get_aligned_spans_y2x(self, y_spans: Sequence[Span], allow_overlap=False) -> List[Span]: ... + def get_aligned_ents_and_ner(self) -> Tuple[List[Span], List[str]]: ... + def get_aligned_ner(self) -> List[str]: ... def get_matching_ents(self, check_label: bool = True) -> List[Span]: ... def to_dict(self) -> Dict[str, Any]: ... - def _spans_to_dict(self) -> Dict[str, List[Tuple[int, int, str, str]]]: ... - def _links_to_dict(self) -> Dict[Tuple[int, int], Dict[str, float]]: ... def split_sents(self) -> List[Example]: ... @property def text(self) -> str: ... def __str__(self) -> str: ... def __repr__(self) -> str: ... - -def _annot2array(vocab, tok_annot, doc_annot): ... -def _add_spans_to_doc(doc, spans_data): ... -def _add_entities_to_doc(doc, ner_data): ... -def _parse_example_dict_data(example_dict): ... -def _fix_legacy_dict_data(example_dict): ... -def _has_field(annot, field) -> bool: ... -def _parse_ner_tags(biluo_or_offsets, vocab, words, spaces): ... -def _parse_links(vocab, words, spaces, links): ... -def _guess_spaces(text, words): ...