other.mem.alloc(buff_size, sizeof(TokenC))
memcpy(tokens, self.c - PADDING, buff_size * sizeof(TokenC))
other.c = &tokens[PADDING]
+ # copy spans after setting tokens so that SpanGroup.copy can verify
+ # that the start/end offsets are valid
+ other.spans = self.spans.copy(doc=other)
return other
def to_disk(self, path, *, exclude=tuple()):
diff --git a/spacy/tokens/graph.pxd b/spacy/tokens/graph.pxd
index 6f2f80656..083ef6522 100644
--- a/spacy/tokens/graph.pxd
+++ b/spacy/tokens/graph.pxd
@@ -1,7 +1,8 @@
-from libcpp.vector cimport vector
from cymem.cymem cimport Pool
+from libcpp.vector cimport vector
from preshed.maps cimport PreshMap
-from ..structs cimport GraphC, EdgeC
+
+from ..structs cimport EdgeC, GraphC
cdef class Graph:
diff --git a/spacy/tokens/graph.pyx b/spacy/tokens/graph.pyx
index adc4d23c8..47f0a20d4 100644
--- a/spacy/tokens/graph.pyx
+++ b/spacy/tokens/graph.pyx
@@ -1,19 +1,26 @@
# cython: infer_types=True, cdivision=True, boundscheck=False, binding=True
-from typing import List, Tuple, Generator
+from typing import Generator, List, Tuple
+
+cimport cython
+from cython.operator cimport dereference
from libc.stdint cimport int32_t, int64_t
from libcpp.pair cimport pair
from libcpp.unordered_map cimport unordered_map
from libcpp.unordered_set cimport unordered_set
-from cython.operator cimport dereference
-cimport cython
+
import weakref
-from preshed.maps cimport map_get_unless_missing
+
from murmurhash.mrmr cimport hash64
+from preshed.maps cimport map_get_unless_missing
from .. import Errors
+
from ..typedefs cimport hash_t
+
from ..strings import get_string_id
+
from ..structs cimport EdgeC, GraphC
+
from .token import Token
diff --git a/spacy/tokens/morphanalysis.pxd b/spacy/tokens/morphanalysis.pxd
index 9510875c9..728f0aaf7 100644
--- a/spacy/tokens/morphanalysis.pxd
+++ b/spacy/tokens/morphanalysis.pxd
@@ -1,6 +1,6 @@
-from ..vocab cimport Vocab
-from ..typedefs cimport hash_t
from ..structs cimport MorphAnalysisC
+from ..typedefs cimport hash_t
+from ..vocab cimport Vocab
cdef class MorphAnalysis:
diff --git a/spacy/tokens/morphanalysis.pyi b/spacy/tokens/morphanalysis.pyi
index a5376e80d..b35ff36aa 100644
--- a/spacy/tokens/morphanalysis.pyi
+++ b/spacy/tokens/morphanalysis.pyi
@@ -1,4 +1,5 @@
from typing import Any, Dict, Iterator, List, Optional, Union
+
from ..vocab import Vocab
class MorphAnalysis:
diff --git a/spacy/tokens/morphanalysis.pyx b/spacy/tokens/morphanalysis.pyx
index baa3800a1..0992a0b66 100644
--- a/spacy/tokens/morphanalysis.pyx
+++ b/spacy/tokens/morphanalysis.pyx
@@ -1,11 +1,12 @@
-from libc.string cimport memset
cimport numpy as np
+from libc.string cimport memset
from ..errors import Errors
from ..morphology import Morphology
+
+from ..morphology cimport check_feature, get_by_field, list_features
+from ..typedefs cimport attr_t, hash_t
from ..vocab cimport Vocab
-from ..typedefs cimport hash_t, attr_t
-from ..morphology cimport list_features, check_feature, get_by_field
cdef class MorphAnalysis:
diff --git a/spacy/tokens/span.pxd b/spacy/tokens/span.pxd
index 78bee0a8c..d77bbea70 100644
--- a/spacy/tokens/span.pxd
+++ b/spacy/tokens/span.pxd
@@ -1,8 +1,8 @@
cimport numpy as np
-from .doc cimport Doc
-from ..typedefs cimport attr_t
from ..structs cimport SpanC
+from ..typedefs cimport attr_t
+from .doc cimport Doc
cdef class Span:
diff --git a/spacy/tokens/span.pyi b/spacy/tokens/span.pyi
index a92f19e20..b982eb810 100644
--- a/spacy/tokens/span.pyi
+++ b/spacy/tokens/span.pyi
@@ -1,10 +1,12 @@
-from typing import Callable, Protocol, Iterator, Optional, Union, Tuple, Any, overload
-from thinc.types import Floats1d, Ints2d, FloatsXd
+from typing import Any, Callable, Iterator, Optional, Protocol, Tuple, Union, overload
+
+from thinc.types import Floats1d, FloatsXd, Ints2d
+
+from ..lexeme import Lexeme
+from ..vocab import Vocab
from .doc import Doc
from .token import Token
from .underscore import Underscore
-from ..lexeme import Lexeme
-from ..vocab import Vocab
class SpanMethod(Protocol):
def __call__(self: Span, *args: Any, **kwargs: Any) -> Any: ... # type: ignore[misc]
@@ -51,7 +53,12 @@ class Span:
kb_id: Union[str, int] = ...,
span_id: Union[str, int] = ...,
) -> None: ...
- def __richcmp__(self, other: Span, op: int) -> bool: ...
+ def __lt__(self, other: Any) -> bool: ...
+ def __le__(self, other: Any) -> bool: ...
+ def __eq__(self, other: Any) -> bool: ...
+ def __ne__(self, other: Any) -> bool: ...
+ def __gt__(self, other: Any) -> bool: ...
+ def __ge__(self, other: Any) -> bool: ...
def __hash__(self) -> int: ...
def __len__(self) -> int: ...
def __repr__(self) -> str: ...
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 29b8ce703..73192b760 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -1,22 +1,23 @@
cimport numpy as np
from libc.math cimport sqrt
+import copy
+import warnings
+
import numpy
from thinc.api import get_array_module
-import warnings
-import copy
-from .doc cimport token_by_start, token_by_end, get_token_attr, _get_lca_matrix
-from ..structs cimport TokenC, LexemeC
-from ..typedefs cimport flags_t, attr_t, hash_t
-from ..attrs cimport attr_id_t
-from ..parts_of_speech cimport univ_pos_t
from ..attrs cimport *
+from ..attrs cimport attr_id_t
from ..lexeme cimport Lexeme
+from ..parts_of_speech cimport univ_pos_t
+from ..structs cimport LexemeC, TokenC
from ..symbols cimport dep
+from ..typedefs cimport attr_t, flags_t, hash_t
+from .doc cimport _get_lca_matrix, get_token_attr, token_by_end, token_by_start
-from ..util import normalize_slice
from ..errors import Errors, Warnings
+from ..util import normalize_slice
from .underscore import Underscore, get_ext_args
diff --git a/spacy/tokens/span_group.pxd b/spacy/tokens/span_group.pxd
index 5074aa275..7f4145682 100644
--- a/spacy/tokens/span_group.pxd
+++ b/spacy/tokens/span_group.pxd
@@ -1,6 +1,8 @@
from libcpp.vector cimport vector
+
from ..structs cimport SpanC
+
cdef class SpanGroup:
cdef public object _doc_ref
cdef public str name
diff --git a/spacy/tokens/span_group.pyi b/spacy/tokens/span_group.pyi
index 0b4aa83aa..d063bb595 100644
--- a/spacy/tokens/span_group.pyi
+++ b/spacy/tokens/span_group.pyi
@@ -1,4 +1,5 @@
-from typing import Any, Dict, Iterable, Optional
+from typing import Any, Dict, Iterable, Iterator, Optional
+
from .doc import Doc
from .span import Span
@@ -18,7 +19,7 @@ class SpanGroup:
def doc(self) -> Doc: ...
@property
def has_overlap(self) -> bool: ...
- def __iter__(self): ...
+ def __iter__(self) -> Iterator[Span]: ...
def __len__(self) -> int: ...
def append(self, span: Span) -> None: ...
def extend(self, spans: Iterable[Span]) -> None: ...
diff --git a/spacy/tokens/span_group.pyx b/spacy/tokens/span_group.pyx
index 608dda283..48ad4a516 100644
--- a/spacy/tokens/span_group.pyx
+++ b/spacy/tokens/span_group.pyx
@@ -1,10 +1,12 @@
-from typing import Iterable, Tuple, Union, Optional, TYPE_CHECKING
-import weakref
import struct
+import weakref
from copy import deepcopy
+from typing import TYPE_CHECKING, Iterable, Optional, Tuple, Union
+
import srsly
from spacy.errors import Errors
+
from .span cimport Span
@@ -52,6 +54,8 @@ cdef class SpanGroup:
if len(spans) :
self.c.reserve(len(spans))
for span in spans:
+ if doc is not span.doc:
+ raise ValueError(Errors.E855.format(obj="span"))
self.push_back(span.c)
def __repr__(self):
@@ -261,11 +265,22 @@ cdef class SpanGroup:
"""
if doc is None:
doc = self.doc
+ if doc is self.doc:
+ spans = list(self)
+ else:
+ spans = [doc.char_span(span.start_char, span.end_char, label=span.label_, kb_id=span.kb_id, span_id=span.id) for span in self]
+ for i, span in enumerate(spans):
+ if span is None:
+ raise ValueError(Errors.E1052.format(i=i))
+ if span.kb_id in self.doc.vocab.strings:
+ doc.vocab.strings.add(span.kb_id_)
+ if span.id in span.doc.vocab.strings:
+ doc.vocab.strings.add(span.id_)
return SpanGroup(
doc,
name=self.name,
attrs=deepcopy(self.attrs),
- spans=list(self),
+ spans=spans,
)
def _concat(
diff --git a/spacy/tokens/token.pxd b/spacy/tokens/token.pxd
index 58b727764..fc02ff624 100644
--- a/spacy/tokens/token.pxd
+++ b/spacy/tokens/token.pxd
@@ -1,14 +1,16 @@
from numpy cimport ndarray
-from ..vocab cimport Vocab
-from ..structs cimport TokenC
+
from ..attrs cimport *
-from ..typedefs cimport attr_t, flags_t
-from ..parts_of_speech cimport univ_pos_t
-from .doc cimport Doc
from ..lexeme cimport Lexeme
+from ..parts_of_speech cimport univ_pos_t
+from ..structs cimport TokenC
+from ..typedefs cimport attr_t, flags_t
+from ..vocab cimport Vocab
+from .doc cimport Doc
from ..errors import Errors
+
cdef int MISSING_DEP = 0
cdef class Token:
diff --git a/spacy/tokens/token.pyi b/spacy/tokens/token.pyi
index bd585d034..e7863fd16 100644
--- a/spacy/tokens/token.pyi
+++ b/spacy/tokens/token.pyi
@@ -1,18 +1,12 @@
-from typing import (
- Callable,
- Protocol,
- Iterator,
- Optional,
- Union,
- Tuple,
- Any,
-)
+from typing import Any, Callable, Iterator, Optional, Protocol, Tuple, Union
+
from thinc.types import Floats1d, FloatsXd
-from .doc import Doc
-from .span import Span
-from .morphanalysis import MorphAnalysis
+
from ..lexeme import Lexeme
from ..vocab import Vocab
+from .doc import Doc
+from .morphanalysis import MorphAnalysis
+from .span import Span
from .underscore import Underscore
class TokenMethod(Protocol):
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index 7fff6b162..8c384f417 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -1,26 +1,43 @@
# cython: infer_types=True
# Compiler crashes on memory view coercion without this. Should report bug.
-from cython.view cimport array as cvarray
cimport numpy as np
+from cython.view cimport array as cvarray
+
np.import_array()
+import warnings
+
import numpy
from thinc.api import get_array_module
-import warnings
-from ..typedefs cimport hash_t
+from ..attrs cimport (
+ IS_ALPHA,
+ IS_ASCII,
+ IS_BRACKET,
+ IS_CURRENCY,
+ IS_DIGIT,
+ IS_LEFT_PUNCT,
+ IS_LOWER,
+ IS_PUNCT,
+ IS_QUOTE,
+ IS_RIGHT_PUNCT,
+ IS_SPACE,
+ IS_STOP,
+ IS_TITLE,
+ IS_UPPER,
+ LIKE_EMAIL,
+ LIKE_NUM,
+ LIKE_URL,
+)
from ..lexeme cimport Lexeme
-from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
-from ..attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT
-from ..attrs cimport IS_TITLE, IS_UPPER, IS_CURRENCY, IS_STOP
-from ..attrs cimport LIKE_URL, LIKE_NUM, LIKE_EMAIL
from ..symbols cimport conj
-from .morphanalysis cimport MorphAnalysis
+from ..typedefs cimport hash_t
from .doc cimport set_children_from_heads
+from .morphanalysis cimport MorphAnalysis
from .. import parts_of_speech
-from ..errors import Errors, Warnings
from ..attrs import IOB_STRINGS
+from ..errors import Errors, Warnings
from .underscore import Underscore, get_ext_args
diff --git a/spacy/tokens/underscore.py b/spacy/tokens/underscore.py
index e9a4e1862..0aa0c1e6d 100644
--- a/spacy/tokens/underscore.py
+++ b/spacy/tokens/underscore.py
@@ -1,6 +1,7 @@
-from typing import Dict, Any, List, Optional, Tuple, Union, TYPE_CHECKING
-import functools
import copy
+import functools
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
+
from ..errors import Errors
if TYPE_CHECKING:
diff --git a/spacy/training/__init__.py b/spacy/training/__init__.py
index a6f873f05..b8c0792f0 100644
--- a/spacy/training/__init__.py
+++ b/spacy/training/__init__.py
@@ -1,12 +1,18 @@
-from .corpus import Corpus, JsonlCorpus, PlainTextCorpus # noqa: F401
-from .example import Example, validate_examples, validate_get_examples # noqa: F401
from .alignment import Alignment # noqa: F401
from .augment import dont_augment, orth_variants_augmenter # noqa: F401
-from .iob_utils import iob_to_biluo, biluo_to_iob # noqa: F401
-from .iob_utils import offsets_to_biluo_tags, biluo_tags_to_offsets # noqa: F401
-from .iob_utils import biluo_tags_to_spans, tags_to_entities # noqa: F401
-from .iob_utils import split_bilu_label, remove_bilu_prefix # noqa: F401
-from .gold_io import docs_to_json, read_json_file # noqa: F401
from .batchers import minibatch_by_padded_size, minibatch_by_words # noqa: F401
-from .loggers import console_logger # noqa: F401
from .callbacks import create_copy_from_base_model # noqa: F401
+from .corpus import Corpus, JsonlCorpus, PlainTextCorpus # noqa: F401
+from .example import Example, validate_examples, validate_get_examples # noqa: F401
+from .gold_io import docs_to_json, read_json_file # noqa: F401
+from .iob_utils import ( # noqa: F401
+ biluo_tags_to_offsets,
+ biluo_tags_to_spans,
+ biluo_to_iob,
+ iob_to_biluo,
+ offsets_to_biluo_tags,
+ remove_bilu_prefix,
+ split_bilu_label,
+ tags_to_entities,
+)
+from .loggers import console_logger # noqa: F401
diff --git a/spacy/training/align.pyx b/spacy/training/align.pyx
index 0ef1fd35d..8bd43b048 100644
--- a/spacy/training/align.pyx
+++ b/spacy/training/align.pyx
@@ -1,6 +1,6 @@
-from typing import List, Tuple
-from itertools import chain
import re
+from itertools import chain
+from typing import List, Tuple
from ..errors import Errors
diff --git a/spacy/training/alignment.py b/spacy/training/alignment.py
index 6d24714bf..3f615d10b 100644
--- a/spacy/training/alignment.py
+++ b/spacy/training/alignment.py
@@ -1,5 +1,5 @@
-from typing import List
from dataclasses import dataclass
+from typing import List
from .align import get_alignments
from .alignment_array import AlignmentArray
diff --git a/spacy/training/alignment_array.pxd b/spacy/training/alignment_array.pxd
index 056f5bef3..bb28f3ac6 100644
--- a/spacy/training/alignment_array.pxd
+++ b/spacy/training/alignment_array.pxd
@@ -1,5 +1,6 @@
-from libcpp.vector cimport vector
cimport numpy as np
+from libcpp.vector cimport vector
+
cdef class AlignmentArray:
cdef np.ndarray _data
diff --git a/spacy/training/alignment_array.pyx b/spacy/training/alignment_array.pyx
index 01e9d9bf8..b0be1512b 100644
--- a/spacy/training/alignment_array.pyx
+++ b/spacy/training/alignment_array.pyx
@@ -1,6 +1,9 @@
from typing import List
-from ..errors import Errors
+
import numpy
+
+from ..errors import Errors
+
from libc.stdint cimport int32_t
diff --git a/spacy/training/augment.py b/spacy/training/augment.py
index 2fe8c24fb..1ebd3313c 100644
--- a/spacy/training/augment.py
+++ b/spacy/training/augment.py
@@ -1,12 +1,11 @@
-from typing import Callable, Iterator, Dict, List, Tuple, TYPE_CHECKING
-from typing import Optional
-import random
import itertools
+import random
from functools import partial
+from typing import TYPE_CHECKING, Callable, Dict, Iterator, List, Optional, Tuple
from ..util import registry
from .example import Example
-from .iob_utils import split_bilu_label, _doc_to_biluo_tags_with_partial
+from .iob_utils import _doc_to_biluo_tags_with_partial, split_bilu_label
if TYPE_CHECKING:
from ..language import Language # noqa: F401
diff --git a/spacy/training/batchers.py b/spacy/training/batchers.py
index f0b6c3123..050c3351b 100644
--- a/spacy/training/batchers.py
+++ b/spacy/training/batchers.py
@@ -1,10 +1,18 @@
-from typing import Union, Iterable, Sequence, TypeVar, List, Callable, Iterator
-from typing import Optional, Any
-from functools import partial
import itertools
+from functools import partial
+from typing import (
+ Any,
+ Callable,
+ Iterable,
+ Iterator,
+ List,
+ Optional,
+ Sequence,
+ TypeVar,
+ Union,
+)
-from ..util import registry, minibatch
-
+from ..util import minibatch, registry
Sizing = Union[Sequence[int], int]
ItemT = TypeVar("ItemT")
diff --git a/spacy/training/callbacks.py b/spacy/training/callbacks.py
index 7e2494f5b..21c3d56a1 100644
--- a/spacy/training/callbacks.py
+++ b/spacy/training/callbacks.py
@@ -1,14 +1,17 @@
-from typing import Callable, Optional
+from typing import TYPE_CHECKING, Callable, Optional
+
from ..errors import Errors
-from ..language import Language
-from ..util import load_model, registry, logger
+from ..util import load_model, logger, registry
+
+if TYPE_CHECKING:
+ from ..language import Language
@registry.callbacks("spacy.copy_from_base_model.v1")
def create_copy_from_base_model(
tokenizer: Optional[str] = None,
vocab: Optional[str] = None,
-) -> Callable[[Language], Language]:
+) -> Callable[["Language"], "Language"]:
def copy_from_base_model(nlp):
if tokenizer:
logger.info("Copying tokenizer from: %s", tokenizer)
diff --git a/spacy/training/converters/__init__.py b/spacy/training/converters/__init__.py
index e91b6aaa6..8173da64c 100644
--- a/spacy/training/converters/__init__.py
+++ b/spacy/training/converters/__init__.py
@@ -1,4 +1,4 @@
-from .iob_to_docs import iob_to_docs # noqa: F401
from .conll_ner_to_docs import conll_ner_to_docs # noqa: F401
-from .json_to_docs import json_to_docs # noqa: F401
from .conllu_to_docs import conllu_to_docs # noqa: F401
+from .iob_to_docs import iob_to_docs # noqa: F401
+from .json_to_docs import json_to_docs # noqa: F401
diff --git a/spacy/training/converters/conll_ner_to_docs.py b/spacy/training/converters/conll_ner_to_docs.py
index 28b21c5f0..b19d1791b 100644
--- a/spacy/training/converters/conll_ner_to_docs.py
+++ b/spacy/training/converters/conll_ner_to_docs.py
@@ -1,10 +1,10 @@
from wasabi import Printer
-from .. import tags_to_entities
-from ...training import iob_to_biluo
-from ...tokens import Doc, Span
from ...errors import Errors
-from ...util import load_model, get_lang_class
+from ...tokens import Doc, Span
+from ...training import iob_to_biluo
+from ...util import get_lang_class, load_model
+from .. import tags_to_entities
def conll_ner_to_docs(
diff --git a/spacy/training/converters/conllu_to_docs.py b/spacy/training/converters/conllu_to_docs.py
index 7052504cc..bda5c88c3 100644
--- a/spacy/training/converters/conllu_to_docs.py
+++ b/spacy/training/converters/conllu_to_docs.py
@@ -1,11 +1,12 @@
import re
-from .conll_ner_to_docs import n_sents_info
-from ...training import iob_to_biluo, biluo_tags_to_spans
-from ...tokens import Doc, Token, Span
-from ...vocab import Vocab
from wasabi import Printer
+from ...tokens import Doc, Span, Token
+from ...training import biluo_tags_to_spans, iob_to_biluo
+from ...vocab import Vocab
+from .conll_ner_to_docs import n_sents_info
+
def conllu_to_docs(
input_data,
diff --git a/spacy/training/converters/iob_to_docs.py b/spacy/training/converters/iob_to_docs.py
index 60fb7df61..45bb65692 100644
--- a/spacy/training/converters/iob_to_docs.py
+++ b/spacy/training/converters/iob_to_docs.py
@@ -1,11 +1,11 @@
from wasabi import Printer
-from .conll_ner_to_docs import n_sents_info
-from ...vocab import Vocab
-from ...training import iob_to_biluo, tags_to_entities
-from ...tokens import Doc, Span
from ...errors import Errors
+from ...tokens import Doc, Span
+from ...training import iob_to_biluo, tags_to_entities
from ...util import minibatch
+from ...vocab import Vocab
+from .conll_ner_to_docs import n_sents_info
def iob_to_docs(input_data, n_sents=10, no_print=False, *args, **kwargs):
diff --git a/spacy/training/converters/json_to_docs.py b/spacy/training/converters/json_to_docs.py
index 4123839f2..b4beedd2f 100644
--- a/spacy/training/converters/json_to_docs.py
+++ b/spacy/training/converters/json_to_docs.py
@@ -1,9 +1,13 @@
import srsly
-from ..gold_io import json_iterate, json_to_annotations
-from ..example import annotations_to_doc
-from ..example import _fix_legacy_dict_data, _parse_example_dict_data
-from ...util import load_model
+
from ...lang.xx import MultiLanguage
+from ...util import load_model
+from ..example import (
+ _fix_legacy_dict_data,
+ _parse_example_dict_data,
+ annotations_to_doc,
+)
+from ..gold_io import json_iterate, json_to_annotations
def json_to_docs(input_data, model=None, **kwargs):
diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py
index 086ad831c..6037c15e3 100644
--- a/spacy/training/corpus.py
+++ b/spacy/training/corpus.py
@@ -1,16 +1,16 @@
-import warnings
-from typing import Union, List, Iterable, Iterator, TYPE_CHECKING, Callable
-from typing import Optional
-from pathlib import Path
import random
+import warnings
+from pathlib import Path
+from typing import TYPE_CHECKING, Callable, Iterable, Iterator, List, Optional, Union
+
import srsly
from .. import util
+from ..errors import Errors, Warnings
+from ..tokens import Doc, DocBin
+from ..vocab import Vocab
from .augment import dont_augment
from .example import Example
-from ..errors import Warnings, Errors
-from ..tokens import DocBin, Doc
-from ..vocab import Vocab
if TYPE_CHECKING:
# This lets us add type hints for mypy etc. without causing circular imports
diff --git a/spacy/training/example.pxd b/spacy/training/example.pxd
index 49e239757..a7c71fa88 100644
--- a/spacy/training/example.pxd
+++ b/spacy/training/example.pxd
@@ -1,6 +1,7 @@
-from ..tokens.doc cimport Doc
from libc.stdint cimport uint64_t
+from ..tokens.doc cimport Doc
+
cdef class Example:
cdef readonly Doc x
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index 95b0f0de9..abdac23ea 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -1,19 +1,29 @@
-from collections.abc import Iterable as IterableInstance
import warnings
+from collections.abc import Iterable as IterableInstance
+
import numpy
+
from murmurhash.mrmr cimport hash64
from ..tokens.doc cimport Doc
from ..tokens.span cimport Span
-from ..tokens.span import Span
+
from ..attrs import IDS
-from .alignment import Alignment
-from .iob_utils import biluo_to_iob, offsets_to_biluo_tags, doc_to_biluo_tags
-from .iob_utils import biluo_tags_to_spans, remove_bilu_prefix
from ..errors import Errors, Warnings
from ..pipeline._parser_internals import nonproj
+from ..tokens.span import Span
+from .alignment import Alignment
+from .iob_utils import (
+ biluo_tags_to_spans,
+ biluo_to_iob,
+ doc_to_biluo_tags,
+ offsets_to_biluo_tags,
+ remove_bilu_prefix,
+)
+
from ..tokens.token cimport MISSING_DEP
-from ..util import logger, to_ternary_int, all_equal
+
+from ..util import all_equal, logger, to_ternary_int
cpdef Doc annotations_to_doc(vocab, tok_annot, doc_annot):
diff --git a/spacy/training/gold_io.pyx b/spacy/training/gold_io.pyx
index 69654e2c7..1e7b3681d 100644
--- a/spacy/training/gold_io.pyx
+++ b/spacy/training/gold_io.pyx
@@ -1,10 +1,12 @@
+import json
import warnings
+
import srsly
+
from .. import util
from ..errors import Warnings
from ..tokens import Doc
from .iob_utils import offsets_to_biluo_tags, tags_to_entities
-import json
def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index 9cf759c55..3a46b6632 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -1,24 +1,33 @@
-from typing import Union, Dict, Optional, Any, IO, TYPE_CHECKING
-from thinc.api import Config, fix_random_seed, set_gpu_allocator
-from thinc.api import ConfigValidationError
-from pathlib import Path
-import srsly
-import numpy
-import tarfile
import gzip
-import zipfile
-import tqdm
-from itertools import islice
+import tarfile
import warnings
+import zipfile
+from itertools import islice
+from pathlib import Path
+from typing import IO, TYPE_CHECKING, Any, Dict, Optional, Union
+
+import numpy
+import srsly
+import tqdm
+from thinc.api import Config, ConfigValidationError, fix_random_seed, set_gpu_allocator
-from .pretrain import get_tok2vec_ref
-from ..lookups import Lookups
-from ..vectors import Vectors, Mode as VectorsMode
from ..errors import Errors, Warnings
+from ..lookups import Lookups
from ..schemas import ConfigSchemaTraining
-from ..util import registry, load_model_from_config, resolve_dot_names, logger
-from ..util import load_model, ensure_path, get_sourced_components
-from ..util import OOV_RANK, DEFAULT_OOV_PROB
+from ..util import (
+ DEFAULT_OOV_PROB,
+ OOV_RANK,
+ ensure_path,
+ get_sourced_components,
+ load_model,
+ load_model_from_config,
+ logger,
+ registry,
+ resolve_dot_names,
+)
+from ..vectors import Mode as VectorsMode
+from ..vectors import Vectors
+from .pretrain import get_tok2vec_ref
if TYPE_CHECKING:
from ..language import Language # noqa: F401
@@ -67,7 +76,8 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
with nlp.select_pipes(enable=resume_components):
logger.info("Resuming training for: %s", resume_components)
nlp.resume_training(sgd=optimizer)
- # Make sure that listeners are defined before initializing further
+ # Make sure that internal component names are synced and listeners are
+ # defined before initializing further
nlp._link_components()
with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
if T["max_epochs"] == -1:
diff --git a/spacy/training/iob_utils.py b/spacy/training/iob_utils.py
index 0d4d246b0..64d02a1e2 100644
--- a/spacy/training/iob_utils.py
+++ b/spacy/training/iob_utils.py
@@ -1,8 +1,8 @@
-from typing import List, Dict, Tuple, Iterable, Union, Iterator, cast
import warnings
+from typing import Dict, Iterable, Iterator, List, Tuple, Union, cast
from ..errors import Errors, Warnings
-from ..tokens import Span, Doc
+from ..tokens import Doc, Span
def iob_to_biluo(tags: Iterable[str]) -> List[str]:
diff --git a/spacy/training/loggers.py b/spacy/training/loggers.py
index 7de31822e..1ec0b7b25 100644
--- a/spacy/training/loggers.py
+++ b/spacy/training/loggers.py
@@ -1,13 +1,14 @@
-from typing import TYPE_CHECKING, Dict, Any, Tuple, Callable, List, Optional, IO, Union
-from wasabi import Printer
-from pathlib import Path
-import tqdm
import sys
-import srsly
+from pathlib import Path
+from typing import IO, TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
+
+import srsly
+import tqdm
+from wasabi import Printer
-from ..util import registry
-from ..errors import Errors
from .. import util
+from ..errors import Errors
+from ..util import registry
if TYPE_CHECKING:
from ..language import Language # noqa: F401
diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index eca40e3d9..56df53957 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -1,17 +1,28 @@
-from typing import List, Callable, Tuple, Dict, Iterable, Union, Any, IO
-from typing import Optional, TYPE_CHECKING
+import random
+import shutil
+import sys
from pathlib import Path
from timeit import default_timer as timer
-from thinc.api import Optimizer, Config, constant, fix_random_seed, set_gpu_allocator
-from wasabi import Printer
-import random
-import sys
-import shutil
+from typing import (
+ IO,
+ TYPE_CHECKING,
+ Any,
+ Callable,
+ Dict,
+ Iterable,
+ List,
+ Optional,
+ Tuple,
+ Union,
+)
+
+from thinc.api import Config, Optimizer, constant, fix_random_seed, set_gpu_allocator
+from wasabi import Printer
-from .example import Example
-from ..schemas import ConfigSchemaTraining
from ..errors import Errors
-from ..util import resolve_dot_names, registry, logger
+from ..schemas import ConfigSchemaTraining
+from ..util import logger, registry, resolve_dot_names
+from .example import Example
if TYPE_CHECKING:
from ..language import Language # noqa: F401
diff --git a/spacy/training/pretrain.py b/spacy/training/pretrain.py
index ebbc5d837..14a813a09 100644
--- a/spacy/training/pretrain.py
+++ b/spacy/training/pretrain.py
@@ -1,20 +1,26 @@
-from typing import Optional, Callable, Iterable, Union, List
-from thinc.api import Config, fix_random_seed, set_gpu_allocator, Model, Optimizer
-from thinc.api import set_dropout_rate
-from pathlib import Path
-from collections import Counter
-import srsly
-import time
import re
+import time
+from collections import Counter
+from pathlib import Path
+from typing import Callable, Iterable, List, Optional, Union
+import srsly
+from thinc.api import (
+ Config,
+ Model,
+ Optimizer,
+ fix_random_seed,
+ set_dropout_rate,
+ set_gpu_allocator,
+)
from thinc.config import ConfigValidationError
from wasabi import Printer
-from .example import Example
from ..errors import Errors
-from ..tokens import Doc
from ..schemas import ConfigSchemaPretrain
-from ..util import registry, load_model_from_config, dot_to_object
+from ..tokens import Doc
+from ..util import dot_to_object, load_model_from_config, registry
+from .example import Example
def pretrain(
diff --git a/spacy/ty.py b/spacy/ty.py
index 8f2903d78..f389456c0 100644
--- a/spacy/ty.py
+++ b/spacy/ty.py
@@ -1,10 +1,20 @@
-from typing import TYPE_CHECKING
-from typing import Optional, Any, Iterable, Dict, Callable, Sequence, List
+from typing import (
+ TYPE_CHECKING,
+ Any,
+ Callable,
+ Dict,
+ Iterable,
+ List,
+ Optional,
+ Sequence,
+)
+
+from thinc.api import Model, Optimizer
+
from .compat import Protocol, runtime_checkable
-from thinc.api import Optimizer, Model
-
if TYPE_CHECKING:
+ from .language import Language
from .training import Example
@@ -32,7 +42,7 @@ class InitializableComponent(Protocol):
def initialize(
self,
get_examples: Callable[[], Iterable["Example"]],
- nlp: Iterable["Example"],
+ nlp: "Language",
**kwargs: Any
):
...
diff --git a/spacy/typedefs.pxd b/spacy/typedefs.pxd
index 8cdc70e42..72d4d99ac 100644
--- a/spacy/typedefs.pxd
+++ b/spacy/typedefs.pxd
@@ -1,6 +1,4 @@
-from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t, int32_t
-from libc.stdint cimport uint8_t
-
+from libc.stdint cimport int32_t, uint8_t, uint16_t, uint32_t, uint64_t, uintptr_t
ctypedef float weight_t
ctypedef uint64_t hash_t
diff --git a/spacy/util.py b/spacy/util.py
index 9f1b886bf..762699a97 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -1,38 +1,62 @@
-from typing import List, Mapping, NoReturn, Union, Dict, Any, Set, cast
-from typing import Optional, Iterable, Callable, Tuple, Type
-from typing import Iterator, Pattern, Generator, TYPE_CHECKING
-from types import ModuleType
-import os
+import functools
import importlib
import importlib.util
-import re
-from pathlib import Path
-import thinc
-from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer
-from thinc.api import ConfigValidationError, Model
-import functools
-import itertools
-import numpy
-import srsly
-import catalogue
-from catalogue import RegistryError, Registry
-import langcodes
-import sys
-import warnings
-from packaging.specifiers import SpecifierSet, InvalidSpecifier
-from packaging.version import Version, InvalidVersion
-from packaging.requirements import Requirement
-import subprocess
-from contextlib import contextmanager
-from collections import defaultdict
-import tempfile
-import shutil
-import shlex
import inspect
-import pkgutil
+import itertools
import logging
+import os
+import pkgutil
+import re
+import shlex
+import shutil
import socket
import stat
+import subprocess
+import sys
+import tempfile
+import warnings
+from collections import defaultdict
+from contextlib import contextmanager
+from pathlib import Path
+from types import ModuleType
+from typing import (
+ TYPE_CHECKING,
+ Any,
+ Callable,
+ Dict,
+ Generator,
+ Iterable,
+ Iterator,
+ List,
+ Mapping,
+ NoReturn,
+ Optional,
+ Pattern,
+ Set,
+ Tuple,
+ Type,
+ Union,
+ cast,
+)
+
+import catalogue
+import langcodes
+import numpy
+import srsly
+import thinc
+from catalogue import Registry, RegistryError
+from packaging.requirements import Requirement
+from packaging.specifiers import InvalidSpecifier, SpecifierSet
+from packaging.version import InvalidVersion, Version
+from thinc.api import (
+ Adam,
+ Config,
+ ConfigValidationError,
+ Model,
+ NumpyOps,
+ Optimizer,
+ get_current_ops,
+)
try:
import cupy.random
@@ -43,13 +67,12 @@ except ImportError:
# and have since moved to Thinc. We're importing them here so people's code
# doesn't break, but they should always be imported from Thinc from now on,
# not from spacy.util.
-from thinc.api import fix_random_seed, compounding, decaying # noqa: F401
+from thinc.api import compounding, decaying, fix_random_seed # noqa: F401
-
-from .symbols import ORTH
-from .compat import cupy, CudaStream, is_windows, importlib_metadata
-from .errors import Errors, Warnings, OLD_MODEL_SHORTCUTS
from . import about
+from .compat import CudaStream, cupy, importlib_metadata, is_windows
+from .errors import OLD_MODEL_SHORTCUTS, Errors, Warnings
+from .symbols import ORTH
if TYPE_CHECKING:
# This lets us add type hints for mypy etc. without causing circular imports
diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index be0f6db09..bc654252a 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -1,14 +1,15 @@
cimport numpy as np
-from libc.stdint cimport uint32_t, uint64_t
from cython.operator cimport dereference as deref
+from libc.stdint cimport uint32_t, uint64_t
from libcpp.set cimport set as cppset
from murmurhash.mrmr cimport hash128_x64
import functools
-import numpy
-from typing import cast
import warnings
from enum import Enum
+from typing import cast
+
+import numpy
import srsly
from thinc.api import Ops, get_array_module, get_current_ops
from thinc.backends import get_array_ops
@@ -16,9 +17,9 @@ from thinc.types import Floats2d
from .strings cimport StringStore
-from .strings import get_string_id
-from .errors import Errors, Warnings
from . import util
+from .errors import Errors, Warnings
+from .strings import get_string_id
def unpickle_vectors(bytes_data):
diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd
index 9c951b2b7..3b0173e3e 100644
--- a/spacy/vocab.pxd
+++ b/spacy/vocab.pxd
@@ -1,12 +1,12 @@
-from libcpp.vector cimport vector
-from preshed.maps cimport PreshMap
from cymem.cymem cimport Pool
+from libcpp.vector cimport vector
from murmurhash.mrmr cimport hash64
+from preshed.maps cimport PreshMap
+from .morphology cimport Morphology
+from .strings cimport StringStore
from .structs cimport LexemeC, TokenC
from .typedefs cimport attr_t, hash_t
-from .strings cimport StringStore
-from .morphology cimport Morphology
cdef LexemeC EMPTY_LEXEME
diff --git a/spacy/vocab.pyi b/spacy/vocab.pyi
index 4cc359c47..b7ff20348 100644
--- a/spacy/vocab.pyi
+++ b/spacy/vocab.pyi
@@ -1,14 +1,15 @@
-from typing import Callable, Iterator, Optional, Union, List, Dict
-from typing import Any, Iterable
+from pathlib import Path
+from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Union
+
from thinc.types import Floats1d, FloatsXd
+
from . import Language
-from .strings import StringStore
from .lexeme import Lexeme
from .lookups import Lookups
from .morphology import Morphology
+from .strings import StringStore
from .tokens import Doc, Span
from .vectors import Vectors
-from pathlib import Path
def create_vocab(
lang: Optional[str], defaults: Any, vectors_name: Optional[str] = ...
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index 27f8e5f98..d47122d08 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -1,26 +1,27 @@
# cython: profile=True
from libc.string cimport memcpy
+import functools
+
import numpy
import srsly
from thinc.api import get_array_module, get_current_ops
-import functools
-from .lexeme cimport EMPTY_LEXEME, OOV_RANK
-from .lexeme cimport Lexeme
-from .typedefs cimport attr_t
-from .tokens.token cimport Token
from .attrs cimport LANG, ORTH
+from .lexeme cimport EMPTY_LEXEME, OOV_RANK, Lexeme
+from .tokens.token cimport Token
+from .typedefs cimport attr_t
+from . import util
+from .attrs import IS_STOP, NORM, intify_attrs
from .compat import copy_reg
from .errors import Errors
-from .attrs import intify_attrs, NORM, IS_STOP
-from .vectors import Vectors, Mode as VectorsMode
-from .util import registry
-from .lookups import Lookups
-from . import util
+from .lang.lex_attrs import LEX_ATTRS, get_lang, is_stop
from .lang.norm_exceptions import BASE_NORMS
-from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang
+from .lookups import Lookups
+from .util import registry
+from .vectors import Mode as VectorsMode
+from .vectors import Vectors
def create_vocab(lang, defaults, vectors_name=None):
diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx
index 323ea2a92..5b4bca1ce 100644
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@@ -1163,18 +1163,19 @@ skew. To render a sample of dependency parses in a HTML file using the
$ python -m spacy benchmark accuracy [model] [data_path] [--output] [--code] [--gold-preproc] [--gpu-id] [--displacy-path] [--displacy-limit]
```
-| Name | Description |
-| ----------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `model` | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~ |
-| `data_path` | Location of evaluation data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~ |
-| `--output`, `-o` | Output JSON file for metrics. If not set, no metrics will be exported. ~~Optional[Path] \(option)~~ |
-| `--code`, `-c` 3 | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
-| `--gold-preproc`, `-G` | Use gold preprocessing. ~~bool (flag)~~ |
-| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ |
-| `--displacy-path`, `-dp` | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. ~~Optional[Path] \(option)~~ |
-| `--displacy-limit`, `-dl` | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. ~~int (option)~~ |
-| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
-| **CREATES** | Training results and optional metrics and visualizations. |
+| Name | Description |
+| ---------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `model` | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~ |
+| `data_path` | Location of evaluation data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~ |
+| `--output`, `-o` | Output JSON file for metrics. If not set, no metrics will be exported. ~~Optional[Path] \(option)~~ |
+| `--code`, `-c` 3 | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
+| `--gold-preproc`, `-G` | Use gold preprocessing. ~~bool (flag)~~ |
+| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ |
+| `--displacy-path`, `-dp` | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. ~~Optional[Path] \(option)~~ |
+| `--displacy-limit`, `-dl` | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. ~~int (option)~~ |
+| `--per-component`, `-P` 3.6 | Whether to return the scores keyed by component name. Defaults to `False`. ~~bool (flag)~~ |
+| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
+| **CREATES** | Training results and optional metrics and visualizations. |
### speed {id="benchmark-speed", version="3.5", tag="command"}
@@ -1220,7 +1221,7 @@ $ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key]
| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `model` | Pipeline to apply to the data. Can be a package or a path to a data directory. ~~str (positional)~~ |
| `data_path` | Location of data to be evaluated in spaCy's [binary format](/api/data-formats#training), jsonl, or plain text. ~~Path (positional)~~ |
-| `output-file`, `-o` | Output `DocBin` path. ~~str (positional)~~ |
+| `output-file` | Output `DocBin` path. ~~str (positional)~~ |
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
| `--text-key`, `-tk` | The key for `.jsonl` files to use to grab the texts from. Defaults to `text`. ~~Optional[str] \(option)~~ |
| `--force-overwrite`, `-F` | If the provided `output-file` already exists, then force `apply` to overwrite it. If this is `False` (default) then quits with a warning instead. ~~bool (flag)~~ |
@@ -1640,7 +1641,7 @@ with [`spacy package`](/api/cli#package) and `--build wheel`. For more details,
see the spaCy project [integration](/usage/projects#huggingface_hub).
```bash
-$ python -m spacy huggingface-hub push [whl_path] [--org] [--msg] [--local-repo] [--verbose]
+$ python -m spacy huggingface-hub push [whl_path] [--org] [--msg] [--verbose]
```
> #### Example
@@ -1654,6 +1655,5 @@ $ python -m spacy huggingface-hub push [whl_path] [--org] [--msg] [--local-repo]
| `whl_path` | The path to the `.whl` file packaged with [`spacy package`](https://spacy.io/api/cli#package). ~~Path(positional)~~ |
| `--org`, `-o` | Optional name of organization to which the pipeline should be uploaded. ~~str (option)~~ |
| `--msg`, `-m` | Commit message to use for update. Defaults to `"Update spaCy pipeline"`. ~~str (option)~~ |
-| `--local-repo`, `-l` | Local path to the model repository (will be created if it doesn't exist). Defaults to `hub` in the current working directory. ~~Path (option)~~ |
| `--verbose`, `-V` | Output additional info for debugging, e.g. the full generated hub metadata. ~~bool (flag)~~ |
| **UPLOADS** | The pipeline to the hub. |
diff --git a/website/docs/api/entitylinker.mdx b/website/docs/api/entitylinker.mdx
index d84dd3ca9..21d2e9015 100644
--- a/website/docs/api/entitylinker.mdx
+++ b/website/docs/api/entitylinker.mdx
@@ -64,7 +64,7 @@ architectures and their arguments and hyperparameters.
| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ |
| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
| `get_candidates_batch` 3.5 | Function that generates plausible candidates for a given batch of `Span` objects. Defaults to [CandidateBatchGenerator](/api/architectures#CandidateBatchGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]]~~ |
-| `generate_empty_kb` 3.6 | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~ |
+| `generate_empty_kb` 3.5.1 | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~ |
| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ |
| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
| `threshold` 3.4 | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
diff --git a/website/docs/api/language.mdx b/website/docs/api/language.mdx
index 93ddd79a2..de23156b9 100644
--- a/website/docs/api/language.mdx
+++ b/website/docs/api/language.mdx
@@ -382,15 +382,16 @@ objects instead of tuples of `Doc` and `GoldParse` objects.
> print(scores)
> ```
-| Name | Description |
-| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
-| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
-| _keyword-only_ | |
-| `batch_size` | The batch size to use. ~~Optional[int]~~ |
-| `scorer` | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. ~~Optional[Scorer]~~ |
-| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
-| `scorer_cfg` | Optional dictionary of keyword arguments for the `Scorer`. Defaults to `None`. ~~Optional[Dict[str, Any]]~~ |
-| **RETURNS** | A dictionary of evaluation scores. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
+| Name | Description |
+| -------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
+| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
+| _keyword-only_ | |
+| `batch_size` | The batch size to use. ~~Optional[int]~~ |
+| `scorer` | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. ~~Optional[Scorer]~~ |
+| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
+| `scorer_cfg` | Optional dictionary of keyword arguments for the `Scorer`. Defaults to `None`. ~~Optional[Dict[str, Any]]~~ |
+| `per_component` 3.6 | Whether to return the scores keyed by component name. Defaults to `False`. ~~bool~~ |
+| **RETURNS** | A dictionary of evaluation scores. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
## Language.use_params {id="use_params",tag="contextmanager, method"}
diff --git a/website/docs/api/morphology.mdx b/website/docs/api/morphology.mdx
index 5d4affafe..018ce2524 100644
--- a/website/docs/api/morphology.mdx
+++ b/website/docs/api/morphology.mdx
@@ -213,11 +213,11 @@ Retrieve values for a feature by field.
> assert morph.get("Feat1") == ["Val1", "Val2"]
> ```
-| Name | Description |
-| -------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------ |
-| `field` | The field to retrieve. ~~str~~ |
-| `default` 3.6 | The value to return if the field is not present. If unset or `None`, the default return value is `[]`. ~~Optional[List[str]]~~ |
-| **RETURNS** | A list of the individual features. ~~List[str]~~ |
+| Name | Description |
+| ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------ |
+| `field` | The field to retrieve. ~~str~~ |
+| `default` 3.5.3 | The value to return if the field is not present. If unset or `None`, the default return value is `[]`. ~~Optional[List[str]]~~ |
+| **RETURNS** | A list of the individual features. ~~List[str]~~ |
### MorphAnalysis.to_dict {id="morphanalysis-to_dict",tag="method"}
diff --git a/website/docs/api/scorer.mdx b/website/docs/api/scorer.mdx
index 6f0c95f6f..9bdd0a8f4 100644
--- a/website/docs/api/scorer.mdx
+++ b/website/docs/api/scorer.mdx
@@ -33,7 +33,7 @@ Create a new `Scorer`.
| `default_lang` | The language to use for a default pipeline if `nlp` is not provided. Defaults to `xx`. ~~str~~ |
| `default_pipeline` | The pipeline components to use for a default pipeline if `nlp` is not provided. Defaults to `("senter", "tagger", "morphologizer", "parser", "ner", "textcat")`. ~~Iterable[string]~~ |
| _keyword-only_ | |
-| `\*\*kwargs` | Any additional settings to pass on to the individual scoring methods. ~~Any~~ |
+| `**kwargs` | Any additional settings to pass on to the individual scoring methods. ~~Any~~ |
## Scorer.score {id="score",tag="method"}
@@ -67,10 +67,12 @@ core pipeline components, the individual score names start with the `Token` or
> scores = scorer.score(examples)
> ```
-| Name | Description |
-| ----------- | ------------------------------------------------------------------------------------------------------------------- |
-| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
-| **RETURNS** | A dictionary of scores. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
+| Name | Description |
+| -------------------------------------------- | ------------------------------------------------------------------------------------------------------------------- |
+| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
+| _keyword-only_ | |
+| `per_component` 3.6 | Whether to return the scores keyed by component name. Defaults to `False`. ~~bool~~ |
+| **RETURNS** | A dictionary of scores. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
## Scorer.score_tokenization {id="score_tokenization",tag="staticmethod",version="3"}
diff --git a/website/docs/api/spancategorizer.mdx b/website/docs/api/spancategorizer.mdx
index f54a8687b..81a473ac2 100644
--- a/website/docs/api/spancategorizer.mdx
+++ b/website/docs/api/spancategorizer.mdx
@@ -105,7 +105,7 @@ architectures and their arguments and hyperparameters.
>
> # Construction via add_pipe with custom model
> config = {"model": {"@architectures": "my_spancat"}}
-> parser = nlp.add_pipe("spancat", config=config)
+> spancat = nlp.add_pipe("spancat", config=config)
>
> # Construction from class
> from spacy.pipeline import SpanCategorizer
@@ -524,3 +524,22 @@ has two columns, indicating the start and end position.
| `min_size` | The minimal phrase lengths to suggest (inclusive). ~~[int]~~ |
| `max_size` | The maximal phrase lengths to suggest (exclusive). ~~[int]~~ |
| **CREATES** | The suggester function. ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ |
+
+### spacy.preset_spans_suggester.v1 {id="preset_spans_suggester"}
+
+> #### Example Config
+>
+> ```ini
+> [components.spancat.suggester]
+> @misc = "spacy.preset_spans_suggester.v1"
+> spans_key = "my_spans"
+> ```
+
+Suggest all spans that are already stored in doc.spans[spans_key]. This is
+useful when an upstream component is used to set the spans on the Doc such as a
+[`SpanRuler`](/api/spanruler) or [`SpanFinder`](/api/spanfinder).
+
+| Name | Description |
+| ----------- | ----------------------------------------------------------------------------- |
+| `spans_key` | Key of [`Doc.spans`](/api/doc/#spans) that provides spans to suggest. ~~str~~ |
+| **CREATES** | The suggester function. ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ |
diff --git a/website/docs/api/spanfinder.mdx b/website/docs/api/spanfinder.mdx
new file mode 100644
index 000000000..ca3104c85
--- /dev/null
+++ b/website/docs/api/spanfinder.mdx
@@ -0,0 +1,372 @@
+---
+title: SpanFinder
+tag: class,experimental
+source: spacy/pipeline/span_finder.py
+version: 3.6
+teaser:
+ 'Pipeline component for identifying potentially overlapping spans of text'
+api_base_class: /api/pipe
+api_string_name: span_finder
+api_trainable: true
+---
+
+The span finder identifies potentially overlapping, unlabeled spans. It
+identifies tokens that start or end spans and annotates unlabeled spans between
+starts and ends, with optional filters for min and max span length. It is
+intended for use in combination with a component like
+[`SpanCategorizer`](/api/spancategorizer) that may further filter or label the
+spans. Predicted spans will be saved in a [`SpanGroup`](/api/spangroup) on the
+doc under `doc.spans[spans_key]`, where `spans_key` is a component config
+setting.
+
+## Assigned Attributes {id="assigned-attributes"}
+
+Predictions will be saved to `Doc.spans[spans_key]` as a
+[`SpanGroup`](/api/spangroup).
+
+`spans_key` defaults to `"sc"`, but can be passed as a parameter. The
+`span_finder` component will overwrite any existing spans under the spans key
+`doc.spans[spans_key]`.
+
+| Location | Value |
+| ---------------------- | ---------------------------------- |
+| `Doc.spans[spans_key]` | The unlabeled spans. ~~SpanGroup~~ |
+
+## Config and implementation {id="config"}
+
+The default config is defined by the pipeline component factory and describes
+how the component should be configured. You can override its settings via the
+`config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your
+[`config.cfg` for training](/usage/training#config). See the
+[model architectures](/api/architectures) documentation for details on the
+architectures and their arguments and hyperparameters.
+
+> #### Example
+>
+> ```python
+> from spacy.pipeline.span_finder import DEFAULT_SPAN_FINDER_MODEL
+> config = {
+> "threshold": 0.5,
+> "spans_key": "my_spans",
+> "max_length": None,
+> "min_length": None,
+> "model": DEFAULT_SPAN_FINDER_MODEL,
+> }
+> nlp.add_pipe("span_finder", config=config)
+> ```
+
+| Setting | Description |
+| ------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `model` | A model instance that is given a list of documents and predicts a probability for each token. ~~Model[List[Doc], Floats2d]~~ |
+| `spans_key` | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ |
+| `threshold` | Minimum probability to consider a prediction positive. Defaults to `0.5`. ~~float~~ |
+| `max_length` | Maximum length of the produced spans, defaults to `None` meaning unlimited length. ~~Optional[int]~~ |
+| `min_length` | Minimum length of the produced spans, defaults to `None` meaning shortest span length is 1. ~~Optional[int]~~ |
+| `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~ |
+
+```python
+%%GITHUB_SPACY/spacy/pipeline/span_finder.py
+```
+
+## SpanFinder.\_\_init\_\_ {id="init",tag="method"}
+
+> #### Example
+>
+> ```python
+> # Construction via add_pipe with default model
+> span_finder = nlp.add_pipe("span_finder")
+>
+> # Construction via add_pipe with custom model
+> config = {"model": {"@architectures": "my_span_finder"}}
+> span_finder = nlp.add_pipe("span_finder", config=config)
+>
+> # Construction from class
+> from spacy.pipeline import SpanFinder
+> span_finder = SpanFinder(nlp.vocab, model)
+> ```
+
+Create a new pipeline instance. In your application, you would normally use a
+shortcut for this and instantiate the component using its string name and
+[`nlp.add_pipe`](/api/language#create_pipe).
+
+| Name | Description |
+| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `vocab` | The shared vocabulary. ~~Vocab~~ |
+| `model` | A model instance that is given a list of documents and predicts a probability for each token. ~~Model[List[Doc], Floats2d]~~ |
+| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
+| _keyword-only_ | |
+| `spans_key` | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ |
+| `threshold` | Minimum probability to consider a prediction positive. Defaults to `0.5`. ~~float~~ |
+| `max_length` | Maximum length of the produced spans, defaults to `None` meaning unlimited length. ~~Optional[int]~~ |
+| `min_length` | Minimum length of the produced spans, defaults to `None` meaning shortest span length is 1. ~~Optional[int]~~ |
+| `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~ |
+
+## SpanFinder.\_\_call\_\_ {id="call",tag="method"}
+
+Apply the pipe to one document. The document is modified in place, and returned.
+This usually happens under the hood when the `nlp` object is called on a text
+and all pipeline components are applied to the `Doc` in order. Both
+[`__call__`](/api/spanfinder#call) and [`pipe`](/api/spanfinder#pipe) delegate
+to the [`predict`](/api/spanfinder#predict) and
+[`set_annotations`](/api/spanfinder#set_annotations) methods.
+
+> #### Example
+>
+> ```python
+> doc = nlp("This is a sentence.")
+> span_finder = nlp.add_pipe("span_finder")
+> # This usually happens under the hood
+> processed = span_finder(doc)
+> ```
+
+| Name | Description |
+| ----------- | -------------------------------- |
+| `doc` | The document to process. ~~Doc~~ |
+| **RETURNS** | The processed document. ~~Doc~~ |
+
+## SpanFinder.pipe {id="pipe",tag="method"}
+
+Apply the pipe to a stream of documents. This usually happens under the hood
+when the `nlp` object is called on a text and all pipeline components are
+applied to the `Doc` in order. Both [`__call__`](/api/spanfinder#call) and
+[`pipe`](/api/spanfinder#pipe) delegate to the
+[`predict`](/api/spanfinder#predict) and
+[`set_annotations`](/api/spanfinder#set_annotations) methods.
+
+> #### Example
+>
+> ```python
+> span_finder = nlp.add_pipe("span_finder")
+> for doc in span_finder.pipe(docs, batch_size=50):
+> pass
+> ```
+
+| Name | Description |
+| -------------- | ------------------------------------------------------------- |
+| `stream` | A stream of documents. ~~Iterable[Doc]~~ |
+| _keyword-only_ | |
+| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
+| **YIELDS** | The processed documents in order. ~~Doc~~ |
+
+## SpanFinder.initialize {id="initialize",tag="method"}
+
+Initialize the component for training. `get_examples` should be a function that
+returns an iterable of [`Example`](/api/example) objects. **At least one example
+should be supplied.** The data examples are used to **initialize the model** of
+the component and can either be the full training data or a representative
+sample. Initialization includes validating the network and
+[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) This
+method is typically called by [`Language.initialize`](/api/language#initialize)
+and lets you customize arguments it receives via the
+[`[initialize.components]`](/api/data-formats#config-initialize) block in the
+config.
+
+> #### Example
+>
+> ```python
+> span_finder = nlp.add_pipe("span_finder")
+> span_finder.initialize(lambda: examples, nlp=nlp)
+> ```
+
+| Name | Description |
+| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Must contain at least one `Example`. ~~Callable[[], Iterable[Example]]~~ |
+| _keyword-only_ | |
+| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
+
+## SpanFinder.predict {id="predict",tag="method"}
+
+Apply the component's model to a batch of [`Doc`](/api/doc) objects without
+modifying them.
+
+> #### Example
+>
+> ```python
+> span_finder = nlp.add_pipe("span_finder")
+> scores = span_finder.predict([doc1, doc2])
+> ```
+
+| Name | Description |
+| ----------- | ------------------------------------------- |
+| `docs` | The documents to predict. ~~Iterable[Doc]~~ |
+| **RETURNS** | The model's prediction for each document. |
+
+## SpanFinder.set_annotations {id="set_annotations",tag="method"}
+
+Modify a batch of [`Doc`](/api/doc) objects using pre-computed scores.
+
+> #### Example
+>
+> ```python
+> span_finder = nlp.add_pipe("span_finder")
+> scores = span_finder.predict(docs)
+> span_finder.set_annotations(docs, scores)
+> ```
+
+| Name | Description |
+| -------- | ---------------------------------------------------- |
+| `docs` | The documents to modify. ~~Iterable[Doc]~~ |
+| `scores` | The scores to set, produced by `SpanFinder.predict`. |
+
+## SpanFinder.update {id="update",tag="method"}
+
+Learn from a batch of [`Example`](/api/example) objects containing the
+predictions and gold-standard annotations, and update the component's model.
+Delegates to [`predict`](/api/spanfinder#predict) and
+[`get_loss`](/api/spanfinder#get_loss).
+
+> #### Example
+>
+> ```python
+> span_finder = nlp.add_pipe("span_finder")
+> optimizer = nlp.initialize()
+> losses = span_finder.update(examples, sgd=optimizer)
+> ```
+
+| Name | Description |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------ |
+| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
+| _keyword-only_ | |
+| `drop` | The dropout rate. ~~float~~ |
+| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
+| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
+| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
+
+## SpanFinder.get_loss {id="get_loss",tag="method"}
+
+Find the loss and gradient of loss for the batch of documents and their
+predicted scores.
+
+> #### Example
+>
+> ```python
+> span_finder = nlp.add_pipe("span_finder")
+> scores = span_finder.predict([eg.predicted for eg in examples])
+> loss, d_loss = span_finder.get_loss(examples, scores)
+> ```
+
+| Name | Description |
+| -------------- | ------------------------------------------------------------------------------ |
+| `examples` | The batch of examples. ~~Iterable[Example]~~ |
+| `spans_scores` | Scores representing the model's predictions. ~~Tuple[Ragged, Floats2d]~~ |
+| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, Floats2d]~~ |
+
+## SpanFinder.create_optimizer {id="create_optimizer",tag="method"}
+
+Create an optimizer for the pipeline component.
+
+> #### Example
+>
+> ```python
+> span_finder = nlp.add_pipe("span_finder")
+> optimizer = span_finder.create_optimizer()
+> ```
+
+| Name | Description |
+| ----------- | ---------------------------- |
+| **RETURNS** | The optimizer. ~~Optimizer~~ |
+
+## SpanFinder.use_params {id="use_params",tag="method, contextmanager"}
+
+Modify the pipe's model to use the given parameter values.
+
+> #### Example
+>
+> ```python
+> span_finder = nlp.add_pipe("span_finder")
+> with span_finder.use_params(optimizer.averages):
+> span_finder.to_disk("/best_model")
+> ```
+
+| Name | Description |
+| -------- | -------------------------------------------------- |
+| `params` | The parameter values to use in the model. ~~dict~~ |
+
+## SpanFinder.to_disk {id="to_disk",tag="method"}
+
+Serialize the pipe to disk.
+
+> #### Example
+>
+> ```python
+> span_finder = nlp.add_pipe("span_finder")
+> span_finder.to_disk("/path/to/span_finder")
+> ```
+
+| Name | Description |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
+| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
+| _keyword-only_ | |
+| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
+
+## SpanFinder.from_disk {id="from_disk",tag="method"}
+
+Load the pipe from disk. Modifies the object in place and returns it.
+
+> #### Example
+>
+> ```python
+> span_finder = nlp.add_pipe("span_finder")
+> span_finder.from_disk("/path/to/span_finder")
+> ```
+
+| Name | Description |
+| -------------- | ----------------------------------------------------------------------------------------------- |
+| `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
+| _keyword-only_ | |
+| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
+| **RETURNS** | The modified `SpanFinder` object. ~~SpanFinder~~ |
+
+## SpanFinder.to_bytes {id="to_bytes",tag="method"}
+
+> #### Example
+>
+> ```python
+> span_finder = nlp.add_pipe("span_finder")
+> span_finder_bytes = span_finder.to_bytes()
+> ```
+
+Serialize the pipe to a bytestring.
+
+| Name | Description |
+| -------------- | ------------------------------------------------------------------------------------------- |
+| _keyword-only_ | |
+| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
+| **RETURNS** | The serialized form of the `SpanFinder` object. ~~bytes~~ |
+
+## SpanFinder.from_bytes {id="from_bytes",tag="method"}
+
+Load the pipe from a bytestring. Modifies the object in place and returns it.
+
+> #### Example
+>
+> ```python
+> span_finder_bytes = span_finder.to_bytes()
+> span_finder = nlp.add_pipe("span_finder")
+> span_finder.from_bytes(span_finder_bytes)
+> ```
+
+| Name | Description |
+| -------------- | ------------------------------------------------------------------------------------------- |
+| `bytes_data` | The data to load from. ~~bytes~~ |
+| _keyword-only_ | |
+| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
+| **RETURNS** | The `SpanFinder` object. ~~SpanFinder~~ |
+
+## Serialization fields {id="serialization-fields"}
+
+During serialization, spaCy will export several data fields used to restore
+different aspects of the object. If needed, you can exclude them from
+serialization by passing in the string names via the `exclude` argument.
+
+> #### Example
+>
+> ```python
+> data = span_finder.to_disk("/path", exclude=["vocab"])
+> ```
+
+| Name | Description |
+| ------- | -------------------------------------------------------------- |
+| `vocab` | The shared [`Vocab`](/api/vocab). |
+| `cfg` | The config file. You usually don't want to exclude this. |
+| `model` | The binary model data. You usually don't want to exclude this. |
diff --git a/website/docs/api/top-level.mdx b/website/docs/api/top-level.mdx
index 6de1acdf0..64ec342cd 100644
--- a/website/docs/api/top-level.mdx
+++ b/website/docs/api/top-level.mdx
@@ -469,7 +469,7 @@ factories.
| `optimizers` | Registry for functions that create [optimizers](https://thinc.ai/docs/api-optimizers). |
| `readers` | Registry for file and data readers, including training and evaluation data readers like [`Corpus`](/api/corpus). |
| `schedules` | Registry for functions that create [schedules](https://thinc.ai/docs/api-schedules). |
-| `scorers` | Registry for functions that create scoring methods for user with the [`Scorer`](/api/scorer). Scoring methods are called with `Iterable[Example]` and arbitrary `\*\*kwargs` and return scores as `Dict[str, Any]`. |
+| `scorers` | Registry for functions that create scoring methods for user with the [`Scorer`](/api/scorer). Scoring methods are called with `Iterable[Example]` and arbitrary `**kwargs` and return scores as `Dict[str, Any]`. |
| `tokenizers` | Registry for tokenizer factories. Registered functions should return a callback that receives the `nlp` object and returns a [`Tokenizer`](/api/tokenizer) or a custom callable. |
### spacy-transformers registry {id="registry-transformers"}
diff --git a/website/docs/usage/index.mdx b/website/docs/usage/index.mdx
index a5b7990d6..4b06178d5 100644
--- a/website/docs/usage/index.mdx
+++ b/website/docs/usage/index.mdx
@@ -259,6 +259,26 @@ source code and recompiling frequently.
$ python setup.py develop
```
+#### Visual Studio Code extension
+
+
+
+The [spaCy VSCode Extension](https://github.com/explosion/spacy-vscode) provides
+additional tooling and features for working with spaCy's config files. Version
+1.0.0 includes hover descriptions for registry functions, variables, and section
+names within the config as an installable extension.
+
+1. Install a supported version of Python on your system (`>=3.7`)
+2. Install the
+ [Python Extension for Visual Studio Code](https://code.visualstudio.com/docs/python/python-tutorial)
+3. Create a
+ [virtual python environment](https://docs.python.org/3/library/venv.html)
+4. Install all python requirements (`spaCy >= 3.4.0` & `pygls >= 1.0.0`)
+5. Install
+ [spaCy extension for Visual Studio Code](https://marketplace.visualstudio.com/items?itemName=Explosion.spacy-extension)
+6. Select your python environment
+7. You are ready to work with `.cfg` files in spaCy!
+
### Building an executable {id="executable"}
The spaCy repository includes a [`Makefile`](%%GITHUB_SPACY/Makefile) that
diff --git a/website/docs/usage/visualizers.mdx b/website/docs/usage/visualizers.mdx
index c372744de..1ac931753 100644
--- a/website/docs/usage/visualizers.mdx
+++ b/website/docs/usage/visualizers.mdx
@@ -56,14 +56,19 @@ wrap. So if you come across this problem, especially when using custom labels,
you'll have to increase the `distance` setting in the `options` to allow longer
arcs.
+Moreover, you might need to modify the `offset_x` argument depending on the shape
+of your document. Otherwise, the left part of the document may overflow beyond the
+container's border.
+
-| Argument | Description |
-| --------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ |
-| `color` | Text color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#000000"`. ~~str~~ |
-| `bg` | Background color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#ffffff"`. ~~str~~ |
-| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ |
+| Argument | Description |
+| ---------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ |
+| `color` | Text color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#000000"`. ~~str~~ |
+| `bg` | Background color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#ffffff"`. ~~str~~ |
+| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ |
+| `offset_x` | Spacing on left side of the SVG in px. You might need to tweak this setting for long texts. Defaults to `50`. ~~int~~ |
For a list of all available options, see the
[`displacy` API documentation](/api/top-level#displacy_options).
diff --git a/website/meta/languages.json b/website/meta/languages.json
index 46c0d3adb..f88d2b7bf 100644
--- a/website/meta/languages.json
+++ b/website/meta/languages.json
@@ -264,6 +264,11 @@
"code": "mr",
"name": "Marathi"
},
+ {
+ "code": "ms",
+ "name": "Malay",
+ "has_examples": true
+ },
{
"code": "nb",
"name": "Norwegian Bokmål",
diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json
index b5c555da6..12c3fce35 100644
--- a/website/meta/sidebars.json
+++ b/website/meta/sidebars.json
@@ -106,6 +106,7 @@
{ "text": "SentenceRecognizer", "url": "/api/sentencerecognizer" },
{ "text": "Sentencizer", "url": "/api/sentencizer" },
{ "text": "SpanCategorizer", "url": "/api/spancategorizer" },
+ { "text": "SpanFinder", "url": "/api/spanfinder" },
{ "text": "SpanResolver", "url": "/api/span-resolver" },
{ "text": "SpanRuler", "url": "/api/spanruler" },
{ "text": "Tagger", "url": "/api/tagger" },
diff --git a/website/meta/universe.json b/website/meta/universe.json
index 33185ca30..cd3bedbff 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -1,5 +1,22 @@
{
"resources": [
+ {
+ "id": "spacy-vscode",
+ "title": "spaCy Visual Studio Code Extension",
+ "thumb": "https://raw.githubusercontent.com/explosion/spacy-vscode/main/icon.png",
+ "slogan": "Work with spaCy's config files in VS Code",
+ "description": "The spaCy VS Code Extension provides additional tooling and features for working with spaCy's config files. Version 1.0.0 includes hover descriptions for registry functions, variables, and section names within the config as an installable extension.",
+ "url": "https://marketplace.visualstudio.com/items?itemName=Explosion.spacy-extension",
+ "github": "explosion/spacy-vscode",
+ "code_language": "python",
+ "author": "Explosion",
+ "author_links": {
+ "twitter": "@explosion_ai",
+ "github": "explosion"
+ },
+ "category": ["extension"],
+ "tags": []
+ },
{
"id": "parsigs",
"title": "parsigs",
@@ -97,26 +114,30 @@
"id": "grecy",
"title": "greCy",
"slogan": "Ancient Greek pipelines for spaCy",
- "description": "greCy offers state-of-the-art pipelines for ancient Greek NLP. The repository makes language models available in various sizes, some of them containing floret word vectors and a BERT transformer layer.",
+ "description": "greCy offers state-of-the-art pipelines for ancient Greek NLP. It installs language models available in various sizes, some of them containing either word vectors or the aristoBERTo transformer.",
"github": "jmyerston/greCy",
+ "pip": "grecy",
"code_example": [
- "import spacy",
- "#After installing the grc_ud_proiel_trf wheel package from the greCy repository",
+ "python -m grecy install grc_proiel_trf",
"",
- "nlp = spacy.load('grc_ud_proiel_trf')",
- "doc = nlp('δοκῶ μοι περὶ ὧν πυνθάνεσθε οὐκ ἀμελέτητος εἶναι.')",
+ "#After installing grc_proiel_trf or any other model",
+ "import spacy",
+ "",
+ "nlp = spacy.load('grc_proiel_trf')",
+ "doc = nlp('δοκῶ μοι περὶ ὧν πυνθάνεσθε οὐκ ἀμελέτητος εἶναι')",
"",
"for token in doc:",
- " print(token.text, token.norm_, token.lemma_, token.pos_, token.tag_)"
+ " print(f'{token.text}, lemma: {token.lemma_}, pos: {token.pos_}, dep: {token.dep_}')"
],
"code_language": "python",
+ "thumb": "https://jacobo-syntax.hf.space/media/03a5317fa660c142e41dd2870b4273ce4e668e6fcdee0a276891f563.png",
"author": "Jacobo Myerston",
"author_links": {
"twitter": "@jcbmyrstn",
"github": "jmyerston",
"website": "https://huggingface.co/spaces/Jacobo/syntax"
},
- "category": ["pipeline", "research"],
+ "category": ["pipeline", "research","models"],
"tags": ["ancient Greek"]
},
{
@@ -384,7 +405,7 @@
},
{
"id": "spacypdfreader",
- "title": "spadypdfreader",
+ "title": "spacypdfreader",
"category": ["pipeline"],
"tags": ["PDF"],
"slogan": "Easy PDF to text to spaCy text extraction in Python.",
@@ -401,7 +422,7 @@
},
"code_example": [
"import spacy",
- "from spacypdfreader import pdf_reader",
+ "from spacypdfreader.spacypdfreader import pdf_reader",
"",
"nlp = spacy.load('en_core_web_sm')",
"doc = pdf_reader('tests/data/test_pdf_01.pdf', nlp)",
@@ -2718,10 +2739,9 @@
"description": "Have you ever struggled with needing a [spaCy TextCategorizer](https://spacy.io/api/textcategorizer) but didn't have the time to train one from scratch? Classy Classification is the way to go! For few-shot classification using [sentence-transformers](https://github.com/UKPLab/sentence-transformers) or [spaCy models](https://spacy.io/usage/models), provide a dictionary with labels and examples, or just provide a list of labels for zero shot-classification with [Huggingface zero-shot classifiers](https://huggingface.co/models?pipeline_tag=zero-shot-classification).",
"github": "davidberenstein1957/classy-classification",
"pip": "classy-classification",
- "thumb": "https://raw.githubusercontent.com/Pandora-Intelligence/classy-classification/master/logo.png",
+ "thumb": "https://raw.githubusercontent.com/davidberenstein1957/classy-classification/master/logo.png",
"code_example": [
"import spacy",
- "import classy_classification",
"",
"data = {",
" \"furniture\": [\"This text is about chairs.\",",
@@ -2766,14 +2786,13 @@
"title": "Concise Concepts",
"slogan": "Concise Concepts uses few-shot NER based on word embedding similarity to get you going with easy!",
"description": "When wanting to apply NER to concise concepts, it is really easy to come up with examples, but it takes some effort to train an entire pipeline. Concise Concepts uses few-shot NER based on word embedding similarity to get you going with easy!",
- "github": "pandora-intelligence/concise-concepts",
+ "github": "davidberenstein1957/concise-concepts",
"pip": "concise-concepts",
- "thumb": "https://raw.githubusercontent.com/Pandora-Intelligence/concise-concepts/master/img/logo.png",
- "image": "https://raw.githubusercontent.com/Pandora-Intelligence/concise-concepts/master/img/example.png",
+ "thumb": "https://raw.githubusercontent.com/davidberenstein1957/concise-concepts/master/img/logo.png",
+ "image": "https://raw.githubusercontent.com/davidberenstein1957/concise-concepts/master/img/example.png",
"code_example": [
"import spacy",
"from spacy import displacy",
- "import concise_concepts",
"",
"data = {",
" \"fruit\": [\"apple\", \"pear\", \"orange\"],",
@@ -2813,13 +2832,12 @@
"title": "Crosslingual Coreference",
"slogan": "One multi-lingual coreference model to rule them all!",
"description": "Coreference is amazing but the data required for training a model is very scarce. In our case, the available training for non-English languages also data proved to be poorly annotated. Crosslingual Coreference therefore uses the assumption a trained model with English data and cross-lingual embeddings should work for other languages with similar sentence structure. Verified to work quite well for at least (EN, NL, DK, FR, DE).",
- "github": "pandora-intelligence/crosslingual-coreference",
+ "github": "davidberenstein1957/crosslingual-coreference",
"pip": "crosslingual-coreference",
- "thumb": "https://raw.githubusercontent.com/Pandora-Intelligence/crosslingual-coreference/master/img/logo.png",
- "image": "https://raw.githubusercontent.com/Pandora-Intelligence/crosslingual-coreference/master/img/example_total.png",
+ "thumb": "https://raw.githubusercontent.com/davidberenstein1957/crosslingual-coreference/master/img/logo.png",
+ "image": "https://raw.githubusercontent.com/davidberenstein1957/crosslingual-coreference/master/img/example_total.png",
"code_example": [
"import spacy",
- "import crosslingual_coreference",
"",
"text = \"\"\"",
" Do not forget about Momofuku Ando!",
@@ -2865,33 +2883,35 @@
"title": "Adept Augmentations",
"slogan": " A Python library aimed at dissecting and augmenting NER training data for a few-shot scenario.",
"description": "EntitySwapAugmenter takes either a `datasets.Dataset` or a `spacy.tokens.DocBin`. Additionally, it is optional to provide a set of labels. It initially creates a knowledge base of entities belonging to a certain label. When running `augmenter.augment()` for N runs, it then creates N new sentences with random swaps of the original entities with an entity of the same corresponding label from the knowledge base.\n\nFor example, assuming that we have knowledge base for `PERSONS`, `LOCATIONS` and `PRODUCTS`. We can then create additional data for the sentence \"Momofuko Ando created instant noodles in Osaka.\" using `augmenter.augment(N=2)`, resulting in \"David created instant noodles in Madrid.\" or \"Tom created Adept Augmentations in the Netherlands\".",
- "github": "davidberenstein1957/adept-augmentations",
+ "github": "argilla-io/adept-augmentations",
"pip": "adept-augmentations",
- "thumb": "https://raw.githubusercontent.com/Pandora-Intelligence/crosslingual-coreference/master/img/logo.png",
+ "thumb": "https://raw.githubusercontent.com/argilla-io/adept-augmentations/main/logo.png",
"code_example": [
- "import spacy",
- "from spacy.tokens import DocBin",
- "",
"from adept_augmentations import EntitySwapAugmenter",
+ "import spacy",
+ "from spacy.tokens import Doc, DocBin",
+ "nlp = spacy.blank(\"en\")",
"",
- "nlp = spacy.load(\"en_core_web_sm\")",
- "",
- "TRAIN_DATA = [",
- " \"Apple is looking at buying U.K. startup for $1 billion\",",
- " \"Microsoft acquires GitHub for $7.5 billion\"",
+ "# Create some example golden data",
+ "example_data = [",
+ " (\"Apple is looking at buying U.K. startup for $1 billion\", [(0, 5, \"ORG\"), (27, 31, \"LOC\"), (44, 54, \"MONEY\")]),",
+ " (\"Microsoft acquires GitHub for $7.5 billion\", [(0, 9, \"ORG\"), (19, 25, \"ORG\"), (30, 42, \"MONEY\")]),",
"]",
- "docs = nlp.pipe(TRAIN_DATA)",
"",
"# Create a new DocBin",
- "doc_bin = DocBin(docs=docs)",
+ "nlp = spacy.blank(\"en\")",
+ "docs = []",
+ "for entry in example_data:",
+ " doc = Doc(nlp.vocab, words=entry[0].split())",
+ " doc.ents = [doc.char_span(ent[0], ent[1], label=ent[2]) for ent in entry[1]]",
+ " docs.append(doc)",
+ "golden_dataset = DocBin(docs=docs)",
"",
"# Augment Data",
- "doc_bin = EntitySwapAugmenter(doc_bin).augment(4)",
- "for doc in doc_bin.get_docs(nlp.vocab):",
+ "augmented_dataset = EntitySwapAugmenter(golden_dataset).augment(4)",
+ "for doc in augmented_dataset.get_docs(nlp.vocab):",
" print(doc.text)",
"",
- "# Output",
- "#",
"# GitHub is looking at buying U.K. startup for $ 7.5 billion",
"# Microsoft is looking at buying U.K. startup for $ 1 billion",
"# Microsoft is looking at buying U.K. startup for $ 7.5 billion",
@@ -2910,6 +2930,54 @@
"tags": ["ner", "few-shot", "augmentation", "datasets", "training"],
"spacy_version": 3
},
+ {
+ "id": "spacysetfit",
+ "title": "spaCy-SetFit",
+ "slogan": "An an easy and intuitive approach to use SetFit in combination with spaCy.",
+ "description": "spaCy-SetFit is a Python library that extends spaCy's text categorization capabilities by incorporating SetFit for few-shot classification. It allows you to train a text categorizer using a intuitive dictionary. \n\nThe library integrates with spaCy's pipeline architecture, enabling easy integration and configuration of the text categorizer component. You can provide a training dataset containing inlier and outlier examples, and spaCy-SetFit will use the paraphrase-MiniLM-L3-v2 model for training the text categorizer with SetFit. Once trained, you can use the categorizer to classify new text and obtain category probabilities.",
+ "github": "davidberenstein1957/spacy-setfit",
+ "pip": "spacy-setfit",
+ "thumb": "https://raw.githubusercontent.com/davidberenstein1957/spacy-setfit/main/logo.png",
+ "code_example": [
+ "import spacy",
+ "",
+ "# Create some example data",
+ "train_dataset = {",
+ " \"inlier\": [",
+ " \"Text about furniture\",",
+ " \"Couches, benches and televisions.\",",
+ " \"I really need to get a new sofa.\"",
+ " ],",
+ " \"outlier\": [",
+ " \"Text about kitchen equipment\",",
+ " \"This text is about politics\",",
+ " \"Comments about AI and stuff.\"",
+ " ]",
+ "}",
+ "",
+ "# Load the spaCy language model:",
+ "nlp = spacy.load(\"en_core_web_sm\")",
+ "",
+ "# Add the \"text_categorizer\" pipeline component to the spaCy model, and configure it with SetFit parameters:",
+ "nlp.add_pipe(\"text_categorizer\", config={",
+ " \"pretrained_model_name_or_path\": \"paraphrase-MiniLM-L3-v2\",",
+ " \"setfit_trainer_args\": {",
+ " \"train_dataset\": train_dataset",
+ " }",
+ "})",
+ "doc = nlp(\"I really need to get a new sofa.\")",
+ "doc.cats",
+ "# {'inlier': 0.902350975129, 'outlier': 0.097649024871}"
+ ],
+ "author": "David Berenstein",
+ "author_links": {
+ "github": "davidberenstein1957",
+ "website": "https://www.linkedin.com/in/david-berenstein-1bab11105/"
+ },
+ "category": ["pipeline"],
+ "tags": ["few-shot", "SetFit", "training"],
+ "spacy_version": 3
+ },
{
"id": "blackstone",
"title": "Blackstone",
@@ -4262,6 +4330,68 @@
},
"category": ["pipeline", "research"],
"tags": ["Thai"]
+ },
+ {
+ "id": "vetiver",
+ "title": "Vetiver",
+ "slogan": "Version, share, deploy, and monitor models.",
+ "description": "The goal of vetiver is to provide fluent tooling to version, deploy, and monitor a trained model. Functions handle creating model objects, versioning models, predicting from a remote API endpoint, deploying Dockerfiles, and more.",
+ "github": "rstudio/vetiver-python",
+ "pip": "vetiver",
+ "code_example": [
+ "import spacy",
+ "from vetiver import VetiverModel, VetiverAPI",
+ "",
+ "# If you use this model, you'll need to download it first:",
+ "# python -m spacy download en_core_web_md",
+ "nlp = spacy.load('en_core_web_md')",
+ "# Create deployable model object with your nlp Language object",
+ "v = VetiverModel(nlp, model_name = 'my_model')",
+ "# Try out your API endpoint locally",
+ "VetiverAPI(v).run()"
+ ],
+ "code_language": "python",
+ "url": "https://vetiver.rstudio.com/",
+ "thumb": "https://raw.githubusercontent.com/rstudio/vetiver-python/main/docs/figures/square-logo.svg",
+ "author": "Posit, PBC",
+ "author_links": {
+ "twitter": "posit_pbc",
+ "github": "rstudio",
+ "website": "https://posit.co/"
+ },
+ "category": ["apis", "standalone"],
+ "tags": ["apis", "deployment"]
+ },
+ {
+ "id": "span_marker",
+ "title": "SpanMarker",
+ "slogan": "Effortless state-of-the-art NER in spaCy",
+ "description": "The SpanMarker integration with spaCy allows you to seamlessly replace the default spaCy `\"ner\"` pipeline component with any [SpanMarker model available on the Hugging Face Hub](https://huggingface.co/models?library=span-marker). Through this, you can take advantage of the advanced Named Entity Recognition capabilities of SpanMarker within the familiar and powerful spaCy framework.\n\nBy default, the `span_marker` pipeline component uses a [SpanMarker model using RoBERTa-large trained on OntoNotes v5.0](https://huggingface.co/tomaarsen/span-marker-roberta-large-ontonotes5). This model reaches a competitive 91.54 F1, notably higher than the [85.5 and 89.8 F1](https://spacy.io/usage/facts-figures#section-benchmarks) from `en_core_web_lg` and `en_core_web_trf`, respectively. A short head-to-head between this SpanMarker model and the `trf` spaCy model has been posted [here](https://github.com/tomaarsen/SpanMarkerNER/pull/12).\n\nAdditionally, see [here](https://tomaarsen.github.io/SpanMarkerNER/notebooks/spacy_integration.html) for documentation on using SpanMarker with spaCy.",
+ "github": "tomaarsen/SpanMarkerNER",
+ "pip": "span_marker",
+ "code_example": [
+ "import spacy",
+ "",
+ "nlp = spacy.load(\"en_core_web_sm\", disable=[\"ner\"])",
+ "nlp.add_pipe(\"span_marker\", config={\"model\": \"tomaarsen/span-marker-roberta-large-ontonotes5\"})",
+ "",
+ "text = \"\"\"Cleopatra VII, also known as Cleopatra the Great, was the last active ruler of the \\",
+ "Ptolemaic Kingdom of Egypt. She was born in 69 BCE and ruled Egypt from 51 BCE until her \\",
+ "death in 30 BCE.\"\"\"",
+ "doc = nlp(text)",
+ "print([(entity, entity.label_) for entity in doc.ents])",
+ "# [(Cleopatra VII, \"PERSON\"), (Cleopatra the Great, \"PERSON\"), (the Ptolemaic Kingdom of Egypt, \"GPE\"),",
+ "# (69 BCE, \"DATE\"), (Egypt, \"GPE\"), (51 BCE, \"DATE\"), (30 BCE, \"DATE\")]"
+ ],
+ "code_language": "python",
+ "url": "https://tomaarsen.github.io/SpanMarkerNER",
+ "author": "Tom Aarsen",
+ "author_links": {
+ "github": "tomaarsen",
+ "website": "https://www.linkedin.com/in/tomaarsen"
+ },
+ "category": ["pipeline", "standalone", "scientific"],
+ "tags": ["ner"]
}
],
diff --git a/website/public/images/spacy-extension-demo.gif b/website/public/images/spacy-extension-demo.gif
new file mode 100644
index 000000000..a857bbe2d
Binary files /dev/null and b/website/public/images/spacy-extension-demo.gif differ
diff --git a/website/src/components/quickstart.js b/website/src/components/quickstart.js
index 160e5a778..2b5bfb5ba 100644
--- a/website/src/components/quickstart.js
+++ b/website/src/components/quickstart.js
@@ -215,15 +215,17 @@ const Quickstart = ({
}
)}
-
- {Children.toArray(children).flat().filter(isRelevant)}
+
+
+ {Children.toArray(children).flat().filter(isRelevant)}
+