Merge branch 'v4' into feature/docwise-generator-batching

# Conflicts: # spacy/kb/kb.pyx # spacy/ml/models/entity_linker.py # spacy/pipeline/entity_linker.py # website/docs/api/inmemorylookupkb.mdx # website/docs/api/kb.mdx
2025-07-10 16:22:29 +03:00 · 2023-04-17 16:28:09 +02:00 · 2023-04-17 16:28:09 +02:00 · 49747697a2
commit 49747697a2
parent e5be5d6092 5d0f48fe69
12 changed files with 135 additions and 39 deletions
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -926,7 +926,7 @@ class Errors(metaclass=ErrorsWithCodes):
    E1029 = ("Edit tree cannot be applied to form.")
    E1030 = ("Edit tree identifier out of range.")
    E1031 = ("Could not find gold transition - see logs above.")
-    E1032 = ("`{var}` should not be {forbidden}, but received {value}.")
+    E1032 = ("Span {var} {value} is out of bounds for {obj} with length {length}.")
    E1033 = ("Dimension {name} invalid -- only nO, nF, nP")
    E1034 = ("Node index {i} out of bounds ({length})")
    E1035 = ("Token index {i} out of bounds ({length})")
@ -966,6 +966,9 @@ class Errors(metaclass=ErrorsWithCodes):
    E4004 = ("Backprop is not supported when is_train is not set.")
    E4005 = ("EntityLinker_v1 is not supported in spaCy v4. Update your configuration.")
    E4006 = ("Expected `entity_id` to be of type {exp_type}, but is of type {found_type}.")
    E4007 = ("Span {var} {value} must be {op} Span {existing_var} "
             "{existing_value}.")
    E4008 = ("Span {pos}_char {value} does not correspond to a token {pos}.")
 RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
--- a/spacy/kb/init.py
+++ b/spacy/kb/init.py
@ -2,5 +2,4 @@ from .kb import KnowledgeBase
 from .kb_in_memory import InMemoryLookupKB
 from .candidate import Candidate, InMemoryCandidate
 __all__ = ["KnowledgeBase", "InMemoryLookupKB", "Candidate", "InMemoryCandidate"]
--- a/spacy/kb/kb.pyx
+++ b/spacy/kb/kb.pyx
@ -43,18 +43,6 @@ cdef class KnowledgeBase:
            Errors.E1045.format(parent="KnowledgeBase", method="get_candidates", name=self.__name__)
        )
    def get_candidates(self, mention: Span) -> Iterable[Candidate]:
        """
        Return candidate entities for specified text. Each candidate defines the entity, the original alias,
        and the prior probability of that alias resolving to that entity.
        If the no candidate is found for a given text, an empty list is returned.
        mention (Span): Mention for which to get candidates.
        RETURNS (Iterable[Candidate]): Identified candidates.
        """
        raise NotImplementedError(
            Errors.E1045.format(parent="KnowledgeBase", method="get_candidates", name=self.__name__)
        )
    def get_vectors(self, entities: Iterable[str]) -> Iterable[Iterable[float]]:
        """
        Return vectors for entities.
--- a/spacy/language.py
+++ b/spacy/language.py
@ -1202,7 +1202,7 @@ class Language:
        _: Optional[Any] = None,
        *,
        drop: float = 0.0,
-        sgd: Optional[Optimizer] = None,
+        sgd: Union[Optimizer, None, Literal[False]] = None,
        losses: Optional[Dict[str, float]] = None,
        component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
        exclude: Iterable[str] = SimpleFrozenList(),
@ -1213,7 +1213,9 @@ class Language:
        examples (Iterable[Example]): A batch of examples
        _: Should not be set - serves to catch backwards-incompatible scripts.
        drop (float): The dropout rate.
-        sgd (Optimizer): An optimizer.
+        sgd (Union[Optimizer, None, Literal[False]]): An optimizer. Will
            be created via create_optimizer if 'None'. No optimizer will
            be used when set to 'False'.
        losses (Dict[str, float]): Dictionary to update with the loss, keyed by
            component.
        component_cfg (Dict[str, Dict]): Config parameters for specific pipeline
@ -1272,6 +1274,7 @@ class Language:
                name not in exclude
                and isinstance(proc, ty.TrainableComponent)
                and proc.is_trainable
                and sgd not in (None, False)
            ):
                proc.finish_update(sgd)
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@ -21,7 +21,6 @@ from thinc.api import CosineDistance, Model, Optimizer, Config
 from thinc.api import set_dropout_rate
 from ..kb import KnowledgeBase, Candidate
 from ..ml import empty_kb
 from ..tokens import Doc, SpanGroup
 from .pipe import deserialize_config
 from .trainable_pipe import TrainablePipe
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@ -707,3 +707,50 @@ def test_span_ent_id(en_tokenizer):
    doc.ents = [span]
    assert doc.ents[0].ent_id_ == "ID2"
    assert doc[1].ent_id_ == "ID2"
 def test_span_start_end_sync(en_tokenizer):
    doc = en_tokenizer("a bc def e fghij kl")
    # can create and edit span starts/ends
    span = doc[2:4]
    span.start_char = 2
    span.end = 5
    assert span == doc[span.start : span.end]
    assert span == doc.char_span(span.start_char, span.end_char)
    # cannot set completely out of bounds starts/ends
    with pytest.raises(IndexError):
        span.start = -1
    with pytest.raises(IndexError):
        span.end = -1
    with pytest.raises(IndexError):
        span.start_char = len(doc.text) + 1
    with pytest.raises(IndexError):
        span.end = len(doc.text) + 1
    # test all possible char starts/ends
    span = doc[0 : len(doc)]
    token_char_starts = [token.idx for token in doc]
    token_char_ends = [token.idx + len(token.text) for token in doc]
    for i in range(len(doc.text)):
        if i not in token_char_starts:
            with pytest.raises(ValueError):
                span.start_char = i
        else:
            span.start_char = i
    span = doc[0 : len(doc)]
    for i in range(len(doc.text)):
        if i not in token_char_ends:
            with pytest.raises(ValueError):
                span.end_char = i
        else:
            span.end_char = i
    # start must be <= end
    span = doc[1:3]
    with pytest.raises(ValueError):
        span.start = 4
    with pytest.raises(ValueError):
        span.end = 0
    span = doc.char_span(2, 8)
    with pytest.raises(ValueError):
        span.start_char = 9
    with pytest.raises(ValueError):
        span.end_char = 1
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@ -157,6 +157,24 @@ def test_language_update_updates():
    )
 def test_language_update_does_not_update_with_sgd_false():
    config = Config().from_str(TAGGER_CFG_STRING)
    nlp = load_model_from_config(config, auto_fill=True, validate=True)
    train_examples = []
    for t in TAGGER_TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
    nlp.initialize(get_examples=lambda: train_examples)
    docs_before_update = list(nlp.pipe([eg.predicted.copy() for eg in train_examples]))
    nlp.update(train_examples, sgd=False)
    docs_after_update = list(nlp.pipe([eg.predicted.copy() for eg in train_examples]))
    xp = get_array_module(docs_after_update[0].tensor)
    xp.testing.assert_equal(docs_before_update[0].tensor, docs_after_update[0].tensor)
 def test_language_evaluate(nlp):
    text = "hello world"
    annots = {"doc_annotation": {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}}
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -772,36 +772,61 @@ cdef class Span:
            return self.span_c().start
        def __set__(self, int start):
-            if start < 0:
+            if start < 0 or start > self.doc.length:
-                raise IndexError(Errors.E1032.format(var="start", forbidden="< 0", value=start))
+                raise IndexError(Errors.E1032.format(var="start", obj="Doc", length=self.doc.length, value=start))
-            self.span_c().start = start
+            cdef SpanC* span_c = self.span_c()
            if start > span_c.end:
                raise ValueError(Errors.E4007.format(var="start", value=start, op="<=", existing_var="end", existing_value=span_c.end))
            span_c.start = start
            span_c.start_char = self.doc.c[start].idx
    property end:
        def __get__(self):
            return self.span_c().end
        def __set__(self, int end):
-            if end < 0:
+            if end < 0 or end > self.doc.length:
-                raise IndexError(Errors.E1032.format(var="end", forbidden="< 0", value=end))
+                raise IndexError(Errors.E1032.format(var="end", obj="Doc", length=self.doc.length, value=end))
-            self.span_c().end = end
+            cdef SpanC* span_c = self.span_c()
            if span_c.start > end:
                raise ValueError(Errors.E4007.format(var="end", value=end, op=">=", existing_var="start", existing_value=span_c.start))
            span_c.end = end
            if end > 0:
                span_c.end_char = self.doc.c[end-1].idx + self.doc.c[end-1].lex.length
            else:
                span_c.end_char = 0
    property start_char:
        def __get__(self):
            return self.span_c().start_char
        def __set__(self, int start_char):
-            if start_char < 0:
+            if start_char < 0 or start_char > len(self.doc.text):
-                raise IndexError(Errors.E1032.format(var="start_char", forbidden="< 0", value=start_char))
+                raise IndexError(Errors.E1032.format(var="start_char", obj="Doc text", length=len(self.doc.text), value=start_char))
-            self.span_c().start_char = start_char
+            cdef int start = token_by_start(self.doc.c, self.doc.length, start_char)
            if start < 0:
                raise ValueError(Errors.E4008.format(value=start_char, pos="start"))
            cdef SpanC* span_c = self.span_c()
            if start_char > span_c.end_char:
                raise ValueError(Errors.E4007.format(var="start_char", value=start_char, op="<=", existing_var="end_char", existing_value=span_c.end_char))
            span_c.start_char = start_char
            span_c.start = start
    property end_char:
        def __get__(self):
            return self.span_c().end_char
        def __set__(self, int end_char):
-            if end_char < 0:
+            if end_char < 0 or end_char > len(self.doc.text):
-                raise IndexError(Errors.E1032.format(var="end_char", forbidden="< 0", value=end_char))
+                raise IndexError(Errors.E1032.format(var="end_char", obj="Doc text", length=len(self.doc.text), value=end_char))
-            self.span_c().end_char = end_char
+            cdef int end = token_by_end(self.doc.c, self.doc.length, end_char)
            if end < 0:
                raise ValueError(Errors.E4008.format(value=end_char, pos="end"))
            cdef SpanC* span_c = self.span_c()
            if span_c.start_char > end_char:
                raise ValueError(Errors.E4007.format(var="end_char", value=end_char, op=">=", existing_var="start_char", existing_value=span_c.start_char))
            span_c.end_char = end_char
            span_c.end = end
    property label:
        def __get__(self):
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@ -210,7 +210,7 @@ def train_while_improving(
                subbatch,
                drop=dropout,
                losses=losses,
-                sgd=None,
+                sgd=False,
                exclude=exclude,
                annotates=annotating_components,
            )
--- a/website/docs/api/language.mdx
+++ b/website/docs/api/language.mdx
@ -323,15 +323,15 @@ and custom registered functions if needed. See the
 >     nlp.update([example], sgd=optimizer)
 > ```
-| Name            | Description                                                                                                                                    |
+| Name            | Description                                                                                                                                                                                        |
-| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
+| --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `examples`      | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~                                                              |
+| `examples`      | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~                                                                                                                  |
-| _keyword-only_  |                                                                                                                                                |
+| _keyword-only_  |                                                                                                                                                                                                    |
-| `drop`          | The dropout rate. ~~float~~                                                                                                                    |
+| `drop`          | The dropout rate. Defaults to `0.0`. ~~float~~                                                                                                                                                     |
-| `sgd`           | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                  |
+| `sgd`           | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if `None`. No optimizer will be used when set to `False`. Defaults to `None`. ~~Union[Optimizer, None, Literal[False]]~~ |
-| `losses`        | Dictionary to update with the loss, keyed by pipeline component. ~~Optional[Dict[str, float]]~~                                                |
+| `losses`        | Dictionary to update with the loss, keyed by pipeline component. Defaults to `None`. ~~Optional[Dict[str, float]]~~                                                                                |
-| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
+| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~                                                     |
-| **RETURNS**     | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                          |
+| **RETURNS**     | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                                              |
 ## Language.distill {id="distill",tag="method,experimental",version="4"}
--- a/website/docs/api/stringstore.mdx
+++ b/website/docs/api/stringstore.mdx
@ -8,6 +8,13 @@ Look up strings by 64-bit hashes. As of v2.0, spaCy uses hash values instead of
 integer IDs. This ensures that strings always map to the same ID, even from
 different `StringStores`.
 <Infobox variant="warning">
 Note that a `StringStore` instance is not static. It increases in size as texts
 with new tokens are processed.
 </Infobox>
 ## StringStore.\_\_init\_\_ {id="init",tag="method"}
 Create the `StringStore`.
--- a/website/docs/api/vocab.mdx
+++ b/website/docs/api/vocab.mdx
@ -10,6 +10,13 @@ The `Vocab` object provides a lookup table that allows you to access
 [`StringStore`](/api/stringstore). It also owns underlying C-data that is shared
 between `Doc` objects.
 <Infobox variant="warning">
 Note that a `Vocab` instance is not static. It increases in size as texts with
 new tokens are processed.
 </Infobox>
 ## Vocab.\_\_init\_\_ {id="init",tag="method"}
 Create the vocabulary.