Warn and document spangroup.doc weakref (#8980)

* test for error after Doc has been garbage collected * warn about using a SpanGroup when the Doc has been garbage collected * add warning to the docs * rephrase slightly * raise error instead of warning * update * move warning to doc property
2025-09-17 01:22:37 +03:00 · 2021-08-20 11:06:19 +02:00 · 2021-08-20 11:06:19 +02:00 · de025beb5f
commit de025beb5f
parent 37fe847af4
4 changed files with 41 additions and 2 deletions
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -521,6 +521,10 @@ class Errors:
    E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.")
    # New errors added in v3.x
    E866 = ("A SpanGroup is not functional after the corresponding Doc has "
            "been garbage collected. To keep using the spans, make sure that "
            "the corresponding Doc object is still available in the scope of "
            "your function.")
    E867 = ("The 'textcat' component requires at least two labels because it "
            "uses mutually exclusive classes where exactly one label is True "
            "for each doc. For binary classification tasks, you can use two "
--- a/spacy/tests/pipeline/test_spancat.py
+++ b/spacy/tests/pipeline/test_spancat.py
@ -1,8 +1,10 @@
 import pytest
 import numpy
-from numpy.testing import assert_equal, assert_array_equal, assert_almost_equal
+from numpy.testing import assert_array_equal, assert_almost_equal
 from thinc.api import get_current_ops
 from spacy.language import Language
 from spacy.tokens.doc import SpanGroups
 from spacy.tokens import SpanGroup
 from spacy.training import Example
 from spacy.util import fix_random_seed, registry
@ -72,6 +74,23 @@ def test_explicit_labels():
    assert spancat.labels == ("PERSON", "LOC")
 def test_doc_gc():
    # If the Doc object is garbage collected, the spans won't be functional afterwards
    nlp = Language()
    spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
    spancat.add_label("PERSON")
    nlp.initialize()
    texts = ["Just a sentence.", "I like London and Berlin", "I like Berlin", "I eat ham."]
    all_spans = [doc.spans for doc in nlp.pipe(texts)]
    for text, spangroups in zip(texts, all_spans):
        assert isinstance(spangroups, SpanGroups)
        for key, spangroup in spangroups.items():
            assert isinstance(spangroup, SpanGroup)
            assert len(spangroup) > 0
            with pytest.raises(RuntimeError):
                span = spangroup[0]
@pytest.mark.parametrize(
    "max_positive,nr_results", [(None, 4), (1, 2), (2, 3), (3, 4), (4, 4)]
 )
--- a/spacy/tokens/span_group.pyx
+++ b/spacy/tokens/span_group.pyx
@ -1,6 +1,8 @@
 import weakref
 import struct
 import srsly
 from spacy.errors import Errors
 from .span cimport Span
 from libc.stdint cimport uint64_t, uint32_t, int32_t
@ -58,7 +60,11 @@ cdef class SpanGroup:
        DOCS: https://spacy.io/api/spangroup#doc
        """
-        return self._doc_ref()
+        doc = self._doc_ref()
        if doc is None:
            # referent has been garbage collected
            raise RuntimeError(Errors.E866)
        return doc
    @property
    def has_overlap(self):
--- a/website/docs/api/spangroup.md
+++ b/website/docs/api/spangroup.md
@ -46,6 +46,16 @@ Create a `SpanGroup`.
 The [`Doc`](/api/doc) object the span group is referring to.
 <Infobox title="SpanGroup and Doc lifecycle" variant="warning">
 When a `Doc` object is garbage collected, any related `SpanGroup` object won't
 be functional anymore, as these objects use a `weakref` to refer to the
 document. An error will be raised as the internal `doc` object will be `None`.
 To avoid this, make sure that the original `Doc` objects are still available in
 the scope of your function.
 </Infobox>
 > #### Example
 >
 > ```python