Warn and document spangroup.doc weakref (#8980)

* test for error after Doc has been garbage collected

* warn about using a SpanGroup when the Doc has been garbage collected

* add warning to the docs

* rephrase slightly

* raise error instead of warning

* update

* move warning to doc property
This commit is contained in:
Sofie Van Landeghem 2021-08-20 11:06:19 +02:00 committed by GitHub
parent 37fe847af4
commit de025beb5f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 41 additions and 2 deletions

View File

@ -521,6 +521,10 @@ class Errors:
E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.")
# New errors added in v3.x
E866 = ("A SpanGroup is not functional after the corresponding Doc has "
"been garbage collected. To keep using the spans, make sure that "
"the corresponding Doc object is still available in the scope of "
"your function.")
E867 = ("The 'textcat' component requires at least two labels because it "
"uses mutually exclusive classes where exactly one label is True "
"for each doc. For binary classification tasks, you can use two "

View File

@ -1,8 +1,10 @@
import pytest
import numpy
from numpy.testing import assert_equal, assert_array_equal, assert_almost_equal
from numpy.testing import assert_array_equal, assert_almost_equal
from thinc.api import get_current_ops
from spacy.language import Language
from spacy.tokens.doc import SpanGroups
from spacy.tokens import SpanGroup
from spacy.training import Example
from spacy.util import fix_random_seed, registry
@ -72,6 +74,23 @@ def test_explicit_labels():
assert spancat.labels == ("PERSON", "LOC")
def test_doc_gc():
# If the Doc object is garbage collected, the spans won't be functional afterwards
nlp = Language()
spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
spancat.add_label("PERSON")
nlp.initialize()
texts = ["Just a sentence.", "I like London and Berlin", "I like Berlin", "I eat ham."]
all_spans = [doc.spans for doc in nlp.pipe(texts)]
for text, spangroups in zip(texts, all_spans):
assert isinstance(spangroups, SpanGroups)
for key, spangroup in spangroups.items():
assert isinstance(spangroup, SpanGroup)
assert len(spangroup) > 0
with pytest.raises(RuntimeError):
span = spangroup[0]
@pytest.mark.parametrize(
"max_positive,nr_results", [(None, 4), (1, 2), (2, 3), (3, 4), (4, 4)]
)

View File

@ -1,6 +1,8 @@
import weakref
import struct
import srsly
from spacy.errors import Errors
from .span cimport Span
from libc.stdint cimport uint64_t, uint32_t, int32_t
@ -58,7 +60,11 @@ cdef class SpanGroup:
DOCS: https://spacy.io/api/spangroup#doc
"""
return self._doc_ref()
doc = self._doc_ref()
if doc is None:
# referent has been garbage collected
raise RuntimeError(Errors.E866)
return doc
@property
def has_overlap(self):

View File

@ -46,6 +46,16 @@ Create a `SpanGroup`.
The [`Doc`](/api/doc) object the span group is referring to.
<Infobox title="SpanGroup and Doc lifecycle" variant="warning">
When a `Doc` object is garbage collected, any related `SpanGroup` object won't
be functional anymore, as these objects use a `weakref` to refer to the
document. An error will be raised as the internal `doc` object will be `None`.
To avoid this, make sure that the original `Doc` objects are still available in
the scope of your function.
</Infobox>
> #### Example
>
> ```python