mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Warn and document spangroup.doc weakref (#8980)
* test for error after Doc has been garbage collected * warn about using a SpanGroup when the Doc has been garbage collected * add warning to the docs * rephrase slightly * raise error instead of warning * update * move warning to doc property
This commit is contained in:
parent
37fe847af4
commit
de025beb5f
|
@ -521,6 +521,10 @@ class Errors:
|
|||
E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.")
|
||||
|
||||
# New errors added in v3.x
|
||||
E866 = ("A SpanGroup is not functional after the corresponding Doc has "
|
||||
"been garbage collected. To keep using the spans, make sure that "
|
||||
"the corresponding Doc object is still available in the scope of "
|
||||
"your function.")
|
||||
E867 = ("The 'textcat' component requires at least two labels because it "
|
||||
"uses mutually exclusive classes where exactly one label is True "
|
||||
"for each doc. For binary classification tasks, you can use two "
|
||||
|
|
|
@ -1,8 +1,10 @@
|
|||
import pytest
|
||||
import numpy
|
||||
from numpy.testing import assert_equal, assert_array_equal, assert_almost_equal
|
||||
from numpy.testing import assert_array_equal, assert_almost_equal
|
||||
from thinc.api import get_current_ops
|
||||
from spacy.language import Language
|
||||
from spacy.tokens.doc import SpanGroups
|
||||
from spacy.tokens import SpanGroup
|
||||
from spacy.training import Example
|
||||
from spacy.util import fix_random_seed, registry
|
||||
|
||||
|
@ -72,6 +74,23 @@ def test_explicit_labels():
|
|||
assert spancat.labels == ("PERSON", "LOC")
|
||||
|
||||
|
||||
def test_doc_gc():
|
||||
# If the Doc object is garbage collected, the spans won't be functional afterwards
|
||||
nlp = Language()
|
||||
spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
|
||||
spancat.add_label("PERSON")
|
||||
nlp.initialize()
|
||||
texts = ["Just a sentence.", "I like London and Berlin", "I like Berlin", "I eat ham."]
|
||||
all_spans = [doc.spans for doc in nlp.pipe(texts)]
|
||||
for text, spangroups in zip(texts, all_spans):
|
||||
assert isinstance(spangroups, SpanGroups)
|
||||
for key, spangroup in spangroups.items():
|
||||
assert isinstance(spangroup, SpanGroup)
|
||||
assert len(spangroup) > 0
|
||||
with pytest.raises(RuntimeError):
|
||||
span = spangroup[0]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"max_positive,nr_results", [(None, 4), (1, 2), (2, 3), (3, 4), (4, 4)]
|
||||
)
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
import weakref
|
||||
import struct
|
||||
import srsly
|
||||
|
||||
from spacy.errors import Errors
|
||||
from .span cimport Span
|
||||
from libc.stdint cimport uint64_t, uint32_t, int32_t
|
||||
|
||||
|
@ -58,7 +60,11 @@ cdef class SpanGroup:
|
|||
|
||||
DOCS: https://spacy.io/api/spangroup#doc
|
||||
"""
|
||||
return self._doc_ref()
|
||||
doc = self._doc_ref()
|
||||
if doc is None:
|
||||
# referent has been garbage collected
|
||||
raise RuntimeError(Errors.E866)
|
||||
return doc
|
||||
|
||||
@property
|
||||
def has_overlap(self):
|
||||
|
|
|
@ -46,6 +46,16 @@ Create a `SpanGroup`.
|
|||
|
||||
The [`Doc`](/api/doc) object the span group is referring to.
|
||||
|
||||
<Infobox title="SpanGroup and Doc lifecycle" variant="warning">
|
||||
|
||||
When a `Doc` object is garbage collected, any related `SpanGroup` object won't
|
||||
be functional anymore, as these objects use a `weakref` to refer to the
|
||||
document. An error will be raised as the internal `doc` object will be `None`.
|
||||
To avoid this, make sure that the original `Doc` objects are still available in
|
||||
the scope of your function.
|
||||
|
||||
</Infobox>
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
|
|
Loading…
Reference in New Issue
Block a user