mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 09:26:27 +03:00
Warn and document spangroup.doc weakref (#8980)
* test for error after Doc has been garbage collected * warn about using a SpanGroup when the Doc has been garbage collected * add warning to the docs * rephrase slightly * raise error instead of warning * update * move warning to doc property
This commit is contained in:
parent
37fe847af4
commit
de025beb5f
|
@ -521,6 +521,10 @@ class Errors:
|
||||||
E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.")
|
E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.")
|
||||||
|
|
||||||
# New errors added in v3.x
|
# New errors added in v3.x
|
||||||
|
E866 = ("A SpanGroup is not functional after the corresponding Doc has "
|
||||||
|
"been garbage collected. To keep using the spans, make sure that "
|
||||||
|
"the corresponding Doc object is still available in the scope of "
|
||||||
|
"your function.")
|
||||||
E867 = ("The 'textcat' component requires at least two labels because it "
|
E867 = ("The 'textcat' component requires at least two labels because it "
|
||||||
"uses mutually exclusive classes where exactly one label is True "
|
"uses mutually exclusive classes where exactly one label is True "
|
||||||
"for each doc. For binary classification tasks, you can use two "
|
"for each doc. For binary classification tasks, you can use two "
|
||||||
|
|
|
@ -1,8 +1,10 @@
|
||||||
import pytest
|
import pytest
|
||||||
import numpy
|
import numpy
|
||||||
from numpy.testing import assert_equal, assert_array_equal, assert_almost_equal
|
from numpy.testing import assert_array_equal, assert_almost_equal
|
||||||
from thinc.api import get_current_ops
|
from thinc.api import get_current_ops
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
|
from spacy.tokens.doc import SpanGroups
|
||||||
|
from spacy.tokens import SpanGroup
|
||||||
from spacy.training import Example
|
from spacy.training import Example
|
||||||
from spacy.util import fix_random_seed, registry
|
from spacy.util import fix_random_seed, registry
|
||||||
|
|
||||||
|
@ -72,6 +74,23 @@ def test_explicit_labels():
|
||||||
assert spancat.labels == ("PERSON", "LOC")
|
assert spancat.labels == ("PERSON", "LOC")
|
||||||
|
|
||||||
|
|
||||||
|
def test_doc_gc():
|
||||||
|
# If the Doc object is garbage collected, the spans won't be functional afterwards
|
||||||
|
nlp = Language()
|
||||||
|
spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
|
||||||
|
spancat.add_label("PERSON")
|
||||||
|
nlp.initialize()
|
||||||
|
texts = ["Just a sentence.", "I like London and Berlin", "I like Berlin", "I eat ham."]
|
||||||
|
all_spans = [doc.spans for doc in nlp.pipe(texts)]
|
||||||
|
for text, spangroups in zip(texts, all_spans):
|
||||||
|
assert isinstance(spangroups, SpanGroups)
|
||||||
|
for key, spangroup in spangroups.items():
|
||||||
|
assert isinstance(spangroup, SpanGroup)
|
||||||
|
assert len(spangroup) > 0
|
||||||
|
with pytest.raises(RuntimeError):
|
||||||
|
span = spangroup[0]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"max_positive,nr_results", [(None, 4), (1, 2), (2, 3), (3, 4), (4, 4)]
|
"max_positive,nr_results", [(None, 4), (1, 2), (2, 3), (3, 4), (4, 4)]
|
||||||
)
|
)
|
||||||
|
|
|
@ -1,6 +1,8 @@
|
||||||
import weakref
|
import weakref
|
||||||
import struct
|
import struct
|
||||||
import srsly
|
import srsly
|
||||||
|
|
||||||
|
from spacy.errors import Errors
|
||||||
from .span cimport Span
|
from .span cimport Span
|
||||||
from libc.stdint cimport uint64_t, uint32_t, int32_t
|
from libc.stdint cimport uint64_t, uint32_t, int32_t
|
||||||
|
|
||||||
|
@ -58,7 +60,11 @@ cdef class SpanGroup:
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/spangroup#doc
|
DOCS: https://spacy.io/api/spangroup#doc
|
||||||
"""
|
"""
|
||||||
return self._doc_ref()
|
doc = self._doc_ref()
|
||||||
|
if doc is None:
|
||||||
|
# referent has been garbage collected
|
||||||
|
raise RuntimeError(Errors.E866)
|
||||||
|
return doc
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def has_overlap(self):
|
def has_overlap(self):
|
||||||
|
|
|
@ -46,6 +46,16 @@ Create a `SpanGroup`.
|
||||||
|
|
||||||
The [`Doc`](/api/doc) object the span group is referring to.
|
The [`Doc`](/api/doc) object the span group is referring to.
|
||||||
|
|
||||||
|
<Infobox title="SpanGroup and Doc lifecycle" variant="warning">
|
||||||
|
|
||||||
|
When a `Doc` object is garbage collected, any related `SpanGroup` object won't
|
||||||
|
be functional anymore, as these objects use a `weakref` to refer to the
|
||||||
|
document. An error will be raised as the internal `doc` object will be `None`.
|
||||||
|
To avoid this, make sure that the original `Doc` objects are still available in
|
||||||
|
the scope of your function.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
|
|
Loading…
Reference in New Issue
Block a user