mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	Warn and document spangroup.doc weakref (#8980)
* test for error after Doc has been garbage collected * warn about using a SpanGroup when the Doc has been garbage collected * add warning to the docs * rephrase slightly * raise error instead of warning * update * move warning to doc property
This commit is contained in:
		
							parent
							
								
									37fe847af4
								
							
						
					
					
						commit
						de025beb5f
					
				| 
						 | 
					@ -521,6 +521,10 @@ class Errors:
 | 
				
			||||||
    E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.")
 | 
					    E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # New errors added in v3.x
 | 
					    # New errors added in v3.x
 | 
				
			||||||
 | 
					    E866 = ("A SpanGroup is not functional after the corresponding Doc has "
 | 
				
			||||||
 | 
					            "been garbage collected. To keep using the spans, make sure that "
 | 
				
			||||||
 | 
					            "the corresponding Doc object is still available in the scope of "
 | 
				
			||||||
 | 
					            "your function.")
 | 
				
			||||||
    E867 = ("The 'textcat' component requires at least two labels because it "
 | 
					    E867 = ("The 'textcat' component requires at least two labels because it "
 | 
				
			||||||
            "uses mutually exclusive classes where exactly one label is True "
 | 
					            "uses mutually exclusive classes where exactly one label is True "
 | 
				
			||||||
            "for each doc. For binary classification tasks, you can use two "
 | 
					            "for each doc. For binary classification tasks, you can use two "
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,8 +1,10 @@
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
import numpy
 | 
					import numpy
 | 
				
			||||||
from numpy.testing import assert_equal, assert_array_equal, assert_almost_equal
 | 
					from numpy.testing import assert_array_equal, assert_almost_equal
 | 
				
			||||||
from thinc.api import get_current_ops
 | 
					from thinc.api import get_current_ops
 | 
				
			||||||
from spacy.language import Language
 | 
					from spacy.language import Language
 | 
				
			||||||
 | 
					from spacy.tokens.doc import SpanGroups
 | 
				
			||||||
 | 
					from spacy.tokens import SpanGroup
 | 
				
			||||||
from spacy.training import Example
 | 
					from spacy.training import Example
 | 
				
			||||||
from spacy.util import fix_random_seed, registry
 | 
					from spacy.util import fix_random_seed, registry
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -72,6 +74,23 @@ def test_explicit_labels():
 | 
				
			||||||
    assert spancat.labels == ("PERSON", "LOC")
 | 
					    assert spancat.labels == ("PERSON", "LOC")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_doc_gc():
 | 
				
			||||||
 | 
					    # If the Doc object is garbage collected, the spans won't be functional afterwards
 | 
				
			||||||
 | 
					    nlp = Language()
 | 
				
			||||||
 | 
					    spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
 | 
				
			||||||
 | 
					    spancat.add_label("PERSON")
 | 
				
			||||||
 | 
					    nlp.initialize()
 | 
				
			||||||
 | 
					    texts = ["Just a sentence.", "I like London and Berlin", "I like Berlin", "I eat ham."]
 | 
				
			||||||
 | 
					    all_spans = [doc.spans for doc in nlp.pipe(texts)]
 | 
				
			||||||
 | 
					    for text, spangroups in zip(texts, all_spans):
 | 
				
			||||||
 | 
					        assert isinstance(spangroups, SpanGroups)
 | 
				
			||||||
 | 
					        for key, spangroup in spangroups.items():
 | 
				
			||||||
 | 
					            assert isinstance(spangroup, SpanGroup)
 | 
				
			||||||
 | 
					            assert len(spangroup) > 0
 | 
				
			||||||
 | 
					            with pytest.raises(RuntimeError):
 | 
				
			||||||
 | 
					                span = spangroup[0]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.mark.parametrize(
 | 
					@pytest.mark.parametrize(
 | 
				
			||||||
    "max_positive,nr_results", [(None, 4), (1, 2), (2, 3), (3, 4), (4, 4)]
 | 
					    "max_positive,nr_results", [(None, 4), (1, 2), (2, 3), (3, 4), (4, 4)]
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,8 @@
 | 
				
			||||||
import weakref
 | 
					import weakref
 | 
				
			||||||
import struct
 | 
					import struct
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from spacy.errors import Errors
 | 
				
			||||||
from .span cimport Span
 | 
					from .span cimport Span
 | 
				
			||||||
from libc.stdint cimport uint64_t, uint32_t, int32_t
 | 
					from libc.stdint cimport uint64_t, uint32_t, int32_t
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -58,7 +60,11 @@ cdef class SpanGroup:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        DOCS: https://spacy.io/api/spangroup#doc
 | 
					        DOCS: https://spacy.io/api/spangroup#doc
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        return self._doc_ref()
 | 
					        doc = self._doc_ref()
 | 
				
			||||||
 | 
					        if doc is None:
 | 
				
			||||||
 | 
					            # referent has been garbage collected
 | 
				
			||||||
 | 
					            raise RuntimeError(Errors.E866)
 | 
				
			||||||
 | 
					        return doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @property
 | 
					    @property
 | 
				
			||||||
    def has_overlap(self):
 | 
					    def has_overlap(self):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -46,6 +46,16 @@ Create a `SpanGroup`.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
The [`Doc`](/api/doc) object the span group is referring to.
 | 
					The [`Doc`](/api/doc) object the span group is referring to.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					<Infobox title="SpanGroup and Doc lifecycle" variant="warning">
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					When a `Doc` object is garbage collected, any related `SpanGroup` object won't
 | 
				
			||||||
 | 
					be functional anymore, as these objects use a `weakref` to refer to the
 | 
				
			||||||
 | 
					document. An error will be raised as the internal `doc` object will be `None`.
 | 
				
			||||||
 | 
					To avoid this, make sure that the original `Doc` objects are still available in
 | 
				
			||||||
 | 
					the scope of your function.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					</Infobox>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
> #### Example
 | 
					> #### Example
 | 
				
			||||||
>
 | 
					>
 | 
				
			||||||
> ```python
 | 
					> ```python
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user