Check whether doc is instantiated in Example.get_gold_parses() (#5167)

* Check whether doc is instantiated When creating docs to pair with gold parses, modify test to check whether a doc is unset rather than whether it contains tokens. * Restore test of evaluate on an empty doc * Set a minimal gold.orig for the scorer Without a minimal gold.orig the scorer can't evaluate empty docs. This is the v3 equivalent of #4925.
2025-07-31 10:29:46 +03:00 · 2020-03-29 13:57:00 +02:00 · 2020-03-29 13:57:00 +02:00 · ce0e538068
commit ce0e538068
parent d6d95674c1
3 changed files with 13 additions and 4 deletions
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -834,7 +834,7 @@ cdef class Example:
        if merge:
            t = self.token_annotation
            doc = self.doc
-            if not self.doc:
+            if self.doc is None:
                if not vocab:
                    raise ValueError(Errors.E998)
                doc = Doc(vocab, words=t.words)
@ -993,7 +993,10 @@ cdef class GoldParse:
        self.links = {} if links is None else dict(links)

        # avoid allocating memory if the doc does not contain any tokens
-        if self.length > 0:
+        if self.length == 0:
+            # set a minimal orig so that the scorer can score an empty doc
+            self.orig = TokenAnnotation(ids=[])
+        else:
            if not words:
                words = [token.text for token in doc]
            if not tags:
--- a/spacy/tests/regression/test_issue4924.py
+++ b/spacy/tests/regression/test_issue4924.py
@ -5,5 +5,4 @@ from spacy.language import Language
 def test_issue4924():
    nlp = Language()
    docs_golds = [("", {})]
-    with pytest.raises(ValueError):
-        nlp.evaluate(docs_golds)
+    nlp.evaluate(docs_golds)
--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@ -480,3 +480,10 @@ def test_tuples_to_example(merged_dict):
    assert ex_dict["token_annotation"]["tags"] == merged_dict["tags"]
    assert ex_dict["token_annotation"]["sent_starts"] == merged_dict["sent_starts"]
    assert ex_dict["doc_annotation"]["cats"] == cats
+
+
+def test_empty_example_goldparse():
+    nlp = English()
+    doc = nlp("")
+    example = Example(doc=doc)
+    assert len(example.get_gold_parses()) == 1