Check whether doc is instantiated in Example.get_gold_parses() (#5167)

* Check whether doc is instantiated

When creating docs to pair with gold parses, modify test to check
whether a doc is unset rather than whether it contains tokens.

* Restore test of evaluate on an empty doc

* Set a minimal gold.orig for the scorer

Without a minimal gold.orig the scorer can't evaluate empty docs. This
is the v3 equivalent of #4925.
This commit is contained in:
adrianeboyd 2020-03-29 13:57:00 +02:00 committed by GitHub
parent d6d95674c1
commit ce0e538068
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 13 additions and 4 deletions

View File

@ -834,7 +834,7 @@ cdef class Example:
if merge:
t = self.token_annotation
doc = self.doc
if not self.doc:
if self.doc is None:
if not vocab:
raise ValueError(Errors.E998)
doc = Doc(vocab, words=t.words)
@ -993,7 +993,10 @@ cdef class GoldParse:
self.links = {} if links is None else dict(links)
# avoid allocating memory if the doc does not contain any tokens
if self.length > 0:
if self.length == 0:
# set a minimal orig so that the scorer can score an empty doc
self.orig = TokenAnnotation(ids=[])
else:
if not words:
words = [token.text for token in doc]
if not tags:

View File

@ -5,5 +5,4 @@ from spacy.language import Language
def test_issue4924():
nlp = Language()
docs_golds = [("", {})]
with pytest.raises(ValueError):
nlp.evaluate(docs_golds)

View File

@ -480,3 +480,10 @@ def test_tuples_to_example(merged_dict):
assert ex_dict["token_annotation"]["tags"] == merged_dict["tags"]
assert ex_dict["token_annotation"]["sent_starts"] == merged_dict["sent_starts"]
assert ex_dict["doc_annotation"]["cats"] == cats
def test_empty_example_goldparse():
nlp = English()
doc = nlp("")
example = Example(doc=doc)
assert len(example.get_gold_parses()) == 1