From 2f9b28c21847318f5d14e361505216b1e911ba17 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Thu, 15 Aug 2019 18:08:28 +0200 Subject: [PATCH] Provide more info in cycle error message E069 (#4123) Provide the tokens in the cycle and the first 50 tokens from document in the error message so it's easier to track down the location of the cycle in the data. Addresses feature request in #3698. --- spacy/errors.py | 3 ++- spacy/gold.pyx | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 25a170bdb..0a4875d96 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -243,7 +243,8 @@ class Errors(object): "Tag sequence:\n{tags}") E068 = ("Invalid BILUO tag: '{tag}'.") E069 = ("Invalid gold-standard parse tree. Found cycle between word " - "IDs: {cycle}") + "IDs: {cycle} (tokens: {cycle_tokens}) in the document starting " + "with tokens: {doc_tokens}.") E070 = ("Invalid gold-standard data. Number of documents ({n_docs}) " "does not align with number of annotations ({n_annots}).") E071 = ("Error creating lexeme: specified orth ID ({orth}) does not " diff --git a/spacy/gold.pyx b/spacy/gold.pyx index ce1648ccd..dab65f48e 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -590,7 +590,7 @@ cdef class GoldParse: cycle = nonproj.contains_cycle(self.heads) if cycle is not None: - raise ValueError(Errors.E069.format(cycle=cycle)) + raise ValueError(Errors.E069.format(cycle=cycle, cycle_tokens=" ".join(["'{}'".format(self.words[tok_id]) for tok_id in cycle]), doc_tokens=" ".join(words[:50]))) def __len__(self): """Get the number of gold-standard tokens.