From 135e3de5310497f96cc8f8e877557391f7ff75ab Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Fri, 18 Oct 2019 10:59:16 +0200 Subject: [PATCH] Check for docs with 2+ sentences in debug-data (#4467) --- spacy/cli/debug_data.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index b649e6666..fe6cccf81 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -360,6 +360,16 @@ def debug_data( ) ) + # check for documents with multiple sentences + sents_per_doc = gold_train_data["n_sents"] / len(gold_train_data["texts"]) + if sents_per_doc < 1.1: + msg.warn( + "The training data contains {:.2f} sentences per " + "document. When there are very few documents containing more " + "than one sentence, the parser will not learn how to segment " + "longer texts into sentences.".format(sents_per_doc) + ) + # profile labels labels_train = [label for label in gold_train_data["deps"]] labels_train_unpreprocessed = [