From 135e3de5310497f96cc8f8e877557391f7ff75ab Mon Sep 17 00:00:00 2001
From: adrianeboyd <adrianeboyd@gmail.com>
Date: Fri, 18 Oct 2019 10:59:16 +0200
Subject: [PATCH] Check for docs with 2+ sentences in debug-data (#4467)

---
 spacy/cli/debug_data.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index b649e6666..fe6cccf81 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -360,6 +360,16 @@ def debug_data(
             )
         )
 
+        # check for documents with multiple sentences
+        sents_per_doc = gold_train_data["n_sents"] / len(gold_train_data["texts"])
+        if sents_per_doc < 1.1:
+            msg.warn(
+                "The training data contains {:.2f} sentences per "
+                "document. When there are very few documents containing more "
+                "than one sentence, the parser will not learn how to segment "
+                "longer texts into sentences.".format(sents_per_doc)
+            )
+
         # profile labels
         labels_train = [label for label in gold_train_data["deps"]]
         labels_train_unpreprocessed = [