From a58cb023d7312d218f45c566f02bf59e708ec4fb Mon Sep 17 00:00:00 2001
From: adrianeboyd <adrianeboyd@gmail.com>
Date: Fri, 16 Aug 2019 10:52:46 +0200
Subject: [PATCH] WIP: Extending debug-data (#4114)

* Extending debug-data with dependency checks, etc.

* Modify debug-data to load with GoldCorpus to iterate over .json/.jsonl
files within directories

* Add GoldCorpus iterator train_docs_without_preprocessing to load
original train docs without shuffling and projectivizing

* Report number of misaligned tokens

* Add more dependency checks and messages

* Update spacy/cli/debug_data.py

Co-Authored-By: Ines Montani <ines@ines.io>

* Fixed conflict

* Move counts to _compile_gold()

* Move all dependency nonproj/sent/head/cycle counting to
_compile_gold()

* Unclobber previous merges

* Update variable names

* Update more variable names, fix misspelling

* Don't clobber loading error messages

* Only warn about misaligned tokens if present
---
 spacy/cli/debug_data.py | 223 ++++++++++++++++++++++++++++++++++------
 spacy/gold.pyx          |   4 +
 2 files changed, 194 insertions(+), 33 deletions(-)

diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index afedb933e..290131c76 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -9,11 +9,14 @@ import srsly
 from wasabi import Printer, MESSAGES
 
 from ..gold import GoldCorpus, read_json_object
+from ..syntax import nonproj
 from ..util import load_model, get_lang_class
 
 
-# Minimum number of expected occurences of label in data to train new label
+# Minimum number of expected occurrences of NER label in data to train new label
 NEW_LABEL_THRESHOLD = 50
+# Minimum number of expected occurrences of dependency labels
+DEP_LABEL_THRESHOLD = 20
 # Minimum number of expected examples to train a blank model
 BLANK_MODEL_MIN_THRESHOLD = 100
 BLANK_MODEL_THRESHOLD = 2000
@@ -68,12 +71,10 @@ def debug_data(
         nlp = lang_cls()
 
     msg.divider("Data format validation")
-    # Load the data in one – might take a while but okay in this case
-    train_data = _load_file(train_path, msg)
-    dev_data = _load_file(dev_path, msg)
 
     # Validate data format using the JSON schema
     # TODO: update once the new format is ready
+    # TODO: move validation to GoldCorpus in order to be able to load from dir
     train_data_errors = []  # TODO: validate_json
     dev_data_errors = []  # TODO: validate_json
     if not train_data_errors:
@@ -88,18 +89,34 @@ def debug_data(
         sys.exit(1)
 
     # Create the gold corpus to be able to better analyze data
-    with msg.loading("Analyzing corpus..."):
-        train_data = read_json_object(train_data)
-        dev_data = read_json_object(dev_data)
-        corpus = GoldCorpus(train_data, dev_data)
-        train_docs = list(corpus.train_docs(nlp))
-        dev_docs = list(corpus.dev_docs(nlp))
+    loading_train_error_message = ""
+    loading_dev_error_message = ""
+    with msg.loading("Loading corpus..."):
+        corpus = GoldCorpus(train_path, dev_path)
+        try:
+            train_docs = list(corpus.train_docs(nlp))
+            train_docs_unpreprocessed = list(corpus.train_docs_without_preprocessing(nlp))
+        except ValueError as e:
+            loading_train_error_message = "Training data cannot be loaded: {}".format(str(e))
+        try:
+            dev_docs = list(corpus.dev_docs(nlp))
+        except ValueError as e:
+            loading_dev_error_message = "Development data cannot be loaded: {}".format(str(e))
+    if loading_train_error_message or loading_dev_error_message:
+        if loading_train_error_message:
+            msg.fail(loading_train_error_message)
+        if loading_dev_error_message:
+            msg.fail(loading_dev_error_message)
+        sys.exit(1)
     msg.good("Corpus is loadable")
 
     # Create all gold data here to avoid iterating over the train_docs constantly
-    gold_data = _compile_gold(train_docs, pipeline)
-    train_texts = gold_data["texts"]
-    dev_texts = set([doc.text for doc, gold in dev_docs])
+    gold_train_data = _compile_gold(train_docs, pipeline)
+    gold_train_unpreprocessed_data = _compile_gold(train_docs_unpreprocessed, pipeline)
+    gold_dev_data = _compile_gold(dev_docs, pipeline)
+
+    train_texts = gold_train_data["texts"]
+    dev_texts = gold_dev_data["texts"]
 
     msg.divider("Training stats")
     msg.text("Training pipeline: {}".format(", ".join(pipeline)))
@@ -133,13 +150,21 @@ def debug_data(
         )
 
     msg.divider("Vocab & Vectors")
-    n_words = gold_data["n_words"]
+    n_words = gold_train_data["n_words"]
     msg.info(
         "{} total {} in the data ({} unique)".format(
-            n_words, "word" if n_words == 1 else "words", len(gold_data["words"])
+            n_words, "word" if n_words == 1 else "words", len(gold_train_data["words"])
         )
     )
-    most_common_words = gold_data["words"].most_common(10)
+    if gold_train_data["n_misaligned_words"] > 0:
+        msg.warn(
+            "{} misaligned tokens in the training data".format(gold_train_data["n_misaligned_words"])
+        )
+    if gold_dev_data["n_misaligned_words"] > 0:
+        msg.warn(
+            "{} misaligned tokens in the dev data".format(gold_dev_data["n_misaligned_words"])
+        )
+    most_common_words = gold_train_data["words"].most_common(10)
     msg.text(
         "10 most common words: {}".format(
             _format_labels(most_common_words, counts=True)
@@ -159,8 +184,8 @@ def debug_data(
 
     if "ner" in pipeline:
         # Get all unique NER labels present in the data
-        labels = set(label for label in gold_data["ner"] if label not in ("O", "-"))
-        label_counts = gold_data["ner"]
+        labels = set(label for label in gold_train_data["ner"] if label not in ("O", "-"))
+        label_counts = gold_train_data["ner"]
         model_labels = _get_labels_from_model(nlp, "ner")
         new_labels = [l for l in labels if l not in model_labels]
         existing_labels = [l for l in labels if l in model_labels]
@@ -196,8 +221,8 @@ def debug_data(
                 "Existing: {}".format(_format_labels(existing_labels)), show=verbose
             )
 
-        if gold_data["ws_ents"]:
-            msg.fail("{} invalid whitespace entity spans".format(gold_data["ws_ents"]))
+        if gold_train_data["ws_ents"]:
+            msg.fail("{} invalid whitespace entity spans".format(gold_train_data["ws_ents"]))
             has_ws_ents_error = True
 
         for label in new_labels:
@@ -227,7 +252,7 @@ def debug_data(
         if has_low_data_warning:
             msg.text(
                 "To train a new entity type, your data should include at "
-                "least {} insteances of the new label".format(NEW_LABEL_THRESHOLD),
+                "least {} instances of the new label".format(NEW_LABEL_THRESHOLD),
                 show=verbose,
             )
         if has_no_neg_warning:
@@ -245,7 +270,7 @@ def debug_data(
 
     if "textcat" in pipeline:
         msg.divider("Text Classification")
-        labels = [label for label in gold_data["textcat"]]
+        labels = [label for label in gold_train_data["textcat"]]
         model_labels = _get_labels_from_model(nlp, "textcat")
         new_labels = [l for l in labels if l not in model_labels]
         existing_labels = [l for l in labels if l in model_labels]
@@ -256,7 +281,7 @@ def debug_data(
         )
         if new_labels:
             labels_with_counts = _format_labels(
-                gold_data["textcat"].most_common(), counts=True
+                gold_train_data["textcat"].most_common(), counts=True
             )
             msg.text("New: {}".format(labels_with_counts), show=verbose)
         if existing_labels:
@@ -266,7 +291,7 @@ def debug_data(
 
     if "tagger" in pipeline:
         msg.divider("Part-of-speech Tagging")
-        labels = [label for label in gold_data["tags"]]
+        labels = [label for label in gold_train_data["tags"]]
         tag_map = nlp.Defaults.tag_map
         msg.info(
             "{} {} in data ({} {} in tag map)".format(
@@ -277,7 +302,7 @@ def debug_data(
             )
         )
         labels_with_counts = _format_labels(
-            gold_data["tags"].most_common(), counts=True
+            gold_train_data["tags"].most_common(), counts=True
         )
         msg.text(labels_with_counts, show=verbose)
         non_tagmap = [l for l in labels if l not in tag_map]
@@ -292,17 +317,132 @@ def debug_data(
 
     if "parser" in pipeline:
         msg.divider("Dependency Parsing")
-        labels = [label for label in gold_data["deps"]]
+
+        # profile sentence length
         msg.info(
-            "{} {} in data".format(
-                len(labels), "label" if len(labels) == 1 else "labels"
+            "Found {} sentence{} with an average length of {:.1f} words.".format(
+                gold_train_data["n_sents"], 
+                "s" if len(train_docs) > 1 else "", 
+                gold_train_data["n_words"] / gold_train_data["n_sents"]
             )
         )
+
+        # profile labels
+        labels_train = [label for label in gold_train_data["deps"]]
+        labels_train_unpreprocessed = [label for label in gold_train_unpreprocessed_data["deps"]]
+        labels_dev = [label for label in gold_dev_data["deps"]]
+
+        if gold_train_unpreprocessed_data["n_nonproj"] > 0:
+            msg.info(
+                "Found {} nonprojective train sentence{}".format(
+                    gold_train_unpreprocessed_data["n_nonproj"],
+                    "s" if gold_train_unpreprocessed_data["n_nonproj"] > 1 else ""
+                )
+            )
+        if gold_dev_data["n_nonproj"] > 0:
+            msg.info(
+                "Found {} nonprojective dev sentence{}".format(
+                    gold_dev_data["n_nonproj"],
+                    "s" if gold_dev_data["n_nonproj"] > 1 else ""
+                )
+            )
+
+        msg.info(
+            "{} {} in train data".format(
+                len(labels_train_unpreprocessed), "label" if len(labels_train) == 1 else "labels"
+            )
+        )
+        msg.info(
+            "{} {} in projectivized train data".format(
+                len(labels_train), "label" if len(labels_train) == 1 else "labels"
+            )
+        )
+
         labels_with_counts = _format_labels(
-            gold_data["deps"].most_common(), counts=True
+            gold_train_unpreprocessed_data["deps"].most_common(), counts=True
         )
         msg.text(labels_with_counts, show=verbose)
 
+        # rare labels in train
+        for label in gold_train_unpreprocessed_data["deps"]:
+            if gold_train_unpreprocessed_data["deps"][label] <= DEP_LABEL_THRESHOLD:
+                msg.warn(
+                    "Low number of examples for label '{}' ({})".format(
+                        label, gold_train_unpreprocessed_data["deps"][label]
+                    )
+                )
+                has_low_data_warning = True
+
+
+        # rare labels in projectivized train
+        rare_projectivized_labels = []
+        for label in gold_train_data["deps"]:
+            if gold_train_data["deps"][label] <= DEP_LABEL_THRESHOLD and "||" in label:
+                rare_projectivized_labels.append("{}: {}".format(label, str(gold_train_data["deps"][label])))
+
+        if len(rare_projectivized_labels) > 0:
+                msg.warn(
+                    "Low number of examples for {} label{} in the "
+                    "projectivized dependency trees used for training. You may "
+                    "want to projectivize labels such as punct before "
+                    "training in order to improve parser performance.".format(
+                        len(rare_projectivized_labels),
+                        "s" if len(rare_projectivized_labels) > 1 else "")
+                )
+                msg.warn(
+                    "Projectivized labels with low numbers of examples: "
+                    "{}".format("\n".join(rare_projectivized_labels)),
+                    show=verbose
+                )
+                has_low_data_warning = True
+
+        # labels only in train
+        if set(labels_train) - set(labels_dev):
+            msg.warn(
+                "The following labels were found only in the train data: "
+                "{}".format(", ".join(set(labels_train) - set(labels_dev))),
+                show=verbose
+            )
+
+        # labels only in dev
+        if set(labels_dev) - set(labels_train):
+            msg.warn(
+                "The following labels were found only in the dev data: " +
+                ", ".join(set(labels_dev) - set(labels_train)),
+                show=verbose
+            )
+
+        if has_low_data_warning:
+            msg.text(
+                "To train a parser, your data should include at "
+                "least {} instances of each label.".format(DEP_LABEL_THRESHOLD),
+                show=verbose,
+            )
+
+        # multiple root labels
+        if len(gold_train_unpreprocessed_data["roots"]) > 1:
+            msg.warn(
+                "Multiple root labels ({}) ".format(", ".join(gold_train_unpreprocessed_data["roots"])) +
+                "found in training data. spaCy's parser uses a single root "
+                "label ROOT so this distinction will not be available."
+            )
+
+        # these should not happen, but just in case
+        if gold_train_data["n_nonproj"] > 0:
+            msg.fail(
+                "Found {} nonprojective projectivized train sentence{}".format(
+                    gold_train_data["n_nonproj"],
+                    "s" if gold_train_data["n_nonproj"] > 1 else ""
+                )
+            )
+        if gold_train_data["n_cycles"] > 0:
+            msg.fail(
+                "Found {} projectivized train sentence{} with cycles".format(
+                    gold_train_data["n_cycles"],
+                    "s" if gold_train_data["n_cycles"] > 1 else ""
+                )
+            )
+
     msg.divider("Summary")
     good_counts = msg.counts[MESSAGES.GOOD]
     warn_counts = msg.counts[MESSAGES.WARN]
@@ -350,16 +490,25 @@ def _compile_gold(train_docs, pipeline):
         "tags": Counter(),
         "deps": Counter(),
         "words": Counter(),
+        "roots": Counter(),
         "ws_ents": 0,
         "n_words": 0,
+        "n_misaligned_words": 0,
+        "n_sents": 0,
+        "n_nonproj": 0,
+        "n_cycles": 0,
         "texts": set(),
     }
     for doc, gold in train_docs:
-        data["words"].update(gold.words)
-        data["n_words"] += len(gold.words)
+        valid_words = [x for x in gold.words if x is not None]
+        data["words"].update(valid_words)
+        data["n_words"] += len(valid_words)
+        data["n_misaligned_words"] += len(gold.words) - len(valid_words)
         data["texts"].add(doc.text)
         if "ner" in pipeline:
             for i, label in enumerate(gold.ner):
+                if label is None:
+                    continue
                 if label.startswith(("B-", "U-", "L-")) and doc[i].is_space:
                     # "Illegal" whitespace entity
                     data["ws_ents"] += 1
@@ -371,9 +520,17 @@ def _compile_gold(train_docs, pipeline):
         if "textcat" in pipeline:
             data["cats"].update(gold.cats)
         if "tagger" in pipeline:
-            data["tags"].update(gold.tags)
+            data["tags"].update([x for x in gold.tags if x is not None])
         if "parser" in pipeline:
-            data["deps"].update(gold.labels)
+            data["deps"].update([x for x in gold.labels if x is not None])
+            for i, (dep, head) in enumerate(zip(gold.labels, gold.heads)):
+                if head == i:
+                    data["roots"].update([dep])
+                    data["n_sents"] += 1
+            if nonproj.is_nonproj_tree(gold.heads):
+                data["n_nonproj"] += 1
+            if nonproj.contains_cycle(gold.heads):
+                data["n_cycles"] += 1
     return data
 
 
diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index 64c2d9772..f6ec8d3fa 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -216,6 +216,10 @@ class GoldCorpus(object):
                                         make_projective=True)
         yield from gold_docs
 
+    def train_docs_without_preprocessing(self, nlp, gold_preproc=False):
+        gold_docs = self.iter_gold_docs(nlp, self.train_tuples, gold_preproc=gold_preproc)
+        yield from gold_docs
+
     def dev_docs(self, nlp, gold_preproc=False):
         gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc=gold_preproc)
         yield from gold_docs