From 66016ac289e2947b1929fc25ddfb7c2176e514d2 Mon Sep 17 00:00:00 2001
From: Sofie <svlandeg@users.noreply.github.com>
Date: Sun, 27 Jan 2019 06:01:02 +0100
Subject: [PATCH] Batch UD evaluation script  (#3174)

* running UD eval

* printing timing of tokenizer: tokens per second

* timing of default English model

* structured output and parameterization to compare different runs

* additional flag to allow evaluation without parsing info

* printing verbose log of errors for manual inspection

* printing over- and undersegmented cases (and combo's)

* add under and oversegmented numbers to Score and structured output

* print high-freq over/under segmented words and word shapes

* printing examples as part of the structured output

* print the results to file

* batch run of different models and treebanks per language

* cleaning up code

* commandline script to process all languages in spaCy & UD

* heuristic to remove blinded corpora and option to run one single best per language

* pathlib instead of os for file paths
---
 spacy/cli/ud/conll17_ud_eval.py |  82 ++++++---
 spacy/cli/ud/run_eval.py        | 287 ++++++++++++++++++++++++++++++++
 2 files changed, 349 insertions(+), 20 deletions(-)
 create mode 100644 spacy/cli/ud/run_eval.py

diff --git a/spacy/cli/ud/conll17_ud_eval.py b/spacy/cli/ud/conll17_ud_eval.py
index 2f8e632f0..78a976a6d 100644
--- a/spacy/cli/ud/conll17_ud_eval.py
+++ b/spacy/cli/ud/conll17_ud_eval.py
@@ -51,7 +51,8 @@
 #   - evaluate the given gold and system CoNLL-U files (loaded with load_conllu)
 #   - raises UDError if the concatenated tokens of gold and system file do not match
 #   - returns a dictionary with the metrics described above, each metrics having
-#     three fields: precision, recall and f1
+#     four fields: precision, recall, f1 and aligned_accuracy (when using aligned
+#     words, otherwise this is None)
 
 # Description of token matching
 # -----------------------------
@@ -97,7 +98,7 @@ class UDError(Exception):
     pass
 
 # Load given CoNLL-U file into internal representation
-def load_conllu(file):
+def load_conllu(file, check_parse=True):
     # Internal representation classes
     class UDRepresentation:
         def __init__(self):
@@ -181,8 +182,9 @@ def load_conllu(file):
                 process_word(word)
 
             # Check there is a single root node
-            if len([word for word in ud.words[sentence_start:] if word.parent is None]) != 1:
-                raise UDError("There are multiple roots in a sentence")
+            if check_parse:
+                if len([word for word in ud.words[sentence_start:] if word.parent is None]) != 1:
+                    raise UDError("There are multiple roots in a sentence")
 
             # End the sentence
             ud.sentences[-1].end = index
@@ -198,7 +200,7 @@ def load_conllu(file):
         if "." in columns[ID]:
             continue
 
-        # Delete spaces from FORM  so gold.characters == system.characters
+        # Delete spaces from FORM so gold.characters == system.characters
         # even if one of them tokenizes the space.
         columns[FORM] = columns[FORM].replace(" ", "")
         if not columns[FORM]:
@@ -247,13 +249,17 @@ def load_conllu(file):
     return ud
 
 # Evaluate the gold and system treebanks (loaded using load_conllu).
-def evaluate(gold_ud, system_ud, deprel_weights=None):
+def evaluate(gold_ud, system_ud, deprel_weights=None, check_parse=True):
     class Score:
-        def __init__(self, gold_total, system_total, correct, aligned_total=None):
+        def __init__(self, gold_total, system_total, correct, aligned_total=None, undersegmented=None, oversegmented=None):
             self.precision = correct / system_total if system_total else 0.0
             self.recall = correct / gold_total if gold_total else 0.0
             self.f1 = 2 * correct / (system_total + gold_total) if system_total + gold_total else 0.0
             self.aligned_accuracy = correct / aligned_total if aligned_total else aligned_total
+            self.undersegmented = undersegmented
+            self.oversegmented = oversegmented
+            self.under_perc = len(undersegmented) / gold_total if gold_total and undersegmented else 0.0
+            self.over_perc = len(oversegmented) / gold_total if gold_total and oversegmented else 0.0
     class AlignmentWord:
         def __init__(self, gold_word, system_word):
             self.gold_word = gold_word
@@ -286,17 +292,43 @@ def evaluate(gold_ud, system_ud, deprel_weights=None):
 
     def spans_score(gold_spans, system_spans):
         correct, gi, si = 0, 0, 0
+        undersegmented = list()
+        oversegmented = list()
+        combo = 0
+        previous_end_si_earlier = False
+        previous_end_gi_earlier = False
         while gi < len(gold_spans) and si < len(system_spans):
+            previous_si = system_spans[si-1] if si > 0 else None
+            previous_gi = gold_spans[gi-1] if gi > 0 else None
             if system_spans[si].start < gold_spans[gi].start:
+                # avoid counting the same mistake twice
+                if not previous_end_si_earlier:
+                    combo += 1
+                    oversegmented.append(str(previous_gi).strip())
                 si += 1
             elif gold_spans[gi].start < system_spans[si].start:
+                # avoid counting the same mistake twice
+                if not previous_end_gi_earlier:
+                    combo += 1
+                    undersegmented.append(str(previous_si).strip())
                 gi += 1
             else:
                 correct += gold_spans[gi].end == system_spans[si].end
+                if gold_spans[gi].end < system_spans[si].end:
+                    undersegmented.append(str(system_spans[si]).strip())
+                    previous_end_gi_earlier = True
+                    previous_end_si_earlier = False
+                elif gold_spans[gi].end > system_spans[si].end:
+                    oversegmented.append(str(gold_spans[gi]).strip())
+                    previous_end_si_earlier = True
+                    previous_end_gi_earlier = False
+                else:
+                    previous_end_gi_earlier = False
+                    previous_end_si_earlier = False
                 si += 1
                 gi += 1
 
-        return Score(len(gold_spans), len(system_spans), correct)
+        return Score(len(gold_spans), len(system_spans), correct, None, undersegmented, oversegmented)
 
     def alignment_score(alignment, key_fn, weight_fn=lambda w: 1):
         gold, system, aligned, correct = 0, 0, 0, 0
@@ -425,18 +457,28 @@ def evaluate(gold_ud, system_ud, deprel_weights=None):
     alignment = align_words(gold_ud.words, system_ud.words)
 
     # Compute the F1-scores
-    result = {
-        "Tokens": spans_score(gold_ud.tokens, system_ud.tokens),
-        "Sentences": spans_score(gold_ud.sentences, system_ud.sentences),
-        "Words": alignment_score(alignment, None),
-        "UPOS": alignment_score(alignment, lambda w, parent: w.columns[UPOS]),
-        "XPOS": alignment_score(alignment, lambda w, parent: w.columns[XPOS]),
-        "Feats": alignment_score(alignment, lambda w, parent: w.columns[FEATS]),
-        "AllTags": alignment_score(alignment, lambda w, parent: (w.columns[UPOS], w.columns[XPOS], w.columns[FEATS])),
-        "Lemmas": alignment_score(alignment, lambda w, parent: w.columns[LEMMA]),
-        "UAS": alignment_score(alignment, lambda w, parent: parent),
-        "LAS": alignment_score(alignment, lambda w, parent: (parent, w.columns[DEPREL])),
-    }
+    if check_parse:
+        result = {
+            "Tokens": spans_score(gold_ud.tokens, system_ud.tokens),
+            "Sentences": spans_score(gold_ud.sentences, system_ud.sentences),
+            "Words": alignment_score(alignment, None),
+            "UPOS": alignment_score(alignment, lambda w, parent: w.columns[UPOS]),
+            "XPOS": alignment_score(alignment, lambda w, parent: w.columns[XPOS]),
+            "Feats": alignment_score(alignment, lambda w, parent: w.columns[FEATS]),
+            "AllTags": alignment_score(alignment, lambda w, parent: (w.columns[UPOS], w.columns[XPOS], w.columns[FEATS])),
+            "Lemmas": alignment_score(alignment, lambda w, parent: w.columns[LEMMA]),
+            "UAS": alignment_score(alignment, lambda w, parent: parent),
+            "LAS": alignment_score(alignment, lambda w, parent: (parent, w.columns[DEPREL])),
+        }
+    else:
+        result = {
+            "Tokens": spans_score(gold_ud.tokens, system_ud.tokens),
+            "Sentences": spans_score(gold_ud.sentences, system_ud.sentences),
+            "Words": alignment_score(alignment, None),
+            "Feats": alignment_score(alignment, lambda w, parent: w.columns[FEATS]),
+            "Lemmas": alignment_score(alignment, lambda w, parent: w.columns[LEMMA]),
+        }
+
 
     # Add WeightedLAS if weights are given
     if deprel_weights is not None:
diff --git a/spacy/cli/ud/run_eval.py b/spacy/cli/ud/run_eval.py
new file mode 100644
index 000000000..171687980
--- /dev/null
+++ b/spacy/cli/ud/run_eval.py
@@ -0,0 +1,287 @@
+import spacy
+import time
+import re
+import plac
+import operator
+import datetime
+from pathlib import Path
+import xml.etree.ElementTree as ET
+
+from spacy.cli.ud import conll17_ud_eval
+from spacy.cli.ud.ud_train import write_conllu
+from spacy.lang.lex_attrs import word_shape
+from spacy.util import get_lang_class
+
+# All languages in spaCy - in UD format (note that Norwegian is 'no' instead of 'nb')
+ALL_LANGUAGES = "ar, ca, da, de, el, en, es, fa, fi, fr, ga, he, hi, hr, hu, id, " \
+                "it, ja, no, nl, pl, pt, ro, ru, sv, tr, ur, vi, zh"
+
+# Non-parsing tasks that will be evaluated (works for default models)
+EVAL_NO_PARSE = ['Tokens', 'Words', 'Lemmas', 'Sentences', 'Feats']
+
+# Tasks that will be evaluated if check_parse=True (does not work for default models)
+EVAL_PARSE = ['Tokens', 'Words', 'Lemmas', 'Sentences', 'Feats', 'UPOS', 'XPOS', 'AllTags', 'UAS', 'LAS']
+
+# Minimum frequency an error should have to be printed
+PRINT_FREQ = 20
+
+# Maximum number of errors printed per category
+PRINT_TOTAL = 10
+
+space_re = re.compile("\s+")
+
+
+def load_model(modelname, add_sentencizer=False):
+    """ Load a specific spaCy model """
+    loading_start = time.time()
+    nlp = spacy.load(modelname)
+    if add_sentencizer:
+        nlp.add_pipe(nlp.create_pipe('sentencizer'))
+    loading_end = time.time()
+    loading_time = loading_end - loading_start
+    if add_sentencizer:
+        return nlp, loading_time, modelname + '_sentencizer'
+    return nlp, loading_time, modelname
+
+
+def load_default_model_sentencizer(lang):
+    """ Load a generic spaCy model and add the sentencizer for sentence tokenization"""
+    loading_start = time.time()
+    lang_class = get_lang_class(lang)
+    nlp = lang_class()
+    nlp.add_pipe(nlp.create_pipe('sentencizer'))
+    loading_end = time.time()
+    loading_time = loading_end - loading_start
+    return nlp, loading_time, lang + "_default_" + 'sentencizer'
+
+
+def split_text(text):
+    return [space_re.sub(" ", par.strip()) for par in text.split("\n\n")]
+
+
+def get_freq_tuples(my_list, print_total_threshold):
+    """ Turn a list of errors into frequency-sorted tuples thresholded by a certain total number """
+    d = {}
+    for token in my_list:
+        d.setdefault(token, 0)
+        d[token] += 1
+    return sorted(d.items(), key=operator.itemgetter(1), reverse=True)[:print_total_threshold]
+
+
+def _contains_blinded_text(stats_xml):
+    """ Heuristic to determine whether the treebank has blinded texts or not """
+    tree = ET.parse(stats_xml)
+    root = tree.getroot()
+    total_tokens = int(root.find('size/total/tokens').text)
+    unique_lemmas = int(root.find('lemmas').get('unique'))
+
+    # assume the corpus is largely blinded when there are less than 1% unique tokens
+    return (unique_lemmas / total_tokens) < 0.01
+
+
+def fetch_all_treebanks(ud_dir, languages, corpus, best_per_language):
+    """" Fetch the txt files for all treebanks for a given set of languages """
+    all_treebanks = dict()
+    treebank_size = dict()
+    for l in languages:
+        all_treebanks[l] = []
+        treebank_size[l] = 0
+
+    for treebank_dir in ud_dir.iterdir():
+        if treebank_dir.is_dir():
+            for txt_path in treebank_dir.iterdir():
+                if txt_path.name.endswith('-ud-' + corpus + '.txt'):
+                    file_lang = txt_path.name.split('_')[0]
+                    if file_lang in languages:
+                        gold_path = treebank_dir / txt_path.name.replace('.txt', '.conllu')
+                        stats_xml = treebank_dir / "stats.xml"
+                        # ignore treebanks where the texts are not publicly available
+                        if not _contains_blinded_text(stats_xml):
+                            if not best_per_language:
+                                all_treebanks[file_lang].append(txt_path)
+                            # check the tokens in the gold annotation to keep only the biggest treebank per language
+                            else:
+                                with gold_path.open(mode='r', encoding='utf-8') as gold_file:
+                                    gold_ud = conll17_ud_eval.load_conllu(gold_file)
+                                    gold_tokens = len(gold_ud.tokens)
+                                if treebank_size[file_lang] < gold_tokens:
+                                    all_treebanks[file_lang] = [txt_path]
+                                    treebank_size[file_lang] = gold_tokens
+
+    return all_treebanks
+
+
+def run_single_eval(nlp, loading_time, print_name, text_path, gold_ud, tmp_output_path, out_file, print_header,
+                    check_parse, print_freq_tasks):
+    """" Run an evaluation of a model nlp on a certain specified treebank """
+    with text_path.open(mode='r', encoding='utf-8') as f:
+        flat_text = f.read()
+
+    # STEP 1: tokenize text
+    tokenization_start = time.time()
+    texts = split_text(flat_text)
+    docs = list(nlp.pipe(texts))
+    tokenization_end = time.time()
+    tokenization_time = tokenization_end - tokenization_start
+
+    # STEP 2: record stats and timings
+    tokens_per_s = int(len(gold_ud.tokens) / tokenization_time)
+
+    print_header_1 = ['date', 'text_path', 'gold_tokens', 'model', 'loading_time', 'tokenization_time', 'tokens_per_s']
+    print_string_1 = [str(datetime.date.today()), text_path.name, len(gold_ud.tokens),
+                      print_name, "%.2f" % loading_time, "%.2f" % tokenization_time, tokens_per_s]
+
+    # STEP 3: evaluate predicted tokens and features
+    with tmp_output_path.open(mode="w", encoding="utf8") as tmp_out_file:
+        write_conllu(docs, tmp_out_file)
+    with tmp_output_path.open(mode="r", encoding="utf8") as sys_file:
+        sys_ud = conll17_ud_eval.load_conllu(sys_file, check_parse=check_parse)
+    tmp_output_path.unlink()
+    scores = conll17_ud_eval.evaluate(gold_ud, sys_ud, check_parse=check_parse)
+
+    # STEP 4: format the scoring results
+    eval_headers = EVAL_PARSE
+    if not check_parse:
+        eval_headers = EVAL_NO_PARSE
+
+    for score_name in eval_headers:
+        score = scores[score_name]
+        print_string_1.extend(["%.2f" % score.precision,
+                               "%.2f" % score.recall,
+                               "%.2f" % score.f1])
+        print_string_1.append("-" if score.aligned_accuracy is None else "%.2f" % score.aligned_accuracy)
+        print_string_1.append("-" if score.undersegmented is None else "%.4f" % score.under_perc)
+        print_string_1.append("-" if score.oversegmented is None else "%.4f" % score.over_perc)
+
+        print_header_1.extend([score_name + '_p', score_name + '_r', score_name + '_F', score_name + '_acc',
+                               score_name + '_under', score_name + '_over'])
+
+        if score_name in print_freq_tasks:
+            print_header_1.extend([score_name + '_word_under_ex', score_name + '_shape_under_ex',
+                                   score_name + '_word_over_ex', score_name + '_shape_over_ex'])
+
+            d_under_words = get_freq_tuples(score.undersegmented, PRINT_TOTAL)
+            d_under_shapes = get_freq_tuples([word_shape(x) for x in score.undersegmented], PRINT_TOTAL)
+            d_over_words = get_freq_tuples(score.oversegmented, PRINT_TOTAL)
+            d_over_shapes = get_freq_tuples([word_shape(x) for x in score.oversegmented], PRINT_TOTAL)
+
+            # saving to CSV with ; seperator so blinding ; in the example output
+            print_string_1.append(
+                str({k: v for k, v in d_under_words if v > PRINT_FREQ}).replace(";", "*SEMICOLON*"))
+            print_string_1.append(
+                str({k: v for k, v in d_under_shapes if v > PRINT_FREQ}).replace(";", "*SEMICOLON*"))
+            print_string_1.append(
+                str({k: v for k, v in d_over_words if v > PRINT_FREQ}).replace(";", "*SEMICOLON*"))
+            print_string_1.append(
+                str({k: v for k, v in d_over_shapes if v > PRINT_FREQ}).replace(";", "*SEMICOLON*"))
+
+    # STEP 5: print the formatted results to CSV
+    if print_header:
+        out_file.write(';'.join(map(str, print_header_1)) + '\n')
+    out_file.write(';'.join(map(str, print_string_1)) + '\n')
+
+
+def run_all_evals(models, treebanks, out_file, check_parse, print_freq_tasks):
+    """" Run an evaluation for each language with its specified models and treebanks """
+    print_header = True
+
+    for tb_lang, treebank_list in treebanks.items():
+        print()
+        print("Language", tb_lang)
+        for text_path in treebank_list:
+            print(" Evaluating on", text_path)
+
+            gold_path = text_path.parent / (text_path.stem + '.conllu')
+            print("  Gold data from ", gold_path)
+
+            # nested try blocks to ensure the code can continue with the next iteration after a failure
+            try:
+                with gold_path.open(mode='r', encoding='utf-8') as gold_file:
+                    gold_ud = conll17_ud_eval.load_conllu(gold_file)
+
+                for nlp, nlp_loading_time, nlp_name in models[tb_lang]:
+                    try:
+                        print("   Benchmarking", nlp_name)
+                        tmp_output_path = text_path.parent / str('tmp_' + nlp_name + '.conllu')
+                        run_single_eval(nlp, nlp_loading_time, nlp_name, text_path, gold_ud, tmp_output_path, out_file,
+                                        print_header, check_parse, print_freq_tasks)
+                        print_header = False
+                    except Exception as e:
+                        print("    Ran into trouble: ", str(e))
+            except Exception as e:
+                print("   Ran into trouble: ", str(e))
+
+
+@plac.annotations(
+    out_path=("Path to output CSV file", "positional", None, Path),
+    ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path),
+    check_parse=("Set flag to evaluate parsing performance", "flag", "p", bool),
+    langs=("Enumeration of languages to evaluate (default: all)", "option", "l", str),
+    exclude_trained_models=("Set flag to exclude trained models", "flag", "t", bool),
+    exclude_multi=("Set flag to exclude the multi-language model as default baseline", "flag", "m", bool),
+    hide_freq=("Set flag to avoid printing out more detailed high-freq tokenization errors", "flag", "f", bool),
+    corpus=("Whether to run on train, dev or test", "option", "c", str),
+    best_per_language=("Set flag to only keep the largest treebank for each language", "flag", "b", bool)
+)
+def main(out_path, ud_dir, check_parse=False, langs=ALL_LANGUAGES, exclude_trained_models=False, exclude_multi=False,
+         hide_freq=False, corpus='train', best_per_language=False):
+    """"
+    Assemble all treebanks and models to run evaluations with.
+    When setting check_parse to True, the default models will not be evaluated as they don't have parsing functionality
+    """
+    languages = [lang.strip() for lang in langs.split(",")]
+
+    print_freq_tasks = []
+    if not hide_freq:
+        print_freq_tasks = ['Tokens']
+
+    # fetching all relevant treebank from the directory
+    treebanks = fetch_all_treebanks(ud_dir, languages, corpus, best_per_language)
+
+    print()
+    print("Loading all relevant models for", languages)
+    models = dict()
+
+    # multi-lang model
+    multi = None
+    if not exclude_multi and not check_parse:
+        multi = load_model('xx_ent_wiki_sm', add_sentencizer=True)
+
+    # initialize all models with the multi-lang model
+    for lang in languages:
+        models[lang] = [multi] if multi else []
+        # add default models if we don't want to evaluate parsing info
+        if not check_parse:
+            # Norwegian is 'nb' in spaCy but 'no' in the UD corpora
+            if lang == 'no':
+                models['no'].append(load_default_model_sentencizer('nb'))
+            else:
+                models[lang].append(load_default_model_sentencizer(lang))
+
+    # language-specific trained models
+    if not exclude_trained_models:
+        if 'de' in models:
+            models['de'].append(load_model('de_core_news_sm'))
+        if 'es' in models:
+            models['es'].append(load_model('es_core_news_sm'))
+            models['es'].append(load_model('es_core_news_md'))
+        if 'pt' in models:
+            models['pt'].append(load_model('pt_core_news_sm'))
+        if 'it' in models:
+            models['it'].append(load_model('it_core_news_sm'))
+        if 'nl' in models:
+            models['nl'].append(load_model('nl_core_news_sm'))
+        if 'en' in models:
+            models['en'].append(load_model('en_core_web_sm'))
+            models['en'].append(load_model('en_core_web_md'))
+            models['en'].append(load_model('en_core_web_lg'))
+        if 'fr' in models:
+            models['fr'].append(load_model('fr_core_news_sm'))
+            models['fr'].append(load_model('fr_core_news_md'))
+
+    with out_path.open(mode='w', encoding='utf-8') as out_file:
+        run_all_evals(models, treebanks, out_file, check_parse, print_freq_tasks)
+
+
+if __name__ == "__main__":
+    plac.call(main)