mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-14 13:47:13 +03:00
615 lines
27 KiB
Python
615 lines
27 KiB
Python
#!/usr/bin/env python
|
|
# flake8: noqa
|
|
|
|
# CoNLL 2017 UD Parsing evaluation script.
|
|
#
|
|
# Compatible with Python 2.7 and 3.2+, can be used either as a module
|
|
# or a standalone executable.
|
|
#
|
|
# Copyright 2017 Institute of Formal and Applied Linguistics (UFAL),
|
|
# Faculty of Mathematics and Physics, Charles University, Czech Republic.
|
|
#
|
|
# Changelog:
|
|
# - [02 Jan 2017] Version 0.9: Initial release
|
|
# - [25 Jan 2017] Version 0.9.1: Fix bug in LCS alignment computation
|
|
# - [10 Mar 2017] Version 1.0: Add documentation and test
|
|
# Compare HEADs correctly using aligned words
|
|
# Allow evaluation with errorneous spaces in forms
|
|
# Compare forms in LCS case insensitively
|
|
# Detect cycles and multiple root nodes
|
|
# Compute AlignedAccuracy
|
|
|
|
# Command line usage
|
|
# ------------------
|
|
# conll17_ud_eval.py [-v] [-w weights_file] gold_conllu_file system_conllu_file
|
|
#
|
|
# - if no -v is given, only the CoNLL17 UD Shared Task evaluation LAS metrics
|
|
# is printed
|
|
# - if -v is given, several metrics are printed (as precision, recall, F1 score,
|
|
# and in case the metric is computed on aligned words also accuracy on these):
|
|
# - Tokens: how well do the gold tokens match system tokens
|
|
# - Sentences: how well do the gold sentences match system sentences
|
|
# - Words: how well can the gold words be aligned to system words
|
|
# - UPOS: using aligned words, how well does UPOS match
|
|
# - XPOS: using aligned words, how well does XPOS match
|
|
# - Feats: using aligned words, how well does FEATS match
|
|
# - AllTags: using aligned words, how well does UPOS+XPOS+FEATS match
|
|
# - Lemmas: using aligned words, how well does LEMMA match
|
|
# - UAS: using aligned words, how well does HEAD match
|
|
# - LAS: using aligned words, how well does HEAD+DEPREL(ignoring subtypes) match
|
|
# - if weights_file is given (with lines containing deprel-weight pairs),
|
|
# one more metric is shown:
|
|
# - WeightedLAS: as LAS, but each deprel (ignoring subtypes) has different weight
|
|
|
|
# API usage
|
|
# ---------
|
|
# - load_conllu(file)
|
|
# - loads CoNLL-U file from given file object to an internal representation
|
|
# - the file object should return str on both Python 2 and Python 3
|
|
# - raises UDError exception if the given file cannot be loaded
|
|
# - evaluate(gold_ud, system_ud)
|
|
# - evaluate the given gold and system CoNLL-U files (loaded with load_conllu)
|
|
# - raises UDError if the concatenated tokens of gold and system file do not match
|
|
# - returns a dictionary with the metrics described above, each metrics having
|
|
# four fields: precision, recall, f1 and aligned_accuracy (when using aligned
|
|
# words, otherwise this is None)
|
|
|
|
# Description of token matching
|
|
# -----------------------------
|
|
# In order to match tokens of gold file and system file, we consider the text
|
|
# resulting from concatenation of gold tokens and text resulting from
|
|
# concatenation of system tokens. These texts should match -- if they do not,
|
|
# the evaluation fails.
|
|
#
|
|
# If the texts do match, every token is represented as a range in this original
|
|
# text, and tokens are equal only if their range is the same.
|
|
|
|
# Description of word matching
|
|
# ----------------------------
|
|
# When matching words of gold file and system file, we first match the tokens.
|
|
# The words which are also tokens are matched as tokens, but words in multi-word
|
|
# tokens have to be handled differently.
|
|
#
|
|
# To handle multi-word tokens, we start by finding "multi-word spans".
|
|
# Multi-word span is a span in the original text such that
|
|
# - it contains at least one multi-word token
|
|
# - all multi-word tokens in the span (considering both gold and system ones)
|
|
# are completely inside the span (i.e., they do not "stick out")
|
|
# - the multi-word span is as small as possible
|
|
#
|
|
# For every multi-word span, we align the gold and system words completely
|
|
# inside this span using LCS on their FORMs. The words not intersecting
|
|
# (even partially) any multi-word span are then aligned as tokens.
|
|
|
|
|
|
from __future__ import division
|
|
from __future__ import print_function
|
|
|
|
import argparse
|
|
import io
|
|
import sys
|
|
import unittest
|
|
|
|
# CoNLL-U column names
|
|
ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC = range(10)
|
|
|
|
# UD Error is used when raising exceptions in this module
|
|
class UDError(Exception):
|
|
pass
|
|
|
|
# Load given CoNLL-U file into internal representation
|
|
def load_conllu(file, check_parse=True):
|
|
# Internal representation classes
|
|
class UDRepresentation:
|
|
def __init__(self):
|
|
# Characters of all the tokens in the whole file.
|
|
# Whitespace between tokens is not included.
|
|
self.characters = []
|
|
# List of UDSpan instances with start&end indices into `characters`.
|
|
self.tokens = []
|
|
# List of UDWord instances.
|
|
self.words = []
|
|
# List of UDSpan instances with start&end indices into `characters`.
|
|
self.sentences = []
|
|
class UDSpan:
|
|
def __init__(self, start, end, characters):
|
|
self.start = start
|
|
# Note that self.end marks the first position **after the end** of span,
|
|
# so we can use characters[start:end] or range(start, end).
|
|
self.end = end
|
|
self.characters = characters
|
|
|
|
@property
|
|
def text(self):
|
|
return ''.join(self.characters[self.start:self.end])
|
|
|
|
def __str__(self):
|
|
return self.text
|
|
|
|
def __repr__(self):
|
|
return self.text
|
|
class UDWord:
|
|
def __init__(self, span, columns, is_multiword):
|
|
# Span of this word (or MWT, see below) within ud_representation.characters.
|
|
self.span = span
|
|
# 10 columns of the CoNLL-U file: ID, FORM, LEMMA,...
|
|
self.columns = columns
|
|
# is_multiword==True means that this word is part of a multi-word token.
|
|
# In that case, self.span marks the span of the whole multi-word token.
|
|
self.is_multiword = is_multiword
|
|
# Reference to the UDWord instance representing the HEAD (or None if root).
|
|
self.parent = None
|
|
# Let's ignore language-specific deprel subtypes.
|
|
self.columns[DEPREL] = columns[DEPREL].split(':')[0]
|
|
|
|
ud = UDRepresentation()
|
|
|
|
# Load the CoNLL-U file
|
|
index, sentence_start = 0, None
|
|
linenum = 0
|
|
while True:
|
|
line = file.readline()
|
|
linenum += 1
|
|
if not line:
|
|
break
|
|
line = line.rstrip("\r\n")
|
|
|
|
# Handle sentence start boundaries
|
|
if sentence_start is None:
|
|
# Skip comments
|
|
if line.startswith("#"):
|
|
continue
|
|
# Start a new sentence
|
|
ud.sentences.append(UDSpan(index, 0, ud.characters))
|
|
sentence_start = len(ud.words)
|
|
if not line:
|
|
# Add parent UDWord links and check there are no cycles
|
|
def process_word(word):
|
|
if word.parent == "remapping":
|
|
raise UDError("There is a cycle in a sentence")
|
|
if word.parent is None:
|
|
head = int(word.columns[HEAD])
|
|
if head > len(ud.words) - sentence_start:
|
|
raise UDError("Line {}: HEAD '{}' points outside of the sentence".format(
|
|
linenum, word.columns[HEAD]))
|
|
if head:
|
|
parent = ud.words[sentence_start + head - 1]
|
|
word.parent = "remapping"
|
|
process_word(parent)
|
|
word.parent = parent
|
|
|
|
for word in ud.words[sentence_start:]:
|
|
process_word(word)
|
|
|
|
# Check there is a single root node
|
|
if check_parse:
|
|
if len([word for word in ud.words[sentence_start:] if word.parent is None]) != 1:
|
|
raise UDError("There are multiple roots in a sentence")
|
|
|
|
# End the sentence
|
|
ud.sentences[-1].end = index
|
|
sentence_start = None
|
|
continue
|
|
|
|
# Read next token/word
|
|
columns = line.split("\t")
|
|
if len(columns) != 10:
|
|
raise UDError("The CoNLL-U line {} does not contain 10 tab-separated columns: '{}'".format(linenum, line))
|
|
|
|
# Skip empty nodes
|
|
if "." in columns[ID]:
|
|
continue
|
|
|
|
# Delete spaces from FORM so gold.characters == system.characters
|
|
# even if one of them tokenizes the space.
|
|
columns[FORM] = columns[FORM].replace(" ", "")
|
|
if not columns[FORM]:
|
|
raise UDError("There is an empty FORM in the CoNLL-U file -- line %d" % linenum)
|
|
|
|
# Save token
|
|
ud.characters.extend(columns[FORM])
|
|
ud.tokens.append(UDSpan(index, index + len(columns[FORM]), ud.characters))
|
|
index += len(columns[FORM])
|
|
|
|
# Handle multi-word tokens to save word(s)
|
|
if "-" in columns[ID]:
|
|
try:
|
|
start, end = map(int, columns[ID].split("-"))
|
|
except:
|
|
raise UDError("Cannot parse multi-word token ID '{}'".format(columns[ID]))
|
|
|
|
for _ in range(start, end + 1):
|
|
word_line = file.readline().rstrip("\r\n")
|
|
word_columns = word_line.split("\t")
|
|
if len(word_columns) != 10:
|
|
print(columns)
|
|
raise UDError("The CoNLL-U line {} does not contain 10 tab-separated columns: '{}'".format(linenum, word_line))
|
|
ud.words.append(UDWord(ud.tokens[-1], word_columns, is_multiword=True))
|
|
# Basic tokens/words
|
|
else:
|
|
try:
|
|
word_id = int(columns[ID])
|
|
except:
|
|
raise UDError("Cannot parse word ID '{}'".format(columns[ID]))
|
|
if word_id != len(ud.words) - sentence_start + 1:
|
|
raise UDError("Incorrect word ID '{}' for word '{}', expected '{}'".format(columns[ID], columns[FORM], len(ud.words) - sentence_start + 1))
|
|
|
|
try:
|
|
head_id = int(columns[HEAD])
|
|
except:
|
|
raise UDError("Cannot parse HEAD '{}'".format(columns[HEAD]))
|
|
if head_id < 0:
|
|
raise UDError("HEAD cannot be negative")
|
|
|
|
ud.words.append(UDWord(ud.tokens[-1], columns, is_multiword=False))
|
|
|
|
if sentence_start is not None:
|
|
raise UDError("The CoNLL-U file does not end with empty line")
|
|
|
|
return ud
|
|
|
|
# Evaluate the gold and system treebanks (loaded using load_conllu).
|
|
def evaluate(gold_ud, system_ud, deprel_weights=None, check_parse=True):
|
|
class Score:
|
|
def __init__(self, gold_total, system_total, correct, aligned_total=None, undersegmented=None, oversegmented=None):
|
|
self.precision = correct / system_total if system_total else 0.0
|
|
self.recall = correct / gold_total if gold_total else 0.0
|
|
self.f1 = 2 * correct / (system_total + gold_total) if system_total + gold_total else 0.0
|
|
self.aligned_accuracy = correct / aligned_total if aligned_total else aligned_total
|
|
self.undersegmented = undersegmented
|
|
self.oversegmented = oversegmented
|
|
self.under_perc = len(undersegmented) / gold_total if gold_total and undersegmented else 0.0
|
|
self.over_perc = len(oversegmented) / gold_total if gold_total and oversegmented else 0.0
|
|
class AlignmentWord:
|
|
def __init__(self, gold_word, system_word):
|
|
self.gold_word = gold_word
|
|
self.system_word = system_word
|
|
self.gold_parent = None
|
|
self.system_parent_gold_aligned = None
|
|
class Alignment:
|
|
def __init__(self, gold_words, system_words):
|
|
self.gold_words = gold_words
|
|
self.system_words = system_words
|
|
self.matched_words = []
|
|
self.matched_words_map = {}
|
|
def append_aligned_words(self, gold_word, system_word):
|
|
self.matched_words.append(AlignmentWord(gold_word, system_word))
|
|
self.matched_words_map[system_word] = gold_word
|
|
def fill_parents(self):
|
|
# We represent root parents in both gold and system data by '0'.
|
|
# For gold data, we represent non-root parent by corresponding gold word.
|
|
# For system data, we represent non-root parent by either gold word aligned
|
|
# to parent system nodes, or by None if no gold words is aligned to the parent.
|
|
for words in self.matched_words:
|
|
words.gold_parent = words.gold_word.parent if words.gold_word.parent is not None else 0
|
|
words.system_parent_gold_aligned = self.matched_words_map.get(words.system_word.parent, None) \
|
|
if words.system_word.parent is not None else 0
|
|
|
|
def lower(text):
|
|
if sys.version_info < (3, 0) and isinstance(text, str):
|
|
return text.decode("utf-8").lower()
|
|
return text.lower()
|
|
|
|
def spans_score(gold_spans, system_spans):
|
|
correct, gi, si = 0, 0, 0
|
|
undersegmented = list()
|
|
oversegmented = list()
|
|
combo = 0
|
|
previous_end_si_earlier = False
|
|
previous_end_gi_earlier = False
|
|
while gi < len(gold_spans) and si < len(system_spans):
|
|
previous_si = system_spans[si-1] if si > 0 else None
|
|
previous_gi = gold_spans[gi-1] if gi > 0 else None
|
|
if system_spans[si].start < gold_spans[gi].start:
|
|
# avoid counting the same mistake twice
|
|
if not previous_end_si_earlier:
|
|
combo += 1
|
|
oversegmented.append(str(previous_gi).strip())
|
|
si += 1
|
|
elif gold_spans[gi].start < system_spans[si].start:
|
|
# avoid counting the same mistake twice
|
|
if not previous_end_gi_earlier:
|
|
combo += 1
|
|
undersegmented.append(str(previous_si).strip())
|
|
gi += 1
|
|
else:
|
|
correct += gold_spans[gi].end == system_spans[si].end
|
|
if gold_spans[gi].end < system_spans[si].end:
|
|
undersegmented.append(str(system_spans[si]).strip())
|
|
previous_end_gi_earlier = True
|
|
previous_end_si_earlier = False
|
|
elif gold_spans[gi].end > system_spans[si].end:
|
|
oversegmented.append(str(gold_spans[gi]).strip())
|
|
previous_end_si_earlier = True
|
|
previous_end_gi_earlier = False
|
|
else:
|
|
previous_end_gi_earlier = False
|
|
previous_end_si_earlier = False
|
|
si += 1
|
|
gi += 1
|
|
|
|
return Score(len(gold_spans), len(system_spans), correct, None, undersegmented, oversegmented)
|
|
|
|
def alignment_score(alignment, key_fn, weight_fn=lambda w: 1):
|
|
gold, system, aligned, correct = 0, 0, 0, 0
|
|
|
|
for word in alignment.gold_words:
|
|
gold += weight_fn(word)
|
|
|
|
for word in alignment.system_words:
|
|
system += weight_fn(word)
|
|
|
|
for words in alignment.matched_words:
|
|
aligned += weight_fn(words.gold_word)
|
|
|
|
if key_fn is None:
|
|
# Return score for whole aligned words
|
|
return Score(gold, system, aligned)
|
|
|
|
for words in alignment.matched_words:
|
|
if key_fn(words.gold_word, words.gold_parent) == key_fn(words.system_word, words.system_parent_gold_aligned):
|
|
correct += weight_fn(words.gold_word)
|
|
|
|
return Score(gold, system, correct, aligned)
|
|
|
|
def beyond_end(words, i, multiword_span_end):
|
|
if i >= len(words):
|
|
return True
|
|
if words[i].is_multiword:
|
|
return words[i].span.start >= multiword_span_end
|
|
return words[i].span.end > multiword_span_end
|
|
|
|
def extend_end(word, multiword_span_end):
|
|
if word.is_multiword and word.span.end > multiword_span_end:
|
|
return word.span.end
|
|
return multiword_span_end
|
|
|
|
def find_multiword_span(gold_words, system_words, gi, si):
|
|
# We know gold_words[gi].is_multiword or system_words[si].is_multiword.
|
|
# Find the start of the multiword span (gs, ss), so the multiword span is minimal.
|
|
# Initialize multiword_span_end characters index.
|
|
if gold_words[gi].is_multiword:
|
|
multiword_span_end = gold_words[gi].span.end
|
|
if not system_words[si].is_multiword and system_words[si].span.start < gold_words[gi].span.start:
|
|
si += 1
|
|
else: # if system_words[si].is_multiword
|
|
multiword_span_end = system_words[si].span.end
|
|
if not gold_words[gi].is_multiword and gold_words[gi].span.start < system_words[si].span.start:
|
|
gi += 1
|
|
gs, ss = gi, si
|
|
|
|
# Find the end of the multiword span
|
|
# (so both gi and si are pointing to the word following the multiword span end).
|
|
while not beyond_end(gold_words, gi, multiword_span_end) or \
|
|
not beyond_end(system_words, si, multiword_span_end):
|
|
if gi < len(gold_words) and (si >= len(system_words) or
|
|
gold_words[gi].span.start <= system_words[si].span.start):
|
|
multiword_span_end = extend_end(gold_words[gi], multiword_span_end)
|
|
gi += 1
|
|
else:
|
|
multiword_span_end = extend_end(system_words[si], multiword_span_end)
|
|
si += 1
|
|
return gs, ss, gi, si
|
|
|
|
def compute_lcs(gold_words, system_words, gi, si, gs, ss):
|
|
lcs = [[0] * (si - ss) for i in range(gi - gs)]
|
|
for g in reversed(range(gi - gs)):
|
|
for s in reversed(range(si - ss)):
|
|
if lower(gold_words[gs + g].columns[FORM]) == lower(system_words[ss + s].columns[FORM]):
|
|
lcs[g][s] = 1 + (lcs[g+1][s+1] if g+1 < gi-gs and s+1 < si-ss else 0)
|
|
lcs[g][s] = max(lcs[g][s], lcs[g+1][s] if g+1 < gi-gs else 0)
|
|
lcs[g][s] = max(lcs[g][s], lcs[g][s+1] if s+1 < si-ss else 0)
|
|
return lcs
|
|
|
|
def align_words(gold_words, system_words):
|
|
alignment = Alignment(gold_words, system_words)
|
|
|
|
gi, si = 0, 0
|
|
while gi < len(gold_words) and si < len(system_words):
|
|
if gold_words[gi].is_multiword or system_words[si].is_multiword:
|
|
# A: Multi-word tokens => align via LCS within the whole "multiword span".
|
|
gs, ss, gi, si = find_multiword_span(gold_words, system_words, gi, si)
|
|
|
|
if si > ss and gi > gs:
|
|
lcs = compute_lcs(gold_words, system_words, gi, si, gs, ss)
|
|
|
|
# Store aligned words
|
|
s, g = 0, 0
|
|
while g < gi - gs and s < si - ss:
|
|
if lower(gold_words[gs + g].columns[FORM]) == lower(system_words[ss + s].columns[FORM]):
|
|
alignment.append_aligned_words(gold_words[gs+g], system_words[ss+s])
|
|
g += 1
|
|
s += 1
|
|
elif lcs[g][s] == (lcs[g+1][s] if g+1 < gi-gs else 0):
|
|
g += 1
|
|
else:
|
|
s += 1
|
|
else:
|
|
# B: No multi-word token => align according to spans.
|
|
if (gold_words[gi].span.start, gold_words[gi].span.end) == (system_words[si].span.start, system_words[si].span.end):
|
|
alignment.append_aligned_words(gold_words[gi], system_words[si])
|
|
gi += 1
|
|
si += 1
|
|
elif gold_words[gi].span.start <= system_words[si].span.start:
|
|
gi += 1
|
|
else:
|
|
si += 1
|
|
|
|
alignment.fill_parents()
|
|
|
|
return alignment
|
|
|
|
# Check that underlying character sequences do match
|
|
if gold_ud.characters != system_ud.characters:
|
|
index = 0
|
|
while gold_ud.characters[index] == system_ud.characters[index]:
|
|
index += 1
|
|
|
|
raise UDError(
|
|
"The concatenation of tokens in gold file and in system file differ!\n" +
|
|
"First 20 differing characters in gold file: '{}' and system file: '{}'".format(
|
|
"".join(gold_ud.characters[index:index + 20]),
|
|
"".join(system_ud.characters[index:index + 20])
|
|
)
|
|
)
|
|
|
|
# Align words
|
|
alignment = align_words(gold_ud.words, system_ud.words)
|
|
|
|
# Compute the F1-scores
|
|
if check_parse:
|
|
result = {
|
|
"Tokens": spans_score(gold_ud.tokens, system_ud.tokens),
|
|
"Sentences": spans_score(gold_ud.sentences, system_ud.sentences),
|
|
"Words": alignment_score(alignment, None),
|
|
"UPOS": alignment_score(alignment, lambda w, parent: w.columns[UPOS]),
|
|
"XPOS": alignment_score(alignment, lambda w, parent: w.columns[XPOS]),
|
|
"Feats": alignment_score(alignment, lambda w, parent: w.columns[FEATS]),
|
|
"AllTags": alignment_score(alignment, lambda w, parent: (w.columns[UPOS], w.columns[XPOS], w.columns[FEATS])),
|
|
"Lemmas": alignment_score(alignment, lambda w, parent: w.columns[LEMMA]),
|
|
"UAS": alignment_score(alignment, lambda w, parent: parent),
|
|
"LAS": alignment_score(alignment, lambda w, parent: (parent, w.columns[DEPREL])),
|
|
}
|
|
else:
|
|
result = {
|
|
"Tokens": spans_score(gold_ud.tokens, system_ud.tokens),
|
|
"Sentences": spans_score(gold_ud.sentences, system_ud.sentences),
|
|
"Words": alignment_score(alignment, None),
|
|
"Feats": alignment_score(alignment, lambda w, parent: w.columns[FEATS]),
|
|
"Lemmas": alignment_score(alignment, lambda w, parent: w.columns[LEMMA]),
|
|
}
|
|
|
|
|
|
# Add WeightedLAS if weights are given
|
|
if deprel_weights is not None:
|
|
def weighted_las(word):
|
|
return deprel_weights.get(word.columns[DEPREL], 1.0)
|
|
result["WeightedLAS"] = alignment_score(alignment, lambda w, parent: (parent, w.columns[DEPREL]), weighted_las)
|
|
|
|
return result
|
|
|
|
def load_deprel_weights(weights_file):
|
|
if weights_file is None:
|
|
return None
|
|
|
|
deprel_weights = {}
|
|
for line in weights_file:
|
|
# Ignore comments and empty lines
|
|
if line.startswith("#") or not line.strip():
|
|
continue
|
|
|
|
columns = line.rstrip("\r\n").split()
|
|
if len(columns) != 2:
|
|
raise ValueError("Expected two columns in the UD Relations weights file on line '{}'".format(line))
|
|
|
|
deprel_weights[columns[0]] = float(columns[1])
|
|
|
|
return deprel_weights
|
|
|
|
def load_conllu_file(path):
|
|
_file = open(path, mode="r", **({"encoding": "utf-8"} if sys.version_info >= (3, 0) else {}))
|
|
return load_conllu(_file)
|
|
|
|
def evaluate_wrapper(args):
|
|
# Load CoNLL-U files
|
|
gold_ud = load_conllu_file(args.gold_file)
|
|
system_ud = load_conllu_file(args.system_file)
|
|
|
|
# Load weights if requested
|
|
deprel_weights = load_deprel_weights(args.weights)
|
|
|
|
return evaluate(gold_ud, system_ud, deprel_weights)
|
|
|
|
def main():
|
|
# Parse arguments
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("gold_file", type=str,
|
|
help="Name of the CoNLL-U file with the gold data.")
|
|
parser.add_argument("system_file", type=str,
|
|
help="Name of the CoNLL-U file with the predicted data.")
|
|
parser.add_argument("--weights", "-w", type=argparse.FileType("r"), default=None,
|
|
metavar="deprel_weights_file",
|
|
help="Compute WeightedLAS using given weights for Universal Dependency Relations.")
|
|
parser.add_argument("--verbose", "-v", default=0, action="count",
|
|
help="Print all metrics.")
|
|
args = parser.parse_args()
|
|
|
|
# Use verbose if weights are supplied
|
|
if args.weights is not None and not args.verbose:
|
|
args.verbose = 1
|
|
|
|
# Evaluate
|
|
evaluation = evaluate_wrapper(args)
|
|
|
|
# Print the evaluation
|
|
if not args.verbose:
|
|
print("LAS F1 Score: {:.2f}".format(100 * evaluation["LAS"].f1))
|
|
else:
|
|
metrics = ["Tokens", "Sentences", "Words", "UPOS", "XPOS", "Feats", "AllTags", "Lemmas", "UAS", "LAS"]
|
|
if args.weights is not None:
|
|
metrics.append("WeightedLAS")
|
|
|
|
print("Metrics | Precision | Recall | F1 Score | AligndAcc")
|
|
print("-----------+-----------+-----------+-----------+-----------")
|
|
for metric in metrics:
|
|
print("{:11}|{:10.2f} |{:10.2f} |{:10.2f} |{}".format(
|
|
metric,
|
|
100 * evaluation[metric].precision,
|
|
100 * evaluation[metric].recall,
|
|
100 * evaluation[metric].f1,
|
|
"{:10.2f}".format(100 * evaluation[metric].aligned_accuracy) if evaluation[metric].aligned_accuracy is not None else ""
|
|
))
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|
|
# Tests, which can be executed with `python -m unittest conll17_ud_eval`.
|
|
class TestAlignment(unittest.TestCase):
|
|
@staticmethod
|
|
def _load_words(words):
|
|
"""Prepare fake CoNLL-U files with fake HEAD to prevent multiple roots errors."""
|
|
lines, num_words = [], 0
|
|
for w in words:
|
|
parts = w.split(" ")
|
|
if len(parts) == 1:
|
|
num_words += 1
|
|
lines.append("{}\t{}\t_\t_\t_\t_\t{}\t_\t_\t_".format(num_words, parts[0], int(num_words>1)))
|
|
else:
|
|
lines.append("{}-{}\t{}\t_\t_\t_\t_\t_\t_\t_\t_".format(num_words + 1, num_words + len(parts) - 1, parts[0]))
|
|
for part in parts[1:]:
|
|
num_words += 1
|
|
lines.append("{}\t{}\t_\t_\t_\t_\t{}\t_\t_\t_".format(num_words, part, int(num_words>1)))
|
|
return load_conllu((io.StringIO if sys.version_info >= (3, 0) else io.BytesIO)("\n".join(lines+["\n"])))
|
|
|
|
def _test_exception(self, gold, system):
|
|
self.assertRaises(UDError, evaluate, self._load_words(gold), self._load_words(system))
|
|
|
|
def _test_ok(self, gold, system, correct):
|
|
metrics = evaluate(self._load_words(gold), self._load_words(system))
|
|
gold_words = sum((max(1, len(word.split(" ")) - 1) for word in gold))
|
|
system_words = sum((max(1, len(word.split(" ")) - 1) for word in system))
|
|
self.assertEqual((metrics["Words"].precision, metrics["Words"].recall, metrics["Words"].f1),
|
|
(correct / system_words, correct / gold_words, 2 * correct / (gold_words + system_words)))
|
|
|
|
def test_exception(self):
|
|
self._test_exception(["a"], ["b"])
|
|
|
|
def test_equal(self):
|
|
self._test_ok(["a"], ["a"], 1)
|
|
self._test_ok(["a", "b", "c"], ["a", "b", "c"], 3)
|
|
|
|
def test_equal_with_multiword(self):
|
|
self._test_ok(["abc a b c"], ["a", "b", "c"], 3)
|
|
self._test_ok(["a", "bc b c", "d"], ["a", "b", "c", "d"], 4)
|
|
self._test_ok(["abcd a b c d"], ["ab a b", "cd c d"], 4)
|
|
self._test_ok(["abc a b c", "de d e"], ["a", "bcd b c d", "e"], 5)
|
|
|
|
def test_alignment(self):
|
|
self._test_ok(["abcd"], ["a", "b", "c", "d"], 0)
|
|
self._test_ok(["abc", "d"], ["a", "b", "c", "d"], 1)
|
|
self._test_ok(["a", "bc", "d"], ["a", "b", "c", "d"], 2)
|
|
self._test_ok(["a", "bc b c", "d"], ["a", "b", "cd"], 2)
|
|
self._test_ok(["abc a BX c", "def d EX f"], ["ab a b", "cd c d", "ef e f"], 4)
|
|
self._test_ok(["ab a b", "cd bc d"], ["a", "bc", "d"], 2)
|
|
self._test_ok(["a", "bc b c", "d"], ["ab AX BX", "cd CX a"], 1)
|