partial annotation support

This commit is contained in:
Peter Baumgartner 2023-01-12 09:10:39 -05:00
parent 28c31048c9
commit a944e55291

View File

@ -708,11 +708,23 @@ def debug_data(
n = gold_train_data["no_lemma_annotations"] n = gold_train_data["no_lemma_annotations"]
msg.warn(f"{n} docs with no lemma annotations.") msg.warn(f"{n} docs with no lemma annotations.")
else: else:
msg.good("All training docs have complete lemma annotations.") msg.good("All training docs have lemma annotations.")
if gold_dev_data["no_lemma_annotations"] > 0: if gold_dev_data["no_lemma_annotations"] > 0:
n = gold_dev_data["no_lemma_annotations"] n = gold_dev_data["no_lemma_annotations"]
msg.warn(f"{n} docs with no lemma annotations.") msg.warn(f"{n} docs with no lemma annotations.")
else:
msg.good("All dev docs have lemma annotations.")
if gold_train_data["partial_lemma_annotations"] > 0:
n = gold_train_data["partial_lemma_annotations"]
msg.info(f"{n} docs with partial lemma annotations.")
else:
msg.good("All training docs have complete lemma annotations.")
if gold_dev_data["partial_lemma_annotations"] > 0:
n = gold_dev_data["partial_lemma_annotations"]
msg.info(f"{n} docs with partial lemma annotations.")
else: else:
msg.good("All dev docs have complete lemma annotations.") msg.good("All dev docs have complete lemma annotations.")
@ -779,6 +791,7 @@ def _compile_gold(
"texts": set(), "texts": set(),
"lemmatizer_trees": set(), "lemmatizer_trees": set(),
"no_lemma_annotations": 0, "no_lemma_annotations": 0,
"partial_lemma_annotations": 0,
"n_low_cardinality_lemmas": 0, "n_low_cardinality_lemmas": 0,
} }
if "trainable_lemmatizer" in factory_names: if "trainable_lemmatizer" in factory_names:
@ -916,19 +929,21 @@ def _compile_gold(
# from EditTreeLemmatizer._labels_from_data # from EditTreeLemmatizer._labels_from_data
if all(token.lemma == 0 for token in gold): if all(token.lemma == 0 for token in gold):
data["no_lemma_annotations"] += 1 data["no_lemma_annotations"] += 1
else: continue
lemma_set = set() if any(token.lemma == 0 for token in gold):
for token in gold: data["partial_lemma_annotations"] += 1
if token.lemma != 0: lemma_set = set()
lemma_set.add(token.lemma) for token in gold:
tree_id = trees.add(token.text, token.lemma_) if token.lemma != 0:
tree_str = trees.tree_to_str(tree_id) lemma_set.add(token.lemma)
data["lemmatizer_trees"].add(tree_str) tree_id = trees.add(token.text, token.lemma_)
# We want to identify cases where lemmas aren't assigned tree_str = trees.tree_to_str(tree_id)
# or are all assigned the same value, as this would indicate data["lemmatizer_trees"].add(tree_str)
# an issue since we're expecting a large set of lemmas # We want to identify cases where lemmas aren't assigned
if len(lemma_set) < 2 and len(gold) > 1: # or are all assigned the same value, as this would indicate
data["n_low_cardinality_lemmas"] += 1 # an issue since we're expecting a large set of lemmas
if len(lemma_set) < 2 and len(gold) > 1:
data["n_low_cardinality_lemmas"] += 1
return data return data