additional fixes

- set approach to identifying unique trees
- adjust line length on messages
- add logic for detecting docs without annotations
This commit is contained in:
Peter Baumgartner 2023-01-11 14:36:28 -05:00
parent c53d57a54e
commit bfb9fa44a0

View File

@ -675,16 +675,17 @@ def debug_data(
msg.divider("Trainable Lemmatizer") msg.divider("Trainable Lemmatizer")
trees_train: Set[str] = gold_train_data["lemmatizer_trees"] trees_train: Set[str] = gold_train_data["lemmatizer_trees"]
trees_dev: Set[str] = gold_dev_data["lemmatizer_trees"] trees_dev: Set[str] = gold_dev_data["lemmatizer_trees"]
# This is necessary context when someone is attempting to interpret whether the # This is necessary context when someone is attempting to interpret whether the
# number of trees exclusively in the dev set is meaningful. # number of trees exclusively in the dev set is meaningful.
msg.info(f"{len(trees_train)} lemmatizer trees generated from training data.") msg.info(f"{len(trees_train)} lemmatizer trees generated from training data")
msg.info(f"{len(trees_dev)} lemmatizer trees generated from dev data.") msg.info(f"{len(trees_dev)} lemmatizer trees generated from dev data")
dev_not_train = trees_dev - trees_train dev_not_train = trees_dev - trees_train
if len(dev_not_train) != 0: if len(dev_not_train) != 0:
msg.warn( pct = len(dev_not_train) / len(trees_dev)
f"{len(dev_not_train)} lemmatizer trees were found exclusively in the dev data." msg.info(
f"{len(dev_not_train)} lemmatizer trees ({pct*100:.1f}% of dev trees)"
" were found exclusively in the dev data."
) )
else: else:
# Would we ever expect this case? It seems like it would be pretty rare, # Would we ever expect this case? It seems like it would be pretty rare,
@ -692,19 +693,29 @@ def debug_data(
msg.good("All trees in dev data present in training data.") msg.good("All trees in dev data present in training data.")
if gold_train_data["n_low_cardinality_lemmas"] > 0: if gold_train_data["n_low_cardinality_lemmas"] > 0:
msg.warn( n = gold_train_data["n_low_cardinality_lemmas"]
f"{gold_train_data['n_low_cardinality_lemmas']} docs with 1 or 0 unique lemmas." msg.warn(f"{n} docs with 1 or 0 unique lemmas.")
)
else: else:
msg.good("All training docs meet lemma uniqueness requirements.") msg.good("All training docs meet lemma uniqueness requirements.")
if gold_dev_data["n_low_cardinality_lemmas"] > 0: if gold_dev_data["n_low_cardinality_lemmas"] > 0:
msg.warn( n = gold_dev_data["n_low_cardinality_lemmas"]
f"{gold_dev_data['n_low_cardinality_lemmas']} docs with 1 or 0 unique lemmas." msg.warn(f"{n} docs with 1 or 0 unique lemmas.")
)
else: else:
msg.good("All dev docs meet lemma uniqueness requirements.") msg.good("All dev docs meet lemma uniqueness requirements.")
if gold_train_data["no_lemma_annotations"] > 0:
n = gold_train_data["no_lemma_annotations"]
msg.warn(f"{n} docs with no lemma annotations.")
else:
msg.good("All training docs have complete lemma annotations.")
if gold_dev_data["no_lemma_annotations"] > 0:
n = gold_dev_data["no_lemma_annotations"]
msg.warn(f"{n} docs with no lemma annotations.")
else:
msg.good("All dev docs have complete lemma annotations.")
msg.divider("Summary") msg.divider("Summary")
good_counts = msg.counts[MESSAGES.GOOD] good_counts = msg.counts[MESSAGES.GOOD]
warn_counts = msg.counts[MESSAGES.WARN] warn_counts = msg.counts[MESSAGES.WARN]
@ -767,6 +778,7 @@ def _compile_gold(
"n_cats_bad_values": 0, "n_cats_bad_values": 0,
"texts": set(), "texts": set(),
"lemmatizer_trees": set(), "lemmatizer_trees": set(),
"no_lemma_annotations": 0,
"n_low_cardinality_lemmas": 0, "n_low_cardinality_lemmas": 0,
} }
if "trainable_lemmatizer" in factory_names: if "trainable_lemmatizer" in factory_names:
@ -902,10 +914,13 @@ def _compile_gold(
data["n_cycles"] += 1 data["n_cycles"] += 1
if "trainable_lemmatizer" in factory_names: if "trainable_lemmatizer" in factory_names:
# from EditTreeLemmatizer._labels_from_data # from EditTreeLemmatizer._labels_from_data
if all(token.lemma is None for token in gold):
data["no_lemma_annotations"] += 1
else:
lemma_set = set() lemma_set = set()
for token in gold: for token in gold:
lemma_set.add(token.lemma)
if token.lemma != 0: if token.lemma != 0:
lemma_set.add(token.lemma)
tree_id = trees.add(token.text, token.lemma_) tree_id = trees.add(token.text, token.lemma_)
tree_str = trees.tree_to_str(tree_id) tree_str = trees.tree_to_str(tree_id)
data["lemmatizer_trees"].add(tree_str) data["lemmatizer_trees"].add(tree_str)