mirror of
https://github.com/explosion/spaCy.git
synced 2025-03-13 07:55:49 +03:00
additional fixes
- set approach to identifying unique trees - adjust line length on messages - add logic for detecting docs without annotations
This commit is contained in:
parent
c53d57a54e
commit
bfb9fa44a0
|
@ -675,16 +675,17 @@ def debug_data(
|
||||||
msg.divider("Trainable Lemmatizer")
|
msg.divider("Trainable Lemmatizer")
|
||||||
trees_train: Set[str] = gold_train_data["lemmatizer_trees"]
|
trees_train: Set[str] = gold_train_data["lemmatizer_trees"]
|
||||||
trees_dev: Set[str] = gold_dev_data["lemmatizer_trees"]
|
trees_dev: Set[str] = gold_dev_data["lemmatizer_trees"]
|
||||||
|
|
||||||
# This is necessary context when someone is attempting to interpret whether the
|
# This is necessary context when someone is attempting to interpret whether the
|
||||||
# number of trees exclusively in the dev set is meaningful.
|
# number of trees exclusively in the dev set is meaningful.
|
||||||
msg.info(f"{len(trees_train)} lemmatizer trees generated from training data.")
|
msg.info(f"{len(trees_train)} lemmatizer trees generated from training data")
|
||||||
msg.info(f"{len(trees_dev)} lemmatizer trees generated from dev data.")
|
msg.info(f"{len(trees_dev)} lemmatizer trees generated from dev data")
|
||||||
|
|
||||||
dev_not_train = trees_dev - trees_train
|
dev_not_train = trees_dev - trees_train
|
||||||
|
|
||||||
if len(dev_not_train) != 0:
|
if len(dev_not_train) != 0:
|
||||||
msg.warn(
|
pct = len(dev_not_train) / len(trees_dev)
|
||||||
f"{len(dev_not_train)} lemmatizer trees were found exclusively in the dev data."
|
msg.info(
|
||||||
|
f"{len(dev_not_train)} lemmatizer trees ({pct*100:.1f}% of dev trees)"
|
||||||
|
" were found exclusively in the dev data."
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
# Would we ever expect this case? It seems like it would be pretty rare,
|
# Would we ever expect this case? It seems like it would be pretty rare,
|
||||||
|
@ -692,19 +693,29 @@ def debug_data(
|
||||||
msg.good("All trees in dev data present in training data.")
|
msg.good("All trees in dev data present in training data.")
|
||||||
|
|
||||||
if gold_train_data["n_low_cardinality_lemmas"] > 0:
|
if gold_train_data["n_low_cardinality_lemmas"] > 0:
|
||||||
msg.warn(
|
n = gold_train_data["n_low_cardinality_lemmas"]
|
||||||
f"{gold_train_data['n_low_cardinality_lemmas']} docs with 1 or 0 unique lemmas."
|
msg.warn(f"{n} docs with 1 or 0 unique lemmas.")
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
msg.good("All training docs meet lemma uniqueness requirements.")
|
msg.good("All training docs meet lemma uniqueness requirements.")
|
||||||
|
|
||||||
if gold_dev_data["n_low_cardinality_lemmas"] > 0:
|
if gold_dev_data["n_low_cardinality_lemmas"] > 0:
|
||||||
msg.warn(
|
n = gold_dev_data["n_low_cardinality_lemmas"]
|
||||||
f"{gold_dev_data['n_low_cardinality_lemmas']} docs with 1 or 0 unique lemmas."
|
msg.warn(f"{n} docs with 1 or 0 unique lemmas.")
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
msg.good("All dev docs meet lemma uniqueness requirements.")
|
msg.good("All dev docs meet lemma uniqueness requirements.")
|
||||||
|
|
||||||
|
if gold_train_data["no_lemma_annotations"] > 0:
|
||||||
|
n = gold_train_data["no_lemma_annotations"]
|
||||||
|
msg.warn(f"{n} docs with no lemma annotations.")
|
||||||
|
else:
|
||||||
|
msg.good("All training docs have complete lemma annotations.")
|
||||||
|
|
||||||
|
if gold_dev_data["no_lemma_annotations"] > 0:
|
||||||
|
n = gold_dev_data["no_lemma_annotations"]
|
||||||
|
msg.warn(f"{n} docs with no lemma annotations.")
|
||||||
|
else:
|
||||||
|
msg.good("All dev docs have complete lemma annotations.")
|
||||||
|
|
||||||
msg.divider("Summary")
|
msg.divider("Summary")
|
||||||
good_counts = msg.counts[MESSAGES.GOOD]
|
good_counts = msg.counts[MESSAGES.GOOD]
|
||||||
warn_counts = msg.counts[MESSAGES.WARN]
|
warn_counts = msg.counts[MESSAGES.WARN]
|
||||||
|
@ -767,6 +778,7 @@ def _compile_gold(
|
||||||
"n_cats_bad_values": 0,
|
"n_cats_bad_values": 0,
|
||||||
"texts": set(),
|
"texts": set(),
|
||||||
"lemmatizer_trees": set(),
|
"lemmatizer_trees": set(),
|
||||||
|
"no_lemma_annotations": 0,
|
||||||
"n_low_cardinality_lemmas": 0,
|
"n_low_cardinality_lemmas": 0,
|
||||||
}
|
}
|
||||||
if "trainable_lemmatizer" in factory_names:
|
if "trainable_lemmatizer" in factory_names:
|
||||||
|
@ -902,10 +914,13 @@ def _compile_gold(
|
||||||
data["n_cycles"] += 1
|
data["n_cycles"] += 1
|
||||||
if "trainable_lemmatizer" in factory_names:
|
if "trainable_lemmatizer" in factory_names:
|
||||||
# from EditTreeLemmatizer._labels_from_data
|
# from EditTreeLemmatizer._labels_from_data
|
||||||
|
if all(token.lemma is None for token in gold):
|
||||||
|
data["no_lemma_annotations"] += 1
|
||||||
|
else:
|
||||||
lemma_set = set()
|
lemma_set = set()
|
||||||
for token in gold:
|
for token in gold:
|
||||||
lemma_set.add(token.lemma)
|
|
||||||
if token.lemma != 0:
|
if token.lemma != 0:
|
||||||
|
lemma_set.add(token.lemma)
|
||||||
tree_id = trees.add(token.text, token.lemma_)
|
tree_id = trees.add(token.text, token.lemma_)
|
||||||
tree_str = trees.tree_to_str(tree_id)
|
tree_str = trees.tree_to_str(tree_id)
|
||||||
data["lemmatizer_trees"].add(tree_str)
|
data["lemmatizer_trees"].add(tree_str)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user