mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 01:04:34 +03:00
trainable_lemmatizer
in debug data
(#11419)
* WIP
* rm ipython embeds
* rm total
* WIP
* cleanup
* cleanup + reword
* rm component function
* remove migration support form
* fix reference dataset for dev data
* additional fixes
- set approach to identifying unique trees
- adjust line length on messages
- add logic for detecting docs without annotations
* use 0 instead of none for no annotation
* partial annotation support
* initial tests for _compile_gold lemma attributes
Using the example data from the edit tree lemmatizer tests for:
- lemmatizer_trees
- partial_lemma_annotations
- n_low_cardinality_lemmas
- no_lemma_annotations
* adds output test for cli app
* switch msg level
* rm unclear uniqueness check
* Revert "rm unclear uniqueness check"
This reverts commit 6ea2b3524b
.
* remove good message on uniqueness
* formatting
* use en_vocab fixture
* clarify data set source in messages
* remove unnecessary import
Co-authored-by: svlandeg <svlandeg@github.com>
This commit is contained in:
parent
8d69874afb
commit
c68e6b8a96
|
@ -17,6 +17,7 @@ from ..pipeline import TrainablePipe
|
||||||
from ..pipeline._parser_internals import nonproj
|
from ..pipeline._parser_internals import nonproj
|
||||||
from ..pipeline._parser_internals.nonproj import DELIMITER
|
from ..pipeline._parser_internals.nonproj import DELIMITER
|
||||||
from ..pipeline import Morphologizer, SpanCategorizer
|
from ..pipeline import Morphologizer, SpanCategorizer
|
||||||
|
from ..pipeline._edit_tree_internals.edit_trees import EditTrees
|
||||||
from ..morphology import Morphology
|
from ..morphology import Morphology
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..util import registry, resolve_dot_names
|
from ..util import registry, resolve_dot_names
|
||||||
|
@ -671,6 +672,59 @@ def debug_data(
|
||||||
f"Found {gold_train_data['n_cycles']} projectivized train sentence(s) with cycles"
|
f"Found {gold_train_data['n_cycles']} projectivized train sentence(s) with cycles"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if "trainable_lemmatizer" in factory_names:
|
||||||
|
msg.divider("Trainable Lemmatizer")
|
||||||
|
trees_train: Set[str] = gold_train_data["lemmatizer_trees"]
|
||||||
|
trees_dev: Set[str] = gold_dev_data["lemmatizer_trees"]
|
||||||
|
# This is necessary context when someone is attempting to interpret whether the
|
||||||
|
# number of trees exclusively in the dev set is meaningful.
|
||||||
|
msg.info(f"{len(trees_train)} lemmatizer trees generated from training data")
|
||||||
|
msg.info(f"{len(trees_dev)} lemmatizer trees generated from dev data")
|
||||||
|
dev_not_train = trees_dev - trees_train
|
||||||
|
|
||||||
|
if len(dev_not_train) != 0:
|
||||||
|
pct = len(dev_not_train) / len(trees_dev)
|
||||||
|
msg.info(
|
||||||
|
f"{len(dev_not_train)} lemmatizer trees ({pct*100:.1f}% of dev trees)"
|
||||||
|
" were found exclusively in the dev data."
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Would we ever expect this case? It seems like it would be pretty rare,
|
||||||
|
# and we might actually want a warning?
|
||||||
|
msg.info("All trees in dev data present in training data.")
|
||||||
|
|
||||||
|
if gold_train_data["n_low_cardinality_lemmas"] > 0:
|
||||||
|
n = gold_train_data["n_low_cardinality_lemmas"]
|
||||||
|
msg.warn(f"{n} training docs with 0 or 1 unique lemmas.")
|
||||||
|
|
||||||
|
if gold_dev_data["n_low_cardinality_lemmas"] > 0:
|
||||||
|
n = gold_dev_data["n_low_cardinality_lemmas"]
|
||||||
|
msg.warn(f"{n} dev docs with 0 or 1 unique lemmas.")
|
||||||
|
|
||||||
|
if gold_train_data["no_lemma_annotations"] > 0:
|
||||||
|
n = gold_train_data["no_lemma_annotations"]
|
||||||
|
msg.warn(f"{n} training docs with no lemma annotations.")
|
||||||
|
else:
|
||||||
|
msg.good("All training docs have lemma annotations.")
|
||||||
|
|
||||||
|
if gold_dev_data["no_lemma_annotations"] > 0:
|
||||||
|
n = gold_dev_data["no_lemma_annotations"]
|
||||||
|
msg.warn(f"{n} dev docs with no lemma annotations.")
|
||||||
|
else:
|
||||||
|
msg.good("All dev docs have lemma annotations.")
|
||||||
|
|
||||||
|
if gold_train_data["partial_lemma_annotations"] > 0:
|
||||||
|
n = gold_train_data["partial_lemma_annotations"]
|
||||||
|
msg.info(f"{n} training docs with partial lemma annotations.")
|
||||||
|
else:
|
||||||
|
msg.good("All training docs have complete lemma annotations.")
|
||||||
|
|
||||||
|
if gold_dev_data["partial_lemma_annotations"] > 0:
|
||||||
|
n = gold_dev_data["partial_lemma_annotations"]
|
||||||
|
msg.info(f"{n} dev docs with partial lemma annotations.")
|
||||||
|
else:
|
||||||
|
msg.good("All dev docs have complete lemma annotations.")
|
||||||
|
|
||||||
msg.divider("Summary")
|
msg.divider("Summary")
|
||||||
good_counts = msg.counts[MESSAGES.GOOD]
|
good_counts = msg.counts[MESSAGES.GOOD]
|
||||||
warn_counts = msg.counts[MESSAGES.WARN]
|
warn_counts = msg.counts[MESSAGES.WARN]
|
||||||
|
@ -732,7 +786,13 @@ def _compile_gold(
|
||||||
"n_cats_multilabel": 0,
|
"n_cats_multilabel": 0,
|
||||||
"n_cats_bad_values": 0,
|
"n_cats_bad_values": 0,
|
||||||
"texts": set(),
|
"texts": set(),
|
||||||
|
"lemmatizer_trees": set(),
|
||||||
|
"no_lemma_annotations": 0,
|
||||||
|
"partial_lemma_annotations": 0,
|
||||||
|
"n_low_cardinality_lemmas": 0,
|
||||||
}
|
}
|
||||||
|
if "trainable_lemmatizer" in factory_names:
|
||||||
|
trees = EditTrees(nlp.vocab.strings)
|
||||||
for eg in examples:
|
for eg in examples:
|
||||||
gold = eg.reference
|
gold = eg.reference
|
||||||
doc = eg.predicted
|
doc = eg.predicted
|
||||||
|
@ -862,6 +922,25 @@ def _compile_gold(
|
||||||
data["n_nonproj"] += 1
|
data["n_nonproj"] += 1
|
||||||
if nonproj.contains_cycle(aligned_heads):
|
if nonproj.contains_cycle(aligned_heads):
|
||||||
data["n_cycles"] += 1
|
data["n_cycles"] += 1
|
||||||
|
if "trainable_lemmatizer" in factory_names:
|
||||||
|
# from EditTreeLemmatizer._labels_from_data
|
||||||
|
if all(token.lemma == 0 for token in gold):
|
||||||
|
data["no_lemma_annotations"] += 1
|
||||||
|
continue
|
||||||
|
if any(token.lemma == 0 for token in gold):
|
||||||
|
data["partial_lemma_annotations"] += 1
|
||||||
|
lemma_set = set()
|
||||||
|
for token in gold:
|
||||||
|
if token.lemma != 0:
|
||||||
|
lemma_set.add(token.lemma)
|
||||||
|
tree_id = trees.add(token.text, token.lemma_)
|
||||||
|
tree_str = trees.tree_to_str(tree_id)
|
||||||
|
data["lemmatizer_trees"].add(tree_str)
|
||||||
|
# We want to identify cases where lemmas aren't assigned
|
||||||
|
# or are all assigned the same value, as this would indicate
|
||||||
|
# an issue since we're expecting a large set of lemmas
|
||||||
|
if len(lemma_set) < 2 and len(gold) > 1:
|
||||||
|
data["n_low_cardinality_lemmas"] += 1
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1207,3 +1207,69 @@ def test_walk_directory():
|
||||||
assert (len(walk_directory(d, suffix="iob"))) == 2
|
assert (len(walk_directory(d, suffix="iob"))) == 2
|
||||||
assert (len(walk_directory(d, suffix="conll"))) == 3
|
assert (len(walk_directory(d, suffix="conll"))) == 3
|
||||||
assert (len(walk_directory(d, suffix="pdf"))) == 0
|
assert (len(walk_directory(d, suffix="pdf"))) == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_debug_data_trainable_lemmatizer_basic():
|
||||||
|
examples = [
|
||||||
|
("She likes green eggs", {"lemmas": ["she", "like", "green", "egg"]}),
|
||||||
|
("Eat blue ham", {"lemmas": ["eat", "blue", "ham"]}),
|
||||||
|
]
|
||||||
|
nlp = Language()
|
||||||
|
train_examples = []
|
||||||
|
for t in examples:
|
||||||
|
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||||
|
|
||||||
|
data = _compile_gold(train_examples, ["trainable_lemmatizer"], nlp, True)
|
||||||
|
# ref test_edit_tree_lemmatizer::test_initialize_from_labels
|
||||||
|
# this results in 4 trees
|
||||||
|
assert len(data["lemmatizer_trees"]) == 4
|
||||||
|
|
||||||
|
|
||||||
|
def test_debug_data_trainable_lemmatizer_partial():
|
||||||
|
partial_examples = [
|
||||||
|
# partial annotation
|
||||||
|
("She likes green eggs", {"lemmas": ["", "like", "green", ""]}),
|
||||||
|
# misaligned partial annotation
|
||||||
|
(
|
||||||
|
"He hates green eggs",
|
||||||
|
{
|
||||||
|
"words": ["He", "hat", "es", "green", "eggs"],
|
||||||
|
"lemmas": ["", "hat", "e", "green", ""],
|
||||||
|
},
|
||||||
|
),
|
||||||
|
]
|
||||||
|
nlp = Language()
|
||||||
|
train_examples = []
|
||||||
|
for t in partial_examples:
|
||||||
|
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||||
|
|
||||||
|
data = _compile_gold(train_examples, ["trainable_lemmatizer"], nlp, True)
|
||||||
|
assert data["partial_lemma_annotations"] == 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_debug_data_trainable_lemmatizer_low_cardinality():
|
||||||
|
low_cardinality_examples = [
|
||||||
|
("She likes green eggs", {"lemmas": ["no", "no", "no", "no"]}),
|
||||||
|
("Eat blue ham", {"lemmas": ["no", "no", "no"]}),
|
||||||
|
]
|
||||||
|
nlp = Language()
|
||||||
|
train_examples = []
|
||||||
|
for t in low_cardinality_examples:
|
||||||
|
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||||
|
|
||||||
|
data = _compile_gold(train_examples, ["trainable_lemmatizer"], nlp, True)
|
||||||
|
assert data["n_low_cardinality_lemmas"] == 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_debug_data_trainable_lemmatizer_not_annotated():
|
||||||
|
unannotated_examples = [
|
||||||
|
("She likes green eggs", {}),
|
||||||
|
("Eat blue ham", {}),
|
||||||
|
]
|
||||||
|
nlp = Language()
|
||||||
|
train_examples = []
|
||||||
|
for t in unannotated_examples:
|
||||||
|
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||||
|
|
||||||
|
data = _compile_gold(train_examples, ["trainable_lemmatizer"], nlp, True)
|
||||||
|
assert data["no_lemma_annotations"] == 2
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typer.testing import CliRunner
|
from typer.testing import CliRunner
|
||||||
|
from spacy.tokens import DocBin, Doc
|
||||||
|
|
||||||
from spacy.cli._util import app
|
from spacy.cli._util import app
|
||||||
from .util import make_tempdir
|
from .util import make_tempdir
|
||||||
|
@ -40,3 +41,51 @@ def test_benchmark_accuracy_alias():
|
||||||
assert result_benchmark.stdout == result_evaluate.stdout.replace(
|
assert result_benchmark.stdout == result_evaluate.stdout.replace(
|
||||||
"spacy evaluate", "spacy benchmark accuracy"
|
"spacy evaluate", "spacy benchmark accuracy"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_debug_data_trainable_lemmatizer_cli(en_vocab):
|
||||||
|
train_docs = [
|
||||||
|
Doc(en_vocab, words=["I", "like", "cats"], lemmas=["I", "like", "cat"]),
|
||||||
|
Doc(
|
||||||
|
en_vocab,
|
||||||
|
words=["Dogs", "are", "great", "too"],
|
||||||
|
lemmas=["dog", "be", "great", "too"],
|
||||||
|
),
|
||||||
|
]
|
||||||
|
dev_docs = [
|
||||||
|
Doc(en_vocab, words=["Cats", "are", "cute"], lemmas=["cat", "be", "cute"]),
|
||||||
|
Doc(en_vocab, words=["Pets", "are", "great"], lemmas=["pet", "be", "great"]),
|
||||||
|
]
|
||||||
|
with make_tempdir() as d_in:
|
||||||
|
train_bin = DocBin(docs=train_docs)
|
||||||
|
train_bin.to_disk(d_in / "train.spacy")
|
||||||
|
dev_bin = DocBin(docs=dev_docs)
|
||||||
|
dev_bin.to_disk(d_in / "dev.spacy")
|
||||||
|
# `debug data` requires an input pipeline config
|
||||||
|
CliRunner().invoke(
|
||||||
|
app,
|
||||||
|
[
|
||||||
|
"init",
|
||||||
|
"config",
|
||||||
|
f"{d_in}/config.cfg",
|
||||||
|
"--lang",
|
||||||
|
"en",
|
||||||
|
"--pipeline",
|
||||||
|
"trainable_lemmatizer",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
result_debug_data = CliRunner().invoke(
|
||||||
|
app,
|
||||||
|
[
|
||||||
|
"debug",
|
||||||
|
"data",
|
||||||
|
f"{d_in}/config.cfg",
|
||||||
|
"--paths.train",
|
||||||
|
f"{d_in}/train.spacy",
|
||||||
|
"--paths.dev",
|
||||||
|
f"{d_in}/dev.spacy",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
# Instead of checking specific wording of the output, which may change,
|
||||||
|
# we'll check that this section of the debug output is present.
|
||||||
|
assert "= Trainable Lemmatizer =" in result_debug_data.stdout
|
||||||
|
|
Loading…
Reference in New Issue
Block a user