mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 01:04:34 +03:00
rm unclear uniqueness check
This commit is contained in:
parent
4c45dfabae
commit
6ea2b3524b
|
@ -693,18 +693,6 @@ def debug_data(
|
||||||
# and we might actually want a warning?
|
# and we might actually want a warning?
|
||||||
msg.info("All trees in dev data present in training data.")
|
msg.info("All trees in dev data present in training data.")
|
||||||
|
|
||||||
if gold_train_data["n_low_cardinality_lemmas"] > 0:
|
|
||||||
n = gold_train_data["n_low_cardinality_lemmas"]
|
|
||||||
msg.warn(f"{n} docs with 1 or 0 unique lemmas.")
|
|
||||||
else:
|
|
||||||
msg.good("All training docs meet lemma uniqueness requirements.")
|
|
||||||
|
|
||||||
if gold_dev_data["n_low_cardinality_lemmas"] > 0:
|
|
||||||
n = gold_dev_data["n_low_cardinality_lemmas"]
|
|
||||||
msg.warn(f"{n} docs with 1 or 0 unique lemmas.")
|
|
||||||
else:
|
|
||||||
msg.good("All dev docs meet lemma uniqueness requirements.")
|
|
||||||
|
|
||||||
if gold_train_data["no_lemma_annotations"] > 0:
|
if gold_train_data["no_lemma_annotations"] > 0:
|
||||||
n = gold_train_data["no_lemma_annotations"]
|
n = gold_train_data["no_lemma_annotations"]
|
||||||
msg.warn(f"{n} docs with no lemma annotations.")
|
msg.warn(f"{n} docs with no lemma annotations.")
|
||||||
|
@ -793,7 +781,6 @@ def _compile_gold(
|
||||||
"lemmatizer_trees": set(),
|
"lemmatizer_trees": set(),
|
||||||
"no_lemma_annotations": 0,
|
"no_lemma_annotations": 0,
|
||||||
"partial_lemma_annotations": 0,
|
"partial_lemma_annotations": 0,
|
||||||
"n_low_cardinality_lemmas": 0,
|
|
||||||
}
|
}
|
||||||
if "trainable_lemmatizer" in factory_names:
|
if "trainable_lemmatizer" in factory_names:
|
||||||
trees = EditTrees(nlp.vocab.strings)
|
trees = EditTrees(nlp.vocab.strings)
|
||||||
|
@ -933,18 +920,11 @@ def _compile_gold(
|
||||||
continue
|
continue
|
||||||
if any(token.lemma == 0 for token in gold):
|
if any(token.lemma == 0 for token in gold):
|
||||||
data["partial_lemma_annotations"] += 1
|
data["partial_lemma_annotations"] += 1
|
||||||
lemma_set = set()
|
|
||||||
for token in gold:
|
for token in gold:
|
||||||
if token.lemma != 0:
|
if token.lemma != 0:
|
||||||
lemma_set.add(token.lemma)
|
|
||||||
tree_id = trees.add(token.text, token.lemma_)
|
tree_id = trees.add(token.text, token.lemma_)
|
||||||
tree_str = trees.tree_to_str(tree_id)
|
tree_str = trees.tree_to_str(tree_id)
|
||||||
data["lemmatizer_trees"].add(tree_str)
|
data["lemmatizer_trees"].add(tree_str)
|
||||||
# We want to identify cases where lemmas aren't assigned
|
|
||||||
# or are all assigned the same value, as this would indicate
|
|
||||||
# an issue since we're expecting a large set of lemmas
|
|
||||||
if len(lemma_set) < 2 and len(gold) > 1:
|
|
||||||
data["n_low_cardinality_lemmas"] += 1
|
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1244,19 +1244,6 @@ def test_debug_data_trainable_lemmatizer_partial():
|
||||||
data = _compile_gold(train_examples, ["trainable_lemmatizer"], nlp, True)
|
data = _compile_gold(train_examples, ["trainable_lemmatizer"], nlp, True)
|
||||||
assert data["partial_lemma_annotations"] == 2
|
assert data["partial_lemma_annotations"] == 2
|
||||||
|
|
||||||
def test_debug_data_trainable_lemmatizer_low_cardinality():
|
|
||||||
low_cardinality_examples = [
|
|
||||||
("She likes green eggs", {"lemmas": ["no", "no", "no", "no"]}),
|
|
||||||
("Eat blue ham", {"lemmas": ["no", "no", "no"]}),
|
|
||||||
]
|
|
||||||
nlp = Language()
|
|
||||||
train_examples = []
|
|
||||||
for t in low_cardinality_examples:
|
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
|
||||||
|
|
||||||
data = _compile_gold(train_examples, ["trainable_lemmatizer"], nlp, True)
|
|
||||||
assert data["n_low_cardinality_lemmas"] == 2
|
|
||||||
|
|
||||||
def test_debug_data_trainable_lemmatizer_not_annotated():
|
def test_debug_data_trainable_lemmatizer_not_annotated():
|
||||||
unannotated_examples = [
|
unannotated_examples = [
|
||||||
("She likes green eggs", {}),
|
("She likes green eggs", {}),
|
||||||
|
|
Loading…
Reference in New Issue
Block a user