Fix alignment and vector checks in debug data

* Update token alignment check to use Example alignment
* Update missing vector check further related to changes in v3
This commit is contained in:
Adriane Boyd 2020-12-15 09:43:14 +01:00
parent 8656a08777
commit 20e18cc246

View File

@ -504,13 +504,18 @@ def _compile_gold(
for eg in examples: for eg in examples:
gold = eg.reference gold = eg.reference
doc = eg.predicted doc = eg.predicted
valid_words = [x for x in gold if x is not None] valid_words = [x.text for x in gold]
data["words"].update(valid_words) data["words"].update(valid_words)
data["n_words"] += len(valid_words) data["n_words"] += len(valid_words)
data["n_misaligned_words"] += len(gold) - len(valid_words) align = eg.alignment
for token in doc:
if token.orth_.isspace():
continue
if align.x2y.lengths[token.i] != 1:
data["n_misaligned_words"] += 1
data["texts"].add(doc.text) data["texts"].add(doc.text)
if len(nlp.vocab.vectors): if len(nlp.vocab.vectors):
for word in valid_words: for word in [t.text for t in doc]:
if nlp.vocab.strings[word] not in nlp.vocab.vectors: if nlp.vocab.strings[word] not in nlp.vocab.vectors:
data["words_missing_vectors"].update([word]) data["words_missing_vectors"].update([word])
if "ner" in factory_names: if "ner" in factory_names: