Merge pull request #6571 from adrianeboyd/bugfix/debug-data-missing-vectors

Fix alignment and vector checks in debug data
This commit is contained in:
Ines Montani 2020-12-17 10:10:47 +11:00 committed by GitHub
commit 3f90bffa27
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -504,13 +504,18 @@ def _compile_gold(
for eg in examples:
gold = eg.reference
doc = eg.predicted
valid_words = [x for x in gold if x is not None]
valid_words = [x.text for x in gold]
data["words"].update(valid_words)
data["n_words"] += len(valid_words)
data["n_misaligned_words"] += len(gold) - len(valid_words)
align = eg.alignment
for token in doc:
if token.orth_.isspace():
continue
if align.x2y.lengths[token.i] != 1:
data["n_misaligned_words"] += 1
data["texts"].add(doc.text)
if len(nlp.vocab.vectors):
for word in valid_words:
for word in [t.text for t in doc]:
if nlp.vocab.strings[word] not in nlp.vocab.vectors:
data["words_missing_vectors"].update([word])
if "ner" in factory_names: