updates to NEL functionality (#6132)

* NEL: read sentences and ents from reference

* fiddling with sent_start annotations

* add KB serialization test

* KB write additional file with strings.json

* score_links function to calculate NEL P/R/F

* formatting

* documentation
This commit is contained in:
Sofie Van Landeghem 2020-09-24 16:53:59 +02:00 committed by GitHub
parent d0ef4a4cf5
commit c7eedd3534
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 273 additions and 42 deletions

View File

@ -517,8 +517,8 @@ class Errors:
"instead.") "instead.")
E927 = ("Can't write to frozen list Maybe you're trying to modify a computed " E927 = ("Can't write to frozen list Maybe you're trying to modify a computed "
"property or default function argument?") "property or default function argument?")
E928 = ("A 'KnowledgeBase' should be written to / read from a file, but the " E928 = ("A 'KnowledgeBase' can only be serialized to/from from a directory, "
"provided argument {loc} is an existing directory.") "but the provided argument {loc} points to a file.")
E929 = ("A 'KnowledgeBase' could not be read from {loc} - the path does " E929 = ("A 'KnowledgeBase' could not be read from {loc} - the path does "
"not seem to exist.") "not seem to exist.")
E930 = ("Received invalid get_examples callback in {name}.begin_training. " E930 = ("Received invalid get_examples callback in {name}.begin_training. "

View File

@ -10,6 +10,8 @@ from libcpp.vector cimport vector
from pathlib import Path from pathlib import Path
import warnings import warnings
from spacy.strings import StringStore
from spacy import util from spacy import util
from .typedefs cimport hash_t from .typedefs cimport hash_t
@ -83,6 +85,9 @@ cdef class KnowledgeBase:
DOCS: https://nightly.spacy.io/api/kb DOCS: https://nightly.spacy.io/api/kb
""" """
contents_loc = "contents"
strings_loc = "strings.json"
def __init__(self, Vocab vocab, entity_vector_length): def __init__(self, Vocab vocab, entity_vector_length):
"""Create a KnowledgeBase.""" """Create a KnowledgeBase."""
self.mem = Pool() self.mem = Pool()
@ -319,15 +324,29 @@ cdef class KnowledgeBase:
return 0.0 return 0.0
def to_disk(self, path): def to_disk(self, path):
path = util.ensure_path(path) path = util.ensure_path(path)
if path.is_dir(): if not path.exists():
path.mkdir(parents=True)
if not path.is_dir():
raise ValueError(Errors.E928.format(loc=path)) raise ValueError(Errors.E928.format(loc=path))
if not path.parent.exists(): self.write_contents(path / self.contents_loc)
path.parent.mkdir(parents=True) self.vocab.strings.to_disk(path / self.strings_loc)
cdef Writer writer = Writer(path) def from_disk(self, path):
path = util.ensure_path(path)
if not path.exists():
raise ValueError(Errors.E929.format(loc=path))
if not path.is_dir():
raise ValueError(Errors.E928.format(loc=path))
self.read_contents(path / self.contents_loc)
kb_strings = StringStore()
kb_strings.from_disk(path / self.strings_loc)
for string in kb_strings:
self.vocab.strings.add(string)
def write_contents(self, file_path):
cdef Writer writer = Writer(file_path)
writer.write_header(self.get_size_entities(), self.entity_vector_length) writer.write_header(self.get_size_entities(), self.entity_vector_length)
# dumping the entity vectors in their original order # dumping the entity vectors in their original order
@ -366,13 +385,7 @@ cdef class KnowledgeBase:
writer.close() writer.close()
def from_disk(self, path): def read_contents(self, file_path):
path = util.ensure_path(path)
if path.is_dir():
raise ValueError(Errors.E928.format(loc=path))
if not path.exists():
raise ValueError(Errors.E929.format(loc=path))
cdef hash_t entity_hash cdef hash_t entity_hash
cdef hash_t alias_hash cdef hash_t alias_hash
cdef int64_t entry_index cdef int64_t entry_index
@ -382,7 +395,7 @@ cdef class KnowledgeBase:
cdef AliasC alias cdef AliasC alias
cdef float vector_element cdef float vector_element
cdef Reader reader = Reader(path) cdef Reader reader = Reader(file_path)
# STEP 0: load header and initialize KB # STEP 0: load header and initialize KB
cdef int64_t nr_entities cdef int64_t nr_entities

View File

@ -16,6 +16,7 @@ from ..training import Example, validate_examples
from ..errors import Errors, Warnings from ..errors import Errors, Warnings
from ..util import SimpleFrozenList from ..util import SimpleFrozenList
from .. import util from .. import util
from ..scorer import Scorer
default_model_config = """ default_model_config = """
@ -47,6 +48,8 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
"incl_context": True, "incl_context": True,
"get_candidates": {"@misc": "spacy.CandidateGenerator.v1"}, "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
}, },
scores=["nel_micro_p", "nel_micro_r", "nel_micro_f"],
default_score_weights={"nel_micro_f": 1.0},
) )
def make_entity_linker( def make_entity_linker(
nlp: Language, nlp: Language,
@ -209,12 +212,11 @@ class EntityLinker(Pipe):
# it does run the model twice :( # it does run the model twice :(
predictions = self.model.predict(docs) predictions = self.model.predict(docs)
for eg in examples: for eg in examples:
sentences = [s for s in eg.predicted.sents] sentences = [s for s in eg.reference.sents]
kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True) kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
for ent in eg.predicted.ents: for ent in eg.reference.ents:
kb_id = kb_ids[ # KB ID of the first token is the same as the whole span
ent.start kb_id = kb_ids[ent.start]
] # KB ID of the first token is the same as the whole span
if kb_id: if kb_id:
try: try:
# find the sentence in the list of sentences. # find the sentence in the list of sentences.
@ -253,7 +255,7 @@ class EntityLinker(Pipe):
entity_encodings = [] entity_encodings = []
for eg in examples: for eg in examples:
kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True) kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
for ent in eg.predicted.ents: for ent in eg.reference.ents:
kb_id = kb_ids[ent.start] kb_id = kb_ids[ent.start]
if kb_id: if kb_id:
entity_encoding = self.kb.get_vector(kb_id) entity_encoding = self.kb.get_vector(kb_id)
@ -415,6 +417,18 @@ class EntityLinker(Pipe):
for token in ent: for token in ent:
token.ent_kb_id_ = kb_id token.ent_kb_id_ = kb_id
def score(self, examples, **kwargs):
"""Score a batch of examples.
examples (Iterable[Example]): The examples to score.
RETURNS (Dict[str, Any]): The scores.
DOCS TODO: https://nightly.spacy.io/api/entity_linker#score
"""
validate_examples(examples, "EntityLinker.score")
return Scorer.score_links(examples, negative_labels=[self.NIL])
def to_disk( def to_disk(
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList() self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
) -> None: ) -> None:

View File

@ -451,6 +451,74 @@ class Scorer:
results[f"{attr}_score_desc"] = "macro AUC" results[f"{attr}_score_desc"] = "macro AUC"
return results return results
@staticmethod
def score_links(
examples: Iterable[Example], *, negative_labels: Iterable[str]
) -> Dict[str, Any]:
"""Returns PRF for predicted links on the entity level.
To disentangle the performance of the NEL from the NER,
this method only evaluates NEL links for entities that overlap
between the gold reference and the predictions.
examples (Iterable[Example]): Examples to score
negative_labels (Iterable[str]): The string values that refer to no annotation (e.g. "NIL")
RETURNS (Dict[str, Any]): A dictionary containing the scores.
DOCS (TODO): https://nightly.spacy.io/api/scorer#score_links
"""
f_per_type = {}
for example in examples:
gold_ent_by_offset = {}
for gold_ent in example.reference.ents:
gold_ent_by_offset[(gold_ent.start_char, gold_ent.end_char)] = gold_ent
for pred_ent in example.predicted.ents:
gold_span = gold_ent_by_offset.get(
(pred_ent.start_char, pred_ent.end_char), None
)
label = gold_span.label_
if not label in f_per_type:
f_per_type[label] = PRFScore()
gold = gold_span.kb_id_
# only evaluating entities that overlap between gold and pred,
# to disentangle the performance of the NEL from the NER
if gold is not None:
pred = pred_ent.kb_id_
if gold in negative_labels and pred in negative_labels:
# ignore true negatives
pass
elif gold == pred:
f_per_type[label].tp += 1
elif gold in negative_labels:
f_per_type[label].fp += 1
elif pred in negative_labels:
f_per_type[label].fn += 1
else:
# a wrong prediction (e.g. Q42 != Q3) counts as both a FP as well as a FN
f_per_type[label].fp += 1
f_per_type[label].fn += 1
micro_prf = PRFScore()
for label_prf in f_per_type.values():
micro_prf.tp += label_prf.tp
micro_prf.fn += label_prf.fn
micro_prf.fp += label_prf.fp
n_labels = len(f_per_type) + 1e-100
macro_p = sum(prf.precision for prf in f_per_type.values()) / n_labels
macro_r = sum(prf.recall for prf in f_per_type.values()) / n_labels
macro_f = sum(prf.fscore for prf in f_per_type.values()) / n_labels
results = {
f"nel_score": micro_prf.fscore,
f"nel_score_desc": "micro F",
f"nel_micro_p": micro_prf.precision,
f"nel_micro_r": micro_prf.recall,
f"nel_micro_f": micro_prf.fscore,
f"nel_macro_p": macro_p,
f"nel_macro_r": macro_r,
f"nel_macro_f": macro_f,
f"nel_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()},
}
return results
@staticmethod @staticmethod
def score_deps( def score_deps(
examples: Iterable[Example], examples: Iterable[Example],

View File

@ -2,8 +2,10 @@ from typing import Callable, Iterable
import pytest import pytest
from spacy.kb import KnowledgeBase, get_candidates, Candidate from spacy.kb import KnowledgeBase, get_candidates, Candidate
from spacy.vocab import Vocab
from spacy import util, registry from spacy import util, registry
from spacy.scorer import Scorer
from spacy.training import Example from spacy.training import Example
from spacy.lang.en import English from spacy.lang.en import English
from spacy.tests.util import make_tempdir from spacy.tests.util import make_tempdir
@ -151,22 +153,15 @@ def test_kb_serialize(nlp):
# normal read-write behaviour # normal read-write behaviour
mykb.to_disk(d / "kb") mykb.to_disk(d / "kb")
mykb.from_disk(d / "kb") mykb.from_disk(d / "kb")
mykb.to_disk(d / "kb.file")
mykb.from_disk(d / "kb.file")
mykb.to_disk(d / "new" / "kb") mykb.to_disk(d / "new" / "kb")
mykb.from_disk(d / "new" / "kb") mykb.from_disk(d / "new" / "kb")
# allow overwriting an existing file # allow overwriting an existing file
mykb.to_disk(d / "kb.file") mykb.to_disk(d / "kb")
with pytest.raises(ValueError):
# can not write to a directory
mykb.to_disk(d)
with pytest.raises(ValueError):
# can not read from a directory
mykb.from_disk(d)
with pytest.raises(ValueError): with pytest.raises(ValueError):
# can not read from an unknown file # can not read from an unknown file
mykb.from_disk(d / "unknown" / "kb") mykb.from_disk(d / "unknown" / "kb")
def test_candidate_generation(nlp): def test_candidate_generation(nlp):
"""Test correct candidate generation""" """Test correct candidate generation"""
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
@ -254,6 +249,41 @@ def test_el_pipe_configuration(nlp):
assert doc[2].ent_kb_id_ == "Q2" assert doc[2].ent_kb_id_ == "Q2"
def test_vocab_serialization(nlp):
"""Test that string information is retained across storage"""
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
# adding entities
q1_hash = mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
q2_hash = mykb.add_entity(entity="Q2", freq=12, entity_vector=[2])
q3_hash = mykb.add_entity(entity="Q3", freq=5, entity_vector=[3])
# adding aliases
douglas_hash = mykb.add_alias(
alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1]
)
adam_hash = mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
candidates = mykb.get_alias_candidates("adam")
assert len(candidates) == 1
assert candidates[0].entity == q2_hash
assert candidates[0].entity_ == "Q2"
assert candidates[0].alias == adam_hash
assert candidates[0].alias_ == "adam"
with make_tempdir() as d:
mykb.to_disk(d / "kb")
kb_new_vocab = KnowledgeBase(Vocab(), entity_vector_length=1)
kb_new_vocab.from_disk(d / "kb")
candidates = kb_new_vocab.get_alias_candidates("adam")
assert len(candidates) == 1
assert candidates[0].entity == q2_hash
assert candidates[0].entity_ == "Q2"
assert candidates[0].alias == adam_hash
assert candidates[0].alias_ == "adam"
def test_append_alias(nlp): def test_append_alias(nlp):
"""Test that we can append additional alias-entity pairs""" """Test that we can append additional alias-entity pairs"""
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
@ -377,16 +407,20 @@ def test_preserving_links_ents_2(nlp):
TRAIN_DATA = [ TRAIN_DATA = [
("Russ Cochran captured his first major title with his son as caddie.", ("Russ Cochran captured his first major title with his son as caddie.",
{"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}}, {"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}},
"entities": [(0, 12, "PERSON")]}), "entities": [(0, 12, "PERSON")],
"sent_starts": [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}),
("Russ Cochran his reprints include EC Comics.", ("Russ Cochran his reprints include EC Comics.",
{"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}}, {"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}},
"entities": [(0, 12, "PERSON")]}), "entities": [(0, 12, "PERSON")],
"sent_starts": [1, -1, 0, 0, 0, 0, 0, 0]}),
("Russ Cochran has been publishing comic art.", ("Russ Cochran has been publishing comic art.",
{"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}}, {"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}},
"entities": [(0, 12, "PERSON")]}), "entities": [(0, 12, "PERSON")],
"sent_starts": [1, -1, 0, 0, 0, 0, 0, 0]}),
("Russ Cochran was a member of University of Kentucky's golf team.", ("Russ Cochran was a member of University of Kentucky's golf team.",
{"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}}, {"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}},
"entities": [(0, 12, "PERSON"), (43, 51, "LOC")]}), "entities": [(0, 12, "PERSON"), (43, 51, "LOC")],
"sent_starts": [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]})
] ]
GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"] GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"]
# fmt: on # fmt: on
@ -395,16 +429,8 @@ GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"]
def test_overfitting_IO(): def test_overfitting_IO():
# Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly # Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly
nlp = English() nlp = English()
nlp.add_pipe("sentencizer")
vector_length = 3 vector_length = 3
# Add a custom component to recognize "Russ Cochran" as an entity for the example training data
patterns = [
{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}
]
ruler = nlp.add_pipe("entity_ruler")
ruler.add_patterns(patterns)
# Convert the texts to docs to make sure we have doc.ents set for the training examples # Convert the texts to docs to make sure we have doc.ents set for the training examples
train_examples = [] train_examples = []
for text, annotation in TRAIN_DATA: for text, annotation in TRAIN_DATA:
@ -446,6 +472,16 @@ def test_overfitting_IO():
nlp.update(train_examples, sgd=optimizer, losses=losses) nlp.update(train_examples, sgd=optimizer, losses=losses)
assert losses["entity_linker"] < 0.001 assert losses["entity_linker"] < 0.001
# adding additional components that are required for the entity_linker
nlp.add_pipe("sentencizer", first=True)
# Add a custom component to recognize "Russ Cochran" as an entity for the example training data
patterns = [
{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}
]
ruler = nlp.add_pipe("entity_ruler", before="entity_linker")
ruler.add_patterns(patterns)
# test the trained model # test the trained model
predictions = [] predictions = []
for text, annotation in TRAIN_DATA: for text, annotation in TRAIN_DATA:
@ -465,3 +501,46 @@ def test_overfitting_IO():
for ent in doc2.ents: for ent in doc2.ents:
predictions.append(ent.kb_id_) predictions.append(ent.kb_id_)
assert predictions == GOLD_entities assert predictions == GOLD_entities
def test_scorer_links():
train_examples = []
nlp = English()
ref1 = nlp("Julia lives in London happily.")
ref1.ents = [
Span(ref1, 0, 1, label="PERSON", kb_id="Q2"),
Span(ref1, 3, 4, label="LOC", kb_id="Q3"),
]
pred1 = nlp("Julia lives in London happily.")
pred1.ents = [
Span(pred1, 0, 1, label="PERSON", kb_id="Q70"),
Span(pred1, 3, 4, label="LOC", kb_id="Q3"),
]
train_examples.append(Example(pred1, ref1))
ref2 = nlp("She loves London.")
ref2.ents = [
Span(ref2, 0, 1, label="PERSON", kb_id="Q2"),
Span(ref2, 2, 3, label="LOC", kb_id="Q13"),
]
pred2 = nlp("She loves London.")
pred2.ents = [
Span(pred2, 0, 1, label="PERSON", kb_id="Q2"),
Span(pred2, 2, 3, label="LOC", kb_id="NIL"),
]
train_examples.append(Example(pred2, ref2))
ref3 = nlp("London is great.")
ref3.ents = [Span(ref3, 0, 1, label="LOC", kb_id="NIL")]
pred3 = nlp("London is great.")
pred3.ents = [Span(pred3, 0, 1, label="LOC", kb_id="NIL")]
train_examples.append(Example(pred3, ref3))
scores = Scorer().score_links(train_examples, negative_labels=["NIL"])
assert scores["nel_f_per_type"]["PERSON"]["p"] == 1 / 2
assert scores["nel_f_per_type"]["PERSON"]["r"] == 1 / 2
assert scores["nel_f_per_type"]["LOC"]["p"] == 1 / 1
assert scores["nel_f_per_type"]["LOC"]["r"] == 1 / 2
assert scores["nel_micro_p"] == 2 / 3
assert scores["nel_micro_r"] == 2 / 4

View File

@ -244,3 +244,22 @@ def test_Example_from_dict_with_links_invalid(annots):
predicted = Doc(vocab, words=annots["words"]) predicted = Doc(vocab, words=annots["words"])
with pytest.raises(ValueError): with pytest.raises(ValueError):
Example.from_dict(predicted, annots) Example.from_dict(predicted, annots)
def test_Example_from_dict_sentences():
vocab = Vocab()
predicted = Doc(vocab, words=["One", "sentence", ".", "one", "more"])
annots = {"sent_starts": [1, 0, 0, 1, 0]}
ex = Example.from_dict(predicted, annots)
assert len(list(ex.reference.sents)) == 2
# this currently throws an error - bug or feature?
# predicted = Doc(vocab, words=["One", "sentence", "not", "one", "more"])
# annots = {"sent_starts": [1, 0, 0, 0, 0]}
# ex = Example.from_dict(predicted, annots)
# assert len(list(ex.reference.sents)) == 1
predicted = Doc(vocab, words=["One", "sentence", "not", "one", "more"])
annots = {"sent_starts": [1, -1, 0, 0, 0]}
ex = Example.from_dict(predicted, annots)
assert len(list(ex.reference.sents)) == 1

View File

@ -225,6 +225,21 @@ pipe's entity linking model and context encoder. Delegates to
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | | `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | | **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
## EntityLinker.score {#score tag="method" new="3"}
Score a batch of examples.
> #### Example
>
> ```python
> scores = entity_linker.score(examples)
> ```
| Name | Description |
| ----------- | ---------------------------------------------------------------------------------------------- |
| `examples` | The examples to score. ~~Iterable[Example]~~ |
| **RETURNS** | The scores, produced by [`Scorer.score_links`](/api/scorer#score_links) . ~~Dict[str, float]~~ |
## EntityLinker.create_optimizer {#create_optimizer tag="method"} ## EntityLinker.create_optimizer {#create_optimizer tag="method"}
Create an optimizer for the pipeline component. Create an optimizer for the pipeline component.

View File

@ -206,3 +206,26 @@ depends on the scorer settings:
| `multi_label` | Whether the attribute allows multiple labels. Defaults to `True`. ~~bool~~ | | `multi_label` | Whether the attribute allows multiple labels. Defaults to `True`. ~~bool~~ |
| `positive_label` | The positive label for a binary task with exclusive classes. Defaults to `None`. ~~Optional[str]~~ | | `positive_label` | The positive label for a binary task with exclusive classes. Defaults to `None`. ~~Optional[str]~~ |
| **RETURNS** | A dictionary containing the scores, with inapplicable scores as `None`. ~~Dict[str, Optional[float]]~~ | | **RETURNS** | A dictionary containing the scores, with inapplicable scores as `None`. ~~Dict[str, Optional[float]]~~ |
## Scorer.score_links {#score_links tag="staticmethod" new="3"}
Returns PRF for predicted links on the entity level. To disentangle the
performance of the NEL from the NER, this method only evaluates NEL links for
entities that overlap between the gold reference and the predictions.
> #### Example
>
> ```python
> scores = Scorer.score_links(
> examples,
> negative_labels=["NIL", ""]
> )
> print(scores["nel_micro_f"])
> ```
| Name | Description |
| ----------------- | ------------------------------------------------------------------------------------------------------------------- |
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
| _keyword-only_ | |
| `negative_labels` | The string values that refer to no annotation (e.g. "NIL"). ~~Iterable[str]~~ |
| **RETURNS** | A dictionary containing the scores. ~~Dict[str, Optional[float]]~~ |