mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-03 11:50:19 +03:00
fix EL scorer
This commit is contained in:
parent
76d77f0f2e
commit
7ea8c4aaa5
|
@ -235,7 +235,6 @@ class EntityLinker(TrainablePipe):
|
||||||
self.cfg: Dict[str, Any] = {"overwrite": overwrite}
|
self.cfg: Dict[str, Any] = {"overwrite": overwrite}
|
||||||
self.distance = CosineDistance(normalize=False)
|
self.distance = CosineDistance(normalize=False)
|
||||||
self.kb = generate_empty_kb(self.vocab, entity_vector_length)
|
self.kb = generate_empty_kb(self.vocab, entity_vector_length)
|
||||||
self.scorer = scorer
|
|
||||||
self.use_gold_ents = use_gold_ents
|
self.use_gold_ents = use_gold_ents
|
||||||
self.candidates_batch_size = candidates_batch_size
|
self.candidates_batch_size = candidates_batch_size
|
||||||
self.threshold = threshold
|
self.threshold = threshold
|
||||||
|
@ -243,6 +242,33 @@ class EntityLinker(TrainablePipe):
|
||||||
if candidates_batch_size < 1:
|
if candidates_batch_size < 1:
|
||||||
raise ValueError(Errors.E1044)
|
raise ValueError(Errors.E1044)
|
||||||
|
|
||||||
|
def _score_augmented(examples, **kwargs):
|
||||||
|
# Because of how spaCy works, we can't just score immediately, because Language.evaluate
|
||||||
|
# calls pipe() on the predicted docs, which won't have entities if there is no NER in the pipeline.
|
||||||
|
if not self.use_gold_ents:
|
||||||
|
return scorer(examples, **kwargs)
|
||||||
|
else:
|
||||||
|
examples = self._augment_examples(examples)
|
||||||
|
docs = self.pipe(
|
||||||
|
(eg.predicted for eg in examples),
|
||||||
|
)
|
||||||
|
for eg, doc in zip(examples, docs):
|
||||||
|
eg.predicted = doc
|
||||||
|
return scorer(examples, **kwargs)
|
||||||
|
|
||||||
|
self.scorer = _score_augmented
|
||||||
|
|
||||||
|
def _augment_examples(self, examples: Iterable[Example]) -> Iterable[Example]:
|
||||||
|
"""If use_gold_ents is true, set the gold entities to eg.predicted.
|
||||||
|
"""
|
||||||
|
new_examples = []
|
||||||
|
for eg in examples:
|
||||||
|
if self.use_gold_ents:
|
||||||
|
ents, _ = eg.get_aligned_ents_and_ner()
|
||||||
|
eg.predicted.ents = ents
|
||||||
|
new_examples.append(eg)
|
||||||
|
return new_examples
|
||||||
|
|
||||||
def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
|
def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
|
||||||
"""Define the KB of this pipe by providing a function that will
|
"""Define the KB of this pipe by providing a function that will
|
||||||
create it using this object's vocab."""
|
create it using this object's vocab."""
|
||||||
|
@ -284,13 +310,9 @@ class EntityLinker(TrainablePipe):
|
||||||
nO = self.kb.entity_vector_length
|
nO = self.kb.entity_vector_length
|
||||||
doc_sample = []
|
doc_sample = []
|
||||||
vector_sample = []
|
vector_sample = []
|
||||||
orig_ents = []
|
examples = self._augment_examples(islice(get_examples(), 10))
|
||||||
for eg in islice(get_examples(), 10):
|
for eg in examples:
|
||||||
doc = eg.x
|
doc = eg.x
|
||||||
if self.use_gold_ents:
|
|
||||||
orig_ents.append(doc.ents)
|
|
||||||
ents, _ = eg.get_aligned_ents_and_ner()
|
|
||||||
doc.ents = ents
|
|
||||||
doc_sample.append(doc)
|
doc_sample.append(doc)
|
||||||
vector_sample.append(self.model.ops.alloc1f(nO))
|
vector_sample.append(self.model.ops.alloc1f(nO))
|
||||||
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
|
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
|
||||||
|
@ -315,10 +337,6 @@ class EntityLinker(TrainablePipe):
|
||||||
if not has_annotations:
|
if not has_annotations:
|
||||||
# Clean up dummy annotation
|
# Clean up dummy annotation
|
||||||
doc.ents = []
|
doc.ents = []
|
||||||
if self.use_gold_ents:
|
|
||||||
assert len(doc_sample) == len(orig_ents)
|
|
||||||
for doc, orig_ent in zip(doc_sample, orig_ents):
|
|
||||||
doc.ents = orig_ent
|
|
||||||
|
|
||||||
def batch_has_learnable_example(self, examples):
|
def batch_has_learnable_example(self, examples):
|
||||||
"""Check if a batch contains a learnable example.
|
"""Check if a batch contains a learnable example.
|
||||||
|
@ -360,25 +378,15 @@ class EntityLinker(TrainablePipe):
|
||||||
losses.setdefault(self.name, 0.0)
|
losses.setdefault(self.name, 0.0)
|
||||||
if not examples:
|
if not examples:
|
||||||
return losses
|
return losses
|
||||||
|
examples = self._augment_examples(examples)
|
||||||
validate_examples(examples, "EntityLinker.update")
|
validate_examples(examples, "EntityLinker.update")
|
||||||
|
|
||||||
set_dropout_rate(self.model, drop)
|
|
||||||
docs = [eg.predicted for eg in examples]
|
|
||||||
# save to restore later
|
|
||||||
old_ents = [doc.ents for doc in docs]
|
|
||||||
|
|
||||||
for doc, ex in zip(docs, examples):
|
|
||||||
if self.use_gold_ents:
|
|
||||||
ents, _ = ex.get_aligned_ents_and_ner()
|
|
||||||
doc.ents = ents
|
|
||||||
else:
|
|
||||||
# only keep matching ents
|
|
||||||
doc.ents = ex.get_matching_ents()
|
|
||||||
|
|
||||||
# make sure we have something to learn from, if not, short-circuit
|
# make sure we have something to learn from, if not, short-circuit
|
||||||
if not self.batch_has_learnable_example(examples):
|
if not self.batch_has_learnable_example(examples):
|
||||||
return losses
|
return losses
|
||||||
|
|
||||||
|
set_dropout_rate(self.model, drop)
|
||||||
|
docs = [eg.predicted for eg in examples]
|
||||||
sentence_encodings, bp_context = self.model.begin_update(docs)
|
sentence_encodings, bp_context = self.model.begin_update(docs)
|
||||||
|
|
||||||
loss, d_scores = self.get_loss(
|
loss, d_scores = self.get_loss(
|
||||||
|
@ -389,14 +397,10 @@ class EntityLinker(TrainablePipe):
|
||||||
self.finish_update(sgd)
|
self.finish_update(sgd)
|
||||||
losses[self.name] += loss
|
losses[self.name] += loss
|
||||||
|
|
||||||
# now restore the ents
|
|
||||||
assert len(docs) == len(old_ents)
|
|
||||||
for doc, old in zip(docs, old_ents):
|
|
||||||
doc.ents = old
|
|
||||||
|
|
||||||
return losses
|
return losses
|
||||||
|
|
||||||
def get_loss(self, examples: Iterable[Example], sentence_encodings: Floats2d):
|
def get_loss(self, examples: Iterable[Example], sentence_encodings: Floats2d):
|
||||||
|
""" Here, we assume that get_loss is called with augmented examples if need be"""
|
||||||
validate_examples(examples, "EntityLinker.get_loss")
|
validate_examples(examples, "EntityLinker.get_loss")
|
||||||
entity_encodings = []
|
entity_encodings = []
|
||||||
eidx = 0 # indices in gold entities to keep
|
eidx = 0 # indices in gold entities to keep
|
||||||
|
|
|
@ -807,6 +807,103 @@ def test_overfitting_IO_gold_entities():
|
||||||
assert_equal(batch_deps_1, batch_deps_2)
|
assert_equal(batch_deps_1, batch_deps_2)
|
||||||
assert_equal(batch_deps_1, no_batch_deps)
|
assert_equal(batch_deps_1, no_batch_deps)
|
||||||
|
|
||||||
|
eval = nlp.evaluate(train_examples)
|
||||||
|
assert "nel_macro_p" in eval
|
||||||
|
assert "nel_macro_r" in eval
|
||||||
|
assert "nel_macro_f" in eval
|
||||||
|
assert "nel_micro_p" in eval
|
||||||
|
assert "nel_micro_r" in eval
|
||||||
|
assert "nel_micro_f" in eval
|
||||||
|
assert "nel_f_per_type" in eval
|
||||||
|
assert "PERSON" in eval["nel_f_per_type"]
|
||||||
|
|
||||||
|
assert eval["nel_macro_f"] > 0
|
||||||
|
assert eval["nel_micro_f"] > 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_overfitting_IO_with_ner():
|
||||||
|
# Simple test to try and overfit the NER and NEL component in combination - ensuring the ML models work correctly
|
||||||
|
nlp = English()
|
||||||
|
vector_length = 3
|
||||||
|
assert "Q2146908" not in nlp.vocab.strings
|
||||||
|
|
||||||
|
# Convert the texts to docs to make sure we have doc.ents set for the training examples
|
||||||
|
train_examples = []
|
||||||
|
for text, annotation in TRAIN_DATA:
|
||||||
|
doc = nlp(text)
|
||||||
|
train_examples.append(Example.from_dict(doc, annotation))
|
||||||
|
|
||||||
|
def create_kb(vocab):
|
||||||
|
# create artificial KB - assign same prior weight to the two russ cochran's
|
||||||
|
# Q2146908 (Russ Cochran): American golfer
|
||||||
|
# Q7381115 (Russ Cochran): publisher
|
||||||
|
mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
|
||||||
|
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
||||||
|
mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
|
||||||
|
mykb.add_alias(
|
||||||
|
alias="Russ Cochran",
|
||||||
|
entities=["Q2146908", "Q7381115"],
|
||||||
|
probabilities=[0.5, 0.5],
|
||||||
|
)
|
||||||
|
return mykb
|
||||||
|
|
||||||
|
# Create the NER and EL components and add them to the pipeline
|
||||||
|
ner = nlp.add_pipe("ner", first=True)
|
||||||
|
entity_linker = nlp.add_pipe("entity_linker", last=True, config={"use_gold_ents": False})
|
||||||
|
entity_linker.set_kb(create_kb)
|
||||||
|
|
||||||
|
train_examples = []
|
||||||
|
for text, annotations in TRAIN_DATA:
|
||||||
|
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||||
|
for ent in annotations.get("entities"):
|
||||||
|
ner.add_label(ent[2])
|
||||||
|
optimizer = nlp.initialize()
|
||||||
|
|
||||||
|
# train the NER and NEL pipes
|
||||||
|
for i in range(50):
|
||||||
|
losses = {}
|
||||||
|
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
||||||
|
assert losses["ner"] < 0.001
|
||||||
|
assert losses["entity_linker"] < 0.001
|
||||||
|
|
||||||
|
# adding additional components that are required for the entity_linker
|
||||||
|
nlp.add_pipe("sentencizer", first=True)
|
||||||
|
|
||||||
|
# test the trained model
|
||||||
|
test_text = "Russ Cochran was a member of a golf team."
|
||||||
|
doc = nlp(test_text)
|
||||||
|
ents = doc.ents
|
||||||
|
assert len(ents) == 1
|
||||||
|
assert ents[0].text == "Russ Cochran"
|
||||||
|
assert ents[0].label_ == "PERSON"
|
||||||
|
assert ents[0].kb_id_ == "Q2146908"
|
||||||
|
|
||||||
|
# Also test the results are still the same after IO
|
||||||
|
with make_tempdir() as tmp_dir:
|
||||||
|
nlp.to_disk(tmp_dir)
|
||||||
|
nlp2 = util.load_model_from_path(tmp_dir)
|
||||||
|
assert nlp2.pipe_names == nlp.pipe_names
|
||||||
|
doc2 = nlp2(test_text)
|
||||||
|
ents2 = doc2.ents
|
||||||
|
assert len(ents2) == 1
|
||||||
|
assert ents2[0].text == "Russ Cochran"
|
||||||
|
assert ents2[0].label_ == "PERSON"
|
||||||
|
assert ents2[0].kb_id_ == "Q2146908"
|
||||||
|
|
||||||
|
eval = nlp.evaluate(train_examples)
|
||||||
|
print(eval)
|
||||||
|
assert "nel_macro_f" in eval
|
||||||
|
assert "nel_micro_f" in eval
|
||||||
|
assert "ents_f" in eval
|
||||||
|
assert "nel_f_per_type" in eval
|
||||||
|
assert "ents_per_type" in eval
|
||||||
|
assert "PERSON" in eval["nel_f_per_type"]
|
||||||
|
assert "PERSON" in eval["ents_per_type"]
|
||||||
|
|
||||||
|
assert eval["nel_macro_f"] > 0
|
||||||
|
assert eval["nel_micro_f"] > 0
|
||||||
|
assert eval["ents_f"] > 0
|
||||||
|
|
||||||
|
|
||||||
def test_kb_serialization():
|
def test_kb_serialization():
|
||||||
# Test that the KB can be used in a pipeline with a different vocab
|
# Test that the KB can be used in a pipeline with a different vocab
|
||||||
|
|
Loading…
Reference in New Issue
Block a user