mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
updates to NEL functionality (#6132)
* NEL: read sentences and ents from reference * fiddling with sent_start annotations * add KB serialization test * KB write additional file with strings.json * score_links function to calculate NEL P/R/F * formatting * documentation
This commit is contained in:
parent
d0ef4a4cf5
commit
c7eedd3534
|
@ -517,8 +517,8 @@ class Errors:
|
||||||
"instead.")
|
"instead.")
|
||||||
E927 = ("Can't write to frozen list Maybe you're trying to modify a computed "
|
E927 = ("Can't write to frozen list Maybe you're trying to modify a computed "
|
||||||
"property or default function argument?")
|
"property or default function argument?")
|
||||||
E928 = ("A 'KnowledgeBase' should be written to / read from a file, but the "
|
E928 = ("A 'KnowledgeBase' can only be serialized to/from from a directory, "
|
||||||
"provided argument {loc} is an existing directory.")
|
"but the provided argument {loc} points to a file.")
|
||||||
E929 = ("A 'KnowledgeBase' could not be read from {loc} - the path does "
|
E929 = ("A 'KnowledgeBase' could not be read from {loc} - the path does "
|
||||||
"not seem to exist.")
|
"not seem to exist.")
|
||||||
E930 = ("Received invalid get_examples callback in {name}.begin_training. "
|
E930 = ("Received invalid get_examples callback in {name}.begin_training. "
|
||||||
|
|
39
spacy/kb.pyx
39
spacy/kb.pyx
|
@ -10,6 +10,8 @@ from libcpp.vector cimport vector
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
|
from spacy.strings import StringStore
|
||||||
|
|
||||||
from spacy import util
|
from spacy import util
|
||||||
|
|
||||||
from .typedefs cimport hash_t
|
from .typedefs cimport hash_t
|
||||||
|
@ -83,6 +85,9 @@ cdef class KnowledgeBase:
|
||||||
DOCS: https://nightly.spacy.io/api/kb
|
DOCS: https://nightly.spacy.io/api/kb
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
contents_loc = "contents"
|
||||||
|
strings_loc = "strings.json"
|
||||||
|
|
||||||
def __init__(self, Vocab vocab, entity_vector_length):
|
def __init__(self, Vocab vocab, entity_vector_length):
|
||||||
"""Create a KnowledgeBase."""
|
"""Create a KnowledgeBase."""
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
|
@ -319,15 +324,29 @@ cdef class KnowledgeBase:
|
||||||
|
|
||||||
return 0.0
|
return 0.0
|
||||||
|
|
||||||
|
|
||||||
def to_disk(self, path):
|
def to_disk(self, path):
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
if path.is_dir():
|
if not path.exists():
|
||||||
|
path.mkdir(parents=True)
|
||||||
|
if not path.is_dir():
|
||||||
raise ValueError(Errors.E928.format(loc=path))
|
raise ValueError(Errors.E928.format(loc=path))
|
||||||
if not path.parent.exists():
|
self.write_contents(path / self.contents_loc)
|
||||||
path.parent.mkdir(parents=True)
|
self.vocab.strings.to_disk(path / self.strings_loc)
|
||||||
|
|
||||||
cdef Writer writer = Writer(path)
|
def from_disk(self, path):
|
||||||
|
path = util.ensure_path(path)
|
||||||
|
if not path.exists():
|
||||||
|
raise ValueError(Errors.E929.format(loc=path))
|
||||||
|
if not path.is_dir():
|
||||||
|
raise ValueError(Errors.E928.format(loc=path))
|
||||||
|
self.read_contents(path / self.contents_loc)
|
||||||
|
kb_strings = StringStore()
|
||||||
|
kb_strings.from_disk(path / self.strings_loc)
|
||||||
|
for string in kb_strings:
|
||||||
|
self.vocab.strings.add(string)
|
||||||
|
|
||||||
|
def write_contents(self, file_path):
|
||||||
|
cdef Writer writer = Writer(file_path)
|
||||||
writer.write_header(self.get_size_entities(), self.entity_vector_length)
|
writer.write_header(self.get_size_entities(), self.entity_vector_length)
|
||||||
|
|
||||||
# dumping the entity vectors in their original order
|
# dumping the entity vectors in their original order
|
||||||
|
@ -366,13 +385,7 @@ cdef class KnowledgeBase:
|
||||||
|
|
||||||
writer.close()
|
writer.close()
|
||||||
|
|
||||||
def from_disk(self, path):
|
def read_contents(self, file_path):
|
||||||
path = util.ensure_path(path)
|
|
||||||
if path.is_dir():
|
|
||||||
raise ValueError(Errors.E928.format(loc=path))
|
|
||||||
if not path.exists():
|
|
||||||
raise ValueError(Errors.E929.format(loc=path))
|
|
||||||
|
|
||||||
cdef hash_t entity_hash
|
cdef hash_t entity_hash
|
||||||
cdef hash_t alias_hash
|
cdef hash_t alias_hash
|
||||||
cdef int64_t entry_index
|
cdef int64_t entry_index
|
||||||
|
@ -382,7 +395,7 @@ cdef class KnowledgeBase:
|
||||||
cdef AliasC alias
|
cdef AliasC alias
|
||||||
cdef float vector_element
|
cdef float vector_element
|
||||||
|
|
||||||
cdef Reader reader = Reader(path)
|
cdef Reader reader = Reader(file_path)
|
||||||
|
|
||||||
# STEP 0: load header and initialize KB
|
# STEP 0: load header and initialize KB
|
||||||
cdef int64_t nr_entities
|
cdef int64_t nr_entities
|
||||||
|
|
|
@ -16,6 +16,7 @@ from ..training import Example, validate_examples
|
||||||
from ..errors import Errors, Warnings
|
from ..errors import Errors, Warnings
|
||||||
from ..util import SimpleFrozenList
|
from ..util import SimpleFrozenList
|
||||||
from .. import util
|
from .. import util
|
||||||
|
from ..scorer import Scorer
|
||||||
|
|
||||||
|
|
||||||
default_model_config = """
|
default_model_config = """
|
||||||
|
@ -47,6 +48,8 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
|
||||||
"incl_context": True,
|
"incl_context": True,
|
||||||
"get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
|
"get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
|
||||||
},
|
},
|
||||||
|
scores=["nel_micro_p", "nel_micro_r", "nel_micro_f"],
|
||||||
|
default_score_weights={"nel_micro_f": 1.0},
|
||||||
)
|
)
|
||||||
def make_entity_linker(
|
def make_entity_linker(
|
||||||
nlp: Language,
|
nlp: Language,
|
||||||
|
@ -209,12 +212,11 @@ class EntityLinker(Pipe):
|
||||||
# it does run the model twice :(
|
# it does run the model twice :(
|
||||||
predictions = self.model.predict(docs)
|
predictions = self.model.predict(docs)
|
||||||
for eg in examples:
|
for eg in examples:
|
||||||
sentences = [s for s in eg.predicted.sents]
|
sentences = [s for s in eg.reference.sents]
|
||||||
kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
|
kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
|
||||||
for ent in eg.predicted.ents:
|
for ent in eg.reference.ents:
|
||||||
kb_id = kb_ids[
|
# KB ID of the first token is the same as the whole span
|
||||||
ent.start
|
kb_id = kb_ids[ent.start]
|
||||||
] # KB ID of the first token is the same as the whole span
|
|
||||||
if kb_id:
|
if kb_id:
|
||||||
try:
|
try:
|
||||||
# find the sentence in the list of sentences.
|
# find the sentence in the list of sentences.
|
||||||
|
@ -253,7 +255,7 @@ class EntityLinker(Pipe):
|
||||||
entity_encodings = []
|
entity_encodings = []
|
||||||
for eg in examples:
|
for eg in examples:
|
||||||
kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
|
kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
|
||||||
for ent in eg.predicted.ents:
|
for ent in eg.reference.ents:
|
||||||
kb_id = kb_ids[ent.start]
|
kb_id = kb_ids[ent.start]
|
||||||
if kb_id:
|
if kb_id:
|
||||||
entity_encoding = self.kb.get_vector(kb_id)
|
entity_encoding = self.kb.get_vector(kb_id)
|
||||||
|
@ -415,6 +417,18 @@ class EntityLinker(Pipe):
|
||||||
for token in ent:
|
for token in ent:
|
||||||
token.ent_kb_id_ = kb_id
|
token.ent_kb_id_ = kb_id
|
||||||
|
|
||||||
|
def score(self, examples, **kwargs):
|
||||||
|
"""Score a batch of examples.
|
||||||
|
|
||||||
|
examples (Iterable[Example]): The examples to score.
|
||||||
|
RETURNS (Dict[str, Any]): The scores.
|
||||||
|
|
||||||
|
DOCS TODO: https://nightly.spacy.io/api/entity_linker#score
|
||||||
|
"""
|
||||||
|
validate_examples(examples, "EntityLinker.score")
|
||||||
|
return Scorer.score_links(examples, negative_labels=[self.NIL])
|
||||||
|
|
||||||
|
|
||||||
def to_disk(
|
def to_disk(
|
||||||
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
|
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
|
||||||
) -> None:
|
) -> None:
|
||||||
|
|
|
@ -451,6 +451,74 @@ class Scorer:
|
||||||
results[f"{attr}_score_desc"] = "macro AUC"
|
results[f"{attr}_score_desc"] = "macro AUC"
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def score_links(
|
||||||
|
examples: Iterable[Example], *, negative_labels: Iterable[str]
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""Returns PRF for predicted links on the entity level.
|
||||||
|
To disentangle the performance of the NEL from the NER,
|
||||||
|
this method only evaluates NEL links for entities that overlap
|
||||||
|
between the gold reference and the predictions.
|
||||||
|
|
||||||
|
examples (Iterable[Example]): Examples to score
|
||||||
|
negative_labels (Iterable[str]): The string values that refer to no annotation (e.g. "NIL")
|
||||||
|
RETURNS (Dict[str, Any]): A dictionary containing the scores.
|
||||||
|
|
||||||
|
DOCS (TODO): https://nightly.spacy.io/api/scorer#score_links
|
||||||
|
"""
|
||||||
|
f_per_type = {}
|
||||||
|
for example in examples:
|
||||||
|
gold_ent_by_offset = {}
|
||||||
|
for gold_ent in example.reference.ents:
|
||||||
|
gold_ent_by_offset[(gold_ent.start_char, gold_ent.end_char)] = gold_ent
|
||||||
|
|
||||||
|
for pred_ent in example.predicted.ents:
|
||||||
|
gold_span = gold_ent_by_offset.get(
|
||||||
|
(pred_ent.start_char, pred_ent.end_char), None
|
||||||
|
)
|
||||||
|
label = gold_span.label_
|
||||||
|
if not label in f_per_type:
|
||||||
|
f_per_type[label] = PRFScore()
|
||||||
|
gold = gold_span.kb_id_
|
||||||
|
# only evaluating entities that overlap between gold and pred,
|
||||||
|
# to disentangle the performance of the NEL from the NER
|
||||||
|
if gold is not None:
|
||||||
|
pred = pred_ent.kb_id_
|
||||||
|
if gold in negative_labels and pred in negative_labels:
|
||||||
|
# ignore true negatives
|
||||||
|
pass
|
||||||
|
elif gold == pred:
|
||||||
|
f_per_type[label].tp += 1
|
||||||
|
elif gold in negative_labels:
|
||||||
|
f_per_type[label].fp += 1
|
||||||
|
elif pred in negative_labels:
|
||||||
|
f_per_type[label].fn += 1
|
||||||
|
else:
|
||||||
|
# a wrong prediction (e.g. Q42 != Q3) counts as both a FP as well as a FN
|
||||||
|
f_per_type[label].fp += 1
|
||||||
|
f_per_type[label].fn += 1
|
||||||
|
micro_prf = PRFScore()
|
||||||
|
for label_prf in f_per_type.values():
|
||||||
|
micro_prf.tp += label_prf.tp
|
||||||
|
micro_prf.fn += label_prf.fn
|
||||||
|
micro_prf.fp += label_prf.fp
|
||||||
|
n_labels = len(f_per_type) + 1e-100
|
||||||
|
macro_p = sum(prf.precision for prf in f_per_type.values()) / n_labels
|
||||||
|
macro_r = sum(prf.recall for prf in f_per_type.values()) / n_labels
|
||||||
|
macro_f = sum(prf.fscore for prf in f_per_type.values()) / n_labels
|
||||||
|
results = {
|
||||||
|
f"nel_score": micro_prf.fscore,
|
||||||
|
f"nel_score_desc": "micro F",
|
||||||
|
f"nel_micro_p": micro_prf.precision,
|
||||||
|
f"nel_micro_r": micro_prf.recall,
|
||||||
|
f"nel_micro_f": micro_prf.fscore,
|
||||||
|
f"nel_macro_p": macro_p,
|
||||||
|
f"nel_macro_r": macro_r,
|
||||||
|
f"nel_macro_f": macro_f,
|
||||||
|
f"nel_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()},
|
||||||
|
}
|
||||||
|
return results
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def score_deps(
|
def score_deps(
|
||||||
examples: Iterable[Example],
|
examples: Iterable[Example],
|
||||||
|
|
|
@ -2,8 +2,10 @@ from typing import Callable, Iterable
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from spacy.kb import KnowledgeBase, get_candidates, Candidate
|
from spacy.kb import KnowledgeBase, get_candidates, Candidate
|
||||||
|
from spacy.vocab import Vocab
|
||||||
|
|
||||||
from spacy import util, registry
|
from spacy import util, registry
|
||||||
|
from spacy.scorer import Scorer
|
||||||
from spacy.training import Example
|
from spacy.training import Example
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.tests.util import make_tempdir
|
from spacy.tests.util import make_tempdir
|
||||||
|
@ -151,22 +153,15 @@ def test_kb_serialize(nlp):
|
||||||
# normal read-write behaviour
|
# normal read-write behaviour
|
||||||
mykb.to_disk(d / "kb")
|
mykb.to_disk(d / "kb")
|
||||||
mykb.from_disk(d / "kb")
|
mykb.from_disk(d / "kb")
|
||||||
mykb.to_disk(d / "kb.file")
|
|
||||||
mykb.from_disk(d / "kb.file")
|
|
||||||
mykb.to_disk(d / "new" / "kb")
|
mykb.to_disk(d / "new" / "kb")
|
||||||
mykb.from_disk(d / "new" / "kb")
|
mykb.from_disk(d / "new" / "kb")
|
||||||
# allow overwriting an existing file
|
# allow overwriting an existing file
|
||||||
mykb.to_disk(d / "kb.file")
|
mykb.to_disk(d / "kb")
|
||||||
with pytest.raises(ValueError):
|
|
||||||
# can not write to a directory
|
|
||||||
mykb.to_disk(d)
|
|
||||||
with pytest.raises(ValueError):
|
|
||||||
# can not read from a directory
|
|
||||||
mykb.from_disk(d)
|
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
# can not read from an unknown file
|
# can not read from an unknown file
|
||||||
mykb.from_disk(d / "unknown" / "kb")
|
mykb.from_disk(d / "unknown" / "kb")
|
||||||
|
|
||||||
|
|
||||||
def test_candidate_generation(nlp):
|
def test_candidate_generation(nlp):
|
||||||
"""Test correct candidate generation"""
|
"""Test correct candidate generation"""
|
||||||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
||||||
|
@ -254,6 +249,41 @@ def test_el_pipe_configuration(nlp):
|
||||||
assert doc[2].ent_kb_id_ == "Q2"
|
assert doc[2].ent_kb_id_ == "Q2"
|
||||||
|
|
||||||
|
|
||||||
|
def test_vocab_serialization(nlp):
|
||||||
|
"""Test that string information is retained across storage"""
|
||||||
|
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
||||||
|
|
||||||
|
# adding entities
|
||||||
|
q1_hash = mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
|
||||||
|
q2_hash = mykb.add_entity(entity="Q2", freq=12, entity_vector=[2])
|
||||||
|
q3_hash = mykb.add_entity(entity="Q3", freq=5, entity_vector=[3])
|
||||||
|
|
||||||
|
# adding aliases
|
||||||
|
douglas_hash = mykb.add_alias(
|
||||||
|
alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1]
|
||||||
|
)
|
||||||
|
adam_hash = mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
|
||||||
|
|
||||||
|
candidates = mykb.get_alias_candidates("adam")
|
||||||
|
assert len(candidates) == 1
|
||||||
|
assert candidates[0].entity == q2_hash
|
||||||
|
assert candidates[0].entity_ == "Q2"
|
||||||
|
assert candidates[0].alias == adam_hash
|
||||||
|
assert candidates[0].alias_ == "adam"
|
||||||
|
|
||||||
|
with make_tempdir() as d:
|
||||||
|
mykb.to_disk(d / "kb")
|
||||||
|
kb_new_vocab = KnowledgeBase(Vocab(), entity_vector_length=1)
|
||||||
|
kb_new_vocab.from_disk(d / "kb")
|
||||||
|
|
||||||
|
candidates = kb_new_vocab.get_alias_candidates("adam")
|
||||||
|
assert len(candidates) == 1
|
||||||
|
assert candidates[0].entity == q2_hash
|
||||||
|
assert candidates[0].entity_ == "Q2"
|
||||||
|
assert candidates[0].alias == adam_hash
|
||||||
|
assert candidates[0].alias_ == "adam"
|
||||||
|
|
||||||
|
|
||||||
def test_append_alias(nlp):
|
def test_append_alias(nlp):
|
||||||
"""Test that we can append additional alias-entity pairs"""
|
"""Test that we can append additional alias-entity pairs"""
|
||||||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
||||||
|
@ -377,16 +407,20 @@ def test_preserving_links_ents_2(nlp):
|
||||||
TRAIN_DATA = [
|
TRAIN_DATA = [
|
||||||
("Russ Cochran captured his first major title with his son as caddie.",
|
("Russ Cochran captured his first major title with his son as caddie.",
|
||||||
{"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}},
|
{"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}},
|
||||||
"entities": [(0, 12, "PERSON")]}),
|
"entities": [(0, 12, "PERSON")],
|
||||||
|
"sent_starts": [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}),
|
||||||
("Russ Cochran his reprints include EC Comics.",
|
("Russ Cochran his reprints include EC Comics.",
|
||||||
{"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}},
|
{"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}},
|
||||||
"entities": [(0, 12, "PERSON")]}),
|
"entities": [(0, 12, "PERSON")],
|
||||||
|
"sent_starts": [1, -1, 0, 0, 0, 0, 0, 0]}),
|
||||||
("Russ Cochran has been publishing comic art.",
|
("Russ Cochran has been publishing comic art.",
|
||||||
{"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}},
|
{"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}},
|
||||||
"entities": [(0, 12, "PERSON")]}),
|
"entities": [(0, 12, "PERSON")],
|
||||||
|
"sent_starts": [1, -1, 0, 0, 0, 0, 0, 0]}),
|
||||||
("Russ Cochran was a member of University of Kentucky's golf team.",
|
("Russ Cochran was a member of University of Kentucky's golf team.",
|
||||||
{"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}},
|
{"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}},
|
||||||
"entities": [(0, 12, "PERSON"), (43, 51, "LOC")]}),
|
"entities": [(0, 12, "PERSON"), (43, 51, "LOC")],
|
||||||
|
"sent_starts": [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]})
|
||||||
]
|
]
|
||||||
GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"]
|
GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"]
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
@ -395,16 +429,8 @@ GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"]
|
||||||
def test_overfitting_IO():
|
def test_overfitting_IO():
|
||||||
# Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly
|
# Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly
|
||||||
nlp = English()
|
nlp = English()
|
||||||
nlp.add_pipe("sentencizer")
|
|
||||||
vector_length = 3
|
vector_length = 3
|
||||||
|
|
||||||
# Add a custom component to recognize "Russ Cochran" as an entity for the example training data
|
|
||||||
patterns = [
|
|
||||||
{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}
|
|
||||||
]
|
|
||||||
ruler = nlp.add_pipe("entity_ruler")
|
|
||||||
ruler.add_patterns(patterns)
|
|
||||||
|
|
||||||
# Convert the texts to docs to make sure we have doc.ents set for the training examples
|
# Convert the texts to docs to make sure we have doc.ents set for the training examples
|
||||||
train_examples = []
|
train_examples = []
|
||||||
for text, annotation in TRAIN_DATA:
|
for text, annotation in TRAIN_DATA:
|
||||||
|
@ -446,6 +472,16 @@ def test_overfitting_IO():
|
||||||
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
||||||
assert losses["entity_linker"] < 0.001
|
assert losses["entity_linker"] < 0.001
|
||||||
|
|
||||||
|
# adding additional components that are required for the entity_linker
|
||||||
|
nlp.add_pipe("sentencizer", first=True)
|
||||||
|
|
||||||
|
# Add a custom component to recognize "Russ Cochran" as an entity for the example training data
|
||||||
|
patterns = [
|
||||||
|
{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}
|
||||||
|
]
|
||||||
|
ruler = nlp.add_pipe("entity_ruler", before="entity_linker")
|
||||||
|
ruler.add_patterns(patterns)
|
||||||
|
|
||||||
# test the trained model
|
# test the trained model
|
||||||
predictions = []
|
predictions = []
|
||||||
for text, annotation in TRAIN_DATA:
|
for text, annotation in TRAIN_DATA:
|
||||||
|
@ -465,3 +501,46 @@ def test_overfitting_IO():
|
||||||
for ent in doc2.ents:
|
for ent in doc2.ents:
|
||||||
predictions.append(ent.kb_id_)
|
predictions.append(ent.kb_id_)
|
||||||
assert predictions == GOLD_entities
|
assert predictions == GOLD_entities
|
||||||
|
|
||||||
|
|
||||||
|
def test_scorer_links():
|
||||||
|
train_examples = []
|
||||||
|
nlp = English()
|
||||||
|
ref1 = nlp("Julia lives in London happily.")
|
||||||
|
ref1.ents = [
|
||||||
|
Span(ref1, 0, 1, label="PERSON", kb_id="Q2"),
|
||||||
|
Span(ref1, 3, 4, label="LOC", kb_id="Q3"),
|
||||||
|
]
|
||||||
|
pred1 = nlp("Julia lives in London happily.")
|
||||||
|
pred1.ents = [
|
||||||
|
Span(pred1, 0, 1, label="PERSON", kb_id="Q70"),
|
||||||
|
Span(pred1, 3, 4, label="LOC", kb_id="Q3"),
|
||||||
|
]
|
||||||
|
train_examples.append(Example(pred1, ref1))
|
||||||
|
|
||||||
|
ref2 = nlp("She loves London.")
|
||||||
|
ref2.ents = [
|
||||||
|
Span(ref2, 0, 1, label="PERSON", kb_id="Q2"),
|
||||||
|
Span(ref2, 2, 3, label="LOC", kb_id="Q13"),
|
||||||
|
]
|
||||||
|
pred2 = nlp("She loves London.")
|
||||||
|
pred2.ents = [
|
||||||
|
Span(pred2, 0, 1, label="PERSON", kb_id="Q2"),
|
||||||
|
Span(pred2, 2, 3, label="LOC", kb_id="NIL"),
|
||||||
|
]
|
||||||
|
train_examples.append(Example(pred2, ref2))
|
||||||
|
|
||||||
|
ref3 = nlp("London is great.")
|
||||||
|
ref3.ents = [Span(ref3, 0, 1, label="LOC", kb_id="NIL")]
|
||||||
|
pred3 = nlp("London is great.")
|
||||||
|
pred3.ents = [Span(pred3, 0, 1, label="LOC", kb_id="NIL")]
|
||||||
|
train_examples.append(Example(pred3, ref3))
|
||||||
|
|
||||||
|
scores = Scorer().score_links(train_examples, negative_labels=["NIL"])
|
||||||
|
assert scores["nel_f_per_type"]["PERSON"]["p"] == 1 / 2
|
||||||
|
assert scores["nel_f_per_type"]["PERSON"]["r"] == 1 / 2
|
||||||
|
assert scores["nel_f_per_type"]["LOC"]["p"] == 1 / 1
|
||||||
|
assert scores["nel_f_per_type"]["LOC"]["r"] == 1 / 2
|
||||||
|
|
||||||
|
assert scores["nel_micro_p"] == 2 / 3
|
||||||
|
assert scores["nel_micro_r"] == 2 / 4
|
||||||
|
|
|
@ -244,3 +244,22 @@ def test_Example_from_dict_with_links_invalid(annots):
|
||||||
predicted = Doc(vocab, words=annots["words"])
|
predicted = Doc(vocab, words=annots["words"])
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
Example.from_dict(predicted, annots)
|
Example.from_dict(predicted, annots)
|
||||||
|
|
||||||
|
|
||||||
|
def test_Example_from_dict_sentences():
|
||||||
|
vocab = Vocab()
|
||||||
|
predicted = Doc(vocab, words=["One", "sentence", ".", "one", "more"])
|
||||||
|
annots = {"sent_starts": [1, 0, 0, 1, 0]}
|
||||||
|
ex = Example.from_dict(predicted, annots)
|
||||||
|
assert len(list(ex.reference.sents)) == 2
|
||||||
|
|
||||||
|
# this currently throws an error - bug or feature?
|
||||||
|
# predicted = Doc(vocab, words=["One", "sentence", "not", "one", "more"])
|
||||||
|
# annots = {"sent_starts": [1, 0, 0, 0, 0]}
|
||||||
|
# ex = Example.from_dict(predicted, annots)
|
||||||
|
# assert len(list(ex.reference.sents)) == 1
|
||||||
|
|
||||||
|
predicted = Doc(vocab, words=["One", "sentence", "not", "one", "more"])
|
||||||
|
annots = {"sent_starts": [1, -1, 0, 0, 0]}
|
||||||
|
ex = Example.from_dict(predicted, annots)
|
||||||
|
assert len(list(ex.reference.sents)) == 1
|
|
@ -225,6 +225,21 @@ pipe's entity linking model and context encoder. Delegates to
|
||||||
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
|
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
|
||||||
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
|
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
|
||||||
|
|
||||||
|
## EntityLinker.score {#score tag="method" new="3"}
|
||||||
|
|
||||||
|
Score a batch of examples.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> scores = entity_linker.score(examples)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | ---------------------------------------------------------------------------------------------- |
|
||||||
|
| `examples` | The examples to score. ~~Iterable[Example]~~ |
|
||||||
|
| **RETURNS** | The scores, produced by [`Scorer.score_links`](/api/scorer#score_links) . ~~Dict[str, float]~~ |
|
||||||
|
|
||||||
## EntityLinker.create_optimizer {#create_optimizer tag="method"}
|
## EntityLinker.create_optimizer {#create_optimizer tag="method"}
|
||||||
|
|
||||||
Create an optimizer for the pipeline component.
|
Create an optimizer for the pipeline component.
|
||||||
|
|
|
@ -206,3 +206,26 @@ depends on the scorer settings:
|
||||||
| `multi_label` | Whether the attribute allows multiple labels. Defaults to `True`. ~~bool~~ |
|
| `multi_label` | Whether the attribute allows multiple labels. Defaults to `True`. ~~bool~~ |
|
||||||
| `positive_label` | The positive label for a binary task with exclusive classes. Defaults to `None`. ~~Optional[str]~~ |
|
| `positive_label` | The positive label for a binary task with exclusive classes. Defaults to `None`. ~~Optional[str]~~ |
|
||||||
| **RETURNS** | A dictionary containing the scores, with inapplicable scores as `None`. ~~Dict[str, Optional[float]]~~ |
|
| **RETURNS** | A dictionary containing the scores, with inapplicable scores as `None`. ~~Dict[str, Optional[float]]~~ |
|
||||||
|
|
||||||
|
## Scorer.score_links {#score_links tag="staticmethod" new="3"}
|
||||||
|
|
||||||
|
Returns PRF for predicted links on the entity level. To disentangle the
|
||||||
|
performance of the NEL from the NER, this method only evaluates NEL links for
|
||||||
|
entities that overlap between the gold reference and the predictions.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> scores = Scorer.score_links(
|
||||||
|
> examples,
|
||||||
|
> negative_labels=["NIL", ""]
|
||||||
|
> )
|
||||||
|
> print(scores["nel_micro_f"])
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------------- | ------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
|
||||||
|
| _keyword-only_ | |
|
||||||
|
| `negative_labels` | The string values that refer to no annotation (e.g. "NIL"). ~~Iterable[str]~~ |
|
||||||
|
| **RETURNS** | A dictionary containing the scores. ~~Dict[str, Optional[float]]~~ |
|
||||||
|
|
Loading…
Reference in New Issue
Block a user