2023-03-01 18:02:55 +03:00
|
|
|
from pathlib import Path
|
2023-06-26 12:41:03 +03:00
|
|
|
from typing import Any, Callable, Dict, Iterable
|
2020-08-18 17:10:36 +03:00
|
|
|
|
2023-03-01 18:02:55 +03:00
|
|
|
import srsly
|
2023-06-26 12:41:03 +03:00
|
|
|
from numpy import zeros
|
|
|
|
from thinc.api import Config
|
2023-03-01 18:02:55 +03:00
|
|
|
|
2023-06-26 12:41:03 +03:00
|
|
|
from spacy import Errors, util
|
2022-09-08 11:38:07 +03:00
|
|
|
from spacy.kb.kb_in_memory import InMemoryLookupKB
|
2023-06-26 12:41:03 +03:00
|
|
|
from spacy.util import SimpleFrozenList, ensure_path, load_model_from_config, registry
|
2021-06-28 12:29:29 +03:00
|
|
|
from spacy.vocab import Vocab
|
2019-09-28 19:05:00 +03:00
|
|
|
|
2019-09-29 18:34:56 +03:00
|
|
|
from ..util import make_tempdir
|
|
|
|
|
2019-04-25 00:52:34 +03:00
|
|
|
|
|
|
|
def test_serialize_kb_disk(en_vocab):
|
|
|
|
# baseline assertions
|
2019-04-29 18:37:29 +03:00
|
|
|
kb1 = _get_dummy_kb(en_vocab)
|
2019-04-25 00:52:34 +03:00
|
|
|
_check_kb(kb1)
|
|
|
|
|
|
|
|
# dumping to file & loading back in
|
|
|
|
with make_tempdir() as d:
|
|
|
|
dir_path = ensure_path(d)
|
|
|
|
if not dir_path.exists():
|
|
|
|
dir_path.mkdir()
|
|
|
|
file_path = dir_path / "kb"
|
2020-08-18 17:10:36 +03:00
|
|
|
kb1.to_disk(str(file_path))
|
2022-09-08 11:38:07 +03:00
|
|
|
kb2 = InMemoryLookupKB(vocab=en_vocab, entity_vector_length=3)
|
2020-08-18 17:10:36 +03:00
|
|
|
kb2.from_disk(str(file_path))
|
2019-04-25 00:52:34 +03:00
|
|
|
|
|
|
|
# final assertions
|
|
|
|
_check_kb(kb2)
|
|
|
|
|
|
|
|
|
2019-04-29 18:37:29 +03:00
|
|
|
def _get_dummy_kb(vocab):
|
2022-09-08 11:38:07 +03:00
|
|
|
kb = InMemoryLookupKB(vocab, entity_vector_length=3)
|
2019-08-20 18:36:34 +03:00
|
|
|
kb.add_entity(entity="Q53", freq=33, entity_vector=[0, 5, 3])
|
|
|
|
kb.add_entity(entity="Q17", freq=2, entity_vector=[7, 1, 0])
|
|
|
|
kb.add_entity(entity="Q007", freq=7, entity_vector=[0, 0, 7])
|
|
|
|
kb.add_entity(entity="Q44", freq=342, entity_vector=[4, 4, 4])
|
|
|
|
|
|
|
|
kb.add_alias(alias="double07", entities=["Q17", "Q007"], probabilities=[0.1, 0.9])
|
|
|
|
kb.add_alias(
|
|
|
|
alias="guy",
|
|
|
|
entities=["Q53", "Q007", "Q17", "Q44"],
|
|
|
|
probabilities=[0.3, 0.3, 0.2, 0.1],
|
|
|
|
)
|
|
|
|
kb.add_alias(alias="random", entities=["Q007"], probabilities=[1.0])
|
2019-04-29 18:37:29 +03:00
|
|
|
|
|
|
|
return kb
|
|
|
|
|
|
|
|
|
2019-04-25 00:52:34 +03:00
|
|
|
def _check_kb(kb):
|
|
|
|
# check entities
|
|
|
|
assert kb.get_size_entities() == 4
|
2019-08-20 18:36:34 +03:00
|
|
|
for entity_string in ["Q53", "Q17", "Q007", "Q44"]:
|
2019-04-25 00:52:34 +03:00
|
|
|
assert entity_string in kb.get_entity_strings()
|
2019-08-20 18:36:34 +03:00
|
|
|
for entity_string in ["", "Q0"]:
|
2019-04-25 00:52:34 +03:00
|
|
|
assert entity_string not in kb.get_entity_strings()
|
|
|
|
|
|
|
|
# check aliases
|
|
|
|
assert kb.get_size_aliases() == 3
|
2019-08-20 18:36:34 +03:00
|
|
|
for alias_string in ["double07", "guy", "random"]:
|
2019-04-25 00:52:34 +03:00
|
|
|
assert alias_string in kb.get_alias_strings()
|
2019-08-20 18:36:34 +03:00
|
|
|
for alias_string in ["nothingness", "", "randomnoise"]:
|
2019-04-25 00:52:34 +03:00
|
|
|
assert alias_string not in kb.get_alias_strings()
|
|
|
|
|
|
|
|
# check candidates & probabilities
|
2023-03-20 02:34:35 +03:00
|
|
|
candidates = sorted(
|
|
|
|
kb._get_alias_candidates("double07"), key=lambda x: x.entity_id_
|
|
|
|
)
|
2019-04-25 00:52:34 +03:00
|
|
|
assert len(candidates) == 2
|
|
|
|
|
2023-03-20 02:34:35 +03:00
|
|
|
assert candidates[0].entity_id_ == "Q007"
|
2019-08-13 16:38:59 +03:00
|
|
|
assert 6.999 < candidates[0].entity_freq < 7.01
|
2019-06-05 19:29:18 +03:00
|
|
|
assert candidates[0].entity_vector == [0, 0, 7]
|
2023-03-20 02:34:35 +03:00
|
|
|
assert candidates[0].alias == "double07"
|
2019-04-29 14:58:07 +03:00
|
|
|
assert 0.899 < candidates[0].prior_prob < 0.901
|
2019-04-25 00:52:34 +03:00
|
|
|
|
2023-03-20 02:34:35 +03:00
|
|
|
assert candidates[1].entity_id_ == "Q17"
|
2019-08-13 16:38:59 +03:00
|
|
|
assert 1.99 < candidates[1].entity_freq < 2.01
|
2019-06-05 19:29:18 +03:00
|
|
|
assert candidates[1].entity_vector == [7, 1, 0]
|
2023-03-20 02:34:35 +03:00
|
|
|
assert candidates[1].alias == "double07"
|
2019-04-29 14:58:07 +03:00
|
|
|
assert 0.099 < candidates[1].prior_prob < 0.101
|
2020-08-18 17:10:36 +03:00
|
|
|
|
|
|
|
|
|
|
|
def test_serialize_subclassed_kb():
|
|
|
|
"""Check that IO of a custom KB works fine as part of an EL pipe."""
|
|
|
|
|
2020-10-08 11:34:01 +03:00
|
|
|
config_string = """
|
|
|
|
[nlp]
|
|
|
|
lang = "en"
|
|
|
|
pipeline = ["entity_linker"]
|
|
|
|
|
|
|
|
[components]
|
|
|
|
|
|
|
|
[components.entity_linker]
|
|
|
|
factory = "entity_linker"
|
2023-03-01 18:02:55 +03:00
|
|
|
|
|
|
|
[components.entity_linker.generate_empty_kb]
|
|
|
|
@misc = "kb_test.CustomEmptyKB.v1"
|
|
|
|
|
2020-10-08 11:34:01 +03:00
|
|
|
[initialize]
|
|
|
|
|
|
|
|
[initialize.components]
|
|
|
|
|
|
|
|
[initialize.components.entity_linker]
|
|
|
|
|
|
|
|
[initialize.components.entity_linker.kb_loader]
|
2023-03-01 18:02:55 +03:00
|
|
|
@misc = "kb_test.CustomKB.v1"
|
2020-10-08 11:34:01 +03:00
|
|
|
entity_vector_length = 342
|
|
|
|
custom_field = 666
|
|
|
|
"""
|
|
|
|
|
2022-09-08 11:38:07 +03:00
|
|
|
class SubInMemoryLookupKB(InMemoryLookupKB):
|
2020-08-18 17:10:36 +03:00
|
|
|
def __init__(self, vocab, entity_vector_length, custom_field):
|
|
|
|
super().__init__(vocab, entity_vector_length)
|
|
|
|
self.custom_field = custom_field
|
|
|
|
|
2023-03-01 18:02:55 +03:00
|
|
|
def to_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()):
|
|
|
|
"""We overwrite InMemoryLookupKB.to_disk() to ensure that self.custom_field is stored as well."""
|
|
|
|
path = ensure_path(path)
|
|
|
|
if not path.exists():
|
|
|
|
path.mkdir(parents=True)
|
|
|
|
if not path.is_dir():
|
|
|
|
raise ValueError(Errors.E928.format(loc=path))
|
|
|
|
|
|
|
|
def serialize_custom_fields(file_path: Path) -> None:
|
|
|
|
srsly.write_json(file_path, {"custom_field": self.custom_field})
|
|
|
|
|
|
|
|
serialize = {
|
|
|
|
"contents": lambda p: self.write_contents(p),
|
|
|
|
"strings.json": lambda p: self.vocab.strings.to_disk(p),
|
|
|
|
"custom_fields": lambda p: serialize_custom_fields(p),
|
|
|
|
}
|
|
|
|
util.to_disk(path, serialize, exclude)
|
|
|
|
|
|
|
|
def from_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()):
|
|
|
|
"""We overwrite InMemoryLookupKB.from_disk() to ensure that self.custom_field is loaded as well."""
|
|
|
|
path = ensure_path(path)
|
|
|
|
if not path.exists():
|
|
|
|
raise ValueError(Errors.E929.format(loc=path))
|
|
|
|
if not path.is_dir():
|
|
|
|
raise ValueError(Errors.E928.format(loc=path))
|
|
|
|
|
|
|
|
def deserialize_custom_fields(file_path: Path) -> None:
|
|
|
|
self.custom_field = srsly.read_json(file_path)["custom_field"]
|
|
|
|
|
|
|
|
deserialize: Dict[str, Callable[[Any], Any]] = {
|
|
|
|
"contents": lambda p: self.read_contents(p),
|
|
|
|
"strings.json": lambda p: self.vocab.strings.from_disk(p),
|
|
|
|
"custom_fields": lambda p: deserialize_custom_fields(p),
|
|
|
|
}
|
|
|
|
util.from_disk(path, deserialize, exclude)
|
|
|
|
|
|
|
|
@registry.misc("kb_test.CustomEmptyKB.v1")
|
|
|
|
def empty_custom_kb() -> Callable[[Vocab, int], SubInMemoryLookupKB]:
|
|
|
|
def empty_kb_factory(vocab: Vocab, entity_vector_length: int):
|
|
|
|
return SubInMemoryLookupKB(
|
|
|
|
vocab=vocab,
|
|
|
|
entity_vector_length=entity_vector_length,
|
|
|
|
custom_field=0,
|
|
|
|
)
|
|
|
|
|
|
|
|
return empty_kb_factory
|
|
|
|
|
|
|
|
@registry.misc("kb_test.CustomKB.v1")
|
2020-08-18 17:10:36 +03:00
|
|
|
def custom_kb(
|
|
|
|
entity_vector_length: int, custom_field: int
|
2023-03-01 18:02:55 +03:00
|
|
|
) -> Callable[[Vocab], SubInMemoryLookupKB]:
|
2020-08-18 17:10:36 +03:00
|
|
|
def custom_kb_factory(vocab):
|
2022-09-08 11:38:07 +03:00
|
|
|
kb = SubInMemoryLookupKB(
|
2020-08-18 17:10:36 +03:00
|
|
|
vocab=vocab,
|
|
|
|
entity_vector_length=entity_vector_length,
|
|
|
|
custom_field=custom_field,
|
|
|
|
)
|
2020-10-07 16:29:52 +03:00
|
|
|
kb.add_entity("random_entity", 0.0, zeros(entity_vector_length))
|
|
|
|
return kb
|
2020-08-18 17:10:36 +03:00
|
|
|
|
|
|
|
return custom_kb_factory
|
|
|
|
|
2020-10-08 11:34:01 +03:00
|
|
|
config = Config().from_str(config_string)
|
|
|
|
nlp = load_model_from_config(config, auto_fill=True)
|
2020-10-07 16:29:52 +03:00
|
|
|
nlp.initialize()
|
2020-10-08 11:34:01 +03:00
|
|
|
|
|
|
|
entity_linker = nlp.get_pipe("entity_linker")
|
2022-09-08 11:38:07 +03:00
|
|
|
assert type(entity_linker.kb) == SubInMemoryLookupKB
|
2020-08-18 17:10:36 +03:00
|
|
|
assert entity_linker.kb.entity_vector_length == 342
|
|
|
|
assert entity_linker.kb.custom_field == 666
|
|
|
|
|
|
|
|
# Make sure the custom KB is serialized correctly
|
|
|
|
with make_tempdir() as tmp_dir:
|
|
|
|
nlp.to_disk(tmp_dir)
|
|
|
|
nlp2 = util.load_model_from_path(tmp_dir)
|
|
|
|
entity_linker2 = nlp2.get_pipe("entity_linker")
|
2020-10-07 16:29:52 +03:00
|
|
|
# After IO, the KB is the standard one
|
2023-03-01 18:02:55 +03:00
|
|
|
assert type(entity_linker2.kb) == SubInMemoryLookupKB
|
2020-08-18 17:10:36 +03:00
|
|
|
assert entity_linker2.kb.entity_vector_length == 342
|
2023-03-01 18:02:55 +03:00
|
|
|
assert entity_linker2.kb.custom_field == 666
|