spaCy/spacy/tests/serialize/test_serialize_kb.py

145 lines
4.6 KiB
Python
Raw Normal View History

from typing import Callable
from spacy import util
2020-10-08 11:34:01 +03:00
from spacy.util import ensure_path, registry, load_model_from_config
2019-09-29 18:32:12 +03:00
from spacy.kb import KnowledgeBase
from spacy.vocab import Vocab
2020-10-08 11:34:01 +03:00
from thinc.api import Config
2019-09-29 18:34:56 +03:00
from ..util import make_tempdir
2020-10-07 16:29:52 +03:00
from numpy import zeros
2019-09-29 18:34:56 +03:00
2019-04-25 00:52:34 +03:00
def test_serialize_kb_disk(en_vocab):
# baseline assertions
kb1 = _get_dummy_kb(en_vocab)
2019-04-25 00:52:34 +03:00
_check_kb(kb1)
# dumping to file & loading back in
with make_tempdir() as d:
dir_path = ensure_path(d)
if not dir_path.exists():
dir_path.mkdir()
file_path = dir_path / "kb"
kb1.to_disk(str(file_path))
kb2 = KnowledgeBase(vocab=en_vocab, entity_vector_length=3)
kb2.from_disk(str(file_path))
2019-04-25 00:52:34 +03:00
# final assertions
_check_kb(kb2)
def _get_dummy_kb(vocab):
kb = KnowledgeBase(vocab, entity_vector_length=3)
2019-08-20 18:36:34 +03:00
kb.add_entity(entity="Q53", freq=33, entity_vector=[0, 5, 3])
kb.add_entity(entity="Q17", freq=2, entity_vector=[7, 1, 0])
kb.add_entity(entity="Q007", freq=7, entity_vector=[0, 0, 7])
kb.add_entity(entity="Q44", freq=342, entity_vector=[4, 4, 4])
kb.add_alias(alias="double07", entities=["Q17", "Q007"], probabilities=[0.1, 0.9])
kb.add_alias(
alias="guy",
entities=["Q53", "Q007", "Q17", "Q44"],
probabilities=[0.3, 0.3, 0.2, 0.1],
)
kb.add_alias(alias="random", entities=["Q007"], probabilities=[1.0])
return kb
2019-04-25 00:52:34 +03:00
def _check_kb(kb):
# check entities
assert kb.get_size_entities() == 4
2019-08-20 18:36:34 +03:00
for entity_string in ["Q53", "Q17", "Q007", "Q44"]:
2019-04-25 00:52:34 +03:00
assert entity_string in kb.get_entity_strings()
2019-08-20 18:36:34 +03:00
for entity_string in ["", "Q0"]:
2019-04-25 00:52:34 +03:00
assert entity_string not in kb.get_entity_strings()
# check aliases
assert kb.get_size_aliases() == 3
2019-08-20 18:36:34 +03:00
for alias_string in ["double07", "guy", "random"]:
2019-04-25 00:52:34 +03:00
assert alias_string in kb.get_alias_strings()
2019-08-20 18:36:34 +03:00
for alias_string in ["nothingness", "", "randomnoise"]:
2019-04-25 00:52:34 +03:00
assert alias_string not in kb.get_alias_strings()
# check candidates & probabilities
candidates = sorted(kb.get_alias_candidates("double07"), key=lambda x: x.entity_)
2019-04-25 00:52:34 +03:00
assert len(candidates) == 2
2019-08-20 18:36:34 +03:00
assert candidates[0].entity_ == "Q007"
CLI scripts for entity linking (wikipedia & generic) (#4091) * document token ent_kb_id * document span kb_id * update pipeline documentation * prior and context weights as bool's instead * entitylinker api documentation * drop for both models * finish entitylinker documentation * small fixes * documentation for KB * candidate documentation * links to api pages in code * small fix * frequency examples as counts for consistency * consistent documentation about tensors returned by predict * add entity linking to usage 101 * add entity linking infobox and KB section to 101 * entity-linking in linguistic features * small typo corrections * training example and docs for entity_linker * predefined nlp and kb * revert back to similarity encodings for simplicity (for now) * set prior probabilities to 0 when excluded * code clean up * bugfix: deleting kb ID from tokens when entities were removed * refactor train el example to use either model or vocab * pretrain_kb example for example kb generation * add to training docs for KB + EL example scripts * small fixes * error numbering * ensure the language of vocab and nlp stay consistent across serialization * equality with = * avoid conflict in errors file * add error 151 * final adjustements to the train scripts - consistency * update of goldparse documentation * small corrections * push commit * turn kb_creator into CLI script (wip) * proper parameters for training entity vectors * wikidata pipeline split up into two executable scripts * remove context_width * move wikidata scripts in bin directory, remove old dummy script * refine KB script with logs and preprocessing options * small edits * small improvements to logging of EL CLI script
2019-08-13 16:38:59 +03:00
assert 6.999 < candidates[0].entity_freq < 7.01
assert candidates[0].entity_vector == [0, 0, 7]
2019-08-20 18:36:34 +03:00
assert candidates[0].alias_ == "double07"
2019-04-29 14:58:07 +03:00
assert 0.899 < candidates[0].prior_prob < 0.901
2019-04-25 00:52:34 +03:00
2019-08-20 18:36:34 +03:00
assert candidates[1].entity_ == "Q17"
CLI scripts for entity linking (wikipedia & generic) (#4091) * document token ent_kb_id * document span kb_id * update pipeline documentation * prior and context weights as bool's instead * entitylinker api documentation * drop for both models * finish entitylinker documentation * small fixes * documentation for KB * candidate documentation * links to api pages in code * small fix * frequency examples as counts for consistency * consistent documentation about tensors returned by predict * add entity linking to usage 101 * add entity linking infobox and KB section to 101 * entity-linking in linguistic features * small typo corrections * training example and docs for entity_linker * predefined nlp and kb * revert back to similarity encodings for simplicity (for now) * set prior probabilities to 0 when excluded * code clean up * bugfix: deleting kb ID from tokens when entities were removed * refactor train el example to use either model or vocab * pretrain_kb example for example kb generation * add to training docs for KB + EL example scripts * small fixes * error numbering * ensure the language of vocab and nlp stay consistent across serialization * equality with = * avoid conflict in errors file * add error 151 * final adjustements to the train scripts - consistency * update of goldparse documentation * small corrections * push commit * turn kb_creator into CLI script (wip) * proper parameters for training entity vectors * wikidata pipeline split up into two executable scripts * remove context_width * move wikidata scripts in bin directory, remove old dummy script * refine KB script with logs and preprocessing options * small edits * small improvements to logging of EL CLI script
2019-08-13 16:38:59 +03:00
assert 1.99 < candidates[1].entity_freq < 2.01
assert candidates[1].entity_vector == [7, 1, 0]
2019-08-20 18:36:34 +03:00
assert candidates[1].alias_ == "double07"
2019-04-29 14:58:07 +03:00
assert 0.099 < candidates[1].prior_prob < 0.101
def test_serialize_subclassed_kb():
"""Check that IO of a custom KB works fine as part of an EL pipe."""
2020-10-08 11:34:01 +03:00
config_string = """
[nlp]
lang = "en"
pipeline = ["entity_linker"]
[components]
[components.entity_linker]
factory = "entity_linker"
[initialize]
[initialize.components]
[initialize.components.entity_linker]
[initialize.components.entity_linker.kb_loader]
@misc = "spacy.CustomKB.v1"
entity_vector_length = 342
custom_field = 666
"""
class SubKnowledgeBase(KnowledgeBase):
def __init__(self, vocab, entity_vector_length, custom_field):
super().__init__(vocab, entity_vector_length)
self.custom_field = custom_field
2021-03-02 19:56:28 +03:00
@registry.misc("spacy.CustomKB.v1")
def custom_kb(
entity_vector_length: int, custom_field: int
) -> Callable[[Vocab], KnowledgeBase]:
def custom_kb_factory(vocab):
2020-10-07 16:29:52 +03:00
kb = SubKnowledgeBase(
vocab=vocab,
entity_vector_length=entity_vector_length,
custom_field=custom_field,
)
2020-10-07 16:29:52 +03:00
kb.add_entity("random_entity", 0.0, zeros(entity_vector_length))
return kb
return custom_kb_factory
2020-10-08 11:34:01 +03:00
config = Config().from_str(config_string)
nlp = load_model_from_config(config, auto_fill=True)
2020-10-07 16:29:52 +03:00
nlp.initialize()
2020-10-08 11:34:01 +03:00
entity_linker = nlp.get_pipe("entity_linker")
assert type(entity_linker.kb) == SubKnowledgeBase
assert entity_linker.kb.entity_vector_length == 342
assert entity_linker.kb.custom_field == 666
# Make sure the custom KB is serialized correctly
with make_tempdir() as tmp_dir:
nlp.to_disk(tmp_dir)
nlp2 = util.load_model_from_path(tmp_dir)
entity_linker2 = nlp2.get_pipe("entity_linker")
2020-10-07 16:29:52 +03:00
# After IO, the KB is the standard one
assert type(entity_linker2.kb) == KnowledgeBase
assert entity_linker2.kb.entity_vector_length == 342
2020-10-07 16:29:52 +03:00
assert not hasattr(entity_linker2.kb, "custom_field")