spaCy/spacy/tests/serialize/test_serialize_kb.py

from typing import Callable

from spacy import util
from spacy.lang.en import English
from spacy.util import ensure_path, registry
from spacy.kb import KnowledgeBase

from ..util import make_tempdir


def test_serialize_kb_disk(en_vocab):
    # baseline assertions
    kb1 = _get_dummy_kb(en_vocab)
    _check_kb(kb1)

    # dumping to file & loading back in
    with make_tempdir() as d:
        dir_path = ensure_path(d)
        if not dir_path.exists():
            dir_path.mkdir()
        file_path = dir_path / "kb"
        kb1.to_disk(str(file_path))
        kb2 = KnowledgeBase(vocab=en_vocab, entity_vector_length=3)
        kb2.from_disk(str(file_path))

    # final assertions
    _check_kb(kb2)


def _get_dummy_kb(vocab):
    kb = KnowledgeBase(vocab, entity_vector_length=3)
    kb.add_entity(entity="Q53", freq=33, entity_vector=[0, 5, 3])
    kb.add_entity(entity="Q17", freq=2, entity_vector=[7, 1, 0])
    kb.add_entity(entity="Q007", freq=7, entity_vector=[0, 0, 7])
    kb.add_entity(entity="Q44", freq=342, entity_vector=[4, 4, 4])

    kb.add_alias(alias="double07", entities=["Q17", "Q007"], probabilities=[0.1, 0.9])
    kb.add_alias(
        alias="guy",
        entities=["Q53", "Q007", "Q17", "Q44"],
        probabilities=[0.3, 0.3, 0.2, 0.1],
    )
    kb.add_alias(alias="random", entities=["Q007"], probabilities=[1.0])

    return kb


def _check_kb(kb):
    # check entities
    assert kb.get_size_entities() == 4
    for entity_string in ["Q53", "Q17", "Q007", "Q44"]:
        assert entity_string in kb.get_entity_strings()
    for entity_string in ["", "Q0"]:
        assert entity_string not in kb.get_entity_strings()

    # check aliases
    assert kb.get_size_aliases() == 3
    for alias_string in ["double07", "guy", "random"]:
        assert alias_string in kb.get_alias_strings()
    for alias_string in ["nothingness", "", "randomnoise"]:
        assert alias_string not in kb.get_alias_strings()

    # check candidates & probabilities
    candidates = sorted(kb.get_alias_candidates("double07"), key=lambda x: x.entity_)
    assert len(candidates) == 2

    assert candidates[0].entity_ == "Q007"
    assert 6.999 < candidates[0].entity_freq < 7.01
    assert candidates[0].entity_vector == [0, 0, 7]
    assert candidates[0].alias_ == "double07"
    assert 0.899 < candidates[0].prior_prob < 0.901

    assert candidates[1].entity_ == "Q17"
    assert 1.99 < candidates[1].entity_freq < 2.01
    assert candidates[1].entity_vector == [7, 1, 0]
    assert candidates[1].alias_ == "double07"
    assert 0.099 < candidates[1].prior_prob < 0.101


def test_serialize_subclassed_kb():
    """Check that IO of a custom KB works fine as part of an EL pipe."""

    class SubKnowledgeBase(KnowledgeBase):
        def __init__(self, vocab, entity_vector_length, custom_field):
            super().__init__(vocab, entity_vector_length)
            self.custom_field = custom_field

    @registry.misc.register("spacy.CustomKB.v1")
    def custom_kb(
        entity_vector_length: int, custom_field: int
    ) -> Callable[["Vocab"], KnowledgeBase]:
        def custom_kb_factory(vocab):
            return SubKnowledgeBase(
                vocab=vocab,
                entity_vector_length=entity_vector_length,
                custom_field=custom_field,
            )

        return custom_kb_factory

    nlp = English()
    config = {
        "kb_loader": {
            "@misc": "spacy.CustomKB.v1",
            "entity_vector_length": 342,
            "custom_field": 666,
        }
    }
    entity_linker = nlp.add_pipe("entity_linker", config=config)
    assert type(entity_linker.kb) == SubKnowledgeBase
    assert entity_linker.kb.entity_vector_length == 342
    assert entity_linker.kb.custom_field == 666

    # Make sure the custom KB is serialized correctly
    with make_tempdir() as tmp_dir:
        nlp.to_disk(tmp_dir)
        nlp2 = util.load_model_from_path(tmp_dir)
        entity_linker2 = nlp2.get_pipe("entity_linker")
        assert type(entity_linker2.kb) == SubKnowledgeBase
        assert entity_linker2.kb.entity_vector_length == 342
        assert entity_linker2.kb.custom_field == 666
Define candidate generator in EL config (#5876) * candidate generator as separate part of EL config * update comment * ent instead of str as input for candidate generation * Span instead of str: correct type indication * fix types * unit test to create new candidate generator * fix replace_pipe argument passing * move error message, general cleanup * add vocab back to KB constructor * provide KB as callable from Vocab arg * rename to kb_loader, fix KB serialization as part of the EL pipe * fix typo * reformatting * cleanup * fix comment * fix wrongly duplicated code from merge conflict * rename dump to to_disk * from_disk instead of load_bulk * update test after recent removal of set_morphology in tagger * remove old doc 2020-08-18 17:10:36 +03:00			`from typing import Callable`

			`from spacy import util`
			`from spacy.lang.en import English`
			`from spacy.util import ensure_path, registry`
Revert #4334 2019-09-29 18:32:12 +03:00			`from spacy.kb import KnowledgeBase`
Move tests out of package (#4334) * Move tests out of package * Fix typo 2019-09-28 19:05:00 +03:00
Fix test imports 2019-09-29 18:34:56 +03:00			`from ..util import make_tempdir`

unit test for KB serialization 2019-04-25 00:52:34 +03:00
			`def test_serialize_kb_disk(en_vocab):`
			`# baseline assertions`
deduce entity freq from WP corpus and serialize vocab in WP test 2019-04-29 18:37:29 +03:00			`kb1 = _get_dummy_kb(en_vocab)`
unit test for KB serialization 2019-04-25 00:52:34 +03:00			`_check_kb(kb1)`

			`# dumping to file & loading back in`
			`with make_tempdir() as d:`
			`dir_path = ensure_path(d)`
			`if not dir_path.exists():`
			`dir_path.mkdir()`
			`file_path = dir_path / "kb"`
Define candidate generator in EL config (#5876) * candidate generator as separate part of EL config * update comment * ent instead of str as input for candidate generation * Span instead of str: correct type indication * fix types * unit test to create new candidate generator * fix replace_pipe argument passing * move error message, general cleanup * add vocab back to KB constructor * provide KB as callable from Vocab arg * rename to kb_loader, fix KB serialization as part of the EL pipe * fix typo * reformatting * cleanup * fix comment * fix wrongly duplicated code from merge conflict * rename dump to to_disk * from_disk instead of load_bulk * update test after recent removal of set_morphology in tagger * remove old doc 2020-08-18 17:10:36 +03:00			`kb1.to_disk(str(file_path))`
			`kb2 = KnowledgeBase(vocab=en_vocab, entity_vector_length=3)`
			`kb2.from_disk(str(file_path))`
unit test for KB serialization 2019-04-25 00:52:34 +03:00
			`# final assertions`
			`_check_kb(kb2)`


deduce entity freq from WP corpus and serialize vocab in WP test 2019-04-29 18:37:29 +03:00			`def _get_dummy_kb(vocab):`
Define candidate generator in EL config (#5876) * candidate generator as separate part of EL config * update comment * ent instead of str as input for candidate generation * Span instead of str: correct type indication * fix types * unit test to create new candidate generator * fix replace_pipe argument passing * move error message, general cleanup * add vocab back to KB constructor * provide KB as callable from Vocab arg * rename to kb_loader, fix KB serialization as part of the EL pipe * fix typo * reformatting * cleanup * fix comment * fix wrongly duplicated code from merge conflict * rename dump to to_disk * from_disk instead of load_bulk * update test after recent removal of set_morphology in tagger * remove old doc 2020-08-18 17:10:36 +03:00			`kb = KnowledgeBase(vocab, entity_vector_length=3)`
Tidy up and auto-format 2019-08-20 18:36:34 +03:00			`kb.add_entity(entity="Q53", freq=33, entity_vector=[0, 5, 3])`
			`kb.add_entity(entity="Q17", freq=2, entity_vector=[7, 1, 0])`
			`kb.add_entity(entity="Q007", freq=7, entity_vector=[0, 0, 7])`
			`kb.add_entity(entity="Q44", freq=342, entity_vector=[4, 4, 4])`

			`kb.add_alias(alias="double07", entities=["Q17", "Q007"], probabilities=[0.1, 0.9])`
			`kb.add_alias(`
			`alias="guy",`
			`entities=["Q53", "Q007", "Q17", "Q44"],`
			`probabilities=[0.3, 0.3, 0.2, 0.1],`
			`)`
			`kb.add_alias(alias="random", entities=["Q007"], probabilities=[1.0])`
deduce entity freq from WP corpus and serialize vocab in WP test 2019-04-29 18:37:29 +03:00
			`return kb`


unit test for KB serialization 2019-04-25 00:52:34 +03:00			`def _check_kb(kb):`
			`# check entities`
			`assert kb.get_size_entities() == 4`
Tidy up and auto-format 2019-08-20 18:36:34 +03:00			`for entity_string in ["Q53", "Q17", "Q007", "Q44"]:`
unit test for KB serialization 2019-04-25 00:52:34 +03:00			`assert entity_string in kb.get_entity_strings()`
Tidy up and auto-format 2019-08-20 18:36:34 +03:00			`for entity_string in ["", "Q0"]:`
unit test for KB serialization 2019-04-25 00:52:34 +03:00			`assert entity_string not in kb.get_entity_strings()`

			`# check aliases`
			`assert kb.get_size_aliases() == 3`
Tidy up and auto-format 2019-08-20 18:36:34 +03:00			`for alias_string in ["double07", "guy", "random"]:`
unit test for KB serialization 2019-04-25 00:52:34 +03:00			`assert alias_string in kb.get_alias_strings()`
Tidy up and auto-format 2019-08-20 18:36:34 +03:00			`for alias_string in ["nothingness", "", "randomnoise"]:`
unit test for KB serialization 2019-04-25 00:52:34 +03:00			`assert alias_string not in kb.get_alias_strings()`

			`# check candidates & probabilities`
Define candidate generator in EL config (#5876) * candidate generator as separate part of EL config * update comment * ent instead of str as input for candidate generation * Span instead of str: correct type indication * fix types * unit test to create new candidate generator * fix replace_pipe argument passing * move error message, general cleanup * add vocab back to KB constructor * provide KB as callable from Vocab arg * rename to kb_loader, fix KB serialization as part of the EL pipe * fix typo * reformatting * cleanup * fix comment * fix wrongly duplicated code from merge conflict * rename dump to to_disk * from_disk instead of load_bulk * update test after recent removal of set_morphology in tagger * remove old doc 2020-08-18 17:10:36 +03:00			`candidates = sorted(kb.get_alias_candidates("double07"), key=lambda x: x.entity_)`
unit test for KB serialization 2019-04-25 00:52:34 +03:00			`assert len(candidates) == 2`

Tidy up and auto-format 2019-08-20 18:36:34 +03:00			`assert candidates[0].entity_ == "Q007"`
CLI scripts for entity linking (wikipedia & generic) (#4091) * document token ent_kb_id * document span kb_id * update pipeline documentation * prior and context weights as bool's instead * entitylinker api documentation * drop for both models * finish entitylinker documentation * small fixes * documentation for KB * candidate documentation * links to api pages in code * small fix * frequency examples as counts for consistency * consistent documentation about tensors returned by predict * add entity linking to usage 101 * add entity linking infobox and KB section to 101 * entity-linking in linguistic features * small typo corrections * training example and docs for entity_linker * predefined nlp and kb * revert back to similarity encodings for simplicity (for now) * set prior probabilities to 0 when excluded * code clean up * bugfix: deleting kb ID from tokens when entities were removed * refactor train el example to use either model or vocab * pretrain_kb example for example kb generation * add to training docs for KB + EL example scripts * small fixes * error numbering * ensure the language of vocab and nlp stay consistent across serialization * equality with = * avoid conflict in errors file * add error 151 * final adjustements to the train scripts - consistency * update of goldparse documentation * small corrections * push commit * turn kb_creator into CLI script (wip) * proper parameters for training entity vectors * wikidata pipeline split up into two executable scripts * remove context_width * move wikidata scripts in bin directory, remove old dummy script * refine KB script with logs and preprocessing options * small edits * small improvements to logging of EL CLI script 2019-08-13 16:38:59 +03:00			`assert 6.999 < candidates[0].entity_freq < 7.01`
entity vectors in the KB + serialization of them 2019-06-05 19:29:18 +03:00			`assert candidates[0].entity_vector == [0, 0, 7]`
Tidy up and auto-format 2019-08-20 18:36:34 +03:00			`assert candidates[0].alias_ == "double07"`
simplify chains 2019-04-29 14:58:07 +03:00			`assert 0.899 < candidates[0].prior_prob < 0.901`
unit test for KB serialization 2019-04-25 00:52:34 +03:00
Tidy up and auto-format 2019-08-20 18:36:34 +03:00			`assert candidates[1].entity_ == "Q17"`
CLI scripts for entity linking (wikipedia & generic) (#4091) * document token ent_kb_id * document span kb_id * update pipeline documentation * prior and context weights as bool's instead * entitylinker api documentation * drop for both models * finish entitylinker documentation * small fixes * documentation for KB * candidate documentation * links to api pages in code * small fix * frequency examples as counts for consistency * consistent documentation about tensors returned by predict * add entity linking to usage 101 * add entity linking infobox and KB section to 101 * entity-linking in linguistic features * small typo corrections * training example and docs for entity_linker * predefined nlp and kb * revert back to similarity encodings for simplicity (for now) * set prior probabilities to 0 when excluded * code clean up * bugfix: deleting kb ID from tokens when entities were removed * refactor train el example to use either model or vocab * pretrain_kb example for example kb generation * add to training docs for KB + EL example scripts * small fixes * error numbering * ensure the language of vocab and nlp stay consistent across serialization * equality with = * avoid conflict in errors file * add error 151 * final adjustements to the train scripts - consistency * update of goldparse documentation * small corrections * push commit * turn kb_creator into CLI script (wip) * proper parameters for training entity vectors * wikidata pipeline split up into two executable scripts * remove context_width * move wikidata scripts in bin directory, remove old dummy script * refine KB script with logs and preprocessing options * small edits * small improvements to logging of EL CLI script 2019-08-13 16:38:59 +03:00			`assert 1.99 < candidates[1].entity_freq < 2.01`
entity vectors in the KB + serialization of them 2019-06-05 19:29:18 +03:00			`assert candidates[1].entity_vector == [7, 1, 0]`
Tidy up and auto-format 2019-08-20 18:36:34 +03:00			`assert candidates[1].alias_ == "double07"`
simplify chains 2019-04-29 14:58:07 +03:00			`assert 0.099 < candidates[1].prior_prob < 0.101`
Define candidate generator in EL config (#5876) * candidate generator as separate part of EL config * update comment * ent instead of str as input for candidate generation * Span instead of str: correct type indication * fix types * unit test to create new candidate generator * fix replace_pipe argument passing * move error message, general cleanup * add vocab back to KB constructor * provide KB as callable from Vocab arg * rename to kb_loader, fix KB serialization as part of the EL pipe * fix typo * reformatting * cleanup * fix comment * fix wrongly duplicated code from merge conflict * rename dump to to_disk * from_disk instead of load_bulk * update test after recent removal of set_morphology in tagger * remove old doc 2020-08-18 17:10:36 +03:00

			`def test_serialize_subclassed_kb():`
			`"""Check that IO of a custom KB works fine as part of an EL pipe."""`

			`class SubKnowledgeBase(KnowledgeBase):`
			`def __init__(self, vocab, entity_vector_length, custom_field):`
			`super().__init__(vocab, entity_vector_length)`
			`self.custom_field = custom_field`

registry.assets -> registry.misc 2020-09-03 18:31:14 +03:00			`@registry.misc.register("spacy.CustomKB.v1")`
Define candidate generator in EL config (#5876) * candidate generator as separate part of EL config * update comment * ent instead of str as input for candidate generation * Span instead of str: correct type indication * fix types * unit test to create new candidate generator * fix replace_pipe argument passing * move error message, general cleanup * add vocab back to KB constructor * provide KB as callable from Vocab arg * rename to kb_loader, fix KB serialization as part of the EL pipe * fix typo * reformatting * cleanup * fix comment * fix wrongly duplicated code from merge conflict * rename dump to to_disk * from_disk instead of load_bulk * update test after recent removal of set_morphology in tagger * remove old doc 2020-08-18 17:10:36 +03:00			`def custom_kb(`
			`entity_vector_length: int, custom_field: int`
			`) -> Callable[["Vocab"], KnowledgeBase]:`
			`def custom_kb_factory(vocab):`
			`return SubKnowledgeBase(`
			`vocab=vocab,`
			`entity_vector_length=entity_vector_length,`
			`custom_field=custom_field,`
			`)`

			`return custom_kb_factory`

			`nlp = English()`
			`config = {`
			`"kb_loader": {`
registry.assets -> registry.misc 2020-09-03 18:31:14 +03:00			`"@misc": "spacy.CustomKB.v1",`
Define candidate generator in EL config (#5876) * candidate generator as separate part of EL config * update comment * ent instead of str as input for candidate generation * Span instead of str: correct type indication * fix types * unit test to create new candidate generator * fix replace_pipe argument passing * move error message, general cleanup * add vocab back to KB constructor * provide KB as callable from Vocab arg * rename to kb_loader, fix KB serialization as part of the EL pipe * fix typo * reformatting * cleanup * fix comment * fix wrongly duplicated code from merge conflict * rename dump to to_disk * from_disk instead of load_bulk * update test after recent removal of set_morphology in tagger * remove old doc 2020-08-18 17:10:36 +03:00			`"entity_vector_length": 342,`
			`"custom_field": 666,`
			`}`
			`}`
			`entity_linker = nlp.add_pipe("entity_linker", config=config)`
			`assert type(entity_linker.kb) == SubKnowledgeBase`
			`assert entity_linker.kb.entity_vector_length == 342`
			`assert entity_linker.kb.custom_field == 666`

			`# Make sure the custom KB is serialized correctly`
			`with make_tempdir() as tmp_dir:`
			`nlp.to_disk(tmp_dir)`
			`nlp2 = util.load_model_from_path(tmp_dir)`
			`entity_linker2 = nlp2.get_pipe("entity_linker")`
			`assert type(entity_linker2.kb) == SubKnowledgeBase`
			`assert entity_linker2.kb.entity_vector_length == 342`
			`assert entity_linker2.kb.custom_field == 666`