KB & NEL to/from bytes (#8113)

* unit test for pickling KB * add pickling test for NEL * KB to_bytes and from_bytes * NEL to_bytes and from_bytes * xfail pickle tests for now * fix docs * cleanup
2025-11-04 09:57:26 +03:00 · 2021-05-20 10:11:30 +02:00 · 2021-05-20 10:11:30 +02:00 · 202943bc8c
commit 202943bc8c
parent f6128c06b0
4 changed files with 287 additions and 13 deletions
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@ -93,6 +93,15 @@ cdef class KnowledgeBase:
        self.vocab = vocab
        self._create_empty_vectors(dummy_hash=self.vocab.strings[""])
    def initialize_entities(self, int64_t nr_entities):
        self._entry_index = PreshMap(nr_entities + 1)
        self._entries = entry_vec(nr_entities + 1)
        self._vectors_table = float_matrix(nr_entities + 1)
    def initialize_aliases(self, int64_t nr_aliases):
        self._alias_index = PreshMap(nr_aliases + 1)
        self._aliases_table = alias_vec(nr_aliases + 1)
    @property
    def entity_vector_length(self):
        """RETURNS (uint64): length of the entity vectors"""
@ -144,8 +153,7 @@ cdef class KnowledgeBase:
            raise ValueError(Errors.E140)
        nr_entities = len(set(entity_list))
-        self._entry_index = PreshMap(nr_entities+1)
+        self.initialize_entities(nr_entities)
        self._entries = entry_vec(nr_entities+1)
        i = 0
        cdef KBEntryC entry
@ -325,6 +333,102 @@ cdef class KnowledgeBase:
        return 0.0
    def to_bytes(self, **kwargs):
        """Serialize the current state to a binary string.
        """
        def serialize_header():
            header = (self.get_size_entities(), self.get_size_aliases(), self.entity_vector_length)
            return srsly.json_dumps(header)
        def serialize_entries():
            i = 1
            tuples = []
            for entry_hash, entry_index in sorted(self._entry_index.items(), key=lambda x: x[1]):
                entry = self._entries[entry_index]
                assert entry.entity_hash == entry_hash
                assert entry_index == i
                tuples.append((entry.entity_hash, entry.freq, entry.vector_index))
                i = i + 1
            return srsly.json_dumps(tuples)
        def serialize_aliases():
            i = 1
            headers = []
            indices_lists = []
            probs_lists = []
            for alias_hash, alias_index in sorted(self._alias_index.items(), key=lambda x: x[1]):
                alias = self._aliases_table[alias_index]
                assert alias_index == i
                candidate_length = len(alias.entry_indices)
                headers.append((alias_hash, candidate_length))
                indices_lists.append(alias.entry_indices)
                probs_lists.append(alias.probs)
                i = i + 1
            headers_dump = srsly.json_dumps(headers)
            indices_dump = srsly.json_dumps(indices_lists)
            probs_dump = srsly.json_dumps(probs_lists)
            return srsly.json_dumps((headers_dump, indices_dump, probs_dump))
        serializers = {
            "header": serialize_header,
            "entity_vectors": lambda: srsly.json_dumps(self._vectors_table),
            "entries": serialize_entries,
            "aliases": serialize_aliases,
        }
        return util.to_bytes(serializers, [])
    def from_bytes(self, bytes_data, *, exclude=tuple()):
        """Load state from a binary string.
        """
        def deserialize_header(b):
            header = srsly.json_loads(b)
            nr_entities = header[0]
            nr_aliases = header[1]
            entity_vector_length = header[2]
            self.initialize_entities(nr_entities)
            self.initialize_aliases(nr_aliases)
            self.entity_vector_length = entity_vector_length
        def deserialize_vectors(b):
            self._vectors_table = srsly.json_loads(b)
        def deserialize_entries(b):
            cdef KBEntryC entry
            tuples = srsly.json_loads(b)
            i = 1
            for (entity_hash, freq, vector_index) in tuples:
                entry.entity_hash = entity_hash
                entry.freq = freq
                entry.vector_index = vector_index
                entry.feats_row = -1  # Features table currently not implemented
                self._entries[i] = entry
                self._entry_index[entity_hash] = i
                i += 1
        def deserialize_aliases(b):
            cdef AliasC alias
            i = 1
            all_data = srsly.json_loads(b)
            headers = srsly.json_loads(all_data[0])
            indices = srsly.json_loads(all_data[1])
            probs = srsly.json_loads(all_data[2])
            for header, indices, probs in zip(headers, indices, probs):
                alias_hash, candidate_length = header
                alias.entry_indices = indices
                alias.probs = probs
                self._aliases_table[i] = alias
                self._alias_index[alias_hash] = i
                i += 1
        setters = {
            "header": deserialize_header,
            "entity_vectors": deserialize_vectors,
            "entries": deserialize_entries,
            "aliases": deserialize_aliases,
        }
        util.from_bytes(bytes_data, setters, exclude)
        return self
    def to_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()):
        path = ensure_path(path)
        if not path.exists():
@ -404,10 +508,8 @@ cdef class KnowledgeBase:
        cdef int64_t entity_vector_length
        reader.read_header(&nr_entities, &entity_vector_length)
        self.initialize_entities(nr_entities)
        self.entity_vector_length = entity_vector_length
        self._entry_index = PreshMap(nr_entities+1)
        self._entries = entry_vec(nr_entities+1)
        self._vectors_table = float_matrix(nr_entities+1)
        # STEP 1: load entity vectors
        cdef int i = 0
@ -445,8 +547,7 @@ cdef class KnowledgeBase:
        # STEP 3: load aliases
        cdef int64_t nr_aliases
        reader.read_alias_length(&nr_aliases)
-        self._alias_index = PreshMap(nr_aliases+1)
+        self.initialize_aliases(nr_aliases)
        self._aliases_table = alias_vec(nr_aliases+1)
        cdef int64_t nr_candidates
        cdef vector[int64_t] entry_indices
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@ -408,6 +408,48 @@ class EntityLinker(TrainablePipe):
        validate_examples(examples, "EntityLinker.score")
        return Scorer.score_links(examples, negative_labels=[self.NIL])
    def to_bytes(self, *, exclude=tuple()):
        """Serialize the pipe to a bytestring.
        exclude (Iterable[str]): String names of serialization fields to exclude.
        RETURNS (bytes): The serialized object.
        DOCS: https://spacy.io/api/entitylinker#to_bytes
        """
        self._validate_serialization_attrs()
        serialize = {}
        if hasattr(self, "cfg") and self.cfg is not None:
            serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
        serialize["vocab"] = self.vocab.to_bytes
        serialize["kb"] = self.kb.to_bytes
        serialize["model"] = self.model.to_bytes
        return util.to_bytes(serialize, exclude)
    def from_bytes(self, bytes_data, *, exclude=tuple()):
        """Load the pipe from a bytestring.
        exclude (Iterable[str]): String names of serialization fields to exclude.
        RETURNS (TrainablePipe): The loaded object.
        DOCS: https://spacy.io/api/entitylinker#from_bytes
        """
        self._validate_serialization_attrs()
        def load_model(b):
            try:
                self.model.from_bytes(b)
            except AttributeError:
                raise ValueError(Errors.E149) from None
        deserialize = {}
        if hasattr(self, "cfg") and self.cfg is not None:
            deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b))
        deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
        deserialize["kb"] = lambda b: self.kb.from_bytes(b)
        deserialize["model"] = load_model
        util.from_bytes(bytes_data, deserialize, exclude)
        return self
    def to_disk(
        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
    ) -> None:
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@ -2,7 +2,7 @@ from typing import Callable, Iterable
 import pytest
 from numpy.testing import assert_equal
 from spacy.attrs import ENT_KB_ID
-
+from spacy.compat import pickle
 from spacy.kb import KnowledgeBase, get_candidates, Candidate
 from spacy.vocab import Vocab
@ -11,7 +11,7 @@ from spacy.ml import load_kb
 from spacy.scorer import Scorer
 from spacy.training import Example
 from spacy.lang.en import English
-from spacy.tests.util import make_tempdir
+from spacy.tests.util import make_tempdir, make_tempfile
 from spacy.tokens import Span
@ -290,6 +290,9 @@ def test_vocab_serialization(nlp):
        assert candidates[0].alias == adam_hash
        assert candidates[0].alias_ == "adam"
        assert kb_new_vocab.get_vector("Q2") == [2]
        assert_almost_equal(kb_new_vocab.get_prior_prob("Q2", "douglas"), 0.4)
 def test_append_alias(nlp):
    """Test that we can append additional alias-entity pairs"""
@ -546,6 +549,98 @@ def test_kb_serialization():
        assert "RandomWord" in nlp2.vocab.strings
@pytest.mark.xfail(reason="Needs fixing")
 def test_kb_pickle():
    # Test that the KB can be pickled
    nlp = English()
    kb_1 = KnowledgeBase(nlp.vocab, entity_vector_length=3)
    kb_1.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
    assert not kb_1.contains_alias("Russ Cochran")
    kb_1.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8])
    assert kb_1.contains_alias("Russ Cochran")
    data = pickle.dumps(kb_1)
    kb_2 = pickle.loads(data)
    assert kb_2.contains_alias("Russ Cochran")
@pytest.mark.xfail(reason="Needs fixing")
 def test_nel_pickle():
    # Test that a pipeline with an EL component can be pickled
    def create_kb(vocab):
        kb = KnowledgeBase(vocab, entity_vector_length=3)
        kb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
        kb.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8])
        return kb
    nlp_1 = English()
    nlp_1.add_pipe("ner")
    entity_linker_1 = nlp_1.add_pipe("entity_linker", last=True)
    entity_linker_1.set_kb(create_kb)
    assert nlp_1.pipe_names == ["ner", "entity_linker"]
    assert entity_linker_1.kb.contains_alias("Russ Cochran")
    data = pickle.dumps(nlp_1)
    nlp_2 = pickle.loads(data)
    assert nlp_2.pipe_names == ["ner", "entity_linker"]
    entity_linker_2 = nlp_2.get_pipe("entity_linker")
    assert entity_linker_2.kb.contains_alias("Russ Cochran")
 def test_kb_to_bytes():
    # Test that the KB's to_bytes method works correctly
    nlp = English()
    kb_1 = KnowledgeBase(nlp.vocab, entity_vector_length=3)
    kb_1.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
    kb_1.add_entity(entity="Q66", freq=9, entity_vector=[1, 2, 3])
    kb_1.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8])
    kb_1.add_alias(alias="Boeing", entities=["Q66"], probabilities=[0.5])
    kb_1.add_alias(alias="Randomness", entities=["Q66", "Q2146908"], probabilities=[0.1, 0.2])
    assert kb_1.contains_alias("Russ Cochran")
    kb_bytes = kb_1.to_bytes()
    kb_2 = KnowledgeBase(nlp.vocab, entity_vector_length=3)
    assert not kb_2.contains_alias("Russ Cochran")
    kb_2 = kb_2.from_bytes(kb_bytes)
    # check that both KBs are exactly the same
    assert kb_1.get_size_entities() == kb_2.get_size_entities()
    assert kb_1.entity_vector_length == kb_2.entity_vector_length
    assert kb_1.get_entity_strings() == kb_2.get_entity_strings()
    assert kb_1.get_vector("Q2146908") == kb_2.get_vector("Q2146908")
    assert kb_1.get_vector("Q66") == kb_2.get_vector("Q66")
    assert kb_2.contains_alias("Russ Cochran")
    assert kb_1.get_size_aliases() == kb_2.get_size_aliases()
    assert kb_1.get_alias_strings() == kb_2.get_alias_strings()
    assert len(kb_1.get_alias_candidates("Russ Cochran")) == len(kb_2.get_alias_candidates("Russ Cochran"))
    assert len(kb_1.get_alias_candidates("Randomness")) == len(kb_2.get_alias_candidates("Randomness"))
 def test_nel_to_bytes():
    # Test that a pipeline with an EL component can be converted to bytes
    def create_kb(vocab):
        kb = KnowledgeBase(vocab, entity_vector_length=3)
        kb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
        kb.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8])
        return kb
    nlp_1 = English()
    nlp_1.add_pipe("ner")
    entity_linker_1 = nlp_1.add_pipe("entity_linker", last=True)
    entity_linker_1.set_kb(create_kb)
    assert entity_linker_1.kb.contains_alias("Russ Cochran")
    assert nlp_1.pipe_names == ["ner", "entity_linker"]
    nlp_bytes = nlp_1.to_bytes()
    nlp_2 = English()
    nlp_2.add_pipe("ner")
    nlp_2.add_pipe("entity_linker", last=True)
    assert nlp_2.pipe_names == ["ner", "entity_linker"]
    assert not nlp_2.get_pipe("entity_linker").kb.contains_alias("Russ Cochran")
    nlp_2 = nlp_2.from_bytes(nlp_bytes)
    kb_2 = nlp_2.get_pipe("entity_linker").kb
    assert kb_2.contains_alias("Russ Cochran")
    assert kb_2.get_vector("Q2146908") == [6, -4, 3]
    assert_almost_equal(kb_2.get_prior_prob(entity="Q2146908", alias="Russ Cochran"), 0.8)
 def test_scorer_links():
    train_examples = []
    nlp = English()
--- a/website/docs/api/entitylinker.md
+++ b/website/docs/api/entitylinker.md
@ -213,10 +213,10 @@ if there is no prediction.
 > kb_ids = entity_linker.predict([doc1, doc2])
 > ```
-| Name        | Description                                 |
+| Name        | Description                                                                |
-| ----------- | ------------------------------------------- |
+| ----------- | -------------------------------------------------------------------------- |
-| `docs`      | The documents to predict. ~~Iterable[Doc]~~ |
+| `docs`      | The documents to predict. ~~Iterable[Doc]~~                                |
-| **RETURNS** | `List[str]`                                 | The predicted KB identifiers for the entities in the `docs`. ~~List[str]~~ |
+| **RETURNS** | The predicted KB identifiers for the entities in the `docs`. ~~List[str]~~ |
 ## EntityLinker.set_annotations {#set_annotations tag="method"}
@ -341,6 +341,42 @@ Load the pipe from disk. Modifies the object in place and returns it.
 | `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~     |
 | **RETURNS**    | The modified `EntityLinker` object. ~~EntityLinker~~                                            |
 ## EntityLinker.to_bytes {#to_bytes tag="method"}
 > #### Example
 >
 > ```python
 > entity_linker = nlp.add_pipe("entity_linker")
 > entity_linker_bytes = entity_linker.to_bytes()
 > ```
 Serialize the pipe to a bytestring, including the `KnowledgeBase`.
 | Name           | Description                                                                                 |
 | -------------- | ------------------------------------------------------------------------------------------- |
 | _keyword-only_ |                                                                                             |
 | `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
 | **RETURNS**    | The serialized form of the `EntityLinker` object. ~~bytes~~                                 |
 ## EntityLinker.from_bytes {#from_bytes tag="method"}
 Load the pipe from a bytestring. Modifies the object in place and returns it.
 > #### Example
 >
 > ```python
 > entity_linker_bytes = entity_linker.to_bytes()
 > entity_linker = nlp.add_pipe("entity_linker")
 > entity_linker.from_bytes(entity_linker_bytes)
 > ```
 | Name           | Description                                                                                 |
 | -------------- | ------------------------------------------------------------------------------------------- |
 | `bytes_data`   | The data to load from. ~~bytes~~                                                            |
 | _keyword-only_ |                                                                                             |
 | `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
 | **RETURNS**    | The `EntityLinker` object. ~~EntityLinker~~                                                 |
 ## Serialization fields {#serialization-fields}
 During serialization, spaCy will export several data fields used to restore