Add vector deduplication (#10551)

* Add vector deduplication * Add `Vocab.deduplicate_vectors()` * Always run deduplication in `spacy init vectors` * Clean up a few vector-related error messages and docs examples * Always unique with numpy * Fix types
2025-05-28 01:33:17 +03:00 · 2022-03-30 08:54:23 +02:00 · 2022-03-30 08:54:23 +02:00 · f98b41c390
commit f98b41c390
parent 9966e08f32
6 changed files with 85 additions and 7 deletions
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -528,7 +528,7 @@ class Errors(metaclass=ErrorsWithCodes):
    E858 = ("The {mode} vector table does not support this operation. "
            "{alternative}")
    E859 = ("The floret vector table cannot be modified.")
-    E860 = ("Can't truncate fasttext-bloom vectors.")
+    E860 = ("Can't truncate floret vectors.")
    E861 = ("No 'keys' should be provided when initializing floret vectors "
            "with 'minn' and 'maxn'.")
    E862 = ("'hash_count' must be between 1-4 for floret vectors.")
--- a/spacy/tests/vocab_vectors/test_vectors.py
+++ b/spacy/tests/vocab_vectors/test_vectors.py
@ -455,6 +455,39 @@ def test_vectors_get_batch():
    assert_equal(OPS.to_numpy(vecs), OPS.to_numpy(v.get_batch(words)))


+def test_vectors_deduplicate():
+    data = OPS.asarray([[1, 1], [2, 2], [3, 4], [1, 1], [3, 4]], dtype="f")
+    v = Vectors(data=data, keys=["a1", "b1", "c1", "a2", "c2"])
+    vocab = Vocab()
+    vocab.vectors = v
+    # duplicate vectors do not use the same keys
+    assert (
+        vocab.vectors.key2row[v.strings["a1"]] != vocab.vectors.key2row[v.strings["a2"]]
+    )
+    assert (
+        vocab.vectors.key2row[v.strings["c1"]] != vocab.vectors.key2row[v.strings["c2"]]
+    )
+    vocab.deduplicate_vectors()
+    # there are three unique vectors
+    assert vocab.vectors.shape[0] == 3
+    # the uniqued data is the same as the deduplicated data
+    assert_equal(
+        numpy.unique(OPS.to_numpy(vocab.vectors.data), axis=0),
+        OPS.to_numpy(vocab.vectors.data),
+    )
+    # duplicate vectors use the same keys now
+    assert (
+        vocab.vectors.key2row[v.strings["a1"]] == vocab.vectors.key2row[v.strings["a2"]]
+    )
+    assert (
+        vocab.vectors.key2row[v.strings["c1"]] == vocab.vectors.key2row[v.strings["c2"]]
+    )
+    # deduplicating again makes no changes
+    vocab_b = vocab.to_bytes()
+    vocab.deduplicate_vectors()
+    assert vocab_b == vocab.to_bytes()
+
+
@pytest.fixture()
 def floret_vectors_hashvec_str():
    """The full hashvec table from floret with the settings:
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@ -213,6 +213,7 @@ def convert_vectors(
        for lex in nlp.vocab:
            if lex.rank and lex.rank != OOV_RANK:
                nlp.vocab.vectors.add(lex.orth, row=lex.rank)  # type: ignore[attr-defined]
+        nlp.vocab.deduplicate_vectors()
    else:
        if vectors_loc:
            logger.info(f"Reading vectors from {vectors_loc}")
@ -239,6 +240,7 @@ def convert_vectors(
                nlp.vocab.vectors = Vectors(
                    strings=nlp.vocab.strings, data=vectors_data, keys=vector_keys
                )
+                nlp.vocab.deduplicate_vectors()
    if name is None:
        # TODO: Is this correct? Does this matter?
        nlp.vocab.vectors.name = f"{nlp.meta['lang']}_{nlp.meta['name']}.vectors"
--- a/spacy/vocab.pyi
+++ b/spacy/vocab.pyi
@ -46,6 +46,7 @@ class Vocab:
    def reset_vectors(
        self, *, width: Optional[int] = ..., shape: Optional[int] = ...
    ) -> None: ...
+    def deduplicate_vectors(self) -> None: ...
    def prune_vectors(self, nr_row: int, batch_size: int = ...) -> Dict[str, float]: ...
    def get_vector(
        self,
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -1,6 +1,7 @@
 # cython: profile=True
 from libc.string cimport memcpy

+import numpy
 import srsly
 from thinc.api import get_array_module, get_current_ops
 import functools
@ -297,6 +298,33 @@ cdef class Vocab:
            width = width if width is not None else self.vectors.shape[1]
            self.vectors = Vectors(strings=self.strings, shape=(self.vectors.shape[0], width))

+    def deduplicate_vectors(self):
+        if self.vectors.mode != VectorsMode.default:
+            raise ValueError(Errors.E858.format(
+                mode=self.vectors.mode,
+                alternative=""
+            ))
+        ops = get_current_ops()
+        xp = get_array_module(self.vectors.data)
+        filled = xp.asarray(
+            sorted(list({row for row in self.vectors.key2row.values()}))
+        )
+        # deduplicate data and remap keys
+        data = numpy.unique(ops.to_numpy(self.vectors.data[filled]), axis=0)
+        data = ops.asarray(data)
+        if data.shape == self.vectors.data.shape:
+            # nothing to deduplicate
+            return
+        row_by_bytes = {row.tobytes(): i for i, row in enumerate(data)}
+        key2row = {
+            key: row_by_bytes[self.vectors.data[row].tobytes()]
+            for key, row in self.vectors.key2row.items()
+        }
+        # replace vectors with deduplicated version
+        self.vectors = Vectors(strings=self.strings, data=data, name=self.vectors.name)
+        for key, row in key2row.items():
+            self.vectors.add(key, row=row)
+
    def prune_vectors(self, nr_row, batch_size=1024):
        """Reduce the current vector table to `nr_row` unique entries. Words
        mapped to the discarded vectors will be remapped to the closest vector
@ -325,7 +353,10 @@ cdef class Vocab:
        DOCS: https://spacy.io/api/vocab#prune_vectors
        """
        if self.vectors.mode != VectorsMode.default:
-            raise ValueError(Errors.E866)
+            raise ValueError(Errors.E858.format(
+                mode=self.vectors.mode,
+                alternative=""
+            ))
        ops = get_current_ops()
        xp = get_array_module(self.vectors.data)
        # Make sure all vectors are in the vocab
--- a/website/docs/api/vocab.md
+++ b/website/docs/api/vocab.md
@ -156,7 +156,7 @@ cosines are calculated in minibatches to reduce memory usage.
 >
 > ```python
 > nlp.vocab.prune_vectors(10000)
-> assert len(nlp.vocab.vectors) <= 1000
+> assert len(nlp.vocab.vectors) <= 10000
 > ```

 | Name         | Description                                                                                                                                                                                                                  |
@ -165,6 +165,17 @@ cosines are calculated in minibatches to reduce memory usage.
 | `batch_size` | Batch of vectors for calculating the similarities. Larger batch sizes might be faster, while temporarily requiring more memory. ~~int~~                                                                                      |
 | **RETURNS**  | A dictionary keyed by removed words mapped to `(string, score)` tuples, where `string` is the entry the removed word was mapped to, and `score` the similarity score between the two words. ~~Dict[str, Tuple[str, float]]~~ |

+## Vocab.deduplicate_vectors {#deduplicate_vectors tag="method" new="3.3"}
+
+> #### Example
+>
+> ```python
+> nlp.vocab.deduplicate_vectors()
+> ```
+
+Remove any duplicate rows from the current vector table, maintaining the
+mappings for all words in the vectors.
+
 ## Vocab.get_vector {#get_vector tag="method" new="2"}

 Retrieve a vector for a word in the vocabulary. Words can be looked up by string
@ -178,10 +189,10 @@ or hash value. If the current vectors do not contain an entry for the word, a
 > nlp.vocab.get_vector("apple")
 > ```

-| Name                                | Description                                                                                                            |
-| ----------------------------------- | ---------------------------------------------------------------------------------------------------------------------- |
-| `orth`                              | The hash value of a word, or its unicode string. ~~Union[int, str]~~                                                   |
-| **RETURNS**                         | A word vector. Size and shape are determined by the `Vocab.vectors` instance. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
+| Name        | Description                                                                                                            |
+| ----------- | ---------------------------------------------------------------------------------------------------------------------- |
+| `orth`      | The hash value of a word, or its unicode string. ~~Union[int, str]~~                                                   |
+| **RETURNS** | A word vector. Size and shape are determined by the `Vocab.vectors` instance. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |

 ## Vocab.set_vector {#set_vector tag="method" new="2"}