Make a pre-check to speed up alignment cache (#6139)

* Dirty trick to fast-track alignment cache * Improve alignment cache check * Fix header * Fix align cache * Fix align logic
2025-05-28 09:43:17 +03:00 · 2020-09-24 18:13:39 +02:00 · 2020-09-24 18:13:39 +02:00 · 2abb4ba9db
commit 2abb4ba9db
parent 26e28ed413
2 changed files with 32 additions and 7 deletions
--- a/spacy/training/example.pxd
+++ b/spacy/training/example.pxd
@ -1,4 +1,5 @@
 from ..tokens.doc cimport Doc
+from libc.stdint cimport uint64_t


 cdef class Example:
@ -7,3 +8,5 @@ cdef class Example:
    cdef readonly object _cached_alignment
    cdef readonly object _cached_words_x
    cdef readonly object _cached_words_y
+    cdef readonly uint64_t _x_sig
+    cdef readonly uint64_t _y_sig
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@ -1,6 +1,7 @@
 from collections import Iterable as IterableInstance
 import warnings
 import numpy
+from murmurhash.mrmr cimport hash64

 from ..tokens.doc cimport Doc
 from ..tokens.span cimport Span
@ -97,15 +98,36 @@ cdef class Example:

    @property
    def alignment(self):
-        words_x = [token.text for token in self.x]
-        words_y = [token.text for token in self.y]
-        if self._cached_alignment is None or \
-                words_x != self._cached_words_x or \
-                words_y != self._cached_words_y:
-            self._cached_alignment = Alignment.from_strings(words_x, words_y)
+        x_sig = hash64(self.x.c, sizeof(self.x.c[0]) * self.x.length, 0)
+        y_sig = hash64(self.y.c, sizeof(self.y.c[0]) * self.y.length, 0)
+        if self._cached_alignment is None:
+            words_x = [token.text for token in self.x]
+            words_y = [token.text for token in self.y]
+            self._x_sig = x_sig
+            self._y_sig = y_sig
            self._cached_words_x = words_x
            self._cached_words_y = words_y
-        return self._cached_alignment
+            self._cached_alignment = Alignment.from_strings(words_x, words_y)
+            return self._cached_alignment
+        elif self._x_sig == x_sig and self._y_sig == y_sig:
+            # If we have a cached alignment, check whether the cache is invalid
+            # due to retokenization. To make this check fast in loops, we first
+            # check a hash of the TokenC arrays.
+            return self._cached_alignment
+        else:
+            words_x = [token.text for token in self.x]
+            words_y = [token.text for token in self.y]
+            if words_x == self._cached_words_x and words_y == self._cached_words_y:
+                self._x_sig = x_sig
+                self._y_sig = y_sig
+                return self._cached_alignment
+            else:
+                self._cached_alignment = Alignment.from_strings(words_x, words_y)
+                self._cached_words_x = words_x
+                self._cached_words_y = words_y
+                self._x_sig = x_sig
+                self._y_sig = y_sig
+                return self._cached_alignment

    def get_aligned(self, field, as_string=False):
        """Return an aligned array for a token attribute."""