From 2abb4ba9db0d0ec074a7336be8a7395da78eaaa4 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 24 Sep 2020 18:13:39 +0200
Subject: [PATCH] Make a pre-check to speed up alignment cache (#6139)

* Dirty trick to fast-track alignment cache

* Improve alignment cache check

* Fix header

* Fix align cache

* Fix align logic
---
 spacy/training/example.pxd |  3 +++
 spacy/training/example.pyx | 36 +++++++++++++++++++++++++++++-------
 2 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/spacy/training/example.pxd b/spacy/training/example.pxd
index e06e36287..49e239757 100644
--- a/spacy/training/example.pxd
+++ b/spacy/training/example.pxd
@@ -1,4 +1,5 @@
 from ..tokens.doc cimport Doc
+from libc.stdint cimport uint64_t
 
 
 cdef class Example:
@@ -7,3 +8,5 @@ cdef class Example:
     cdef readonly object _cached_alignment
     cdef readonly object _cached_words_x
     cdef readonly object _cached_words_y
+    cdef readonly uint64_t _x_sig
+    cdef readonly uint64_t _y_sig
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index 1e7bea5df..6a9815c44 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -1,6 +1,7 @@
 from collections import Iterable as IterableInstance
 import warnings
 import numpy
+from murmurhash.mrmr cimport hash64
 
 from ..tokens.doc cimport Doc
 from ..tokens.span cimport Span
@@ -97,15 +98,36 @@ cdef class Example:
 
     @property
     def alignment(self):
-        words_x = [token.text for token in self.x]
-        words_y = [token.text for token in self.y]
-        if self._cached_alignment is None or \
-                words_x != self._cached_words_x or \
-                words_y != self._cached_words_y:
-            self._cached_alignment = Alignment.from_strings(words_x, words_y)
+        x_sig = hash64(self.x.c, sizeof(self.x.c[0]) * self.x.length, 0)
+        y_sig = hash64(self.y.c, sizeof(self.y.c[0]) * self.y.length, 0)
+        if self._cached_alignment is None:
+            words_x = [token.text for token in self.x]
+            words_y = [token.text for token in self.y]
+            self._x_sig = x_sig
+            self._y_sig = y_sig
             self._cached_words_x = words_x
             self._cached_words_y = words_y
-        return self._cached_alignment
+            self._cached_alignment = Alignment.from_strings(words_x, words_y)
+            return self._cached_alignment
+        elif self._x_sig == x_sig and self._y_sig == y_sig:
+            # If we have a cached alignment, check whether the cache is invalid
+            # due to retokenization. To make this check fast in loops, we first
+            # check a hash of the TokenC arrays.
+            return self._cached_alignment
+        else:
+            words_x = [token.text for token in self.x]
+            words_y = [token.text for token in self.y]
+            if words_x == self._cached_words_x and words_y == self._cached_words_y:
+                self._x_sig = x_sig
+                self._y_sig = y_sig
+                return self._cached_alignment
+            else:
+                self._cached_alignment = Alignment.from_strings(words_x, words_y)
+                self._cached_words_x = words_x
+                self._cached_words_y = words_y
+                self._x_sig = x_sig
+                self._y_sig = y_sig
+                return self._cached_alignment
 
     def get_aligned(self, field, as_string=False):
         """Return an aligned array for a token attribute."""