Make a pre-check to speed up alignment cache (#6139)

* Dirty trick to fast-track alignment cache

* Improve alignment cache check

* Fix header

* Fix align cache

* Fix align logic
This commit is contained in:
Matthew Honnibal 2020-09-24 18:13:39 +02:00 committed by GitHub
parent 26e28ed413
commit 2abb4ba9db
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 32 additions and 7 deletions

View File

@ -1,4 +1,5 @@
from ..tokens.doc cimport Doc from ..tokens.doc cimport Doc
from libc.stdint cimport uint64_t
cdef class Example: cdef class Example:
@ -7,3 +8,5 @@ cdef class Example:
cdef readonly object _cached_alignment cdef readonly object _cached_alignment
cdef readonly object _cached_words_x cdef readonly object _cached_words_x
cdef readonly object _cached_words_y cdef readonly object _cached_words_y
cdef readonly uint64_t _x_sig
cdef readonly uint64_t _y_sig

View File

@ -1,6 +1,7 @@
from collections import Iterable as IterableInstance from collections import Iterable as IterableInstance
import warnings import warnings
import numpy import numpy
from murmurhash.mrmr cimport hash64
from ..tokens.doc cimport Doc from ..tokens.doc cimport Doc
from ..tokens.span cimport Span from ..tokens.span cimport Span
@ -97,15 +98,36 @@ cdef class Example:
@property @property
def alignment(self): def alignment(self):
words_x = [token.text for token in self.x] x_sig = hash64(self.x.c, sizeof(self.x.c[0]) * self.x.length, 0)
words_y = [token.text for token in self.y] y_sig = hash64(self.y.c, sizeof(self.y.c[0]) * self.y.length, 0)
if self._cached_alignment is None or \ if self._cached_alignment is None:
words_x != self._cached_words_x or \ words_x = [token.text for token in self.x]
words_y != self._cached_words_y: words_y = [token.text for token in self.y]
self._cached_alignment = Alignment.from_strings(words_x, words_y) self._x_sig = x_sig
self._y_sig = y_sig
self._cached_words_x = words_x self._cached_words_x = words_x
self._cached_words_y = words_y self._cached_words_y = words_y
return self._cached_alignment self._cached_alignment = Alignment.from_strings(words_x, words_y)
return self._cached_alignment
elif self._x_sig == x_sig and self._y_sig == y_sig:
# If we have a cached alignment, check whether the cache is invalid
# due to retokenization. To make this check fast in loops, we first
# check a hash of the TokenC arrays.
return self._cached_alignment
else:
words_x = [token.text for token in self.x]
words_y = [token.text for token in self.y]
if words_x == self._cached_words_x and words_y == self._cached_words_y:
self._x_sig = x_sig
self._y_sig = y_sig
return self._cached_alignment
else:
self._cached_alignment = Alignment.from_strings(words_x, words_y)
self._cached_words_x = words_x
self._cached_words_y = words_y
self._x_sig = x_sig
self._y_sig = y_sig
return self._cached_alignment
def get_aligned(self, field, as_string=False): def get_aligned(self, field, as_string=False):
"""Return an aligned array for a token attribute.""" """Return an aligned array for a token attribute."""