mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 01:04:34 +03:00
Make a pre-check to speed up alignment cache (#6139)
* Dirty trick to fast-track alignment cache * Improve alignment cache check * Fix header * Fix align cache * Fix align logic
This commit is contained in:
parent
26e28ed413
commit
2abb4ba9db
|
@ -1,4 +1,5 @@
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
|
from libc.stdint cimport uint64_t
|
||||||
|
|
||||||
|
|
||||||
cdef class Example:
|
cdef class Example:
|
||||||
|
@ -7,3 +8,5 @@ cdef class Example:
|
||||||
cdef readonly object _cached_alignment
|
cdef readonly object _cached_alignment
|
||||||
cdef readonly object _cached_words_x
|
cdef readonly object _cached_words_x
|
||||||
cdef readonly object _cached_words_y
|
cdef readonly object _cached_words_y
|
||||||
|
cdef readonly uint64_t _x_sig
|
||||||
|
cdef readonly uint64_t _y_sig
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
from collections import Iterable as IterableInstance
|
from collections import Iterable as IterableInstance
|
||||||
import warnings
|
import warnings
|
||||||
import numpy
|
import numpy
|
||||||
|
from murmurhash.mrmr cimport hash64
|
||||||
|
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
from ..tokens.span cimport Span
|
from ..tokens.span cimport Span
|
||||||
|
@ -97,15 +98,36 @@ cdef class Example:
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def alignment(self):
|
def alignment(self):
|
||||||
words_x = [token.text for token in self.x]
|
x_sig = hash64(self.x.c, sizeof(self.x.c[0]) * self.x.length, 0)
|
||||||
words_y = [token.text for token in self.y]
|
y_sig = hash64(self.y.c, sizeof(self.y.c[0]) * self.y.length, 0)
|
||||||
if self._cached_alignment is None or \
|
if self._cached_alignment is None:
|
||||||
words_x != self._cached_words_x or \
|
words_x = [token.text for token in self.x]
|
||||||
words_y != self._cached_words_y:
|
words_y = [token.text for token in self.y]
|
||||||
self._cached_alignment = Alignment.from_strings(words_x, words_y)
|
self._x_sig = x_sig
|
||||||
|
self._y_sig = y_sig
|
||||||
self._cached_words_x = words_x
|
self._cached_words_x = words_x
|
||||||
self._cached_words_y = words_y
|
self._cached_words_y = words_y
|
||||||
return self._cached_alignment
|
self._cached_alignment = Alignment.from_strings(words_x, words_y)
|
||||||
|
return self._cached_alignment
|
||||||
|
elif self._x_sig == x_sig and self._y_sig == y_sig:
|
||||||
|
# If we have a cached alignment, check whether the cache is invalid
|
||||||
|
# due to retokenization. To make this check fast in loops, we first
|
||||||
|
# check a hash of the TokenC arrays.
|
||||||
|
return self._cached_alignment
|
||||||
|
else:
|
||||||
|
words_x = [token.text for token in self.x]
|
||||||
|
words_y = [token.text for token in self.y]
|
||||||
|
if words_x == self._cached_words_x and words_y == self._cached_words_y:
|
||||||
|
self._x_sig = x_sig
|
||||||
|
self._y_sig = y_sig
|
||||||
|
return self._cached_alignment
|
||||||
|
else:
|
||||||
|
self._cached_alignment = Alignment.from_strings(words_x, words_y)
|
||||||
|
self._cached_words_x = words_x
|
||||||
|
self._cached_words_y = words_y
|
||||||
|
self._x_sig = x_sig
|
||||||
|
self._y_sig = y_sig
|
||||||
|
return self._cached_alignment
|
||||||
|
|
||||||
def get_aligned(self, field, as_string=False):
|
def get_aligned(self, field, as_string=False):
|
||||||
"""Return an aligned array for a token attribute."""
|
"""Return an aligned array for a token attribute."""
|
||||||
|
|
Loading…
Reference in New Issue
Block a user