Tagger/morphologizer alignment performance optimizations (#10798)

* `example`: Unwrap `numpy` scalar arrays before passing them to `StringStore.__getitem__`

* `AlignmentArray`: Use native list as staging buffer for offset calculation

* `example`: Vectorize `get_aligned`

* Hoist inner functions out of `get_aligned`

* Replace inline `if..else` clause in assignment statement

* `AlignmentArray`: Use raw indexing into offset and data `numpy` arrays

* `example`: Replace array unique value check with `groupby`

* `example`: Correctly exclude tokens with no alignment in `_get_aligned_vectorized`
Simplify `_get_aligned_non_vectorized`

* `util`: Update `all_equal` docstring

* Explicitly use `int32_t*`
This commit is contained in:
Madeesh Kannan 2022-05-24 18:34:23 +02:00 committed by GitHub
parent c251ea650c
commit e7e179f277
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 89 additions and 22 deletions

View File

@ -1,33 +1,39 @@
from typing import List from typing import List
from ..errors import Errors from ..errors import Errors
import numpy import numpy
from libc.stdint cimport int32_t
cdef class AlignmentArray: cdef class AlignmentArray:
"""AlignmentArray is similar to Thinc's Ragged with two simplfications: """AlignmentArray is similar to Thinc's Ragged with two simplfications:
indexing returns numpy arrays and this type can only be used for CPU arrays. indexing returns numpy arrays and this type can only be used for CPU arrays.
However, these changes make AlginmentArray more efficient for indexing in a However, these changes make AlignmentArray more efficient for indexing in a
tight loop.""" tight loop."""
__slots__ = [] __slots__ = []
def __init__(self, alignment: List[List[int]]): def __init__(self, alignment: List[List[int]]):
self._lengths = None
self._starts_ends = numpy.zeros(len(alignment) + 1, dtype="i")
cdef int data_len = 0 cdef int data_len = 0
cdef int outer_len cdef int outer_len
cdef int idx cdef int idx
self._starts_ends = numpy.zeros(len(alignment) + 1, dtype='int32')
cdef int32_t* starts_ends_ptr = <int32_t*>self._starts_ends.data
for idx, outer in enumerate(alignment): for idx, outer in enumerate(alignment):
outer_len = len(outer) outer_len = len(outer)
self._starts_ends[idx + 1] = self._starts_ends[idx] + outer_len starts_ends_ptr[idx + 1] = starts_ends_ptr[idx] + outer_len
data_len += outer_len data_len += outer_len
self._data = numpy.empty(data_len, dtype="i") self._lengths = None
self._data = numpy.empty(data_len, dtype="int32")
idx = 0 idx = 0
cdef int32_t* data_ptr = <int32_t*>self._data.data
for outer in alignment: for outer in alignment:
for inner in outer: for inner in outer:
self._data[idx] = inner data_ptr[idx] = inner
idx += 1 idx += 1
def __getitem__(self, idx): def __getitem__(self, idx):

View File

@ -12,7 +12,7 @@ from .iob_utils import biluo_tags_to_spans
from ..errors import Errors, Warnings from ..errors import Errors, Warnings
from ..pipeline._parser_internals import nonproj from ..pipeline._parser_internals import nonproj
from ..tokens.token cimport MISSING_DEP from ..tokens.token cimport MISSING_DEP
from ..util import logger, to_ternary_int from ..util import logger, to_ternary_int, all_equal
cpdef Doc annotations_to_doc(vocab, tok_annot, doc_annot): cpdef Doc annotations_to_doc(vocab, tok_annot, doc_annot):
@ -150,27 +150,81 @@ cdef class Example:
self._y_sig = y_sig self._y_sig = y_sig
return self._cached_alignment return self._cached_alignment
def _get_aligned_vectorized(self, align, gold_values):
# Fast path for Doc attributes/fields that are predominantly a single value,
# i.e., TAG, POS, MORPH.
x2y_single_toks = []
x2y_single_toks_i = []
x2y_multiple_toks = []
x2y_multiple_toks_i = []
# Gather indices of gold tokens aligned to the candidate tokens into two buckets.
# Bucket 1: All tokens that have a one-to-one alignment.
# Bucket 2: All tokens that have a one-to-many alignment.
for idx, token in enumerate(self.predicted):
aligned_gold_i = align[token.i]
aligned_gold_len = len(aligned_gold_i)
if aligned_gold_len == 1:
x2y_single_toks.append(aligned_gold_i.item())
x2y_single_toks_i.append(idx)
elif aligned_gold_len > 1:
x2y_multiple_toks.append(aligned_gold_i)
x2y_multiple_toks_i.append(idx)
# Map elements of the first bucket directly to the output array.
output = numpy.full(len(self.predicted), None)
output[x2y_single_toks_i] = gold_values[x2y_single_toks].squeeze()
# Collapse many-to-one alignments into one-to-one alignments if they
# share the same value. Map to None in all other cases.
for i in range(len(x2y_multiple_toks)):
aligned_gold_values = gold_values[x2y_multiple_toks[i]]
# If all aligned tokens have the same value, use it.
if all_equal(aligned_gold_values):
x2y_multiple_toks[i] = aligned_gold_values[0].item()
else:
x2y_multiple_toks[i] = None
output[x2y_multiple_toks_i] = x2y_multiple_toks
return output.tolist()
def _get_aligned_non_vectorized(self, align, gold_values):
# Slower path for fields that return multiple values (resulting
# in ragged arrays that cannot be vectorized trivially).
output = [None] * len(self.predicted)
for token in self.predicted:
aligned_gold_i = align[token.i]
values = gold_values[aligned_gold_i].ravel()
if len(values) == 1:
output[token.i] = values.item()
elif all_equal(values):
# If all aligned tokens have the same value, use it.
output[token.i] = values[0].item()
return output
def get_aligned(self, field, as_string=False): def get_aligned(self, field, as_string=False):
"""Return an aligned array for a token attribute.""" """Return an aligned array for a token attribute."""
align = self.alignment.x2y align = self.alignment.x2y
gold_values = self.reference.to_array([field])
if len(gold_values.shape) == 1:
output = self._get_aligned_vectorized(align, gold_values)
else:
output = self._get_aligned_non_vectorized(align, gold_values)
vocab = self.reference.vocab vocab = self.reference.vocab
gold_values = self.reference.to_array([field])
output = [None] * len(self.predicted)
for token in self.predicted:
values = gold_values[align[token.i]]
values = values.ravel()
if len(values) == 0:
output[token.i] = None
elif len(values) == 1:
output[token.i] = values[0]
elif len(set(list(values))) == 1:
# If all aligned tokens have the same value, use it.
output[token.i] = values[0]
else:
output[token.i] = None
if as_string and field not in ["ENT_IOB", "SENT_START"]: if as_string and field not in ["ENT_IOB", "SENT_START"]:
output = [vocab.strings[o] if o is not None else o for o in output] output = [vocab.strings[o] if o is not None else o for o in output]
return output return output
def get_aligned_parse(self, projectivize=True): def get_aligned_parse(self, projectivize=True):

View File

@ -1679,3 +1679,10 @@ def packages_distributions() -> Dict[str, List[str]]:
for pkg in (dist.read_text("top_level.txt") or "").split(): for pkg in (dist.read_text("top_level.txt") or "").split():
pkg_to_dist[pkg].append(dist.metadata["Name"]) pkg_to_dist[pkg].append(dist.metadata["Name"])
return dict(pkg_to_dist) return dict(pkg_to_dist)
def all_equal(iterable):
"""Return True if all the elements are equal to each other
(or if the input is an empty sequence), False otherwise."""
g = itertools.groupby(iterable)
return next(g, True) and not next(g, False)