Tagger/morphologizer alignment performance optimizations (#10798)

* `example`: Unwrap `numpy` scalar arrays before passing them to `StringStore.__getitem__`

* `AlignmentArray`: Use native list as staging buffer for offset calculation

* `example`: Vectorize `get_aligned`

* Hoist inner functions out of `get_aligned`

* Replace inline `if..else` clause in assignment statement

* `AlignmentArray`: Use raw indexing into offset and data `numpy` arrays

* `example`: Replace array unique value check with `groupby`

* `example`: Correctly exclude tokens with no alignment in `_get_aligned_vectorized`
Simplify `_get_aligned_non_vectorized`

* `util`: Update `all_equal` docstring

* Explicitly use `int32_t*`
This commit is contained in:
Madeesh Kannan 2022-05-24 18:34:23 +02:00 committed by GitHub
parent c251ea650c
commit e7e179f277
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 89 additions and 22 deletions

View File

@ -1,33 +1,39 @@
from typing import List
from ..errors import Errors
import numpy
from libc.stdint cimport int32_t
cdef class AlignmentArray:
"""AlignmentArray is similar to Thinc's Ragged with two simplfications:
indexing returns numpy arrays and this type can only be used for CPU arrays.
However, these changes make AlginmentArray more efficient for indexing in a
However, these changes make AlignmentArray more efficient for indexing in a
tight loop."""
__slots__ = []
def __init__(self, alignment: List[List[int]]):
self._lengths = None
self._starts_ends = numpy.zeros(len(alignment) + 1, dtype="i")
cdef int data_len = 0
cdef int outer_len
cdef int idx
self._starts_ends = numpy.zeros(len(alignment) + 1, dtype='int32')
cdef int32_t* starts_ends_ptr = <int32_t*>self._starts_ends.data
for idx, outer in enumerate(alignment):
outer_len = len(outer)
self._starts_ends[idx + 1] = self._starts_ends[idx] + outer_len
starts_ends_ptr[idx + 1] = starts_ends_ptr[idx] + outer_len
data_len += outer_len
self._data = numpy.empty(data_len, dtype="i")
self._lengths = None
self._data = numpy.empty(data_len, dtype="int32")
idx = 0
cdef int32_t* data_ptr = <int32_t*>self._data.data
for outer in alignment:
for inner in outer:
self._data[idx] = inner
data_ptr[idx] = inner
idx += 1
def __getitem__(self, idx):

View File

@ -12,7 +12,7 @@ from .iob_utils import biluo_tags_to_spans
from ..errors import Errors, Warnings
from ..pipeline._parser_internals import nonproj
from ..tokens.token cimport MISSING_DEP
from ..util import logger, to_ternary_int
from ..util import logger, to_ternary_int, all_equal
cpdef Doc annotations_to_doc(vocab, tok_annot, doc_annot):
@ -150,27 +150,81 @@ cdef class Example:
self._y_sig = y_sig
return self._cached_alignment
def _get_aligned_vectorized(self, align, gold_values):
# Fast path for Doc attributes/fields that are predominantly a single value,
# i.e., TAG, POS, MORPH.
x2y_single_toks = []
x2y_single_toks_i = []
x2y_multiple_toks = []
x2y_multiple_toks_i = []
# Gather indices of gold tokens aligned to the candidate tokens into two buckets.
# Bucket 1: All tokens that have a one-to-one alignment.
# Bucket 2: All tokens that have a one-to-many alignment.
for idx, token in enumerate(self.predicted):
aligned_gold_i = align[token.i]
aligned_gold_len = len(aligned_gold_i)
if aligned_gold_len == 1:
x2y_single_toks.append(aligned_gold_i.item())
x2y_single_toks_i.append(idx)
elif aligned_gold_len > 1:
x2y_multiple_toks.append(aligned_gold_i)
x2y_multiple_toks_i.append(idx)
# Map elements of the first bucket directly to the output array.
output = numpy.full(len(self.predicted), None)
output[x2y_single_toks_i] = gold_values[x2y_single_toks].squeeze()
# Collapse many-to-one alignments into one-to-one alignments if they
# share the same value. Map to None in all other cases.
for i in range(len(x2y_multiple_toks)):
aligned_gold_values = gold_values[x2y_multiple_toks[i]]
# If all aligned tokens have the same value, use it.
if all_equal(aligned_gold_values):
x2y_multiple_toks[i] = aligned_gold_values[0].item()
else:
x2y_multiple_toks[i] = None
output[x2y_multiple_toks_i] = x2y_multiple_toks
return output.tolist()
def _get_aligned_non_vectorized(self, align, gold_values):
# Slower path for fields that return multiple values (resulting
# in ragged arrays that cannot be vectorized trivially).
output = [None] * len(self.predicted)
for token in self.predicted:
aligned_gold_i = align[token.i]
values = gold_values[aligned_gold_i].ravel()
if len(values) == 1:
output[token.i] = values.item()
elif all_equal(values):
# If all aligned tokens have the same value, use it.
output[token.i] = values[0].item()
return output
def get_aligned(self, field, as_string=False):
"""Return an aligned array for a token attribute."""
align = self.alignment.x2y
gold_values = self.reference.to_array([field])
if len(gold_values.shape) == 1:
output = self._get_aligned_vectorized(align, gold_values)
else:
output = self._get_aligned_non_vectorized(align, gold_values)
vocab = self.reference.vocab
gold_values = self.reference.to_array([field])
output = [None] * len(self.predicted)
for token in self.predicted:
values = gold_values[align[token.i]]
values = values.ravel()
if len(values) == 0:
output[token.i] = None
elif len(values) == 1:
output[token.i] = values[0]
elif len(set(list(values))) == 1:
# If all aligned tokens have the same value, use it.
output[token.i] = values[0]
else:
output[token.i] = None
if as_string and field not in ["ENT_IOB", "SENT_START"]:
output = [vocab.strings[o] if o is not None else o for o in output]
return output
def get_aligned_parse(self, projectivize=True):

View File

@ -1679,3 +1679,10 @@ def packages_distributions() -> Dict[str, List[str]]:
for pkg in (dist.read_text("top_level.txt") or "").split():
pkg_to_dist[pkg].append(dist.metadata["Name"])
return dict(pkg_to_dist)
def all_equal(iterable):
"""Return True if all the elements are equal to each other
(or if the input is an empty sequence), False otherwise."""
g = itertools.groupby(iterable)
return next(g, True) and not next(g, False)