Backport parser/alignment optimizations from feature/refactor-parser (#10952)

This commit is contained in:
Madeesh Kannan 2022-06-24 13:39:52 +02:00 committed by GitHub
parent 9738b69c0e
commit 8f1ba4de58
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 123 additions and 33 deletions

View File

@ -1,33 +1,39 @@
from typing import List from typing import List
from ..errors import Errors from ..errors import Errors
import numpy import numpy
from libc.stdint cimport int32_t
cdef class AlignmentArray: cdef class AlignmentArray:
"""AlignmentArray is similar to Thinc's Ragged with two simplfications: """AlignmentArray is similar to Thinc's Ragged with two simplfications:
indexing returns numpy arrays and this type can only be used for CPU arrays. indexing returns numpy arrays and this type can only be used for CPU arrays.
However, these changes make AlginmentArray more efficient for indexing in a However, these changes make AlignmentArray more efficient for indexing in a
tight loop.""" tight loop."""
__slots__ = [] __slots__ = []
def __init__(self, alignment: List[List[int]]): def __init__(self, alignment: List[List[int]]):
self._lengths = None
self._starts_ends = numpy.zeros(len(alignment) + 1, dtype="i")
cdef int data_len = 0 cdef int data_len = 0
cdef int outer_len cdef int outer_len
cdef int idx cdef int idx
self._starts_ends = numpy.zeros(len(alignment) + 1, dtype='int32')
cdef int32_t* starts_ends_ptr = <int32_t*>self._starts_ends.data
for idx, outer in enumerate(alignment): for idx, outer in enumerate(alignment):
outer_len = len(outer) outer_len = len(outer)
self._starts_ends[idx + 1] = self._starts_ends[idx] + outer_len starts_ends_ptr[idx + 1] = starts_ends_ptr[idx] + outer_len
data_len += outer_len data_len += outer_len
self._data = numpy.empty(data_len, dtype="i") self._lengths = None
self._data = numpy.empty(data_len, dtype="int32")
idx = 0 idx = 0
cdef int32_t* data_ptr = <int32_t*>self._data.data
for outer in alignment: for outer in alignment:
for inner in outer: for inner in outer:
self._data[idx] = inner data_ptr[idx] = inner
idx += 1 idx += 1
def __getitem__(self, idx): def __getitem__(self, idx):

View File

@ -13,7 +13,7 @@ from .iob_utils import biluo_tags_to_spans, remove_bilu_prefix
from ..errors import Errors, Warnings from ..errors import Errors, Warnings
from ..pipeline._parser_internals import nonproj from ..pipeline._parser_internals import nonproj
from ..tokens.token cimport MISSING_DEP from ..tokens.token cimport MISSING_DEP
from ..util import logger, to_ternary_int from ..util import logger, to_ternary_int, all_equal
cpdef Doc annotations_to_doc(vocab, tok_annot, doc_annot): cpdef Doc annotations_to_doc(vocab, tok_annot, doc_annot):
@ -151,50 +151,127 @@ cdef class Example:
self._y_sig = y_sig self._y_sig = y_sig
return self._cached_alignment return self._cached_alignment
def _get_aligned_vectorized(self, align, gold_values):
# Fast path for Doc attributes/fields that are predominantly a single value,
# i.e., TAG, POS, MORPH.
x2y_single_toks = []
x2y_single_toks_i = []
x2y_multiple_toks = []
x2y_multiple_toks_i = []
# Gather indices of gold tokens aligned to the candidate tokens into two buckets.
# Bucket 1: All tokens that have a one-to-one alignment.
# Bucket 2: All tokens that have a one-to-many alignment.
for idx, token in enumerate(self.predicted):
aligned_gold_i = align[token.i]
aligned_gold_len = len(aligned_gold_i)
if aligned_gold_len == 1:
x2y_single_toks.append(aligned_gold_i.item())
x2y_single_toks_i.append(idx)
elif aligned_gold_len > 1:
x2y_multiple_toks.append(aligned_gold_i)
x2y_multiple_toks_i.append(idx)
# Map elements of the first bucket directly to the output array.
output = numpy.full(len(self.predicted), None)
output[x2y_single_toks_i] = gold_values[x2y_single_toks].squeeze()
# Collapse many-to-one alignments into one-to-one alignments if they
# share the same value. Map to None in all other cases.
for i in range(len(x2y_multiple_toks)):
aligned_gold_values = gold_values[x2y_multiple_toks[i]]
# If all aligned tokens have the same value, use it.
if all_equal(aligned_gold_values):
x2y_multiple_toks[i] = aligned_gold_values[0].item()
else:
x2y_multiple_toks[i] = None
output[x2y_multiple_toks_i] = x2y_multiple_toks
return output.tolist()
def _get_aligned_non_vectorized(self, align, gold_values):
# Slower path for fields that return multiple values (resulting
# in ragged arrays that cannot be vectorized trivially).
output = [None] * len(self.predicted)
for token in self.predicted:
aligned_gold_i = align[token.i]
values = gold_values[aligned_gold_i].ravel()
if len(values) == 1:
output[token.i] = values.item()
elif all_equal(values):
# If all aligned tokens have the same value, use it.
output[token.i] = values[0].item()
return output
def get_aligned(self, field, as_string=False): def get_aligned(self, field, as_string=False):
"""Return an aligned array for a token attribute.""" """Return an aligned array for a token attribute."""
align = self.alignment.x2y align = self.alignment.x2y
gold_values = self.reference.to_array([field])
if len(gold_values.shape) == 1:
output = self._get_aligned_vectorized(align, gold_values)
else:
output = self._get_aligned_non_vectorized(align, gold_values)
vocab = self.reference.vocab vocab = self.reference.vocab
gold_values = self.reference.to_array([field])
output = [None] * len(self.predicted)
for token in self.predicted:
values = gold_values[align[token.i]]
values = values.ravel()
if len(values) == 0:
output[token.i] = None
elif len(values) == 1:
output[token.i] = values[0]
elif len(set(list(values))) == 1:
# If all aligned tokens have the same value, use it.
output[token.i] = values[0]
else:
output[token.i] = None
if as_string and field not in ["ENT_IOB", "SENT_START"]: if as_string and field not in ["ENT_IOB", "SENT_START"]:
output = [vocab.strings[o] if o is not None else o for o in output] output = [vocab.strings[o] if o is not None else o for o in output]
return output return output
def get_aligned_parse(self, projectivize=True): def get_aligned_parse(self, projectivize=True):
cand_to_gold = self.alignment.x2y cand_to_gold = self.alignment.x2y
gold_to_cand = self.alignment.y2x gold_to_cand = self.alignment.y2x
aligned_heads = [None] * self.x.length
aligned_deps = [None] * self.x.length
has_deps = [token.has_dep() for token in self.y]
has_heads = [token.has_head() for token in self.y]
heads = [token.head.i for token in self.y] heads = [token.head.i for token in self.y]
deps = [token.dep_ for token in self.y] deps = [token.dep_ for token in self.y]
if projectivize: if projectivize:
proj_heads, proj_deps = nonproj.projectivize(heads, deps) proj_heads, proj_deps = nonproj.projectivize(heads, deps)
has_deps = [token.has_dep() for token in self.y]
has_heads = [token.has_head() for token in self.y]
# ensure that missing data remains missing # ensure that missing data remains missing
heads = [h if has_heads[i] else heads[i] for i, h in enumerate(proj_heads)] heads = [h if has_heads[i] else heads[i] for i, h in enumerate(proj_heads)]
deps = [d if has_deps[i] else deps[i] for i, d in enumerate(proj_deps)] deps = [d if has_deps[i] else deps[i] for i, d in enumerate(proj_deps)]
for cand_i in range(self.x.length):
if cand_to_gold.lengths[cand_i] == 1: # Select all candidate tokens that are aligned to a single gold token.
gold_i = cand_to_gold[cand_i][0] c2g_single_toks = numpy.where(cand_to_gold.lengths == 1)[0]
if gold_to_cand.lengths[heads[gold_i]] == 1:
aligned_heads[cand_i] = int(gold_to_cand[heads[gold_i]][0]) # Fetch all aligned gold token incides.
aligned_deps[cand_i] = deps[gold_i] if c2g_single_toks.shape == cand_to_gold.lengths.shape:
return aligned_heads, aligned_deps # This the most likely case.
gold_i = cand_to_gold[:].squeeze()
else:
gold_i = numpy.vectorize(lambda x: cand_to_gold[int(x)][0])(c2g_single_toks).squeeze()
# Fetch indices of all gold heads for the aligned gold tokens.
heads = numpy.asarray(heads, dtype='i')
gold_head_i = heads[gold_i]
# Select all gold tokens that are heads of the previously selected
# gold tokens (and are aligned to a single candidate token).
g2c_len_heads = gold_to_cand.lengths[gold_head_i]
g2c_len_heads = numpy.where(g2c_len_heads == 1)[0]
g2c_i = numpy.vectorize(lambda x: gold_to_cand[int(x)][0])(gold_head_i[g2c_len_heads]).squeeze()
# Update head/dep alignments with the above.
aligned_heads = numpy.full((self.x.length), None)
aligned_heads[c2g_single_toks[g2c_len_heads]] = g2c_i
deps = numpy.asarray(deps)
aligned_deps = numpy.full((self.x.length), None)
aligned_deps[c2g_single_toks] = deps[gold_i]
return aligned_heads.tolist(), aligned_deps.tolist()
def get_aligned_sent_starts(self): def get_aligned_sent_starts(self):
"""Get list of SENT_START attributes aligned to the predicted tokenization. """Get list of SENT_START attributes aligned to the predicted tokenization.

View File

@ -1716,3 +1716,10 @@ def packages_distributions() -> Dict[str, List[str]]:
for pkg in (dist.read_text("top_level.txt") or "").split(): for pkg in (dist.read_text("top_level.txt") or "").split():
pkg_to_dist[pkg].append(dist.metadata["Name"]) pkg_to_dist[pkg].append(dist.metadata["Name"])
return dict(pkg_to_dist) return dict(pkg_to_dist)
def all_equal(iterable):
"""Return True if all the elements are equal to each other
(or if the input is an empty sequence), False otherwise."""
g = itertools.groupby(iterable)
return next(g, True) and not next(g, False)