diff --git a/spacy/training/alignment_array.pyx b/spacy/training/alignment_array.pyx index b58f08786..01e9d9bf8 100644 --- a/spacy/training/alignment_array.pyx +++ b/spacy/training/alignment_array.pyx @@ -1,33 +1,39 @@ from typing import List from ..errors import Errors import numpy +from libc.stdint cimport int32_t cdef class AlignmentArray: """AlignmentArray is similar to Thinc's Ragged with two simplfications: indexing returns numpy arrays and this type can only be used for CPU arrays. - However, these changes make AlginmentArray more efficient for indexing in a + However, these changes make AlignmentArray more efficient for indexing in a tight loop.""" __slots__ = [] def __init__(self, alignment: List[List[int]]): - self._lengths = None - self._starts_ends = numpy.zeros(len(alignment) + 1, dtype="i") - cdef int data_len = 0 cdef int outer_len cdef int idx + + self._starts_ends = numpy.zeros(len(alignment) + 1, dtype='int32') + cdef int32_t* starts_ends_ptr = self._starts_ends.data + for idx, outer in enumerate(alignment): outer_len = len(outer) - self._starts_ends[idx + 1] = self._starts_ends[idx] + outer_len + starts_ends_ptr[idx + 1] = starts_ends_ptr[idx] + outer_len data_len += outer_len - self._data = numpy.empty(data_len, dtype="i") + self._lengths = None + self._data = numpy.empty(data_len, dtype="int32") + idx = 0 + cdef int32_t* data_ptr = self._data.data + for outer in alignment: for inner in outer: - self._data[idx] = inner + data_ptr[idx] = inner idx += 1 def __getitem__(self, idx): diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx index ee613f5ac..f2dd0cb8a 100644 --- a/spacy/training/example.pyx +++ b/spacy/training/example.pyx @@ -12,7 +12,7 @@ from .iob_utils import biluo_tags_to_spans from ..errors import Errors, Warnings from ..pipeline._parser_internals import nonproj from ..tokens.token cimport MISSING_DEP -from ..util import logger, to_ternary_int +from ..util import logger, to_ternary_int, all_equal cpdef Doc annotations_to_doc(vocab, tok_annot, doc_annot): @@ -150,27 +150,81 @@ cdef class Example: self._y_sig = y_sig return self._cached_alignment + + def _get_aligned_vectorized(self, align, gold_values): + # Fast path for Doc attributes/fields that are predominantly a single value, + # i.e., TAG, POS, MORPH. + x2y_single_toks = [] + x2y_single_toks_i = [] + + x2y_multiple_toks = [] + x2y_multiple_toks_i = [] + + # Gather indices of gold tokens aligned to the candidate tokens into two buckets. + # Bucket 1: All tokens that have a one-to-one alignment. + # Bucket 2: All tokens that have a one-to-many alignment. + for idx, token in enumerate(self.predicted): + aligned_gold_i = align[token.i] + aligned_gold_len = len(aligned_gold_i) + + if aligned_gold_len == 1: + x2y_single_toks.append(aligned_gold_i.item()) + x2y_single_toks_i.append(idx) + elif aligned_gold_len > 1: + x2y_multiple_toks.append(aligned_gold_i) + x2y_multiple_toks_i.append(idx) + + # Map elements of the first bucket directly to the output array. + output = numpy.full(len(self.predicted), None) + output[x2y_single_toks_i] = gold_values[x2y_single_toks].squeeze() + + # Collapse many-to-one alignments into one-to-one alignments if they + # share the same value. Map to None in all other cases. + for i in range(len(x2y_multiple_toks)): + aligned_gold_values = gold_values[x2y_multiple_toks[i]] + + # If all aligned tokens have the same value, use it. + if all_equal(aligned_gold_values): + x2y_multiple_toks[i] = aligned_gold_values[0].item() + else: + x2y_multiple_toks[i] = None + + output[x2y_multiple_toks_i] = x2y_multiple_toks + + return output.tolist() + + + def _get_aligned_non_vectorized(self, align, gold_values): + # Slower path for fields that return multiple values (resulting + # in ragged arrays that cannot be vectorized trivially). + output = [None] * len(self.predicted) + + for token in self.predicted: + aligned_gold_i = align[token.i] + values = gold_values[aligned_gold_i].ravel() + if len(values) == 1: + output[token.i] = values.item() + elif all_equal(values): + # If all aligned tokens have the same value, use it. + output[token.i] = values[0].item() + + return output + + def get_aligned(self, field, as_string=False): """Return an aligned array for a token attribute.""" align = self.alignment.x2y + gold_values = self.reference.to_array([field]) + + if len(gold_values.shape) == 1: + output = self._get_aligned_vectorized(align, gold_values) + else: + output = self._get_aligned_non_vectorized(align, gold_values) vocab = self.reference.vocab - gold_values = self.reference.to_array([field]) - output = [None] * len(self.predicted) - for token in self.predicted: - values = gold_values[align[token.i]] - values = values.ravel() - if len(values) == 0: - output[token.i] = None - elif len(values) == 1: - output[token.i] = values[0] - elif len(set(list(values))) == 1: - # If all aligned tokens have the same value, use it. - output[token.i] = values[0] - else: - output[token.i] = None if as_string and field not in ["ENT_IOB", "SENT_START"]: output = [vocab.strings[o] if o is not None else o for o in output] + return output def get_aligned_parse(self, projectivize=True): diff --git a/spacy/util.py b/spacy/util.py index 66e257dd8..184ae2b30 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -1679,3 +1679,10 @@ def packages_distributions() -> Dict[str, List[str]]: for pkg in (dist.read_text("top_level.txt") or "").split(): pkg_to_dist[pkg].append(dist.metadata["Name"]) return dict(pkg_to_dist) + + +def all_equal(iterable): + """Return True if all the elements are equal to each other + (or if the input is an empty sequence), False otherwise.""" + g = itertools.groupby(iterable) + return next(g, True) and not next(g, False)