diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx index f50f91f21..7c4bb0184 100644 --- a/spacy/pipeline/_parser_internals/arc_eager.pyx +++ b/spacy/pipeline/_parser_internals/arc_eager.pyx @@ -204,6 +204,32 @@ cdef class ArcEagerGold: def update(self, StateClass stcls): update_gold_state(&self.c, stcls.c) +def _get_aligned_sent_starts(example): + """Get list of SENT_START attributes aligned to the predicted tokenization. + If the reference has not sentence starts, return a list of None values. + + This function is slightly different from the one on Example, because we also + check whether the reference sentences align across multiple sentences, + and return missing values if they do. This prevents a problem where you have + the start of a sentence merged onto a token that belongs to two sentences. + """ + if example.y.has_annotation("SENT_START"): + align = example.alignment.y2x + sent_starts = [False] * len(example.x) + seen_words = set() + for y_sent in example.y.sents: + x_indices = list(align[y_sent.start : y_sent.end].dataXd) + if any(x_idx in seen_words for x_idx in x_indices): + # If there are any tokens in X that align across two sentences, + # regard the sentence annotations as missing, as we can't + # reliably use them. + return [None] * len(example.x) + seen_words.update(x_indices) + sent_starts[x_indices[0]] = True + return sent_starts + else: + return [None] * len(example.x) + cdef int check_state_gold(char state_bits, char flag) nogil: cdef char one = 1 diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx index fe4ee6ff4..bbe59e9f4 100644 --- a/spacy/training/example.pyx +++ b/spacy/training/example.pyx @@ -200,10 +200,6 @@ cdef class Example: def get_aligned_sent_starts(self): """Get list of SENT_START attributes aligned to the predicted tokenization. If the reference has not sentence starts, return a list of None values. - - The aligned sentence starts use the get_aligned_spans method, rather - than aligning the list of tags, so that it handles cases where a mistaken - tokenization starts the sentence. """ if self.y.has_annotation("SENT_START"): align = self.alignment.y2x