implement split_sent with aligned SENT_START attribute

2025-11-01 16:37:45 +03:00 · 2020-06-18 19:41:53 +02:00 · 2020-06-18 19:41:53 +02:00 · 1951921230
commit 1951921230
parent d1d6f16776
1 changed files with 12 additions and 15 deletions
--- a/spacy/gold/example.pyx
+++ b/spacy/gold/example.pyx
@ -117,7 +117,7 @@ cdef class Example:
                        i = j2i_multi[j]
                        if output[i] is None:
                            output[i] = gold_values[j]
-        if as_string and field not in ["ENT_IOB"]:
+        if as_string and field not in ["ENT_IOB", "SENT_START"]:
            output = [vocab.strings[o] if o is not None else o for o in output]
        return output
@ -146,22 +146,19 @@ cdef class Example:
        sent_starts and return a list of the new Examples"""
        if not self.reference.is_sentenced:
            return [self]
-        # TODO: Do this for misaligned somehow?
+
-        predicted_words = [t.text for t in self.predicted]
+        sent_starts = self.get_aligned("SENT_START")
-        reference_words = [t.text for t in self.reference]
+        sent_starts.append(1)   # appending virtual start of a next sentence to facilitate search
-        if predicted_words != reference_words:
+
            raise NotImplementedError("TODO: Implement this")
        # Implement the easy case.
        output = []
-        cls = self.__class__
+        pred_start = 0
        for sent in self.reference.sents:
-            # I guess for misaligned we just need to use the gold_to_cand?
+            new_ref = sent.as_doc()
-            output.append(
+            pred_end = sent_starts.index(1, pred_start+1)  # find where the next sentence starts
-                cls(
+            new_pred = self.predicted[pred_start : pred_end].as_doc()
-                    self.predicted[sent.start : sent.end + 1].as_doc(),
+            output.append(Example(new_pred, new_ref))
-                    sent.as_doc()
+            pred_start = pred_end
-                )
+
            )
        return output
    property text: