* Allow gold parse to cut non-projective arcs

2026-03-06 21:01:34 +03:00 · 2015-05-31 01:11:56 +02:00 · 2015-05-31 01:11:56 +02:00 · 87d6551d19
commit 87d6551d19
parent d512d20d81
1 changed files with 37 additions and 11 deletions
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -163,7 +163,7 @@ def _consume_ent(tags):


 cdef class GoldParse:
-    def __init__(self, tokens, annot_tuples, brackets=tuple()):
+    def __init__(self, tokens, annot_tuples, brackets=tuple(), make_projective=False):
        self.mem = Pool()
        self.loss = 0
        self.length = len(tokens)
@ -196,6 +196,24 @@ cdef class GoldParse:
                self.heads[i] = self.gold_to_cand[annot_tuples[3][gold_i]]
                self.labels[i] = annot_tuples[4][gold_i]
                self.ner[i] = annot_tuples[5][gold_i]
+       
+        # If we have any non-projective arcs, i.e. crossing brackets, consider
+        # the heads for those words missing in the gold-standard.
+        # This way, we can train from these sentences
+        cdef int w1, w2, h1, h2
+        if make_projective:
+            heads = list(self.heads)
+            for w1 in range(self.length):
+                if heads[w1] is not None:
+                    h1 = heads[w1]
+                    for w2 in range(w1+1, self.length):
+                        if heads[w2] is not None:
+                            h2 = heads[w2]
+                            if _arcs_cross(w1, h1, w2, h2):
+                                self.heads[w1] = None
+                                self.labels[w1] = ''
+                                self.heads[w2] = None
+                                self.labels[w2] = ''

        self.brackets = {}
        for (gold_start, gold_end, label_str) in brackets:
@ -210,16 +228,24 @@ cdef class GoldParse:

    @property
    def is_projective(self):
-        heads = [head for (id_, word, tag, head, dep, ner) in self.orig_annot]
-        deps = sorted([sorted(arc) for arc in enumerate(heads)])
-        for w1, h1 in deps:
-            for w2, h2 in deps:
-                if w1 < w2 < h1 < h2:
-                    return False
-                elif w1 < w2 == h2 < h1:
-                    return False
-        else:
-            return True
+        heads = list(self.heads)
+        for w1 in range(self.length):
+            if heads[w1] is not None:
+                h1 = heads[w1]
+                for w2 in range(self.length):
+                    if heads[w2] is not None and _arcs_cross(w1, h1, w2, heads[w2]):
+                        return False
+        return True
+
+
+cdef int _arcs_cross(int w1, int h1, int w2, int h2) except -1:
+    if w1 > h1:
+        w1, h1 = h1, w1
+    if w2 > h2:
+        w2, h2 = h2, w2
+    if w1 > w2:
+        w1, h1, w2, h2 = w2, h2, w1, h1
+    return w1 < w2 < h1 < h2 or w1 < w2 == h2 < h1


 def is_punct_label(label):