Fix arc_eager oracle

2025-07-05 20:33:10 +03:00 · 2020-06-23 22:58:12 +02:00 · 2020-06-23 22:58:12 +02:00 · 420a986d15
commit 420a986d15
parent a68d0e63f0
1 changed files with 51 additions and 43 deletions
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@ -200,7 +200,6 @@ cdef class ArcEagerGold:
        sent_starts = example.get_aligned("SENT_START")
        assert len(heads) == len(labels) == len(sent_starts)
        self.c = create_gold_state(self.mem, stcls, heads, labels, sent_starts)
-        self.update(stcls)

    def update(self, StateClass stcls):
        update_gold_state(&self.c, stcls)
@ -577,17 +576,12 @@ cdef class ArcEager(TransitionSystem):
    def is_gold_parse(self, StateClass state, gold):
        raise NotImplementedError

-    def has_gold(self, gold, start=0, end=None):
-        raise NotImplementedError
-
-    def preprocess_gold(self, example):
-        raise NotImplementedError
-
    def init_gold_batch(self, examples):
+        examples = [eg for eg in examples if self.has_gold(eg)]
        states = self.init_batch([eg.predicted for eg in examples])
        keeps = [i for i, s in enumerate(states) if not s.is_final()]
-        states = [states[i] for i in keeps]
        golds = [ArcEagerGold(self, states[i], examples[i]) for i in keeps]
+        states = [states[i] for i in keeps]
        for gold in golds:
            self._replace_unseen_labels(gold)
        n_steps = sum([len(s.queue) * 4 for s in states])
@ -690,6 +684,9 @@ cdef class ArcEager(TransitionSystem):
        doc.is_parsed = True
        set_children_from_heads(doc.c, doc.length)

+    def has_gold(self, Example eg):
+        return eg.y.is_parsed
+
    cdef int set_valid(self, int* output, const StateC* st) nogil:
        cdef bint[N_MOVES] is_valid
        is_valid[SHIFT] = Shift.is_valid(st, 0)
@ -736,21 +733,29 @@ cdef class ArcEager(TransitionSystem):
            raise ValueError

    def get_oracle_sequence(self, Example example):
+        cdef StateClass state
+        cdef ArcEagerGold gold
+        states, golds, n_steps = self.init_gold_batch([example])
+        if not golds:
+            return []
+
        cdef Pool mem = Pool()
        # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
        assert self.n_moves > 0
        costs = <float*>mem.alloc(self.n_moves, sizeof(float))
        is_valid = <int*>mem.alloc(self.n_moves, sizeof(int))

-        cdef StateClass state
-        cdef ArcEagerGold gold
-        states, golds, n_steps = self.init_gold_batch([example])
        state = states[0]
        gold = golds[0]
        history = []
        debug_log = []
+        failed = False
        while not state.is_final():
+            try:
                self.set_costs(is_valid, costs, state, gold)
+            except ValueError:
+                failed = True
+                break
            for i in range(self.n_moves):
                if is_valid[i] and costs[i] <= 0:
                    action = self.c[i]
@ -766,6 +771,9 @@ cdef class ArcEager(TransitionSystem):
                    action.do(state.c, action.label)
                    break
            else:
+                failed = False
+                break
+        if failed:
            print("Actions")
            for i in range(self.n_moves):
                print(self.get_class_name(i))