# cython: profile=True, cdivision=True, infer_types=True from cymem.cymem cimport Address, Pool from libc.stdint cimport int32_t from libcpp.vector cimport vector from collections import Counter, defaultdict from ...strings cimport hash_string from ...structs cimport TokenC from ...tokens.doc cimport Doc, set_children_from_heads from ...tokens.token cimport MISSING_DEP from ...typedefs cimport attr_t, hash_t from ...training import split_bilu_label from ...training.example cimport Example from ._state cimport ArcC, StateC from .stateclass cimport StateClass from ...errors import Errors from thinc.extra.search cimport Beam cdef weight_t MIN_SCORE = -90000 cdef attr_t SUBTOK_LABEL = hash_string('subtok') DEF NON_MONOTONIC = True cdef enum: SHIFT REDUCE LEFT RIGHT BREAK N_MOVES MOVE_NAMES = [None] * N_MOVES MOVE_NAMES[SHIFT] = 'S' MOVE_NAMES[REDUCE] = 'D' MOVE_NAMES[LEFT] = 'L' MOVE_NAMES[RIGHT] = 'R' MOVE_NAMES[BREAK] = 'B' cdef enum: HEAD_IN_STACK = 0 HEAD_IN_BUFFER HEAD_UNKNOWN IS_SENT_START SENT_START_UNKNOWN cdef struct GoldParseStateC: char* state_bits int32_t* n_kids_in_buffer int32_t* n_kids_in_stack int32_t* heads attr_t* labels int32_t** kids int32_t* n_kids int32_t length int32_t stride weight_t push_cost weight_t pop_cost cdef GoldParseStateC create_gold_state(Pool mem, const StateC* state, heads, labels, sent_starts) except *: cdef GoldParseStateC gs gs.length = len(heads) gs.stride = 1 assert gs.length > 0 gs.labels = mem.alloc(gs.length, sizeof(gs.labels[0])) gs.heads = mem.alloc(gs.length, sizeof(gs.heads[0])) gs.n_kids = mem.alloc(gs.length, sizeof(gs.n_kids[0])) gs.state_bits = mem.alloc(gs.length, sizeof(gs.state_bits[0])) gs.n_kids_in_buffer = mem.alloc(gs.length, sizeof(gs.n_kids_in_buffer[0])) gs.n_kids_in_stack = mem.alloc(gs.length, sizeof(gs.n_kids_in_stack[0])) for i, is_sent_start in enumerate(sent_starts): if is_sent_start == True: gs.state_bits[i] = set_state_flag( gs.state_bits[i], IS_SENT_START, 1 ) gs.state_bits[i] = set_state_flag( gs.state_bits[i], SENT_START_UNKNOWN, 0 ) elif is_sent_start is None: gs.state_bits[i] = set_state_flag( gs.state_bits[i], SENT_START_UNKNOWN, 1 ) gs.state_bits[i] = set_state_flag( gs.state_bits[i], IS_SENT_START, 0 ) else: gs.state_bits[i] = set_state_flag( gs.state_bits[i], SENT_START_UNKNOWN, 0 ) gs.state_bits[i] = set_state_flag( gs.state_bits[i], IS_SENT_START, 0 ) for i, (head, label) in enumerate(zip(heads, labels)): if head is not None: gs.heads[i] = head gs.labels[i] = label if i != head: gs.n_kids[head] += 1 gs.state_bits[i] = set_state_flag( gs.state_bits[i], HEAD_UNKNOWN, 0 ) else: gs.state_bits[i] = set_state_flag( gs.state_bits[i], HEAD_UNKNOWN, 1 ) # Make an array of pointers, pointing into the gs_kids_flat array. assert gs.length > 0 gs.kids = mem.alloc(gs.length, sizeof(int32_t*)) for i in range(gs.length): if gs.n_kids[i] != 0: gs.kids[i] = mem.alloc(gs.n_kids[i], sizeof(int32_t)) # This is a temporary buffer js_addr = Address(gs.length, sizeof(int32_t)) js = js_addr.ptr for i in range(gs.length): if not is_head_unknown(&gs, i): head = gs.heads[i] if head != i: gs.kids[head][js[head]] = i js[head] += 1 gs.push_cost = push_cost(state, &gs) gs.pop_cost = pop_cost(state, &gs) return gs cdef void update_gold_state(GoldParseStateC* gs, const StateC* s) nogil: for i in range(gs.length): gs.state_bits[i] = set_state_flag( gs.state_bits[i], HEAD_IN_BUFFER, 0 ) gs.state_bits[i] = set_state_flag( gs.state_bits[i], HEAD_IN_STACK, 0 ) gs.n_kids_in_stack[i] = 0 gs.n_kids_in_buffer[i] = 0 for i in range(s.stack_depth()): s_i = s.S(i) if not is_head_unknown(gs, s_i) and gs.heads[s_i] != s_i: gs.n_kids_in_stack[gs.heads[s_i]] += 1 for kid in gs.kids[s_i][:gs.n_kids[s_i]]: gs.state_bits[kid] = set_state_flag( gs.state_bits[kid], HEAD_IN_STACK, 1 ) for i in range(s.buffer_length()): b_i = s.B(i) if s.is_sent_start(b_i): break if not is_head_unknown(gs, b_i) and gs.heads[b_i] != b_i: gs.n_kids_in_buffer[gs.heads[b_i]] += 1 for kid in gs.kids[b_i][:gs.n_kids[b_i]]: gs.state_bits[kid] = set_state_flag( gs.state_bits[kid], HEAD_IN_BUFFER, 1 ) gs.push_cost = push_cost(s, gs) gs.pop_cost = pop_cost(s, gs) cdef class ArcEagerGold: cdef GoldParseStateC c cdef Pool mem def __init__(self, ArcEager moves, StateClass stcls, Example example): self.mem = Pool() heads, labels = example.get_aligned_parse(projectivize=True) labels = [example.x.vocab.strings.add(label) if label is not None else MISSING_DEP for label in labels] sent_starts = _get_aligned_sent_starts(example) assert len(heads) == len(labels) == len(sent_starts), (len(heads), len(labels), len(sent_starts)) self.c = create_gold_state(self.mem, stcls.c, heads, labels, sent_starts) def update(self, StateClass stcls): update_gold_state(&self.c, stcls.c) def _get_aligned_sent_starts(example): """Get list of SENT_START attributes aligned to the predicted tokenization. If the reference has not sentence starts, return a list of None values. This function is slightly different from the one on Example, because we also check whether the reference sentences align across multiple sentences, and return missing values if they do. This prevents a problem where you have the start of a sentence merged onto a token that belongs to two sentences. """ if example.y.has_annotation("SENT_START"): align = example.alignment.y2x sent_starts = [False] * len(example.x) seen_words = set() for y_sent in example.y.sents: x_indices = list(align[y_sent.start : y_sent.end]) if any(x_idx in seen_words for x_idx in x_indices): # If there are any tokens in X that align across two sentences, # regard the sentence annotations as missing, as we can't # reliably use them. return [None] * len(example.x) seen_words.update(x_indices) sent_starts[x_indices[0]] = True return sent_starts else: return [None] * len(example.x) cdef int check_state_gold(char state_bits, char flag) nogil: cdef char one = 1 return 1 if (state_bits & (one << flag)) else 0 cdef int set_state_flag(char state_bits, char flag, int value) nogil: cdef char one = 1 if value: return state_bits | (one << flag) else: return state_bits & ~(one << flag) cdef int is_head_in_stack(const GoldParseStateC* gold, int i) nogil: return check_state_gold(gold.state_bits[i], HEAD_IN_STACK) cdef int is_head_in_buffer(const GoldParseStateC* gold, int i) nogil: return check_state_gold(gold.state_bits[i], HEAD_IN_BUFFER) cdef int is_head_unknown(const GoldParseStateC* gold, int i) nogil: return check_state_gold(gold.state_bits[i], HEAD_UNKNOWN) cdef int is_sent_start(const GoldParseStateC* gold, int i) nogil: return check_state_gold(gold.state_bits[i], IS_SENT_START) cdef int is_sent_start_unknown(const GoldParseStateC* gold, int i) nogil: return check_state_gold(gold.state_bits[i], SENT_START_UNKNOWN) # Helper functions for the arc-eager oracle cdef weight_t push_cost(const StateC* state, const GoldParseStateC* gold) nogil: cdef weight_t cost = 0 b0 = state.B(0) if b0 < 0: return 9000 if is_head_in_stack(gold, b0): cost += 1 cost += gold.n_kids_in_stack[b0] if Break.is_valid(state, 0) and is_sent_start(gold, state.B(1)): cost += 1 return cost cdef weight_t pop_cost(const StateC* state, const GoldParseStateC* gold) nogil: cdef weight_t cost = 0 s0 = state.S(0) if s0 < 0: return 9000 if is_head_in_buffer(gold, s0): cost += 1 cost += gold.n_kids_in_buffer[s0] return cost cdef bint arc_is_gold(const GoldParseStateC* gold, int head, int child) nogil: if is_head_unknown(gold, child): return True elif gold.heads[child] == head: return True else: return False cdef bint label_is_gold(const GoldParseStateC* gold, int child, attr_t label) nogil: if is_head_unknown(gold, child): return True elif label == 0: return True elif gold.labels[child] == label: return True else: return False cdef bint _is_gold_root(const GoldParseStateC* gold, int word) nogil: return gold.heads[word] == word or is_head_unknown(gold, word) cdef class Shift: """Move the first word of the buffer onto the stack and mark it as "shifted" Validity: * If stack is empty * At least two words in sentence * Word has not been shifted before Cost: push_cost Action: * Mark B[0] as 'shifted' * Push stack * Advance buffer """ @staticmethod cdef bint is_valid(const StateC* st, attr_t label) nogil: if st.stack_depth() == 0: return 1 elif st.buffer_length() < 2: return 0 elif st.is_sent_start(st.B(0)): return 0 elif st.is_unshiftable(st.B(0)): return 0 else: return 1 @staticmethod cdef int transition(StateC* st, attr_t label) nogil: st.push() @staticmethod cdef weight_t cost(const StateC* state, const void* _gold, attr_t label) nogil: gold = _gold return gold.push_cost cdef class Reduce: """ Pop from the stack. If it has no head and the stack isn't empty, place it back on the buffer. Validity: * Stack not empty * Buffer nt empty * Stack depth 1 and cannot sent start l_edge(st.B(0)) Cost: * If B[0] is the start of a sentence, cost is 0 * Arcs between stack and buffer * If arc has no head, we're saving arcs between S[0] and S[1:], so decrement cost by those arcs. """ @staticmethod cdef bint is_valid(const StateC* st, attr_t label) nogil: if st.stack_depth() == 0: return False elif st.buffer_length() == 0: return True elif st.stack_depth() == 1 and st.cannot_sent_start(st.l_edge(st.B(0))): return False else: return True @staticmethod cdef int transition(StateC* st, attr_t label) nogil: if st.has_head(st.S(0)) or st.stack_depth() == 1: st.pop() else: st.unshift() @staticmethod cdef weight_t cost(const StateC* state, const void* _gold, attr_t label) nogil: gold = _gold if state.is_sent_start(state.B(0)): return 0 s0 = state.S(0) cost = gold.pop_cost if not state.has_head(s0): # Decrement cost for the arcs we save, as we'll be putting this # back to the buffer if is_head_in_stack(gold, s0): cost -= 1 cost -= gold.n_kids_in_stack[s0] return cost cdef class LeftArc: """Add an arc between B[0] and S[0], replacing the previous head of S[0] if one is set. Pop S[0] from the stack. Validity: * len(S) >= 1 * len(B) >= 1 * not is_sent_start(B[0]) Cost: pop_cost - Arc(B[0], S[0], label) + (Arc(S[1], S[0]) if H(S[0]) else Arcs(S, S[0])) """ @staticmethod cdef bint is_valid(const StateC* st, attr_t label) nogil: if st.stack_depth() == 0: return 0 elif st.buffer_length() == 0: return 0 elif st.is_sent_start(st.B(0)): return 0 elif label == SUBTOK_LABEL and st.S(0) != (st.B(0)-1): return 0 else: return 1 @staticmethod cdef int transition(StateC* st, attr_t label) nogil: st.add_arc(st.B(0), st.S(0), label) # If we change the stack, it's okay to remove the shifted mark, as # we can't get in an infinite loop this way. st.set_reshiftable(st.B(0)) st.pop() @staticmethod cdef inline weight_t cost(const StateC* state, const void* _gold, attr_t label) nogil: gold = _gold cdef weight_t cost = gold.pop_cost s0 = state.S(0) s1 = state.S(1) b0 = state.B(0) if state.has_head(s0): # Increment cost if we're clobbering a correct arc cost += gold.heads[s0] == s1 else: # If there's no head, we're losing arcs between S0 and S[1:]. cost += is_head_in_stack(gold, s0) cost += gold.n_kids_in_stack[s0] if b0 != -1 and s0 != -1 and gold.heads[s0] == b0: cost -= 1 cost += not label_is_gold(gold, s0, label) return cost cdef class RightArc: """ Add an arc from S[0] to B[0]. Push B[0]. Validity: * len(S) >= 1 * len(B) >= 1 * not is_sent_start(B[0]) Cost: push_cost + (not shifted[b0] and Arc(B[1:], B[0])) - Arc(S[0], B[0], label) """ @staticmethod cdef bint is_valid(const StateC* st, attr_t label) nogil: if st.stack_depth() == 0: return 0 elif st.buffer_length() == 0: return 0 elif st.is_sent_start(st.B(0)): return 0 elif label == SUBTOK_LABEL and st.S(0) != (st.B(0)-1): # If there's (perhaps partial) parse pre-set, don't allow cycle. return 0 else: return 1 @staticmethod cdef int transition(StateC* st, attr_t label) nogil: st.add_arc(st.S(0), st.B(0), label) st.push() @staticmethod cdef inline weight_t cost(const StateC* state, const void* _gold, attr_t label) nogil: gold = _gold cost = gold.push_cost s0 = state.S(0) b0 = state.B(0) if s0 != -1 and b0 != -1 and gold.heads[b0] == s0: cost -= 1 cost += not label_is_gold(gold, b0, label) elif is_head_in_buffer(gold, b0) and not state.is_unshiftable(b0): cost += 1 return cost cdef class Break: """Mark the second word of the buffer as the start of a sentence. Validity: * len(buffer) >= 2 * B[1] == B[0] + 1 * not is_sent_start(B[1]) * not cannot_sent_start(B[1]) Action: * mark_sent_start(B[1]) Cost: * not is_sent_start(B[1]) * Arcs between B[0] and B[1:] * Arcs between S and B[1] """ @staticmethod cdef bint is_valid(const StateC* st, attr_t label) nogil: cdef int i if st.buffer_length() < 2: return False elif st.B(1) != st.B(0) + 1: return False elif st.is_sent_start(st.B(1)): return False elif st.cannot_sent_start(st.B(1)): return False else: return True @staticmethod cdef int transition(StateC* st, attr_t label) nogil: st.set_sent_start(st.B(1), 1) @staticmethod cdef weight_t cost(const StateC* state, const void* _gold, attr_t label) nogil: gold = _gold cdef int b0 = state.B(0) cdef int cost = 0 cdef int si for i in range(state.stack_depth()): si = state.S(i) if is_head_in_buffer(gold, si): cost += 1 cost += gold.n_kids_in_buffer[si] # We need to score into B[1:], so subtract deps that are at b0 if gold.heads[b0] == si: cost -= 1 if gold.heads[si] == b0: cost -= 1 if not is_sent_start(gold, state.B(1)) \ and not is_sent_start_unknown(gold, state.B(1)): cost += 1 return cost cdef void* _init_state(Pool mem, int length, void* tokens) except NULL: st = new StateC(tokens, length) return st cdef int _del_state(Pool mem, void* state, void* x) except -1: cdef StateC* st = state del st cdef class ArcEager(TransitionSystem): def __init__(self, *args, **kwargs): TransitionSystem.__init__(self, *args, **kwargs) self.init_beam_state = _init_state self.del_beam_state = _del_state @classmethod def get_actions(cls, **kwargs): min_freq = kwargs.get('min_freq', None) actions = defaultdict(lambda: Counter()) actions[SHIFT][''] = 1 actions[REDUCE][''] = 1 for label in kwargs.get('left_labels', []): actions[LEFT][label] = 1 actions[SHIFT][label] = 1 for label in kwargs.get('right_labels', []): actions[RIGHT][label] = 1 actions[REDUCE][label] = 1 for example in kwargs.get('examples', []): # use heads and labels from the reference parse (without regard to # misalignments between the predicted and reference) example_gold_preproc = Example(example.reference, example.reference) heads, labels = example_gold_preproc.get_aligned_parse(projectivize=True) for child, (head, label) in enumerate(zip(heads, labels)): if head is None or label is None: continue if label.upper() == 'ROOT' : label = 'ROOT' if head == child: actions[BREAK][label] += 1 if head < child: actions[RIGHT][label] += 1 actions[REDUCE][''] += 1 elif head > child: actions[LEFT][label] += 1 actions[SHIFT][''] += 1 if min_freq is not None: for action, label_freqs in actions.items(): for label, freq in label_freqs.copy().items(): if freq < min_freq: label_freqs.pop(label) # Ensure these actions are present actions[BREAK].setdefault('ROOT', 0) if kwargs.get("learn_tokens") is True: actions[RIGHT].setdefault('subtok', 0) actions[LEFT].setdefault('subtok', 0) # Used for backoff actions[RIGHT].setdefault('dep', 0) actions[LEFT].setdefault('dep', 0) return actions @property def builtin_labels(self): return ["ROOT", "dep"] @property def action_types(self): return (SHIFT, REDUCE, LEFT, RIGHT, BREAK) def get_doc_labels(self, doc): """Get the labels required for a given Doc.""" labels = set(self.builtin_labels) for token in doc: if token.dep_: labels.add(token.dep_) return labels def transition(self, StateClass state, action): cdef Transition t = self.lookup_transition(action) t.do(state.c, t.label) return state def is_gold_parse(self, StateClass state, ArcEagerGold gold): for i in range(state.c.length): token = state.c.safe_get(i) if not arc_is_gold(&gold.c, i, i+token.head): return False elif not label_is_gold(&gold.c, i, token.dep): return False return True def init_gold(self, StateClass state, Example example): gold = ArcEagerGold(self, state, example) self._replace_unseen_labels(gold) return gold def init_gold_batch(self, examples): # TODO: Projectivity? all_states = self.init_batch([eg.predicted for eg in examples]) golds = [] states = [] for state, eg in zip(all_states, examples): if self.has_gold(eg) and not state.is_final(): golds.append(self.init_gold(state, eg)) states.append(state) n_steps = sum([len(s.queue) for s in states]) return states, golds, n_steps def _replace_unseen_labels(self, ArcEagerGold gold): backoff_label = self.strings["dep"] root_label = self.strings["ROOT"] left_labels = self.labels[LEFT] right_labels = self.labels[RIGHT] break_labels = self.labels[BREAK] for i in range(gold.c.length): if not is_head_unknown(&gold.c, i): head = gold.c.heads[i] label = self.strings[gold.c.labels[i]] if head > i and label not in left_labels: gold.c.labels[i] = backoff_label elif head < i and label not in right_labels: gold.c.labels[i] = backoff_label elif head == i and label not in break_labels: gold.c.labels[i] = root_label return gold cdef Transition lookup_transition(self, object name_or_id) except *: if isinstance(name_or_id, int): return self.c[name_or_id] name = name_or_id if '-' in name: move_str, label_str = split_bilu_label(name) label = self.strings[label_str] else: move_str = name label = 0 move = MOVE_NAMES.index(move_str) for i in range(self.n_moves): if self.c[i].move == move and self.c[i].label == label: return self.c[i] raise KeyError(f"Unknown transition: {name}") def move_name(self, int move, attr_t label): label_str = self.strings[label] if label_str: return MOVE_NAMES[move] + '-' + label_str else: return MOVE_NAMES[move] def class_name(self, int i): return self.move_name(self.c[i].move, self.c[i].label) cdef Transition init_transition(self, int clas, int move, attr_t label) except *: # TODO: Apparent Cython bug here when we try to use the Transition() # constructor with the function pointers cdef Transition t t.score = 0 t.clas = clas t.move = move t.label = label if move == SHIFT: t.is_valid = Shift.is_valid t.do = Shift.transition t.get_cost = Shift.cost elif move == REDUCE: t.is_valid = Reduce.is_valid t.do = Reduce.transition t.get_cost = Reduce.cost elif move == LEFT: t.is_valid = LeftArc.is_valid t.do = LeftArc.transition t.get_cost = LeftArc.cost elif move == RIGHT: t.is_valid = RightArc.is_valid t.do = RightArc.transition t.get_cost = RightArc.cost elif move == BREAK: t.is_valid = Break.is_valid t.do = Break.transition t.get_cost = Break.cost else: raise ValueError(Errors.E019.format(action=move, src='arc_eager')) return t def set_annotations(self, StateClass state, Doc doc): for arc in state.arcs: doc.c[arc["child"]].head = arc["head"] - arc["child"] doc.c[arc["child"]].dep = arc["label"] for i in range(doc.length): if doc.c[i].head == 0: doc.c[i].dep = self.root_label set_children_from_heads(doc.c, 0, doc.length) def get_beam_parses(self, Beam beam): parses = [] probs = beam.probs for i in range(beam.size): state = beam.at(i) if state.is_final(): prob = probs[i] parse = [] arcs = self.get_arcs(state) if arcs: for arc in arcs: dep = arc["label"] label = self.strings[dep] parse.append((arc["head"], arc["child"], label)) parses.append((prob, parse)) return parses cdef get_arcs(self, StateC* state): cdef vector[ArcC] arcs state.get_arcs(&arcs) return list(arcs) def has_gold(self, Example eg, start=0, end=None): for word in eg.y[start:end]: if word.dep != 0: return True else: return False cdef int set_valid(self, int* output, const StateC* st) nogil: cdef int[N_MOVES] is_valid is_valid[SHIFT] = Shift.is_valid(st, 0) is_valid[REDUCE] = Reduce.is_valid(st, 0) is_valid[LEFT] = LeftArc.is_valid(st, 0) is_valid[RIGHT] = RightArc.is_valid(st, 0) is_valid[BREAK] = Break.is_valid(st, 0) cdef int i for i in range(self.n_moves): if self.c[i].label == SUBTOK_LABEL: output[i] = self.c[i].is_valid(st, self.c[i].label) else: output[i] = is_valid[self.c[i].move] def get_cost(self, StateClass stcls, gold, int i): if not isinstance(gold, ArcEagerGold): raise TypeError(Errors.E909.format(name="ArcEagerGold")) cdef ArcEagerGold gold_ = gold gold_state = gold_.c n_gold = 0 if self.c[i].is_valid(stcls.c, self.c[i].label): cost = self.c[i].get_cost(stcls.c, &gold_state, self.c[i].label) else: cost = 9000 return cost cdef int set_costs(self, int* is_valid, weight_t* costs, const StateC* state, gold) except -1: if not isinstance(gold, ArcEagerGold): raise TypeError(Errors.E909.format(name="ArcEagerGold")) cdef ArcEagerGold gold_ = gold gold_state = gold_.c update_gold_state(&gold_state, state) self.set_valid(is_valid, state) cdef int n_gold = 0 for i in range(self.n_moves): if is_valid[i]: costs[i] = self.c[i].get_cost(state, &gold_state, self.c[i].label) if costs[i] <= 0: n_gold += 1 else: costs[i] = 9000 if n_gold < 1: for i in range(self.n_moves): print(self.get_class_name(i), is_valid[i], costs[i]) print("Gold sent starts?", is_sent_start(&gold_state, state.B(0)), is_sent_start(&gold_state, state.B(1))) raise ValueError(Errors.E1031) def get_oracle_sequence_from_state(self, StateClass state, ArcEagerGold gold, _debug=None): cdef int i cdef Pool mem = Pool() # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc assert self.n_moves > 0 costs = mem.alloc(self.n_moves, sizeof(float)) is_valid = mem.alloc(self.n_moves, sizeof(int)) history = [] debug_log = [] failed = False while not state.is_final(): try: self.set_costs(is_valid, costs, state.c, gold) except ValueError: failed = True break min_cost = min(costs[i] for i in range(self.n_moves)) for i in range(self.n_moves): if is_valid[i] and costs[i] <= min_cost: action = self.c[i] history.append(i) s0 = state.S(0) b0 = state.B(0) if _debug: example = _debug debug_log.append(" ".join(( self.get_class_name(i), state.print_state() ))) action.do(state.c, action.label) break else: failed = False break if failed and _debug not in (False, None): example = _debug print("Actions") for i in range(self.n_moves): print(self.get_class_name(i)) print("Gold") for token in example.y: print(token.i, token.text, token.dep_, token.head.text) aligned_heads, aligned_labels = example.get_aligned_parse() print("Aligned heads") for i, head in enumerate(aligned_heads): print(example.x[i], example.x[head] if head is not None else "__") print("Aligned sent starts") print(example.get_aligned_sent_starts()) print("Predicted tokens") print([(w.i, w.text) for w in example.x]) s0 = state.S(0) b0 = state.B(0) debug_log.append(" ".join(( "?", "S0=", (example.x[s0].text if s0 >= 0 else "-"), "B0=", (example.x[b0].text if b0 >= 0 else "-"), "S0 head?", str(state.has_head(state.S(0))), ))) s0 = state.S(0) b0 = state.B(0) print("\n".join(debug_log)) print("Arc is gold B0, S0?", arc_is_gold(&gold.c, b0, s0)) print("Arc is gold S0, B0?", arc_is_gold(&gold.c, s0, b0)) print("is_head_unknown(s0)", is_head_unknown(&gold.c, s0)) print("is_head_unknown(b0)", is_head_unknown(&gold.c, b0)) print("b0", b0, "gold.heads[s0]", gold.c.heads[s0]) print("Stack", [example.x[i] for i in state.stack]) print("Buffer", [example.x[i] for i in state.queue]) raise ValueError(Errors.E024) return history