mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-27 09:44:36 +03:00
0f01f46e02
* Replace all basestring references with unicode `basestring` was a compatability type introduced by Cython to make dealing with utf-8 strings in Python2 easier. In Python3 it is equivalent to the unicode (or str) type. I replaced all references to basestring with unicode, since that was used elsewhere, but we could also just replace them with str, which shoudl also be equivalent. All tests pass locally. * Replace all references to unicode type with str Since we only support python3 this is simpler. * Remove all references to unicode type This removes all references to the unicode type across the codebase and replaces them with `str`, which makes it more drastic than the prior commits. In order to make this work importing `unicode_literals` had to be removed, and one explicit unicode literal also had to be removed (it is unclear why this is necessary in Cython with language level 3, but without doing it there were errors about implicit conversion). When `unicode` is used as a type in comments it was also edited to be `str`. Additionally `coding: utf8` headers were removed from a few files.
898 lines
30 KiB
Cython
898 lines
30 KiB
Cython
# cython: profile=True, cdivision=True, infer_types=True
|
|
from cymem.cymem cimport Pool, Address
|
|
from libc.stdint cimport int32_t
|
|
from libcpp.vector cimport vector
|
|
|
|
from collections import defaultdict, Counter
|
|
|
|
from ...typedefs cimport hash_t, attr_t
|
|
from ...strings cimport hash_string
|
|
from ...structs cimport TokenC
|
|
from ...tokens.doc cimport Doc, set_children_from_heads
|
|
from ...tokens.token cimport MISSING_DEP
|
|
from ...training.example cimport Example
|
|
from .stateclass cimport StateClass
|
|
from ._state cimport StateC, ArcC
|
|
from ...errors import Errors
|
|
from thinc.extra.search cimport Beam
|
|
|
|
cdef weight_t MIN_SCORE = -90000
|
|
cdef attr_t SUBTOK_LABEL = hash_string('subtok')
|
|
|
|
DEF NON_MONOTONIC = True
|
|
|
|
cdef enum:
|
|
SHIFT
|
|
REDUCE
|
|
LEFT
|
|
RIGHT
|
|
|
|
BREAK
|
|
|
|
N_MOVES
|
|
|
|
|
|
MOVE_NAMES = [None] * N_MOVES
|
|
MOVE_NAMES[SHIFT] = 'S'
|
|
MOVE_NAMES[REDUCE] = 'D'
|
|
MOVE_NAMES[LEFT] = 'L'
|
|
MOVE_NAMES[RIGHT] = 'R'
|
|
MOVE_NAMES[BREAK] = 'B'
|
|
|
|
|
|
cdef enum:
|
|
HEAD_IN_STACK = 0
|
|
HEAD_IN_BUFFER
|
|
HEAD_UNKNOWN
|
|
IS_SENT_START
|
|
SENT_START_UNKNOWN
|
|
|
|
|
|
cdef struct GoldParseStateC:
|
|
char* state_bits
|
|
int32_t* n_kids_in_buffer
|
|
int32_t* n_kids_in_stack
|
|
int32_t* heads
|
|
attr_t* labels
|
|
int32_t** kids
|
|
int32_t* n_kids
|
|
int32_t length
|
|
int32_t stride
|
|
weight_t push_cost
|
|
weight_t pop_cost
|
|
|
|
|
|
cdef GoldParseStateC create_gold_state(Pool mem, const StateC* state,
|
|
heads, labels, sent_starts) except *:
|
|
cdef GoldParseStateC gs
|
|
gs.length = len(heads)
|
|
gs.stride = 1
|
|
assert gs.length > 0
|
|
gs.labels = <attr_t*>mem.alloc(gs.length, sizeof(gs.labels[0]))
|
|
gs.heads = <int32_t*>mem.alloc(gs.length, sizeof(gs.heads[0]))
|
|
gs.n_kids = <int32_t*>mem.alloc(gs.length, sizeof(gs.n_kids[0]))
|
|
gs.state_bits = <char*>mem.alloc(gs.length, sizeof(gs.state_bits[0]))
|
|
gs.n_kids_in_buffer = <int32_t*>mem.alloc(gs.length, sizeof(gs.n_kids_in_buffer[0]))
|
|
gs.n_kids_in_stack = <int32_t*>mem.alloc(gs.length, sizeof(gs.n_kids_in_stack[0]))
|
|
|
|
for i, is_sent_start in enumerate(sent_starts):
|
|
if is_sent_start == True:
|
|
gs.state_bits[i] = set_state_flag(
|
|
gs.state_bits[i],
|
|
IS_SENT_START,
|
|
1
|
|
)
|
|
gs.state_bits[i] = set_state_flag(
|
|
gs.state_bits[i],
|
|
SENT_START_UNKNOWN,
|
|
0
|
|
)
|
|
|
|
elif is_sent_start is None:
|
|
gs.state_bits[i] = set_state_flag(
|
|
gs.state_bits[i],
|
|
SENT_START_UNKNOWN,
|
|
1
|
|
)
|
|
gs.state_bits[i] = set_state_flag(
|
|
gs.state_bits[i],
|
|
IS_SENT_START,
|
|
0
|
|
)
|
|
else:
|
|
gs.state_bits[i] = set_state_flag(
|
|
gs.state_bits[i],
|
|
SENT_START_UNKNOWN,
|
|
0
|
|
)
|
|
gs.state_bits[i] = set_state_flag(
|
|
gs.state_bits[i],
|
|
IS_SENT_START,
|
|
0
|
|
)
|
|
|
|
for i, (head, label) in enumerate(zip(heads, labels)):
|
|
if head is not None:
|
|
gs.heads[i] = head
|
|
gs.labels[i] = label
|
|
if i != head:
|
|
gs.n_kids[head] += 1
|
|
gs.state_bits[i] = set_state_flag(
|
|
gs.state_bits[i],
|
|
HEAD_UNKNOWN,
|
|
0
|
|
)
|
|
else:
|
|
gs.state_bits[i] = set_state_flag(
|
|
gs.state_bits[i],
|
|
HEAD_UNKNOWN,
|
|
1
|
|
)
|
|
# Make an array of pointers, pointing into the gs_kids_flat array.
|
|
assert gs.length > 0
|
|
gs.kids = <int32_t**>mem.alloc(gs.length, sizeof(int32_t*))
|
|
for i in range(gs.length):
|
|
if gs.n_kids[i] != 0:
|
|
gs.kids[i] = <int32_t*>mem.alloc(gs.n_kids[i], sizeof(int32_t))
|
|
# This is a temporary buffer
|
|
js_addr = Address(gs.length, sizeof(int32_t))
|
|
js = <int32_t*>js_addr.ptr
|
|
for i in range(gs.length):
|
|
if not is_head_unknown(&gs, i):
|
|
head = gs.heads[i]
|
|
if head != i:
|
|
gs.kids[head][js[head]] = i
|
|
js[head] += 1
|
|
gs.push_cost = push_cost(state, &gs)
|
|
gs.pop_cost = pop_cost(state, &gs)
|
|
return gs
|
|
|
|
|
|
cdef void update_gold_state(GoldParseStateC* gs, const StateC* s) nogil:
|
|
for i in range(gs.length):
|
|
gs.state_bits[i] = set_state_flag(
|
|
gs.state_bits[i],
|
|
HEAD_IN_BUFFER,
|
|
0
|
|
)
|
|
gs.state_bits[i] = set_state_flag(
|
|
gs.state_bits[i],
|
|
HEAD_IN_STACK,
|
|
0
|
|
)
|
|
gs.n_kids_in_stack[i] = 0
|
|
gs.n_kids_in_buffer[i] = 0
|
|
|
|
for i in range(s.stack_depth()):
|
|
s_i = s.S(i)
|
|
if not is_head_unknown(gs, s_i) and gs.heads[s_i] != s_i:
|
|
gs.n_kids_in_stack[gs.heads[s_i]] += 1
|
|
for kid in gs.kids[s_i][:gs.n_kids[s_i]]:
|
|
gs.state_bits[kid] = set_state_flag(
|
|
gs.state_bits[kid],
|
|
HEAD_IN_STACK,
|
|
1
|
|
)
|
|
for i in range(s.buffer_length()):
|
|
b_i = s.B(i)
|
|
if s.is_sent_start(b_i):
|
|
break
|
|
if not is_head_unknown(gs, b_i) and gs.heads[b_i] != b_i:
|
|
gs.n_kids_in_buffer[gs.heads[b_i]] += 1
|
|
for kid in gs.kids[b_i][:gs.n_kids[b_i]]:
|
|
gs.state_bits[kid] = set_state_flag(
|
|
gs.state_bits[kid],
|
|
HEAD_IN_BUFFER,
|
|
1
|
|
)
|
|
gs.push_cost = push_cost(s, gs)
|
|
gs.pop_cost = pop_cost(s, gs)
|
|
|
|
|
|
cdef class ArcEagerGold:
|
|
cdef GoldParseStateC c
|
|
cdef Pool mem
|
|
|
|
def __init__(self, ArcEager moves, StateClass stcls, Example example):
|
|
self.mem = Pool()
|
|
heads, labels = example.get_aligned_parse(projectivize=True)
|
|
labels = [example.x.vocab.strings.add(label) if label is not None else MISSING_DEP for label in labels]
|
|
sent_starts = _get_aligned_sent_starts(example)
|
|
assert len(heads) == len(labels) == len(sent_starts), (len(heads), len(labels), len(sent_starts))
|
|
self.c = create_gold_state(self.mem, stcls.c, heads, labels, sent_starts)
|
|
|
|
def update(self, StateClass stcls):
|
|
update_gold_state(&self.c, stcls.c)
|
|
|
|
def _get_aligned_sent_starts(example):
|
|
"""Get list of SENT_START attributes aligned to the predicted tokenization.
|
|
If the reference has not sentence starts, return a list of None values.
|
|
|
|
This function is slightly different from the one on Example, because we also
|
|
check whether the reference sentences align across multiple sentences,
|
|
and return missing values if they do. This prevents a problem where you have
|
|
the start of a sentence merged onto a token that belongs to two sentences.
|
|
"""
|
|
if example.y.has_annotation("SENT_START"):
|
|
align = example.alignment.y2x
|
|
sent_starts = [False] * len(example.x)
|
|
seen_words = set()
|
|
for y_sent in example.y.sents:
|
|
x_indices = list(align[y_sent.start : y_sent.end].dataXd)
|
|
if any(x_idx in seen_words for x_idx in x_indices):
|
|
# If there are any tokens in X that align across two sentences,
|
|
# regard the sentence annotations as missing, as we can't
|
|
# reliably use them.
|
|
return [None] * len(example.x)
|
|
seen_words.update(x_indices)
|
|
sent_starts[x_indices[0]] = True
|
|
return sent_starts
|
|
else:
|
|
return [None] * len(example.x)
|
|
|
|
|
|
cdef int check_state_gold(char state_bits, char flag) nogil:
|
|
cdef char one = 1
|
|
return 1 if (state_bits & (one << flag)) else 0
|
|
|
|
|
|
cdef int set_state_flag(char state_bits, char flag, int value) nogil:
|
|
cdef char one = 1
|
|
if value:
|
|
return state_bits | (one << flag)
|
|
else:
|
|
return state_bits & ~(one << flag)
|
|
|
|
|
|
cdef int is_head_in_stack(const GoldParseStateC* gold, int i) nogil:
|
|
return check_state_gold(gold.state_bits[i], HEAD_IN_STACK)
|
|
|
|
|
|
cdef int is_head_in_buffer(const GoldParseStateC* gold, int i) nogil:
|
|
return check_state_gold(gold.state_bits[i], HEAD_IN_BUFFER)
|
|
|
|
|
|
cdef int is_head_unknown(const GoldParseStateC* gold, int i) nogil:
|
|
return check_state_gold(gold.state_bits[i], HEAD_UNKNOWN)
|
|
|
|
cdef int is_sent_start(const GoldParseStateC* gold, int i) nogil:
|
|
return check_state_gold(gold.state_bits[i], IS_SENT_START)
|
|
|
|
cdef int is_sent_start_unknown(const GoldParseStateC* gold, int i) nogil:
|
|
return check_state_gold(gold.state_bits[i], SENT_START_UNKNOWN)
|
|
|
|
|
|
# Helper functions for the arc-eager oracle
|
|
|
|
cdef weight_t push_cost(const StateC* state, const GoldParseStateC* gold) nogil:
|
|
cdef weight_t cost = 0
|
|
b0 = state.B(0)
|
|
if b0 < 0:
|
|
return 9000
|
|
if is_head_in_stack(gold, b0):
|
|
cost += 1
|
|
cost += gold.n_kids_in_stack[b0]
|
|
if Break.is_valid(state, 0) and is_sent_start(gold, state.B(1)):
|
|
cost += 1
|
|
return cost
|
|
|
|
|
|
cdef weight_t pop_cost(const StateC* state, const GoldParseStateC* gold) nogil:
|
|
cdef weight_t cost = 0
|
|
s0 = state.S(0)
|
|
if s0 < 0:
|
|
return 9000
|
|
if is_head_in_buffer(gold, s0):
|
|
cost += 1
|
|
cost += gold.n_kids_in_buffer[s0]
|
|
return cost
|
|
|
|
|
|
cdef bint arc_is_gold(const GoldParseStateC* gold, int head, int child) nogil:
|
|
if is_head_unknown(gold, child):
|
|
return True
|
|
elif gold.heads[child] == head:
|
|
return True
|
|
else:
|
|
return False
|
|
|
|
|
|
cdef bint label_is_gold(const GoldParseStateC* gold, int child, attr_t label) nogil:
|
|
if is_head_unknown(gold, child):
|
|
return True
|
|
elif label == 0:
|
|
return True
|
|
elif gold.labels[child] == label:
|
|
return True
|
|
else:
|
|
return False
|
|
|
|
|
|
cdef bint _is_gold_root(const GoldParseStateC* gold, int word) nogil:
|
|
return gold.heads[word] == word or is_head_unknown(gold, word)
|
|
|
|
|
|
cdef class Shift:
|
|
"""Move the first word of the buffer onto the stack and mark it as "shifted"
|
|
|
|
Validity:
|
|
* If stack is empty
|
|
* At least two words in sentence
|
|
* Word has not been shifted before
|
|
|
|
Cost: push_cost
|
|
|
|
Action:
|
|
* Mark B[0] as 'shifted'
|
|
* Push stack
|
|
* Advance buffer
|
|
"""
|
|
@staticmethod
|
|
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
|
if st.stack_depth() == 0:
|
|
return 1
|
|
elif st.buffer_length() < 2:
|
|
return 0
|
|
elif st.is_sent_start(st.B(0)):
|
|
return 0
|
|
elif st.is_unshiftable(st.B(0)):
|
|
return 0
|
|
else:
|
|
return 1
|
|
|
|
@staticmethod
|
|
cdef int transition(StateC* st, attr_t label) nogil:
|
|
st.push()
|
|
|
|
@staticmethod
|
|
cdef weight_t cost(const StateC* state, const void* _gold, attr_t label) nogil:
|
|
gold = <const GoldParseStateC*>_gold
|
|
return gold.push_cost
|
|
|
|
|
|
cdef class Reduce:
|
|
"""
|
|
Pop from the stack. If it has no head and the stack isn't empty, place
|
|
it back on the buffer.
|
|
|
|
Validity:
|
|
* Stack not empty
|
|
* Buffer nt empty
|
|
* Stack depth 1 and cannot sent start l_edge(st.B(0))
|
|
|
|
Cost:
|
|
* If B[0] is the start of a sentence, cost is 0
|
|
* Arcs between stack and buffer
|
|
* If arc has no head, we're saving arcs between S[0] and S[1:], so decrement
|
|
cost by those arcs.
|
|
"""
|
|
@staticmethod
|
|
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
|
if st.stack_depth() == 0:
|
|
return False
|
|
elif st.buffer_length() == 0:
|
|
return True
|
|
elif st.stack_depth() == 1 and st.cannot_sent_start(st.l_edge(st.B(0))):
|
|
return False
|
|
else:
|
|
return True
|
|
|
|
@staticmethod
|
|
cdef int transition(StateC* st, attr_t label) nogil:
|
|
if st.has_head(st.S(0)) or st.stack_depth() == 1:
|
|
st.pop()
|
|
else:
|
|
st.unshift()
|
|
|
|
@staticmethod
|
|
cdef weight_t cost(const StateC* state, const void* _gold, attr_t label) nogil:
|
|
gold = <const GoldParseStateC*>_gold
|
|
if state.is_sent_start(state.B(0)):
|
|
return 0
|
|
s0 = state.S(0)
|
|
cost = gold.pop_cost
|
|
if not state.has_head(s0):
|
|
# Decrement cost for the arcs we save, as we'll be putting this
|
|
# back to the buffer
|
|
if is_head_in_stack(gold, s0):
|
|
cost -= 1
|
|
cost -= gold.n_kids_in_stack[s0]
|
|
return cost
|
|
|
|
|
|
cdef class LeftArc:
|
|
"""Add an arc between B[0] and S[0], replacing the previous head of S[0] if
|
|
one is set. Pop S[0] from the stack.
|
|
|
|
Validity:
|
|
* len(S) >= 1
|
|
* len(B) >= 1
|
|
* not is_sent_start(B[0])
|
|
|
|
Cost:
|
|
pop_cost - Arc(B[0], S[0], label) + (Arc(S[1], S[0]) if H(S[0]) else Arcs(S, S[0]))
|
|
"""
|
|
@staticmethod
|
|
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
|
if st.stack_depth() == 0:
|
|
return 0
|
|
elif st.buffer_length() == 0:
|
|
return 0
|
|
elif st.is_sent_start(st.B(0)):
|
|
return 0
|
|
elif label == SUBTOK_LABEL and st.S(0) != (st.B(0)-1):
|
|
return 0
|
|
else:
|
|
return 1
|
|
|
|
@staticmethod
|
|
cdef int transition(StateC* st, attr_t label) nogil:
|
|
st.add_arc(st.B(0), st.S(0), label)
|
|
# If we change the stack, it's okay to remove the shifted mark, as
|
|
# we can't get in an infinite loop this way.
|
|
st.set_reshiftable(st.B(0))
|
|
st.pop()
|
|
|
|
@staticmethod
|
|
cdef inline weight_t cost(const StateC* state, const void* _gold, attr_t label) nogil:
|
|
gold = <const GoldParseStateC*>_gold
|
|
cdef weight_t cost = gold.pop_cost
|
|
s0 = state.S(0)
|
|
s1 = state.S(1)
|
|
b0 = state.B(0)
|
|
if state.has_head(s0):
|
|
# Increment cost if we're clobbering a correct arc
|
|
cost += gold.heads[s0] == s1
|
|
else:
|
|
# If there's no head, we're losing arcs between S0 and S[1:].
|
|
cost += is_head_in_stack(gold, s0)
|
|
cost += gold.n_kids_in_stack[s0]
|
|
if b0 != -1 and s0 != -1 and gold.heads[s0] == b0:
|
|
cost -= 1
|
|
cost += not label_is_gold(gold, s0, label)
|
|
return cost
|
|
|
|
|
|
cdef class RightArc:
|
|
"""
|
|
Add an arc from S[0] to B[0]. Push B[0].
|
|
|
|
Validity:
|
|
* len(S) >= 1
|
|
* len(B) >= 1
|
|
* not is_sent_start(B[0])
|
|
|
|
Cost:
|
|
push_cost + (not shifted[b0] and Arc(B[1:], B[0])) - Arc(S[0], B[0], label)
|
|
"""
|
|
@staticmethod
|
|
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
|
if st.stack_depth() == 0:
|
|
return 0
|
|
elif st.buffer_length() == 0:
|
|
return 0
|
|
elif st.is_sent_start(st.B(0)):
|
|
return 0
|
|
elif label == SUBTOK_LABEL and st.S(0) != (st.B(0)-1):
|
|
# If there's (perhaps partial) parse pre-set, don't allow cycle.
|
|
return 0
|
|
else:
|
|
return 1
|
|
|
|
@staticmethod
|
|
cdef int transition(StateC* st, attr_t label) nogil:
|
|
st.add_arc(st.S(0), st.B(0), label)
|
|
st.push()
|
|
|
|
@staticmethod
|
|
cdef inline weight_t cost(const StateC* state, const void* _gold, attr_t label) nogil:
|
|
gold = <const GoldParseStateC*>_gold
|
|
cost = gold.push_cost
|
|
s0 = state.S(0)
|
|
b0 = state.B(0)
|
|
if s0 != -1 and b0 != -1 and gold.heads[b0] == s0:
|
|
cost -= 1
|
|
cost += not label_is_gold(gold, b0, label)
|
|
elif is_head_in_buffer(gold, b0) and not state.is_unshiftable(b0):
|
|
cost += 1
|
|
return cost
|
|
|
|
|
|
cdef class Break:
|
|
"""Mark the second word of the buffer as the start of a
|
|
sentence.
|
|
|
|
Validity:
|
|
* len(buffer) >= 2
|
|
* B[1] == B[0] + 1
|
|
* not is_sent_start(B[1])
|
|
* not cannot_sent_start(B[1])
|
|
|
|
Action:
|
|
* mark_sent_start(B[1])
|
|
|
|
Cost:
|
|
* not is_sent_start(B[1])
|
|
* Arcs between B[0] and B[1:]
|
|
* Arcs between S and B[1]
|
|
"""
|
|
@staticmethod
|
|
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
|
cdef int i
|
|
if st.buffer_length() < 2:
|
|
return False
|
|
elif st.B(1) != st.B(0) + 1:
|
|
return False
|
|
elif st.is_sent_start(st.B(1)):
|
|
return False
|
|
elif st.cannot_sent_start(st.B(1)):
|
|
return False
|
|
else:
|
|
return True
|
|
|
|
@staticmethod
|
|
cdef int transition(StateC* st, attr_t label) nogil:
|
|
st.set_sent_start(st.B(1), 1)
|
|
|
|
@staticmethod
|
|
cdef weight_t cost(const StateC* state, const void* _gold, attr_t label) nogil:
|
|
gold = <const GoldParseStateC*>_gold
|
|
cdef int b0 = state.B(0)
|
|
cdef int cost = 0
|
|
cdef int si
|
|
for i in range(state.stack_depth()):
|
|
si = state.S(i)
|
|
if is_head_in_buffer(gold, si):
|
|
cost += 1
|
|
cost += gold.n_kids_in_buffer[si]
|
|
# We need to score into B[1:], so subtract deps that are at b0
|
|
if gold.heads[b0] == si:
|
|
cost -= 1
|
|
if gold.heads[si] == b0:
|
|
cost -= 1
|
|
if not is_sent_start(gold, state.B(1)) \
|
|
and not is_sent_start_unknown(gold, state.B(1)):
|
|
cost += 1
|
|
return cost
|
|
|
|
|
|
cdef void* _init_state(Pool mem, int length, void* tokens) except NULL:
|
|
st = new StateC(<const TokenC*>tokens, length)
|
|
return <void*>st
|
|
|
|
|
|
cdef int _del_state(Pool mem, void* state, void* x) except -1:
|
|
cdef StateC* st = <StateC*>state
|
|
del st
|
|
|
|
|
|
cdef class ArcEager(TransitionSystem):
|
|
def __init__(self, *args, **kwargs):
|
|
TransitionSystem.__init__(self, *args, **kwargs)
|
|
self.init_beam_state = _init_state
|
|
self.del_beam_state = _del_state
|
|
|
|
@classmethod
|
|
def get_actions(cls, **kwargs):
|
|
min_freq = kwargs.get('min_freq', None)
|
|
actions = defaultdict(lambda: Counter())
|
|
actions[SHIFT][''] = 1
|
|
actions[REDUCE][''] = 1
|
|
for label in kwargs.get('left_labels', []):
|
|
actions[LEFT][label] = 1
|
|
actions[SHIFT][label] = 1
|
|
for label in kwargs.get('right_labels', []):
|
|
actions[RIGHT][label] = 1
|
|
actions[REDUCE][label] = 1
|
|
for example in kwargs.get('examples', []):
|
|
heads, labels = example.get_aligned_parse(projectivize=True)
|
|
for child, (head, label) in enumerate(zip(heads, labels)):
|
|
if head is None or label is None:
|
|
continue
|
|
if label.upper() == 'ROOT' :
|
|
label = 'ROOT'
|
|
if head == child:
|
|
actions[BREAK][label] += 1
|
|
if head < child:
|
|
actions[RIGHT][label] += 1
|
|
actions[REDUCE][''] += 1
|
|
elif head > child:
|
|
actions[LEFT][label] += 1
|
|
actions[SHIFT][''] += 1
|
|
if min_freq is not None:
|
|
for action, label_freqs in actions.items():
|
|
for label, freq in list(label_freqs.items()):
|
|
if freq < min_freq:
|
|
label_freqs.pop(label)
|
|
# Ensure these actions are present
|
|
actions[BREAK].setdefault('ROOT', 0)
|
|
if kwargs.get("learn_tokens") is True:
|
|
actions[RIGHT].setdefault('subtok', 0)
|
|
actions[LEFT].setdefault('subtok', 0)
|
|
# Used for backoff
|
|
actions[RIGHT].setdefault('dep', 0)
|
|
actions[LEFT].setdefault('dep', 0)
|
|
return actions
|
|
|
|
@property
|
|
def builtin_labels(self):
|
|
return ["ROOT", "dep"]
|
|
|
|
@property
|
|
def action_types(self):
|
|
return (SHIFT, REDUCE, LEFT, RIGHT, BREAK)
|
|
|
|
def get_doc_labels(self, doc):
|
|
"""Get the labels required for a given Doc."""
|
|
labels = set(self.builtin_labels)
|
|
for token in doc:
|
|
if token.dep_:
|
|
labels.add(token.dep_)
|
|
return labels
|
|
|
|
def transition(self, StateClass state, action):
|
|
cdef Transition t = self.lookup_transition(action)
|
|
t.do(state.c, t.label)
|
|
return state
|
|
|
|
def is_gold_parse(self, StateClass state, ArcEagerGold gold):
|
|
for i in range(state.c.length):
|
|
token = state.c.safe_get(i)
|
|
if not arc_is_gold(&gold.c, i, i+token.head):
|
|
return False
|
|
elif not label_is_gold(&gold.c, i, token.dep):
|
|
return False
|
|
return True
|
|
|
|
def init_gold(self, StateClass state, Example example):
|
|
gold = ArcEagerGold(self, state, example)
|
|
self._replace_unseen_labels(gold)
|
|
return gold
|
|
|
|
def init_gold_batch(self, examples):
|
|
# TODO: Projectivity?
|
|
all_states = self.init_batch([eg.predicted for eg in examples])
|
|
golds = []
|
|
states = []
|
|
for state, eg in zip(all_states, examples):
|
|
if self.has_gold(eg) and not state.is_final():
|
|
golds.append(self.init_gold(state, eg))
|
|
states.append(state)
|
|
n_steps = sum([len(s.queue) for s in states])
|
|
return states, golds, n_steps
|
|
|
|
def _replace_unseen_labels(self, ArcEagerGold gold):
|
|
backoff_label = self.strings["dep"]
|
|
root_label = self.strings["ROOT"]
|
|
left_labels = self.labels[LEFT]
|
|
right_labels = self.labels[RIGHT]
|
|
break_labels = self.labels[BREAK]
|
|
for i in range(gold.c.length):
|
|
if not is_head_unknown(&gold.c, i):
|
|
head = gold.c.heads[i]
|
|
label = self.strings[gold.c.labels[i]]
|
|
if head > i and label not in left_labels:
|
|
gold.c.labels[i] = backoff_label
|
|
elif head < i and label not in right_labels:
|
|
gold.c.labels[i] = backoff_label
|
|
elif head == i and label not in break_labels:
|
|
gold.c.labels[i] = root_label
|
|
return gold
|
|
|
|
cdef Transition lookup_transition(self, object name_or_id) except *:
|
|
if isinstance(name_or_id, int):
|
|
return self.c[name_or_id]
|
|
name = name_or_id
|
|
if '-' in name:
|
|
move_str, label_str = name.split('-', 1)
|
|
label = self.strings[label_str]
|
|
else:
|
|
move_str = name
|
|
label = 0
|
|
move = MOVE_NAMES.index(move_str)
|
|
for i in range(self.n_moves):
|
|
if self.c[i].move == move and self.c[i].label == label:
|
|
return self.c[i]
|
|
raise KeyError(f"Unknown transition: {name}")
|
|
|
|
def move_name(self, int move, attr_t label):
|
|
label_str = self.strings[label]
|
|
if label_str:
|
|
return MOVE_NAMES[move] + '-' + label_str
|
|
else:
|
|
return MOVE_NAMES[move]
|
|
|
|
def class_name(self, int i):
|
|
return self.move_name(self.c[i].move, self.c[i].label)
|
|
|
|
cdef Transition init_transition(self, int clas, int move, attr_t label) except *:
|
|
# TODO: Apparent Cython bug here when we try to use the Transition()
|
|
# constructor with the function pointers
|
|
cdef Transition t
|
|
t.score = 0
|
|
t.clas = clas
|
|
t.move = move
|
|
t.label = label
|
|
if move == SHIFT:
|
|
t.is_valid = Shift.is_valid
|
|
t.do = Shift.transition
|
|
t.get_cost = Shift.cost
|
|
elif move == REDUCE:
|
|
t.is_valid = Reduce.is_valid
|
|
t.do = Reduce.transition
|
|
t.get_cost = Reduce.cost
|
|
elif move == LEFT:
|
|
t.is_valid = LeftArc.is_valid
|
|
t.do = LeftArc.transition
|
|
t.get_cost = LeftArc.cost
|
|
elif move == RIGHT:
|
|
t.is_valid = RightArc.is_valid
|
|
t.do = RightArc.transition
|
|
t.get_cost = RightArc.cost
|
|
elif move == BREAK:
|
|
t.is_valid = Break.is_valid
|
|
t.do = Break.transition
|
|
t.get_cost = Break.cost
|
|
else:
|
|
raise ValueError(Errors.E019.format(action=move, src='arc_eager'))
|
|
return t
|
|
|
|
def set_annotations(self, StateClass state, Doc doc):
|
|
for arc in state.arcs:
|
|
doc.c[arc["child"]].head = arc["head"] - arc["child"]
|
|
doc.c[arc["child"]].dep = arc["label"]
|
|
for i in range(doc.length):
|
|
if doc.c[i].head == 0:
|
|
doc.c[i].dep = self.root_label
|
|
set_children_from_heads(doc.c, 0, doc.length)
|
|
|
|
def get_beam_parses(self, Beam beam):
|
|
parses = []
|
|
probs = beam.probs
|
|
for i in range(beam.size):
|
|
state = <StateC*>beam.at(i)
|
|
if state.is_final():
|
|
prob = probs[i]
|
|
parse = []
|
|
arcs = self.get_arcs(state)
|
|
if arcs:
|
|
for arc in arcs:
|
|
dep = arc["label"]
|
|
label = self.strings[dep]
|
|
parse.append((arc["head"], arc["child"], label))
|
|
parses.append((prob, parse))
|
|
return parses
|
|
|
|
cdef get_arcs(self, StateC* state):
|
|
cdef vector[ArcC] arcs
|
|
state.get_arcs(&arcs)
|
|
return list(arcs)
|
|
|
|
def has_gold(self, Example eg, start=0, end=None):
|
|
for word in eg.y[start:end]:
|
|
if word.dep != 0:
|
|
return True
|
|
else:
|
|
return False
|
|
|
|
cdef int set_valid(self, int* output, const StateC* st) nogil:
|
|
cdef int[N_MOVES] is_valid
|
|
is_valid[SHIFT] = Shift.is_valid(st, 0)
|
|
is_valid[REDUCE] = Reduce.is_valid(st, 0)
|
|
is_valid[LEFT] = LeftArc.is_valid(st, 0)
|
|
is_valid[RIGHT] = RightArc.is_valid(st, 0)
|
|
is_valid[BREAK] = Break.is_valid(st, 0)
|
|
cdef int i
|
|
for i in range(self.n_moves):
|
|
if self.c[i].label == SUBTOK_LABEL:
|
|
output[i] = self.c[i].is_valid(st, self.c[i].label)
|
|
else:
|
|
output[i] = is_valid[self.c[i].move]
|
|
|
|
def get_cost(self, StateClass stcls, gold, int i):
|
|
if not isinstance(gold, ArcEagerGold):
|
|
raise TypeError(Errors.E909.format(name="ArcEagerGold"))
|
|
cdef ArcEagerGold gold_ = gold
|
|
gold_state = gold_.c
|
|
n_gold = 0
|
|
if self.c[i].is_valid(stcls.c, self.c[i].label):
|
|
cost = self.c[i].get_cost(stcls.c, &gold_state, self.c[i].label)
|
|
else:
|
|
cost = 9000
|
|
return cost
|
|
|
|
cdef int set_costs(self, int* is_valid, weight_t* costs,
|
|
const StateC* state, gold) except -1:
|
|
if not isinstance(gold, ArcEagerGold):
|
|
raise TypeError(Errors.E909.format(name="ArcEagerGold"))
|
|
cdef ArcEagerGold gold_ = gold
|
|
gold_state = gold_.c
|
|
update_gold_state(&gold_state, state)
|
|
self.set_valid(is_valid, state)
|
|
cdef int n_gold = 0
|
|
for i in range(self.n_moves):
|
|
if is_valid[i]:
|
|
costs[i] = self.c[i].get_cost(state, &gold_state, self.c[i].label)
|
|
if costs[i] <= 0:
|
|
n_gold += 1
|
|
else:
|
|
costs[i] = 9000
|
|
if n_gold < 1:
|
|
for i in range(self.n_moves):
|
|
print(self.get_class_name(i), is_valid[i], costs[i])
|
|
print("Gold sent starts?", is_sent_start(&gold_state, state.B(0)), is_sent_start(&gold_state, state.B(1)))
|
|
raise ValueError("Could not find gold transition - see logs above.")
|
|
|
|
def get_oracle_sequence_from_state(self, StateClass state, ArcEagerGold gold, _debug=None):
|
|
cdef int i
|
|
cdef Pool mem = Pool()
|
|
# n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
|
|
assert self.n_moves > 0
|
|
costs = <float*>mem.alloc(self.n_moves, sizeof(float))
|
|
is_valid = <int*>mem.alloc(self.n_moves, sizeof(int))
|
|
|
|
history = []
|
|
debug_log = []
|
|
failed = False
|
|
while not state.is_final():
|
|
try:
|
|
self.set_costs(is_valid, costs, state.c, gold)
|
|
except ValueError:
|
|
failed = True
|
|
break
|
|
min_cost = min(costs[i] for i in range(self.n_moves))
|
|
for i in range(self.n_moves):
|
|
if is_valid[i] and costs[i] <= min_cost:
|
|
action = self.c[i]
|
|
history.append(i)
|
|
s0 = state.S(0)
|
|
b0 = state.B(0)
|
|
if _debug:
|
|
example = _debug
|
|
debug_log.append(" ".join((
|
|
self.get_class_name(i),
|
|
state.print_state()
|
|
)))
|
|
action.do(state.c, action.label)
|
|
break
|
|
else:
|
|
failed = False
|
|
break
|
|
if failed and _debug not in (False, None):
|
|
example = _debug
|
|
print("Actions")
|
|
for i in range(self.n_moves):
|
|
print(self.get_class_name(i))
|
|
print("Gold")
|
|
for token in example.y:
|
|
print(token.i, token.text, token.dep_, token.head.text)
|
|
aligned_heads, aligned_labels = example.get_aligned_parse()
|
|
print("Aligned heads")
|
|
for i, head in enumerate(aligned_heads):
|
|
print(example.x[i], example.x[head] if head is not None else "__")
|
|
print("Aligned sent starts")
|
|
print(example.get_aligned_sent_starts())
|
|
|
|
print("Predicted tokens")
|
|
print([(w.i, w.text) for w in example.x])
|
|
s0 = state.S(0)
|
|
b0 = state.B(0)
|
|
debug_log.append(" ".join((
|
|
"?",
|
|
"S0=", (example.x[s0].text if s0 >= 0 else "-"),
|
|
"B0=", (example.x[b0].text if b0 >= 0 else "-"),
|
|
"S0 head?", str(state.has_head(state.S(0))),
|
|
)))
|
|
s0 = state.S(0)
|
|
b0 = state.B(0)
|
|
print("\n".join(debug_log))
|
|
print("Arc is gold B0, S0?", arc_is_gold(&gold.c, b0, s0))
|
|
print("Arc is gold S0, B0?", arc_is_gold(&gold.c, s0, b0))
|
|
print("is_head_unknown(s0)", is_head_unknown(&gold.c, s0))
|
|
print("is_head_unknown(b0)", is_head_unknown(&gold.c, b0))
|
|
print("b0", b0, "gold.heads[s0]", gold.c.heads[s0])
|
|
print("Stack", [example.x[i] for i in state.stack])
|
|
print("Buffer", [example.x[i] for i in state.queue])
|
|
raise ValueError(Errors.E024)
|
|
return history
|