From c7e3dfc1dc581a0dbe3cfc7adbd48d8df7c7894d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 8 Jun 2015 14:49:04 +0200 Subject: [PATCH 01/75] * Don't automatically push words when stack is empty, as it messes up beam parsing. Add hash method to beam state. --- spacy/syntax/_state.pyx | 87 +++++++++++++++++++++++-------- spacy/syntax/arc_eager.pyx | 104 ++++++++++++++++++------------------- spacy/syntax/parser.pyx | 31 ++++++++--- 3 files changed, 142 insertions(+), 80 deletions(-) diff --git a/spacy/syntax/_state.pyx b/spacy/syntax/_state.pyx index 3e28a6cd4..3a876df2e 100644 --- a/spacy/syntax/_state.pyx +++ b/spacy/syntax/_state.pyx @@ -61,8 +61,8 @@ cdef int pop_stack(State *s) except -1: assert s.stack_len >= 1 s.stack_len -= 1 s.stack -= 1 - if s.stack_len == 0 and not at_eol(s): - push_stack(s) + #if s.stack_len == 0 and not at_eol(s): + # push_stack(s) cdef int push_stack(State *s) except -1: @@ -114,27 +114,29 @@ cdef bint has_head(const TokenC* t) nogil: cdef const TokenC* get_left(const State* s, const TokenC* head, const int idx) nogil: - cdef uint32_t kids = head.l_kids - if kids == 0: - return NULL - cdef int offset = _nth_significant_bit(kids, idx) - cdef const TokenC* child = head - offset - if child >= s.sent: - return child - else: - return NULL + return _new_get_left(s, head, idx) + #cdef uint32_t kids = head.l_kids + #if kids == 0: + # return NULL + #cdef int offset = _nth_significant_bit(kids, idx) + #cdef const TokenC* child = head - offset + #if child >= s.sent: + # return child + ##else: + # return NULL cdef const TokenC* get_right(const State* s, const TokenC* head, const int idx) nogil: - cdef uint32_t kids = head.r_kids - if kids == 0: - return NULL - cdef int offset = _nth_significant_bit(kids, idx) - cdef const TokenC* child = head + offset - if child < (s.sent + s.sent_len): - return child - else: - return NULL + return _new_get_right(s, head, idx) + #cdef uint32_t kids = head.r_kids + #if kids == 0: + # return NULL + #cdef int offset = _nth_significant_bit(kids, idx) + #cdef const TokenC* child = head + offset + #if child < (s.sent + s.sent_len): + # return child + #else: + # return NULL cdef int count_left_kids(const TokenC* head) nogil: @@ -190,7 +192,7 @@ cdef int copy_state(State* dest, const State* src) except -1: # From https://en.wikipedia.org/wiki/Hamming_weight cdef inline uint32_t _popcount(uint32_t x) nogil: """Find number of non-zero bits.""" - cdef int count = 0 + cdef uint32_t count = 0 while x != 0: x &= x - 1 count += 1 @@ -198,10 +200,51 @@ cdef inline uint32_t _popcount(uint32_t x) nogil: cdef inline uint32_t _nth_significant_bit(uint32_t bits, int n) nogil: - cdef int i + cdef uint32_t i for i in range(32): if bits & (1 << i): n -= 1 if n < 1: return i return 0 + + +cdef const TokenC* _new_get_left(const State* s, const TokenC* target, int idx) nogil: + if idx < 1: + return NULL + cdef const TokenC* ptr = s.sent + while ptr < target: + # If this head is still to the right of us, we can skip to it + # No token that's between this token and this head could be our + # child. + if (ptr.head >= 1) and (ptr + ptr.head) < target: + ptr += ptr.head + + elif ptr + ptr.head == target: + idx -= 1 + if idx == 0: + return ptr + ptr += 1 + else: + ptr += 1 + return NULL + + +cdef const TokenC* _new_get_right(const State* s, const TokenC* target, int idx) nogil: + if idx < 1: + return NULL + cdef const TokenC* ptr = s.sent + (s.sent_len - 1) + while ptr > target: + # If this head is still to the right of us, we can skip to it + # No token that's between this token and this head could be our + # child. + if (ptr.head < 0) and ((ptr + ptr.head) > target): + ptr += ptr.head + elif ptr + ptr.head == target: + idx -= 1 + if idx == 0: + return ptr + ptr -= 1 + else: + ptr -= 1 + return NULL diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 855535f4e..afa05bd9a 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -55,6 +55,8 @@ cdef int push_cost(const State* st, const GoldParseC* gold, int target) except - cdef int cost = 0 cost += head_in_stack(st, target, gold.heads) cost += children_in_stack(st, target, gold.heads) + # If we can Break, we shouldn't push + cost += Break.is_valid(st, -1) and Break.move_cost(st, gold) == 0 return cost @@ -65,15 +67,42 @@ cdef int pop_cost(const State* st, const GoldParseC* gold, int target) except -1 return cost -cdef int arc_cost(const GoldParseC* gold, int head, int child, int label) except -1: - if gold.heads[child] != head: +cdef int arc_cost(const State* st, const GoldParseC* gold, int head, int child) except -1: + if arc_is_gold(gold, head, child): return 0 - elif gold.labels[child] == -1: - return 0 - elif gold.labels[child] == label: - return 0 - else: + elif (child + st.sent[child].head) == gold.heads[child]: return 1 + elif gold.heads[child] >= st.i: + return 1 + else: + return 0 + + + +cdef bint arc_is_gold(const GoldParseC* gold, int head, int child) except -1: + if gold.labels[child] == -1: + return True + elif _is_gold_root(gold, head) and _is_gold_root(gold, child): + return True + elif gold.heads[child] == head: + return True + else: + return False + + +cdef bint label_is_gold(const GoldParseC* gold, int head, int child, int label) except -1: + if gold.labels[child] == -1: + return True + elif label == -1: + return True + elif gold.labels[child] == label: + return True + else: + return False + + +cdef bint _is_gold_root(const GoldParseC* gold, int word) except -1: + return gold.labels[word] == -1 or gold.heads[word] == word cdef class Shift: @@ -96,11 +125,7 @@ cdef class Shift: @staticmethod cdef int move_cost(const State* s, const GoldParseC* gold) except -1: - cdef int cost = push_cost(s, gold, s.i) - # If we can break, and there's no cost to doing so, we should - if Break.is_valid(s, -1) and Break.cost(s, gold, -1) == 0: - cost += 1 - return cost + return push_cost(s, gold, s.i) @staticmethod cdef int label_cost(const State* s, const GoldParseC* gold, int label) except -1: @@ -117,7 +142,7 @@ cdef class Reduce: @staticmethod cdef int transition(State* state, int label) except -1: - if NON_MONOTONIC and not has_head(get_s0(state)): + if NON_MONOTONIC and not has_head(get_s0(state)) and state.stack_len >= 2: add_dep(state, state.stack[-1], state.stack[0], get_s0(state).dep) pop_stack(state) @@ -139,7 +164,6 @@ cdef class Reduce: return 0 - cdef class LeftArc: @staticmethod cdef bint is_valid(const State* s, int label) except -1: @@ -167,31 +191,14 @@ cdef class LeftArc: cdef int move_cost(const State* s, const GoldParseC* gold) except -1: if not LeftArc.is_valid(s, -1): return 9000 - cdef int cost = 0 - if gold.heads[s.stack[0]] == s.i: - return cost - elif at_eol(s): - # Are we root? - if gold.labels[s.stack[0]] != -1: - # If we're at EOL, prefer to reduce or break over left-arc - if Reduce.is_valid(s, -1) or Break.is_valid(s, -1): - cost += gold.heads[s.stack[0]] != s.stack[0] - return cost - cost += head_in_buffer(s, s.stack[0], gold.heads) - cost += children_in_buffer(s, s.stack[0], gold.heads) - if NON_MONOTONIC and s.stack_len >= 2: - cost += gold.heads[s.stack[0]] == s.stack[-1] - if gold.labels[s.stack[0]] != -1: - cost += gold.heads[s.stack[0]] == s.stack[0] - return cost + elif arc_is_gold(gold, s.i, s.stack[0]): + return 0 + else: + return pop_cost(s, gold, s.stack[0]) + arc_cost(s, gold, s.i, s.stack[0]) @staticmethod cdef int label_cost(const State* s, const GoldParseC* gold, int label) except -1: - if label == -1 or gold.labels[s.stack[0]] == -1: - return 0 - if gold.heads[s.stack[0]] == s.i and label != gold.labels[s.stack[0]]: - return 1 - return 0 + return arc_is_gold(gold, s.i, s.stack[0]) and not label_is_gold(gold, s.i, s.stack[0], label) cdef class RightArc: @@ -212,21 +219,14 @@ cdef class RightArc: @staticmethod cdef int move_cost(const State* s, const GoldParseC* gold) except -1: - return push_cost(s, gold, s.i) - (gold.heads[s.i] == s.stack[0]) + if arc_is_gold(gold, s.stack[0], s.i): + return 0 + else: + return push_cost(s, gold, s.i) + arc_cost(s, gold, s.stack[0], s.i) @staticmethod cdef int label_cost(const State* s, const GoldParseC* gold, int label) except -1: - return arc_cost(gold, s.stack[0], s.i, label) - #cdef int cost = 0 - #if gold.heads[s.i] == s.stack[0]: - # cost += label != -1 and label != gold.labels[s.i] - # return cost - # This indicates missing head - #if gold.labels[s.i] != -1: - # cost += head_in_buffer(s, s.i, gold.heads) - #cost += children_in_stack(s, s.i, gold.heads) - #cost += head_in_stack(s, s.i, gold.heads) - #return cost + return arc_is_gold(gold, s.stack[0], s.i) and not label_is_gold(gold, s.stack[0], s.i, label) cdef class Break: @@ -237,8 +237,10 @@ cdef class Break: return False elif at_eol(s): return False - #elif NON_MONOTONIC: - # return True + elif s.stack_len < 1: + return False + elif NON_MONOTONIC: + return True else: # In the Break transition paper, they have this constraint that prevents # Break if stack is disconnected. But, if we're doing non-monotonic parsing, @@ -262,8 +264,6 @@ cdef class Break: get_s0(state).dep = label state.stack -= 1 state.stack_len -= 1 - if not at_eol(state): - push_stack(state) @staticmethod cdef int cost(const State* s, const GoldParseC* gold, int label) except -1: diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 639f91c03..47921563b 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -14,7 +14,7 @@ import json from cymem.cymem cimport Pool, Address from murmurhash.mrmr cimport hash64 -from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t +from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t from util import Config @@ -34,7 +34,7 @@ from ..strings cimport StringStore from .arc_eager cimport TransitionSystem, Transition from .transition_system import OracleError -from ._state cimport State, new_state, copy_state, is_final, push_stack +from ._state cimport State, new_state, copy_state, is_final, push_stack, get_left, get_n0 from ..gold cimport GoldParse from . import _parse_features @@ -83,14 +83,14 @@ cdef class Parser: def __call__(self, Tokens tokens): if tokens.length == 0: return 0 - if self.cfg.get('beam_width', 1) <= 1: + if self.cfg.get('beam_width', 1) < 1: self._greedy_parse(tokens) else: self._beam_parse(tokens) def train(self, Tokens tokens, GoldParse gold): self.moves.preprocess_gold(gold) - if self.cfg.beam_width <= 1: + if self.cfg.beam_width < 1: return self._greedy_train(tokens, gold) else: return self._beam_train(tokens, gold) @@ -185,8 +185,7 @@ cdef class Parser: if follow_gold: for j in range(self.moves.n_moves): beam.is_valid[i][j] *= beam.costs[i][j] == 0 - beam.advance(_transition_state, self.moves.c) - state = beam.at(0) + beam.advance(_transition_state, _hash_state, self.moves.c) beam.check_done(_check_final_state, NULL) def _count_feats(self, dict counts, Tokens tokens, list hist, int inc): @@ -222,3 +221,23 @@ cdef void* _init_state(Pool mem, int length, void* tokens) except NULL: cdef int _check_final_state(void* state, void* extra_args) except -1: return is_final(state) + + +cdef hash_t _hash_state(void* _state, void* _) except 0: + state = _state + cdef atom_t[10] rep + + rep[0] = state.stack[0] if state.stack_len >= 1 else 0 + rep[1] = state.stack[-1] if state.stack_len >= 2 else 0 + rep[2] = state.stack[-2] if state.stack_len >= 3 else 0 + rep[3] = state.i + rep[4] = state.sent[state.stack[0]].l_kids if state.stack_len >= 1 else 0 + rep[5] = state.sent[state.stack[0]].r_kids if state.stack_len >= 1 else 0 + rep[6] = state.sent[state.stack[0]].dep if state.stack_len >= 1 else 0 + rep[7] = state.sent[state.stack[-1]].dep if state.stack_len >= 2 else 0 + if get_left(state, get_n0(state), 1) != NULL: + rep[8] = get_left(state, get_n0(state), 1).dep + else: + rep[8] = 0 + rep[9] = state.sent[state.i].l_kids + return hash64(rep, sizeof(atom_t) * 10, 0) From bd4f5f89cb6fb125d26fe5a3cd1259a150f79a15 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 8 Jun 2015 16:17:07 +0200 Subject: [PATCH 02/75] * Add note about failed tokenization --- tests/tokenizer/test_tokenizer.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/tokenizer/test_tokenizer.py b/tests/tokenizer/test_tokenizer.py index ed2bfddf2..abf09dd03 100644 --- a/tests/tokenizer/test_tokenizer.py +++ b/tests/tokenizer/test_tokenizer.py @@ -103,10 +103,12 @@ def test_cnts5(en_tokenizer): tokens = en_tokenizer(text) assert len(tokens) == 11 -def test_mr(en_tokenizer): - text = """Mr. Smith""" - tokens = en_tokenizer(text) - assert len(tokens) == 2 +# TODO: This is currently difficult --- infix interferes here. +#def test_mr(en_tokenizer): +# text = """Today is Tuesday.Mr.""" +# tokens = en_tokenizer(text) +# assert len(tokens) == 5 +# assert [w.orth_ for w in tokens] == ['Today', 'is', 'Tuesday', '.', 'Mr.'] def test_cnts6(en_tokenizer): From ba10fd8af5442168eba704967b09c039ff2110f3 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 9 Jun 2015 01:39:54 +0200 Subject: [PATCH 03/75] * Add StateClass, to replace/refactor the mess in _state --- spacy/syntax/stateclass.pxd | 93 ++++++++++++++++++++++++++ spacy/syntax/stateclass.pyx | 126 ++++++++++++++++++++++++++++++++++++ 2 files changed, 219 insertions(+) create mode 100644 spacy/syntax/stateclass.pxd create mode 100644 spacy/syntax/stateclass.pyx diff --git a/spacy/syntax/stateclass.pxd b/spacy/syntax/stateclass.pxd new file mode 100644 index 000000000..63e22cac5 --- /dev/null +++ b/spacy/syntax/stateclass.pxd @@ -0,0 +1,93 @@ +from libc.string cimport memcpy, memset + +from cymem.cymem cimport Pool + +from structs cimport TokenC + +from .syntax._state cimport State + +from .vocab cimport EMPTY_LEXEME + + +cdef TokenC EMPTY_TOKEN + + +cdef class StateClass: + cdef Pool mem + cdef int* _stack + cdef int* _buffer + cdef TokenC* _sent + cdef int length + cdef int _s_i + cdef int _b_i + + @staticmethod + cdef inline StateClass init(const TokenC* sent, int length): + cdef StateClass self = StateClass(length) + memcpy(self._sent, sent, sizeof(TokenC*) * length) + return self + + @staticmethod + cdef inline StateClass from_struct(Pool mem, const State* state): + cdef StateClass self = StateClass.init(state.sent, state.sent_len) + memcpy(self._stack, state.stack - state.stack_len, sizeof(int) * state.stack_len) + self._s_i = state.stack_len - 1 + self._b_i = state.i + return self + + cdef inline const TokenC* S_(self, int i) nogil: + return self.safe_get(self.S(i)) + + cdef inline const TokenC* B_(self, int i) nogil: + return self.safe_get(self.B(i)) + + cdef inline const TokenC* H_(self, int i) nogil: + return self.safe_get(self.B(i)) + + cdef inline const TokenC* L_(self, int i, int idx) nogil: + return self.safe_get(self.L(i, idx)) + + cdef inline const TokenC* R_(self, int i, int idx) nogil: + return self.safe_get(self.R(i, idx)) + + cdef inline const TokenC* safe_get(self, int i) nogil: + if 0 >= i >= self.length: + return &EMPTY_TOKEN + else: + return self._sent + + cdef int S(self, int i) nogil + cdef int B(self, int i) nogil + + cdef int H(self, int i) nogil + + cdef int L(self, int i, int idx) nogil + cdef int R(self, int i, int idx) nogil + + cdef bint empty(self) nogil + + cdef bint eol(self) nogil + + cdef bint is_final(self) nogil + + cdef bint has_head(self, int i) nogil + + cdef bint stack_is_connected(self) nogil + + cdef int stack_depth(self) nogil + + cdef int buffer_length(self) nogil + + cdef void push(self) nogil + + cdef void pop(self) nogil + + cdef void add_arc(self, int head, int child, int label) nogil + + cdef void del_arc(self, int head, int child) nogil + + cdef void set_sent_end(self, int i) nogil + + cdef void clone(self, StateClass src) nogil + + diff --git a/spacy/syntax/stateclass.pyx b/spacy/syntax/stateclass.pyx new file mode 100644 index 000000000..511283da3 --- /dev/null +++ b/spacy/syntax/stateclass.pyx @@ -0,0 +1,126 @@ +from libc.string cimport memcpy, memset +from libc.stdint cimport uint32_t +from .vocab cimport EMPTY_LEXEME + + +memset(&EMPTY_TOKEN, 0, sizeof(TokenC)) +EMPTY_TOKEN.lex = &EMPTY_LEXEME + + +cdef class StateClass: + def __cinit__(self, int length): + self.mem = Pool() + self._stack = self.mem.alloc(sizeof(int), length) + self._buffer = self.mem.alloc(sizeof(int), length) + self._sent = self.mem.alloc(sizeof(TokenC*), length) + self.length = 0 + for i in range(self.length): + self._buffer[i] = i + + cdef int S(self, int i) nogil: + if self._s_i - (i+1) < 0: + return -1 + return self._stack[self._s_i - (i+1)] + + cdef int B(self, int i) nogil: + if (i + self._b_i) >= self.length: + return -1 + return self._buffer[self._b_i + i] + + cdef int H(self, int i) nogil: + if i < 0 or i >= self.length: + return -1 + return self._sent[i].head + i + + cdef int L(self, int i, int idx) nogil: + if 0 <= _popcount(self.safe_get(i).l_kids) <= idx: + return -1 + return _nth_significant_bit(self.safe_get(i).l_kids, idx) + + cdef int R(self, int i, int idx) nogil: + if 0 <= _popcount(self.safe_get(i).r_kids) <= idx: + return -1 + return _nth_significant_bit(self.safe_get(i).r_kids, idx) + + cdef bint empty(self) nogil: + return self._s_i <= 0 + + cdef bint eol(self) nogil: + return self._b_i >= self.length + + cdef bint is_final(self) nogil: + return self.eol() and self.empty() + + cdef bint has_head(self, int i) nogil: + return self.safe_get(i).head != 0 + + cdef bint stack_is_connected(self) nogil: + return False + + cdef int stack_depth(self) nogil: + return self._s_i + + cdef int buffer_length(self) nogil: + return self.length - self._b_i + + cdef void push(self) nogil: + self._stack[self._s_i] = self.B(0) + self._s_i += 1 + self._b_i += 1 + + cdef void pop(self) nogil: + self._s_i -= 1 + + cdef void add_arc(self, int head, int child, int label) nogil: + if self.has_head(child): + self.del_arc(self.H(child), child) + + cdef int dist = head - child + self._sent[child].head = dist + self._sent[child].dep = label + # Keep a bit-vector tracking child dependencies. If a word has a child at + # offset i from it, set that bit (tracking left and right separately) + if child > head: + self._sent[head].r_kids |= 1 << (-dist) + else: + self._sent[head].l_kids |= 1 << dist + + cdef void del_arc(self, int head, int child) nogil: + cdef int dist = head - child + if child > head: + self._sent[head].r_kids &= ~(1 << (-dist)) + else: + self._sent[head].l_kids &= ~(1 << dist) + + cdef void set_sent_end(self, int i) nogil: + if 0 < i < self.length: + self._sent[i].sent_end = True + + cdef void clone(self, StateClass src) nogil: + memcpy(self._sent, src._sent, self.length * sizeof(TokenC)) + memcpy(self._stack, src._stack, self.length * sizeof(int)) + memcpy(self._buffer, src._buffer, self.length * sizeof(int)) + self._b_i = src._b_i + self._s_i = src._s_i + + +# From https://en.wikipedia.org/wiki/Hamming_weight +cdef inline uint32_t _popcount(uint32_t x) nogil: + """Find number of non-zero bits.""" + cdef int count = 0 + while x != 0: + x &= x - 1 + count += 1 + return count + + +cdef inline uint32_t _nth_significant_bit(uint32_t bits, int n) nogil: + cdef int i + for i in range(32): + if bits & (1 << i): + if n < 1: + return i + n -= 1 + return 0 + + From 2b9629ed629ead735039c70b34427f7b9a8748ac Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 9 Jun 2015 01:41:09 +0200 Subject: [PATCH 04/75] * Begin adding stateclass to ArcEager --- spacy/syntax/arc_eager.pyx | 76 ++++++++++++++++++++++++++++++++++---- 1 file changed, 68 insertions(+), 8 deletions(-) diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index afa05bd9a..f1dbcf426 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -1,6 +1,9 @@ # cython: profile=True from __future__ import unicode_literals +import ctypes +import os + from ._state cimport State from ._state cimport has_head, get_idx, get_s0, get_n0, get_left, get_right from ._state cimport is_final, at_eol, pop_stack, push_stack, add_dep @@ -15,6 +18,12 @@ from .transition_system cimport move_cost_func_t, label_cost_func_t from ..gold cimport GoldParse from ..gold cimport GoldParseC +from libc.stdint cimport uint32_t +from libc.string cimport memcpy + +from cymem.cymem cimport Pool +from ..stateclass cimport StateClass + DEF NON_MONOTONIC = True DEF USE_BREAK = True @@ -78,7 +87,6 @@ cdef int arc_cost(const State* st, const GoldParseC* gold, int head, int child) return 0 - cdef bint arc_is_gold(const GoldParseC* gold, int head, int child) except -1: if gold.labels[child] == -1: return True @@ -110,6 +118,11 @@ cdef class Shift: cdef bint is_valid(const State* s, int label) except -1: return not at_eol(s) + @staticmethod + cdef bint _new_is_valid(StateClass st, int label) except -1: + return not st.eol() + + @staticmethod cdef int transition(State* state, int label) except -1: # Set the dep label, in case we need it after we reduce @@ -133,6 +146,13 @@ cdef class Shift: cdef class Reduce: + @staticmethod + cdef bint _new_is_valid(StateClass st, int label) except -1: + if NON_MONOTONIC: + return st.stack_depth() >= 2 #and not missing_brackets(s) + else: + return st.stack_depth() >= 2 and st.has_head(st.S(0)) + @staticmethod cdef bint is_valid(const State* s, int label) except -1: if NON_MONOTONIC: @@ -165,6 +185,13 @@ cdef class Reduce: cdef class LeftArc: + @staticmethod + cdef bint _new_is_valid(StateClass st, int label) except -1: + if NON_MONOTONIC: + return st.stack_depth() >= 1 #and not missing_brackets(s) + else: + return st.stack_depth() >= 1 and not st.has_head(st.S(0)) + @staticmethod cdef bint is_valid(const State* s, int label) except -1: if NON_MONOTONIC: @@ -206,6 +233,10 @@ cdef class RightArc: cdef bint is_valid(const State* s, int label) except -1: return s.stack_len >= 1 and not at_eol(s) + @staticmethod + cdef bint _new_is_valid(StateClass st, int label) except -1: + return st.stack_depth() >= 1 and not st.eol() + @staticmethod cdef int transition(State* state, int label) except -1: add_dep(state, state.stack[0], state.i, label) @@ -230,6 +261,32 @@ cdef class RightArc: cdef class Break: + @staticmethod + cdef bint _new_is_valid(StateClass st, int label) except -1: + cdef int i + if not USE_BREAK: + return False + elif st.eol(): + return False + elif st.stack_depth() < 1: + return False + elif NON_MONOTONIC: + return True + else: + # In the Break transition paper, they have this constraint that prevents + # Break if stack is disconnected. But, if we're doing non-monotonic parsing, + # we prefer to relax this constraint. This is helpful in parsing whole + # documents, because then we don't get stuck with words on the stack. + seen_headless = False + for i in range(st.stack_depth()): + if not st.has_head(st.S(i)): + if seen_headless: + return False + else: + seen_headless = True + # TODO: Constituency constraints + return True + @staticmethod cdef bint is_valid(const State* s, int label) except -1: cdef int i @@ -584,14 +641,17 @@ cdef class ArcEager(TransitionSystem): output[i] = move_costs[move] + label_cost_funcs[move](s, &gold.c, label) cdef Transition best_valid(self, const weight_t* scores, const State* s) except *: + cdef Pool mem = Pool() + cdef StateClass stcls = StateClass.from_struct(mem, s) cdef bint[N_MOVES] is_valid - is_valid[SHIFT] = Shift.is_valid(s, -1) - is_valid[REDUCE] = Reduce.is_valid(s, -1) - is_valid[LEFT] = LeftArc.is_valid(s, -1) - is_valid[RIGHT] = RightArc.is_valid(s, -1) - is_valid[BREAK] = Break.is_valid(s, -1) - is_valid[CONSTITUENT] = Constituent.is_valid(s, -1) - is_valid[ADJUST] = Adjust.is_valid(s, -1) + #is_valid[SHIFT] = Shift.is_valid(s, -1) + is_valid[SHIFT] = Shift._new_is_valid(stcls, -1) + is_valid[REDUCE] = Reduce._new_is_valid(stcls, -1) + is_valid[LEFT] = LeftArc._new_is_valid(stcls, -1) + is_valid[RIGHT] = RightArc._new_is_valid(stcls, -1) + is_valid[BREAK] = Break._new_is_valid(stcls, -1) + is_valid[CONSTITUENT] = False # Constituent._new_is_valid(s, -1) + is_valid[ADJUST] = False # Adjust._new_is_valid(s, -1) cdef Transition best cdef weight_t score = MIN_SCORE cdef int i From 0895d454fb5a0c546ab3842efaf1c24b3b5d5961 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 9 Jun 2015 21:20:14 +0200 Subject: [PATCH 05/75] * Prepare to switch to using state class, instead of state struct --- spacy/syntax/_parse_features.pxd | 1 + spacy/syntax/_parse_features.pyx | 52 ++++++++++++++ spacy/syntax/_state.pyx | 40 ++++++----- spacy/syntax/arc_eager.pxd | 1 - spacy/syntax/arc_eager.pyx | 75 ++++++++++++++------ spacy/syntax/parser.pyx | 5 +- spacy/syntax/stateclass.pxd | 59 ++++++---------- spacy/syntax/stateclass.pyx | 106 ++++++++++++++++++++++++----- spacy/syntax/transition_system.pxd | 7 -- spacy/syntax/transition_system.pyx | 40 ++--------- 10 files changed, 245 insertions(+), 141 deletions(-) diff --git a/spacy/syntax/_parse_features.pxd b/spacy/syntax/_parse_features.pxd index 0a5965671..d1410a742 100644 --- a/spacy/syntax/_parse_features.pxd +++ b/spacy/syntax/_parse_features.pxd @@ -4,6 +4,7 @@ from ._state cimport State cdef int fill_context(atom_t* context, State* state) except -1 +cdef int _new_fill_context(atom_t* context, State* state) except -1 # Context elements # Ensure each token's attributes are listed: w, p, c, c6, c4. The order diff --git a/spacy/syntax/_parse_features.pyx b/spacy/syntax/_parse_features.pyx index adbaff05d..2787e1c80 100644 --- a/spacy/syntax/_parse_features.pyx +++ b/spacy/syntax/_parse_features.pyx @@ -20,6 +20,11 @@ from ._state cimport has_head, get_left, get_right from ._state cimport count_left_kids, count_right_kids +from .stateclass cimport StateClass + +from cymem.cymem cimport Pool + + cdef inline void fill_token(atom_t* context, const TokenC* token) nogil: if token is NULL: context[0] = 0 @@ -60,6 +65,53 @@ cdef inline void fill_token(atom_t* context, const TokenC* token) nogil: context[10] = token.ent_iob context[11] = token.ent_type +cdef int _new_fill_context(atom_t* ctxt, State* state) except -1: + # Take care to fill every element of context! + # We could memset, but this makes it very easy to have broken features that + # make almost no impact on accuracy. If instead they're unset, the impact + # tends to be dramatic, so we get an obvious regression to fix... + cdef StateClass st = StateClass(state.sent_len) + st.from_struct(state) + fill_token(&ctxt[S2w], st.S_(2)) + fill_token(&ctxt[S1w], st.S_(1)) + fill_token(&ctxt[S1rw], st.R_(st.S(1), 1)) + fill_token(&ctxt[S0lw], st.L_(st.S(0), 1)) + fill_token(&ctxt[S0l2w], st.L_(st.S(0), 2)) + fill_token(&ctxt[S0w], st.S_(0)) + fill_token(&ctxt[S0r2w], st.R_(st.S(0), 2)) + fill_token(&ctxt[S0rw], st.R_(st.S(0), 1)) + fill_token(&ctxt[N0lw], st.L_(st.B(0), 1)) + fill_token(&ctxt[N0l2w], st.L_(st.B(0), 2)) + fill_token(&ctxt[N0w], st.B_(0)) + fill_token(&ctxt[N1w], st.B_(1)) + fill_token(&ctxt[N2w], st.B_(2)) + fill_token(&ctxt[P1w], st.safe_get(st.B(0)-1)) + fill_token(&ctxt[P2w], st.safe_get(st.B(0)-2)) + + # TODO + fill_token(&ctxt[E0w], get_e0(state)) + fill_token(&ctxt[E1w], get_e1(state)) + + if st.stack_depth() >= 1 and not st.eol(): + ctxt[dist] = min(st.S(0) - st.B(0), 5) # TODO: This is backwards!! + else: + ctxt[dist] = 0 + ctxt[N0lv] = min(st.n_L(st.B(0)), 5) + ctxt[S0lv] = min(st.n_L(st.S(0)), 5) + ctxt[S0rv] = min(st.n_R(st.S(0)), 5) + ctxt[S1lv] = min(st.n_L(st.S(1)), 5) + ctxt[S1rv] = min(st.n_R(st.S(1)), 5) + + ctxt[S0_has_head] = 0 + ctxt[S1_has_head] = 0 + ctxt[S2_has_head] = 0 + if st.stack_depth() >= 1: + ctxt[S0_has_head] = st.has_head(st.S(0)) + 1 + if st.stack_depth() >= 2: + ctxt[S1_has_head] = st.has_head(st.S(1)) + 1 + if st.stack_depth() >= 3: + ctxt[S2_has_head] = st.has_head(st.S(2)) + 1 + cdef int fill_context(atom_t* context, State* state) except -1: # Take care to fill every element of context! diff --git a/spacy/syntax/_state.pyx b/spacy/syntax/_state.pyx index 3a876df2e..e499b6461 100644 --- a/spacy/syntax/_state.pyx +++ b/spacy/syntax/_state.pyx @@ -115,29 +115,33 @@ cdef bint has_head(const TokenC* t) nogil: cdef const TokenC* get_left(const State* s, const TokenC* head, const int idx) nogil: return _new_get_left(s, head, idx) - #cdef uint32_t kids = head.l_kids - #if kids == 0: - # return NULL - #cdef int offset = _nth_significant_bit(kids, idx) - #cdef const TokenC* child = head - offset - #if child >= s.sent: - # return child - ##else: - # return NULL +""" + cdef uint32_t kids = head.l_kids + if kids == 0: + return NULL + cdef int offset = _nth_significant_bit(kids, idx) + cdef const TokenC* child = head - offset + if child >= s.sent: + return child + else: + return NULL +""" cdef const TokenC* get_right(const State* s, const TokenC* head, const int idx) nogil: return _new_get_right(s, head, idx) - #cdef uint32_t kids = head.r_kids - #if kids == 0: - # return NULL - #cdef int offset = _nth_significant_bit(kids, idx) - #cdef const TokenC* child = head + offset - #if child < (s.sent + s.sent_len): - # return child - #else: - # return NULL +""" + cdef uint32_t kids = head.r_kids + if kids == 0: + return NULL + cdef int offset = _nth_significant_bit(kids, idx) + cdef const TokenC* child = head + offset + if child < (s.sent + s.sent_len): + return child + else: + return NULL +""" cdef int count_left_kids(const TokenC* head) nogil: return _popcount(head.l_kids) diff --git a/spacy/syntax/arc_eager.pxd b/spacy/syntax/arc_eager.pxd index 606629c66..aedfe6031 100644 --- a/spacy/syntax/arc_eager.pxd +++ b/spacy/syntax/arc_eager.pxd @@ -6,6 +6,5 @@ from thinc.typedefs cimport weight_t from ._state cimport State from .transition_system cimport TransitionSystem, Transition - cdef class ArcEager(TransitionSystem): pass diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index f1dbcf426..d667e4d86 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -22,7 +22,7 @@ from libc.stdint cimport uint32_t from libc.string cimport memcpy from cymem.cymem cimport Pool -from ..stateclass cimport StateClass +from .stateclass cimport StateClass DEF NON_MONOTONIC = True @@ -59,32 +59,63 @@ MOVE_NAMES[ADJUST] = 'A' # Helper functions for the arc-eager oracle cdef int push_cost(const State* st, const GoldParseC* gold, int target) except -1: - # When we push a word, we can't make arcs to or from the stack. So, we lose - # any of those arcs. + cdef StateClass stcls = StateClass(st.sent_len) + stcls.from_struct(st) cdef int cost = 0 - cost += head_in_stack(st, target, gold.heads) - cost += children_in_stack(st, target, gold.heads) - # If we can Break, we shouldn't push + cdef int i, S_i + for i in range(stcls.stack_depth()): + S_i = stcls.S(i) + if gold.heads[target] == S_i: + cost += 1 + if gold.heads[S_i] == target and (NON_MONOTONIC or not stcls.has_head(S_i)): + cost += 1 cost += Break.is_valid(st, -1) and Break.move_cost(st, gold) == 0 return cost + # When we push a word, we can't make arcs to or from the stack. So, we lose + # any of those arcs. + #cost += head_in_stack(st, target, gold.heads) + #cost += children_in_stack(st, target, gold.heads) + # If we can Break, we shouldn't push + #cost += Break.is_valid(st, -1) and Break.move_cost(st, gold) == 0 + #return cost cdef int pop_cost(const State* st, const GoldParseC* gold, int target) except -1: + cdef StateClass stcls = StateClass(st.sent_len) + stcls.from_struct(st) cdef int cost = 0 - cost += children_in_buffer(st, target, gold.heads) - cost += head_in_buffer(st, target, gold.heads) + cdef int i, B_i + for i in range(stcls.buffer_length()): + B_i = stcls.B(i) + cost += gold.heads[B_i] == target + cost += gold.heads[target] == B_i + if gold.heads[B_i] == B_i or gold.heads[B_i] < target: + break return cost + #cost += children_in_buffer(st, target, gold.heads) + #cost += head_in_buffer(st, target, gold.heads) + #return cost cdef int arc_cost(const State* st, const GoldParseC* gold, int head, int child) except -1: + cdef StateClass stcls = StateClass(st.sent_len) + stcls.from_struct(st) if arc_is_gold(gold, head, child): return 0 - elif (child + st.sent[child].head) == gold.heads[child]: + elif stcls.H(child) == gold.heads[child]: return 1 - elif gold.heads[child] >= st.i: + elif gold.heads[child] >= stcls.B(0): return 1 else: return 0 + #if arc_is_gold(gold, head, child): + # return 0 + #elif (child + st.sent[child].head) == gold.heads[child]: + # return 1 + #elif gold.heads[child] >= st.i: + # return 1 + #else: + # return 0 cdef bint arc_is_gold(const GoldParseC* gold, int head, int child) except -1: @@ -122,7 +153,6 @@ cdef class Shift: cdef bint _new_is_valid(StateClass st, int label) except -1: return not st.eol() - @staticmethod cdef int transition(State* state, int label) except -1: # Set the dep label, in case we need it after we reduce @@ -596,14 +626,17 @@ cdef class ArcEager(TransitionSystem): state.sent[i].dep = root_label cdef int set_valid(self, bint* output, const State* state) except -1: + raise Exception + cdef StateClass stcls = StateClass(state.sent_len) + stcls.from_struct(state) cdef bint[N_MOVES] is_valid - is_valid[SHIFT] = Shift.is_valid(state, -1) - is_valid[REDUCE] = Reduce.is_valid(state, -1) - is_valid[LEFT] = LeftArc.is_valid(state, -1) - is_valid[RIGHT] = RightArc.is_valid(state, -1) - is_valid[BREAK] = Break.is_valid(state, -1) - is_valid[CONSTITUENT] = Constituent.is_valid(state, -1) - is_valid[ADJUST] = Adjust.is_valid(state, -1) + is_valid[SHIFT] = Shift._new_is_valid(stcls, -1) + is_valid[REDUCE] = Reduce._new_is_valid(stcls, -1) + is_valid[LEFT] = LeftArc._new_is_valid(stcls, -1) + is_valid[RIGHT] = RightArc._new_is_valid(stcls, -1) + is_valid[BREAK] = Break._new_is_valid(stcls, -1) + is_valid[CONSTITUENT] = False # Constituent.is_valid(state, -1) + is_valid[ADJUST] = False # Adjust.is_valid(state, -1) cdef int i for i in range(self.n_moves): output[i] = is_valid[self.c[i].move] @@ -641,10 +674,10 @@ cdef class ArcEager(TransitionSystem): output[i] = move_costs[move] + label_cost_funcs[move](s, &gold.c, label) cdef Transition best_valid(self, const weight_t* scores, const State* s) except *: - cdef Pool mem = Pool() - cdef StateClass stcls = StateClass.from_struct(mem, s) + assert s is not NULL + cdef StateClass stcls = StateClass(s.sent_len) + stcls.from_struct(s) cdef bint[N_MOVES] is_valid - #is_valid[SHIFT] = Shift.is_valid(s, -1) is_valid[SHIFT] = Shift._new_is_valid(stcls, -1) is_valid[REDUCE] = Reduce._new_is_valid(stcls, -1) is_valid[LEFT] = LeftArc._new_is_valid(stcls, -1) diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 47921563b..712673d85 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -1,4 +1,5 @@ # cython: profile=True +# cython: experimental_cpp_class_def=True """ MALT-style dependency parser """ @@ -38,7 +39,9 @@ from ._state cimport State, new_state, copy_state, is_final, push_stack, get_lef from ..gold cimport GoldParse from . import _parse_features -from ._parse_features cimport fill_context, CONTEXT_SIZE +from ._parse_features cimport CONTEXT_SIZE +from ._parse_features cimport _new_fill_context as fill_context +#from ._parse_features cimport fill_context DEBUG = False diff --git a/spacy/syntax/stateclass.pxd b/spacy/syntax/stateclass.pxd index 63e22cac5..e543a4529 100644 --- a/spacy/syntax/stateclass.pxd +++ b/spacy/syntax/stateclass.pxd @@ -2,14 +2,11 @@ from libc.string cimport memcpy, memset from cymem.cymem cimport Pool -from structs cimport TokenC +from ..structs cimport TokenC -from .syntax._state cimport State +from ._state cimport State -from .vocab cimport EMPTY_LEXEME - - -cdef TokenC EMPTY_TOKEN +from ..vocab cimport EMPTY_LEXEME cdef class StateClass: @@ -17,45 +14,13 @@ cdef class StateClass: cdef int* _stack cdef int* _buffer cdef TokenC* _sent + cdef TokenC _empty_token cdef int length cdef int _s_i cdef int _b_i - @staticmethod - cdef inline StateClass init(const TokenC* sent, int length): - cdef StateClass self = StateClass(length) - memcpy(self._sent, sent, sizeof(TokenC*) * length) - return self + cdef int from_struct(self, const State* state) except -1 - @staticmethod - cdef inline StateClass from_struct(Pool mem, const State* state): - cdef StateClass self = StateClass.init(state.sent, state.sent_len) - memcpy(self._stack, state.stack - state.stack_len, sizeof(int) * state.stack_len) - self._s_i = state.stack_len - 1 - self._b_i = state.i - return self - - cdef inline const TokenC* S_(self, int i) nogil: - return self.safe_get(self.S(i)) - - cdef inline const TokenC* B_(self, int i) nogil: - return self.safe_get(self.B(i)) - - cdef inline const TokenC* H_(self, int i) nogil: - return self.safe_get(self.B(i)) - - cdef inline const TokenC* L_(self, int i, int idx) nogil: - return self.safe_get(self.L(i, idx)) - - cdef inline const TokenC* R_(self, int i, int idx) nogil: - return self.safe_get(self.R(i, idx)) - - cdef inline const TokenC* safe_get(self, int i) nogil: - if 0 >= i >= self.length: - return &EMPTY_TOKEN - else: - return self._sent - cdef int S(self, int i) nogil cdef int B(self, int i) nogil @@ -64,6 +29,16 @@ cdef class StateClass: cdef int L(self, int i, int idx) nogil cdef int R(self, int i, int idx) nogil + cdef const TokenC* S_(self, int i) nogil + cdef const TokenC* B_(self, int i) nogil + + cdef const TokenC* H_(self, int i) nogil + + cdef const TokenC* L_(self, int i, int idx) nogil + cdef const TokenC* R_(self, int i, int idx) nogil + + cdef const TokenC* safe_get(self, int i) nogil + cdef bint empty(self) nogil cdef bint eol(self) nogil @@ -72,6 +47,10 @@ cdef class StateClass: cdef bint has_head(self, int i) nogil + cdef int n_L(self, int i) nogil + + cdef int n_R(self, int i) nogil + cdef bint stack_is_connected(self) nogil cdef int stack_depth(self) nogil diff --git a/spacy/syntax/stateclass.pyx b/spacy/syntax/stateclass.pyx index 511283da3..724e1fadb 100644 --- a/spacy/syntax/stateclass.pyx +++ b/spacy/syntax/stateclass.pyx @@ -1,24 +1,33 @@ from libc.string cimport memcpy, memset from libc.stdint cimport uint32_t -from .vocab cimport EMPTY_LEXEME - - -memset(&EMPTY_TOKEN, 0, sizeof(TokenC)) -EMPTY_TOKEN.lex = &EMPTY_LEXEME +from ..vocab cimport EMPTY_LEXEME cdef class StateClass: - def __cinit__(self, int length): - self.mem = Pool() - self._stack = self.mem.alloc(sizeof(int), length) - self._buffer = self.mem.alloc(sizeof(int), length) - self._sent = self.mem.alloc(sizeof(TokenC*), length) - self.length = 0 - for i in range(self.length): + def __init__(self, int length): + cdef Pool mem = Pool() + self._buffer = mem.alloc(length, sizeof(int)) + self._stack = mem.alloc(length, sizeof(int)) + self._sent = mem.alloc(length, sizeof(TokenC)) + self.mem = mem + self.length = length + self._s_i = 0 + self._b_i = 0 + cdef int i + for i in range(length): self._buffer[i] = i + self._empty_token.lex = &EMPTY_LEXEME + + cdef int from_struct(self, const State* state) except -1: + self._s_i = state.stack_len + self._b_i = state.i + memcpy(self._sent, state.sent, sizeof(TokenC) * self.length) + cdef int i + for i in range(state.stack_len): + self._stack[self._s_i - (i+1)] = state.stack[-i] cdef int S(self, int i) nogil: - if self._s_i - (i+1) < 0: + if i >= self._s_i: return -1 return self._stack[self._s_i - (i+1)] @@ -33,14 +42,71 @@ cdef class StateClass: return self._sent[i].head + i cdef int L(self, int i, int idx) nogil: - if 0 <= _popcount(self.safe_get(i).l_kids) <= idx: + if idx < 1: return -1 - return _nth_significant_bit(self.safe_get(i).l_kids, idx) + if i < 0 or i >= self.length: + return -1 + cdef const TokenC* target = &self._sent[i] + cdef const TokenC* ptr = self._sent + + while ptr < target: + # If this head is still to the right of us, we can skip to it + # No token that's between this token and this head could be our + # child. + if (ptr.head >= 1) and (ptr + ptr.head) < target: + ptr += ptr.head + + elif ptr + ptr.head == target: + idx -= 1 + if idx == 0: + return ptr - self._sent + ptr += 1 + else: + ptr += 1 + return -1 cdef int R(self, int i, int idx) nogil: - if 0 <= _popcount(self.safe_get(i).r_kids) <= idx: + if idx < 1: return -1 - return _nth_significant_bit(self.safe_get(i).r_kids, idx) + if i < 0 or i >= self.length: + return -1 + cdef const TokenC* ptr = self._sent + (self.length - 1) + cdef const TokenC* target = &self._sent[i] + while ptr > target: + # If this head is still to the right of us, we can skip to it + # No token that's between this token and this head could be our + # child. + if (ptr.head < 0) and ((ptr + ptr.head) > target): + ptr += ptr.head + elif ptr + ptr.head == target: + idx -= 1 + if idx == 0: + return ptr - self._sent + ptr -= 1 + else: + ptr -= 1 + return -1 + + cdef const TokenC* S_(self, int i) nogil: + return self.safe_get(self.S(i)) + + cdef const TokenC* B_(self, int i) nogil: + return self.safe_get(self.B(i)) + + cdef const TokenC* H_(self, int i) nogil: + return self.safe_get(self.B(i)) + + cdef const TokenC* L_(self, int i, int idx) nogil: + return self.safe_get(self.L(i, idx)) + + cdef const TokenC* R_(self, int i, int idx) nogil: + return self.safe_get(self.R(i, idx)) + + cdef const TokenC* safe_get(self, int i) nogil: + if i < 0 or i >= self.length: + return &self._empty_token + else: + return &self._sent[i] cdef bint empty(self) nogil: return self._s_i <= 0 @@ -54,6 +120,12 @@ cdef class StateClass: cdef bint has_head(self, int i) nogil: return self.safe_get(i).head != 0 + cdef int n_L(self, int i) nogil: + return _popcount(self.safe_get(i).l_kids) + + cdef int n_R(self, int i) nogil: + return _popcount(self.safe_get(i).r_kids) + cdef bint stack_is_connected(self) nogil: return False diff --git a/spacy/syntax/transition_system.pxd b/spacy/syntax/transition_system.pxd index 584e361df..5f21987a5 100644 --- a/spacy/syntax/transition_system.pxd +++ b/spacy/syntax/transition_system.pxd @@ -51,10 +51,3 @@ cdef class TransitionSystem: cdef Transition best_gold(self, const weight_t* scores, const State* state, GoldParse gold) except * - - -#cdef class PyState: -# """Provide a Python class for testing purposes.""" -# cdef Pool mem -# cdef TransitionSystem system -# cdef State* _state diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx index 67e325240..664af67c4 100644 --- a/spacy/syntax/transition_system.pyx +++ b/spacy/syntax/transition_system.pyx @@ -3,6 +3,8 @@ from ._state cimport State from ..structs cimport TokenC from thinc.typedefs cimport weight_t +from .stateclass cimport StateClass + cdef weight_t MIN_SCORE = -90000 @@ -55,6 +57,8 @@ cdef class TransitionSystem: cdef Transition best_gold(self, const weight_t* scores, const State* s, GoldParse gold) except *: + cdef StateClass stcls = StateClass(s.sent_len) + stcls.from_struct(s) cdef Transition best cdef weight_t score = MIN_SCORE cdef int i @@ -65,39 +69,3 @@ cdef class TransitionSystem: score = scores[i] assert score > MIN_SCORE return best - - -#cdef class PyState: -# """Provide a Python class for testing purposes.""" -# def __init__(self, GoldParse gold): -# self.mem = Pool() -# self.system = EntityRecognition(labels) -# self._state = init_state(self.mem, tokens, gold.length) -# -# def transition(self, name): -# cdef const Transition* trans = self._transition_by_name(name) -# trans.do(trans, self._state) -# -# def is_valid(self, name): -# cdef const Transition* trans = self._transition_by_name(name) -# return _is_valid(trans.move, trans.label, self._state) -# -# def is_gold(self, name): -# cdef const Transition* trans = self._transition_by_name(name) -# return _get_const(trans, self._state, self._gold) -# -# property ent: -# def __get__(self): -# pass -# -# property n_ents: -# def __get__(self): -# pass -# -# property i: -# def __get__(self): -# pass -# -# property open_entity: -# def __get__(self): -# return entity_is_open(self._s) From 09617a4638cf6066102afdf1443810f9bf80d0c4 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 9 Jun 2015 21:20:33 +0200 Subject: [PATCH 06/75] * Whitespace --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 010cfa06b..a7414f7dd 100644 --- a/setup.py +++ b/setup.py @@ -150,11 +150,12 @@ def main(modules, is_pypy): MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings', 'spacy.lexeme', 'spacy.vocab', 'spacy.tokens', 'spacy.spans', 'spacy.morphology', + 'spacy.syntax.stateclass', 'spacy._ml', 'spacy.tokenizer', 'spacy.en.attrs', 'spacy.en.pos', 'spacy.syntax.parser', 'spacy.syntax._state', 'spacy.syntax.transition_system', 'spacy.syntax.arc_eager', 'spacy.syntax._parse_features', - 'spacy.gold', 'spacy.orth', + 'spacy.gold', 'spacy.orth', 'spacy.syntax.ner'] From e0cf61f5917f5c04e4d08747553b20b683b3bf44 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 9 Jun 2015 23:23:28 +0200 Subject: [PATCH 07/75] * Move StateClass into the interface for is_valid --- spacy/syntax/_parse_features.pxd | 3 +- spacy/syntax/_parse_features.pyx | 8 +- spacy/syntax/arc_eager.pxd | 2 + spacy/syntax/arc_eager.pyx | 320 +++-------------------------- spacy/syntax/ner.pyx | 45 ++-- spacy/syntax/parser.pyx | 21 +- spacy/syntax/stateclass.pxd | 15 +- spacy/syntax/stateclass.pyx | 35 +++- spacy/syntax/transition_system.pxd | 10 +- spacy/syntax/transition_system.pyx | 13 +- 10 files changed, 132 insertions(+), 340 deletions(-) diff --git a/spacy/syntax/_parse_features.pxd b/spacy/syntax/_parse_features.pxd index d1410a742..6c1f0d6a5 100644 --- a/spacy/syntax/_parse_features.pxd +++ b/spacy/syntax/_parse_features.pxd @@ -1,10 +1,11 @@ from thinc.typedefs cimport atom_t from ._state cimport State +from .stateclass cimport StateClass cdef int fill_context(atom_t* context, State* state) except -1 -cdef int _new_fill_context(atom_t* context, State* state) except -1 +cdef int _new_fill_context(atom_t* context, StateClass state) except -1 # Context elements # Ensure each token's attributes are listed: w, p, c, c6, c4. The order diff --git a/spacy/syntax/_parse_features.pyx b/spacy/syntax/_parse_features.pyx index 2787e1c80..db59c82a2 100644 --- a/spacy/syntax/_parse_features.pyx +++ b/spacy/syntax/_parse_features.pyx @@ -65,13 +65,11 @@ cdef inline void fill_token(atom_t* context, const TokenC* token) nogil: context[10] = token.ent_iob context[11] = token.ent_type -cdef int _new_fill_context(atom_t* ctxt, State* state) except -1: +cdef int _new_fill_context(atom_t* ctxt, StateClass st) except -1: # Take care to fill every element of context! # We could memset, but this makes it very easy to have broken features that # make almost no impact on accuracy. If instead they're unset, the impact # tends to be dramatic, so we get an obvious regression to fix... - cdef StateClass st = StateClass(state.sent_len) - st.from_struct(state) fill_token(&ctxt[S2w], st.S_(2)) fill_token(&ctxt[S1w], st.S_(1)) fill_token(&ctxt[S1rw], st.R_(st.S(1), 1)) @@ -89,8 +87,8 @@ cdef int _new_fill_context(atom_t* ctxt, State* state) except -1: fill_token(&ctxt[P2w], st.safe_get(st.B(0)-2)) # TODO - fill_token(&ctxt[E0w], get_e0(state)) - fill_token(&ctxt[E1w], get_e1(state)) + fill_token(&ctxt[E0w], st.E_(0)) + fill_token(&ctxt[E1w], st.E_(1)) if st.stack_depth() >= 1 and not st.eol(): ctxt[dist] = min(st.S(0) - st.B(0), 5) # TODO: This is backwards!! diff --git a/spacy/syntax/arc_eager.pxd b/spacy/syntax/arc_eager.pxd index aedfe6031..81b26f703 100644 --- a/spacy/syntax/arc_eager.pxd +++ b/spacy/syntax/arc_eager.pxd @@ -4,6 +4,8 @@ from thinc.typedefs cimport weight_t from ._state cimport State +from .stateclass cimport StateClass + from .transition_system cimport TransitionSystem, Transition cdef class ArcEager(TransitionSystem): diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index d667e4d86..c5bd5475b 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -40,9 +40,6 @@ cdef enum: BREAK - CONSTITUENT - ADJUST - N_MOVES @@ -52,8 +49,6 @@ MOVE_NAMES[REDUCE] = 'D' MOVE_NAMES[LEFT] = 'L' MOVE_NAMES[RIGHT] = 'R' MOVE_NAMES[BREAK] = 'B' -MOVE_NAMES[CONSTITUENT] = 'C' -MOVE_NAMES[ADJUST] = 'A' # Helper functions for the arc-eager oracle @@ -69,15 +64,8 @@ cdef int push_cost(const State* st, const GoldParseC* gold, int target) except - cost += 1 if gold.heads[S_i] == target and (NON_MONOTONIC or not stcls.has_head(S_i)): cost += 1 - cost += Break.is_valid(st, -1) and Break.move_cost(st, gold) == 0 + cost += Break.is_valid(stcls, -1) and Break.move_cost(st, gold) == 0 return cost - # When we push a word, we can't make arcs to or from the stack. So, we lose - # any of those arcs. - #cost += head_in_stack(st, target, gold.heads) - #cost += children_in_stack(st, target, gold.heads) - # If we can Break, we shouldn't push - #cost += Break.is_valid(st, -1) and Break.move_cost(st, gold) == 0 - #return cost cdef int pop_cost(const State* st, const GoldParseC* gold, int target) except -1: @@ -92,10 +80,6 @@ cdef int pop_cost(const State* st, const GoldParseC* gold, int target) except -1 if gold.heads[B_i] == B_i or gold.heads[B_i] < target: break return cost - #cost += children_in_buffer(st, target, gold.heads) - #cost += head_in_buffer(st, target, gold.heads) - #return cost - cdef int arc_cost(const State* st, const GoldParseC* gold, int head, int child) except -1: cdef StateClass stcls = StateClass(st.sent_len) @@ -108,14 +92,6 @@ cdef int arc_cost(const State* st, const GoldParseC* gold, int head, int child) return 1 else: return 0 - #if arc_is_gold(gold, head, child): - # return 0 - #elif (child + st.sent[child].head) == gold.heads[child]: - # return 1 - #elif gold.heads[child] >= st.i: - # return 1 - #else: - # return 0 cdef bint arc_is_gold(const GoldParseC* gold, int head, int child) except -1: @@ -146,11 +122,7 @@ cdef bint _is_gold_root(const GoldParseC* gold, int word) except -1: cdef class Shift: @staticmethod - cdef bint is_valid(const State* s, int label) except -1: - return not at_eol(s) - - @staticmethod - cdef bint _new_is_valid(StateClass st, int label) except -1: + cdef bint is_valid(StateClass st, int label) except -1: return not st.eol() @staticmethod @@ -162,8 +134,6 @@ cdef class Shift: @staticmethod cdef int cost(const State* s, const GoldParseC* gold, int label) except -1: - if not Shift.is_valid(s, label): - return 9000 return Shift.move_cost(s, gold) + Shift.label_cost(s, gold, label) @staticmethod @@ -177,19 +147,12 @@ cdef class Shift: cdef class Reduce: @staticmethod - cdef bint _new_is_valid(StateClass st, int label) except -1: + cdef bint is_valid(StateClass st, int label) except -1: if NON_MONOTONIC: return st.stack_depth() >= 2 #and not missing_brackets(s) else: return st.stack_depth() >= 2 and st.has_head(st.S(0)) - @staticmethod - cdef bint is_valid(const State* s, int label) except -1: - if NON_MONOTONIC: - return s.stack_len >= 2 #and not missing_brackets(s) - else: - return s.stack_len >= 2 and has_head(get_s0(s)) - @staticmethod cdef int transition(State* state, int label) except -1: if NON_MONOTONIC and not has_head(get_s0(state)) and state.stack_len >= 2: @@ -198,8 +161,6 @@ cdef class Reduce: @staticmethod cdef int cost(const State* s, const GoldParseC* gold, int label) except -1: - if not Reduce.is_valid(s, label): - return 9000 return Reduce.move_cost(s, gold) + Reduce.label_cost(s, gold, label) @staticmethod @@ -216,19 +177,12 @@ cdef class Reduce: cdef class LeftArc: @staticmethod - cdef bint _new_is_valid(StateClass st, int label) except -1: + cdef bint is_valid(StateClass st, int label) except -1: if NON_MONOTONIC: return st.stack_depth() >= 1 #and not missing_brackets(s) else: return st.stack_depth() >= 1 and not st.has_head(st.S(0)) - @staticmethod - cdef bint is_valid(const State* s, int label) except -1: - if NON_MONOTONIC: - return s.stack_len >= 1 #and not missing_brackets(s) - else: - return s.stack_len >= 1 and not has_head(get_s0(s)) - @staticmethod cdef int transition(State* state, int label) except -1: # Interpret left-arcs from EOL as attachment to root @@ -240,15 +194,11 @@ cdef class LeftArc: @staticmethod cdef int cost(const State* s, const GoldParseC* gold, int label) except -1: - if not LeftArc.is_valid(s, label): - return 9000 return LeftArc.move_cost(s, gold) + LeftArc.label_cost(s, gold, label) @staticmethod cdef int move_cost(const State* s, const GoldParseC* gold) except -1: - if not LeftArc.is_valid(s, -1): - return 9000 - elif arc_is_gold(gold, s.i, s.stack[0]): + if arc_is_gold(gold, s.i, s.stack[0]): return 0 else: return pop_cost(s, gold, s.stack[0]) + arc_cost(s, gold, s.i, s.stack[0]) @@ -260,11 +210,7 @@ cdef class LeftArc: cdef class RightArc: @staticmethod - cdef bint is_valid(const State* s, int label) except -1: - return s.stack_len >= 1 and not at_eol(s) - - @staticmethod - cdef bint _new_is_valid(StateClass st, int label) except -1: + cdef bint is_valid(StateClass st, int label) except -1: return st.stack_depth() >= 1 and not st.eol() @staticmethod @@ -274,8 +220,6 @@ cdef class RightArc: @staticmethod cdef int cost(const State* s, const GoldParseC* gold, int label) except -1: - if not RightArc.is_valid(s, label): - return 9000 return RightArc.move_cost(s, gold) + RightArc.label_cost(s, gold, label) @staticmethod @@ -292,7 +236,7 @@ cdef class RightArc: cdef class Break: @staticmethod - cdef bint _new_is_valid(StateClass st, int label) except -1: + cdef bint is_valid(StateClass st, int label) except -1: cdef int i if not USE_BREAK: return False @@ -317,32 +261,6 @@ cdef class Break: # TODO: Constituency constraints return True - @staticmethod - cdef bint is_valid(const State* s, int label) except -1: - cdef int i - if not USE_BREAK: - return False - elif at_eol(s): - return False - elif s.stack_len < 1: - return False - elif NON_MONOTONIC: - return True - else: - # In the Break transition paper, they have this constraint that prevents - # Break if stack is disconnected. But, if we're doing non-monotonic parsing, - # we prefer to relax this constraint. This is helpful in parsing whole - # documents, because then we don't get stuck with words on the stack. - seen_headless = False - for i in range(s.stack_len): - if s.sent[s.stack[-i]].head == 0: - if seen_headless: - return False - else: - seen_headless = True - # TODO: Constituency constraints - return True - @staticmethod cdef int transition(State* state, int label) except -1: state.sent[state.i-1].sent_end = True @@ -354,10 +272,7 @@ cdef class Break: @staticmethod cdef int cost(const State* s, const GoldParseC* gold, int label) except -1: - if not Break.is_valid(s, label): - return 9000 - else: - return Break.move_cost(s, gold) + Break.label_cost(s, gold, label) + return Break.move_cost(s, gold) + Break.label_cost(s, gold, label) @staticmethod cdef int move_cost(const State* s, const GoldParseC* gold) except -1: @@ -374,163 +289,11 @@ cdef class Break: return 0 -cdef class Constituent: - @staticmethod - cdef bint is_valid(const State* s, int label) except -1: - if s.stack_len < 1: - return False - return False - #else: - # # If all stack elements are popped, can't constituent - # for i in range(s.ctnts.stack_len): - # if not s.ctnts.is_popped[-i]: - # return True - # else: - # return False - - @staticmethod - cdef int transition(State* state, int label) except -1: - return False - #cdef Constituent* bracket = new_bracket(state.ctnts) - - #bracket.parent = NULL - #bracket.label = self.label - #bracket.head = get_s0(state) - #bracket.length = 0 - - #attach(bracket, state.ctnts.stack) - # Attach rightward children. They're in the brackets array somewhere - # between here and B0. - #cdef Constituent* node - #cdef const TokenC* node_gov - #for i in range(1, bracket - state.ctnts.stack): - # node = bracket - i - # node_gov = node.head + node.head.head - # if node_gov == bracket.head: - # attach(bracket, node) - - @staticmethod - cdef int cost(const State* s, const GoldParseC* gold, int label) except -1: - if not Constituent.is_valid(s, label): - return 9000 - raise Exception("Constituent move should be disabled currently") - # The gold standard is indexed by end, then by start, then a set of labels - #brackets = gold.brackets(get_s0(s).r_edge, {}) - #if not brackets: - # return 2 # 2 loss for bad bracket, only 1 for good bracket bad label - # Index the current brackets in the state - #existing = set() - #for i in range(s.ctnt_len): - # if ctnt.end == s.r_edge and ctnt.label == self.label: - # existing.add(ctnt.start) - #cdef int loss = 2 - #cdef const TokenC* child - #cdef const TokenC* s0 = get_s0(s) - #cdef int n_left = count_left_kids(s0) - # Iterate over the possible start positions, and check whether we have a - # (start, end, label) match to the gold tree - #for i in range(1, n_left): - # child = get_left(s, s0, i) - # if child.l_edge in brackets and child.l_edge not in existing: - # if self.label in brackets[child.l_edge] - # return 0 - # else: - # loss = 1 # If we see the start position, set loss to 1 - #return loss - - @staticmethod - cdef int move_cost(const State* s, const GoldParseC* gold) except -1: - if not Constituent.is_valid(s, -1): - return 9000 - raise Exception("Constituent move should be disabled currently") - - @staticmethod - cdef int label_cost(const State* s, const GoldParseC* gold, int label) except -1: - return 0 - - - -cdef class Adjust: - @staticmethod - cdef bint is_valid(const State* s, int label) except -1: - return False - #if s.ctnts.stack_len < 2: - # return False - - #cdef const Constituent* b1 = s.ctnts.stack[-1] - #cdef const Constituent* b0 = s.ctnts.stack[0] - - #if (b1.head + b1.head.head) != b0.head: - # return False - #elif b0.head >= b1.head: - # return False - #elif b0 >= b1: - # return False - - @staticmethod - cdef int transition(State* state, int label) except -1: - return False - #cdef Constituent* b0 = state.ctnts.stack[0] - #cdef Constituent* b1 = state.ctnts.stack[1] - - #assert (b1.head + b1.head.head) == b0.head - #assert b0.head < b1.head - #assert b0 < b1 - - #attach(b0, b1) - ## Pop B1 from stack, but keep B0 on top - #state.ctnts.stack -= 1 - #state.ctnts.stack[0] = b0 - - @staticmethod - cdef int cost(const State* s, const GoldParseC* gold, int label) except -1: - if not Adjust.is_valid(s, label): - return 9000 - raise Exception("Adjust move should be disabled currently") - - @staticmethod - cdef int move_cost(const State* s, const GoldParseC* gold) except -1: - if not Adjust.is_valid(s, -1): - return 9000 - raise Exception("Adjust move should be disabled currently") - - @staticmethod - cdef int label_cost(const State* s, const GoldParseC* gold, int label) except -1: - return 0 - # The gold standard is indexed by end, then by start, then a set of labels - #gold_starts = gold.brackets(get_s0(s).r_edge, {}) - # Case 1: There are 0 brackets ending at this word. - # --> Cost is sunk, but must allow brackets to begin - #if not gold_starts: - # return 0 - # Is the top bracket correct? - #gold_labels = gold_starts.get(s.ctnt.start, set()) - # TODO: Case where we have a unary rule - # TODO: Case where two brackets end on this word, with top bracket starting - # before - - #cdef const TokenC* child - #cdef const TokenC* s0 = get_s0(s) - #cdef int n_left = count_left_kids(s0) - #cdef int i - # Iterate over the possible start positions, and check whether we have a - # (start, end, label) match to the gold tree - #for i in range(1, n_left): - # child = get_left(s, s0, i) - # if child.l_edge in brackets: - # if self.label in brackets[child.l_edge]: - # return 0 - # else: - # loss = 1 # If we see the start position, set loss to 1 - #return loss - - cdef class ArcEager(TransitionSystem): @classmethod def get_labels(cls, gold_parses): move_labels = {SHIFT: {'': True}, REDUCE: {'': True}, RIGHT: {}, - LEFT: {'ROOT': True}, BREAK: {'ROOT': True}, - CONSTITUENT: {}, ADJUST: {'': True}} + LEFT: {'ROOT': True}, BREAK: {'ROOT': True}} for raw_text, sents in gold_parses: for (ids, words, tags, heads, labels, iob), ctnts in sents: for child, head, label in zip(ids, heads, labels): @@ -539,8 +302,6 @@ cdef class ArcEager(TransitionSystem): move_labels[RIGHT][label] = True elif head > child: move_labels[LEFT][label] = True - for start, end, label in ctnts: - move_labels[CONSTITUENT][label] = True return move_labels cdef int preprocess_gold(self, GoldParse gold) except -1: @@ -604,14 +365,6 @@ cdef class ArcEager(TransitionSystem): t.is_valid = Break.is_valid t.do = Break.transition t.get_cost = Break.cost - elif move == CONSTITUENT: - t.is_valid = Constituent.is_valid - t.do = Constituent.transition - t.get_cost = Constituent.cost - elif move == ADJUST: - t.is_valid = Adjust.is_valid - t.do = Adjust.transition - t.get_cost = Adjust.cost else: raise Exception(move) return t @@ -625,18 +378,13 @@ cdef class ArcEager(TransitionSystem): if state.sent[i].head == 0 and state.sent[i].dep == 0: state.sent[i].dep = root_label - cdef int set_valid(self, bint* output, const State* state) except -1: - raise Exception - cdef StateClass stcls = StateClass(state.sent_len) - stcls.from_struct(state) + cdef int set_valid(self, bint* output, StateClass stcls) except -1: cdef bint[N_MOVES] is_valid - is_valid[SHIFT] = Shift._new_is_valid(stcls, -1) - is_valid[REDUCE] = Reduce._new_is_valid(stcls, -1) - is_valid[LEFT] = LeftArc._new_is_valid(stcls, -1) - is_valid[RIGHT] = RightArc._new_is_valid(stcls, -1) - is_valid[BREAK] = Break._new_is_valid(stcls, -1) - is_valid[CONSTITUENT] = False # Constituent.is_valid(state, -1) - is_valid[ADJUST] = False # Adjust.is_valid(state, -1) + is_valid[SHIFT] = Shift.is_valid(stcls, -1) + is_valid[REDUCE] = Reduce.is_valid(stcls, -1) + is_valid[LEFT] = LeftArc.is_valid(stcls, -1) + is_valid[RIGHT] = RightArc.is_valid(stcls, -1) + is_valid[BREAK] = Break.is_valid(stcls, -1) cdef int i for i in range(self.n_moves): output[i] = is_valid[self.c[i].move] @@ -653,38 +401,36 @@ cdef class ArcEager(TransitionSystem): move_cost_funcs[LEFT] = LeftArc.move_cost move_cost_funcs[RIGHT] = RightArc.move_cost move_cost_funcs[BREAK] = Break.move_cost - move_cost_funcs[CONSTITUENT] = Constituent.move_cost - move_cost_funcs[ADJUST] = Adjust.move_cost label_cost_funcs[SHIFT] = Shift.label_cost label_cost_funcs[REDUCE] = Reduce.label_cost label_cost_funcs[LEFT] = LeftArc.label_cost label_cost_funcs[RIGHT] = RightArc.label_cost label_cost_funcs[BREAK] = Break.label_cost - label_cost_funcs[CONSTITUENT] = Constituent.label_cost - label_cost_funcs[ADJUST] = Adjust.label_cost cdef int* labels = gold.c.labels cdef int* heads = gold.c.heads - for i in range(self.n_moves): - move = self.c[i].move - label = self.c[i].label - if move_costs[move] == -1: - move_costs[move] = move_cost_funcs[move](s, &gold.c) - output[i] = move_costs[move] + label_cost_funcs[move](s, &gold.c, label) - cdef Transition best_valid(self, const weight_t* scores, const State* s) except *: - assert s is not NULL cdef StateClass stcls = StateClass(s.sent_len) stcls.from_struct(s) + self.set_valid(self._is_valid, stcls) + for i in range(self.n_moves): + if not self._is_valid[i]: + output[i] = 9000 + else: + move = self.c[i].move + label = self.c[i].label + if move_costs[move] == -1: + move_costs[move] = move_cost_funcs[move](s, &gold.c) + output[i] = move_costs[move] + label_cost_funcs[move](s, &gold.c, label) + + cdef Transition best_valid(self, const weight_t* scores, StateClass stcls) except *: cdef bint[N_MOVES] is_valid - is_valid[SHIFT] = Shift._new_is_valid(stcls, -1) - is_valid[REDUCE] = Reduce._new_is_valid(stcls, -1) - is_valid[LEFT] = LeftArc._new_is_valid(stcls, -1) - is_valid[RIGHT] = RightArc._new_is_valid(stcls, -1) - is_valid[BREAK] = Break._new_is_valid(stcls, -1) - is_valid[CONSTITUENT] = False # Constituent._new_is_valid(s, -1) - is_valid[ADJUST] = False # Adjust._new_is_valid(s, -1) + is_valid[SHIFT] = Shift.is_valid(stcls, -1) + is_valid[REDUCE] = Reduce.is_valid(stcls, -1) + is_valid[LEFT] = LeftArc.is_valid(stcls, -1) + is_valid[RIGHT] = RightArc.is_valid(stcls, -1) + is_valid[BREAK] = Break.is_valid(stcls, -1) cdef Transition best cdef weight_t score = MIN_SCORE cdef int i @@ -703,5 +449,3 @@ cdef class ArcEager(TransitionSystem): best.label = self.c[i].label score = scores[i] return best - - diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx index 9f4512483..5791b0845 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/syntax/ner.pyx @@ -11,6 +11,8 @@ from thinc.typedefs cimport weight_t from ..gold cimport GoldParseC from ..gold cimport GoldParse +from .stateclass cimport StateClass + cdef enum: MISSING @@ -132,14 +134,14 @@ cdef class BiluoPushDown(TransitionSystem): raise Exception(move) return t - cdef Transition best_valid(self, const weight_t* scores, const State* s) except *: + cdef Transition best_valid(self, const weight_t* scores, StateClass stcls) except *: cdef int best = -1 cdef weight_t score = -90000 cdef const Transition* m cdef int i for i in range(self.n_moves): m = &self.c[i] - if m.is_valid(s, m.label) and scores[i] > score: + if m.is_valid(stcls, m.label) and scores[i] > score: best = i score = scores[i] assert best >= 0 @@ -147,16 +149,16 @@ cdef class BiluoPushDown(TransitionSystem): t.score = score return t - cdef int set_valid(self, bint* output, const State* s) except -1: + cdef int set_valid(self, bint* output, StateClass stcls) except -1: cdef int i for i in range(self.n_moves): m = &self.c[i] - output[i] = m.is_valid(s, m.label) + output[i] = m.is_valid(stcls, m.label) cdef class Missing: @staticmethod - cdef bint is_valid(const State* s, int label) except -1: + cdef bint is_valid(StateClass st, int label) except -1: return False @staticmethod @@ -170,8 +172,8 @@ cdef class Missing: cdef class Begin: @staticmethod - cdef bint is_valid(const State* s, int label) except -1: - return label != 0 and not entity_is_open(s) + cdef bint is_valid(StateClass st, int label) except -1: + return label != 0 and not st.entity_is_open() @staticmethod cdef int transition(State* s, int label) except -1: @@ -186,8 +188,6 @@ cdef class Begin: @staticmethod cdef int cost(const State* s, const GoldParseC* gold, int label) except -1: - if not Begin.is_valid(s, label): - return 9000 cdef int g_act = gold.ner[s.i].move cdef int g_tag = gold.ner[s.i].label @@ -203,10 +203,11 @@ cdef class Begin: # B, Gold U --> False (P) return 1 + cdef class In: @staticmethod - cdef bint is_valid(const State* s, int label) except -1: - return entity_is_open(s) and label != 0 and s.ent.label == label + cdef bint is_valid(StateClass st, int label) except -1: + return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label @staticmethod cdef int transition(State* s, int label) except -1: @@ -216,8 +217,6 @@ cdef class In: @staticmethod cdef int cost(const State* s, const GoldParseC* gold, int label) except -1: - if not In.is_valid(s, label): - return 9000 move = IN cdef int next_act = gold.ner[s.i+1].move if s.i < s.sent_len else OUT cdef int g_act = gold.ner[s.i].move @@ -245,11 +244,10 @@ cdef class In: return 1 - cdef class Last: @staticmethod - cdef bint is_valid(const State* s, int label) except -1: - return entity_is_open(s) and label != 0 and s.ent.label == label + cdef bint is_valid(StateClass st, int label) except -1: + return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label @staticmethod cdef int transition(State* s, int label) except -1: @@ -260,8 +258,6 @@ cdef class Last: @staticmethod cdef int cost(const State* s, const GoldParseC* gold, int label) except -1: - if not Last.is_valid(s, label): - return 9000 move = LAST cdef int g_act = gold.ner[s.i].move @@ -290,8 +286,8 @@ cdef class Last: cdef class Unit: @staticmethod - cdef bint is_valid(const State* s, int label) except -1: - return label != 0 and not entity_is_open(s) + cdef bint is_valid(StateClass st, int label) except -1: + return label != 0 and not st.entity_is_open() @staticmethod cdef int transition(State* s, int label) except -1: @@ -306,8 +302,6 @@ cdef class Unit: @staticmethod cdef int cost(const State* s, const GoldParseC* gold, int label) except -1: - if not Unit.is_valid(s, label): - return 9000 cdef int g_act = gold.ner[s.i].move cdef int g_tag = gold.ner[s.i].label @@ -326,8 +320,8 @@ cdef class Unit: cdef class Out: @staticmethod - cdef bint is_valid(const State* s, int label) except -1: - return not entity_is_open(s) + cdef bint is_valid(StateClass st, int label) except -1: + return not st.entity_is_open() @staticmethod cdef int transition(State* s, int label) except -1: @@ -336,9 +330,6 @@ cdef class Out: @staticmethod cdef int cost(const State* s, const GoldParseC* gold, int label) except -1: - if not Out.is_valid(s, label): - return 9000 - cdef int g_act = gold.ner[s.i].move cdef int g_tag = gold.ner[s.i].label diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 712673d85..b2436feb1 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -40,8 +40,9 @@ from ..gold cimport GoldParse from . import _parse_features from ._parse_features cimport CONTEXT_SIZE -from ._parse_features cimport _new_fill_context as fill_context -#from ._parse_features cimport fill_context +from ._parse_features cimport _new_fill_context +from ._parse_features cimport fill_context +from .stateclass cimport StateClass DEBUG = False @@ -104,11 +105,13 @@ cdef class Parser: cdef Pool mem = Pool() cdef State* state = new_state(mem, tokens.data, tokens.length) self.moves.initialize_state(state) + cdef StateClass stcls = StateClass(state.sent_len) cdef Transition guess while not is_final(state): - fill_context(context, state) + stcls.from_struct(state) + _new_fill_context(context, stcls) scores = self.model.score(context) - guess = self.moves.best_valid(scores, state) + guess = self.moves.best_valid(scores, stcls) guess.do(state, guess.label) self.moves.finalize_state(state) tokens.set_parse(state.sent) @@ -133,12 +136,14 @@ cdef class Parser: cdef const weight_t* scores cdef Transition guess cdef Transition best + cdef StateClass stcls = StateClass(state.sent_len) cdef atom_t[CONTEXT_SIZE] context loss = 0 while not is_final(state): - fill_context(context, state) + stcls.from_struct(state) + _new_fill_context(context, stcls) scores = self.model.score(context) - guess = self.moves.best_valid(scores, state) + guess = self.moves.best_valid(scores, stcls) best = self.moves.best_gold(scores, state, gold) cost = guess.get_cost(state, &gold.c, guess.label) self.model.update(context, guess.clas, best.clas, cost) @@ -174,12 +179,14 @@ cdef class Parser: cdef int i, j, cost cdef bint is_valid cdef const Transition* move + cdef StateClass stcls = StateClass(gold.length) for i in range(beam.size): state = beam.at(i) + stcls.from_struct(state) if not is_final(state): fill_context(context, state) self.model.set_scores(beam.scores[i], context) - self.moves.set_valid(beam.is_valid[i], state) + self.moves.set_valid(beam.is_valid[i], stcls) if gold is not None: for i in range(beam.size): diff --git a/spacy/syntax/stateclass.pxd b/spacy/syntax/stateclass.pxd index e543a4529..c5b9dfa47 100644 --- a/spacy/syntax/stateclass.pxd +++ b/spacy/syntax/stateclass.pxd @@ -2,7 +2,7 @@ from libc.string cimport memcpy, memset from cymem.cymem cimport Pool -from ..structs cimport TokenC +from ..structs cimport TokenC, Entity from ._state cimport State @@ -14,10 +14,12 @@ cdef class StateClass: cdef int* _stack cdef int* _buffer cdef TokenC* _sent + cdef Entity* _ents cdef TokenC _empty_token cdef int length cdef int _s_i cdef int _b_i + cdef int _e_i cdef int from_struct(self, const State* state) except -1 @@ -25,6 +27,7 @@ cdef class StateClass: cdef int B(self, int i) nogil cdef int H(self, int i) nogil + cdef int E(self, int i) nogil cdef int L(self, int i, int idx) nogil cdef int R(self, int i, int idx) nogil @@ -33,6 +36,7 @@ cdef class StateClass: cdef const TokenC* B_(self, int i) nogil cdef const TokenC* H_(self, int i) nogil + cdef const TokenC* E_(self, int i) nogil cdef const TokenC* L_(self, int i, int idx) nogil cdef const TokenC* R_(self, int i, int idx) nogil @@ -40,12 +44,15 @@ cdef class StateClass: cdef const TokenC* safe_get(self, int i) nogil cdef bint empty(self) nogil + + cdef bint entity_is_open(self) nogil cdef bint eol(self) nogil cdef bint is_final(self) nogil cdef bint has_head(self, int i) nogil + cdef int n_L(self, int i) nogil @@ -64,6 +71,12 @@ cdef class StateClass: cdef void add_arc(self, int head, int child, int label) nogil cdef void del_arc(self, int head, int child) nogil + + cdef void open_ent(self, int label) nogil + + cdef void close_ent(self) nogil + + cdef void set_ent_tag(self, int i, int ent_iob, int ent_type) nogil cdef void set_sent_end(self, int i) nogil diff --git a/spacy/syntax/stateclass.pyx b/spacy/syntax/stateclass.pyx index 724e1fadb..d15a2b650 100644 --- a/spacy/syntax/stateclass.pyx +++ b/spacy/syntax/stateclass.pyx @@ -9,10 +9,12 @@ cdef class StateClass: self._buffer = mem.alloc(length, sizeof(int)) self._stack = mem.alloc(length, sizeof(int)) self._sent = mem.alloc(length, sizeof(TokenC)) + self._ents = mem.alloc(length, sizeof(Entity)) self.mem = mem self.length = length self._s_i = 0 self._b_i = 0 + self._e_i = 0 cdef int i for i in range(length): self._buffer[i] = i @@ -21,10 +23,13 @@ cdef class StateClass: cdef int from_struct(self, const State* state) except -1: self._s_i = state.stack_len self._b_i = state.i + self._e_i = state.ents_len memcpy(self._sent, state.sent, sizeof(TokenC) * self.length) cdef int i for i in range(state.stack_len): self._stack[self._s_i - (i+1)] = state.stack[-i] + for i in range(state.ents_len): + self._ents[i] = state.ent[-i] cdef int S(self, int i) nogil: if i >= self._s_i: @@ -41,6 +46,9 @@ cdef class StateClass: return -1 return self._sent[i].head + i + cdef int E(self, int i) nogil: + return -1 + cdef int L(self, int i, int idx) nogil: if idx < 1: return -1 @@ -94,7 +102,10 @@ cdef class StateClass: return self.safe_get(self.B(i)) cdef const TokenC* H_(self, int i) nogil: - return self.safe_get(self.B(i)) + return self.safe_get(self.H(i)) + + cdef const TokenC* E_(self, int i) nogil: + return self.safe_get(self.E(i)) cdef const TokenC* L_(self, int i, int idx) nogil: return self.safe_get(self.L(i, idx)) @@ -129,6 +140,11 @@ cdef class StateClass: cdef bint stack_is_connected(self) nogil: return False + cdef bint entity_is_open(self) nogil: + if self._e_i < 1: + return False + return self._ents[self._e_i-1].end != 0 + cdef int stack_depth(self) nogil: return self._s_i @@ -164,6 +180,21 @@ cdef class StateClass: else: self._sent[head].l_kids &= ~(1 << dist) + cdef void open_ent(self, int label) nogil: + self._ents[self._e_i].start = self.B(0) + self._ents[self._e_i].label = label + self._ents[self._e_i].end = 0 + self._e_i += 1 + + cdef void close_ent(self) nogil: + self._ents[self._e_i].end = self.B(0)+1 + self._sent[self.B(0)].ent_iob = 1 + + cdef void set_ent_tag(self, int i, int ent_iob, int ent_type) nogil: + if 0 <= i < self.length: + self._sent[i].ent_iob = ent_iob + self._sent[i].ent_type = ent_type + cdef void set_sent_end(self, int i) nogil: if 0 < i < self.length: self._sent[i].sent_end = True @@ -172,8 +203,10 @@ cdef class StateClass: memcpy(self._sent, src._sent, self.length * sizeof(TokenC)) memcpy(self._stack, src._stack, self.length * sizeof(int)) memcpy(self._buffer, src._buffer, self.length * sizeof(int)) + memcpy(self._ents, src._ents, self.length * sizeof(int)) self._b_i = src._b_i self._s_i = src._s_i + self._e_i = src._e_i # From https://en.wikipedia.org/wiki/Hamming_weight diff --git a/spacy/syntax/transition_system.pxd b/spacy/syntax/transition_system.pxd index 5f21987a5..80a4af3a3 100644 --- a/spacy/syntax/transition_system.pxd +++ b/spacy/syntax/transition_system.pxd @@ -7,6 +7,8 @@ from ..gold cimport GoldParse from ..gold cimport GoldParseC from ..strings cimport StringStore +from .stateclass cimport StateClass + cdef struct Transition: int clas @@ -15,7 +17,7 @@ cdef struct Transition: weight_t score - bint (*is_valid)(const State* state, int label) except -1 + bint (*is_valid)(StateClass state, int label) except -1 int (*get_cost)(const State* state, const GoldParseC* gold, int label) except -1 int (*do)(State* state, int label) except -1 @@ -43,11 +45,11 @@ cdef class TransitionSystem: cdef Transition init_transition(self, int clas, int move, int label) except * - cdef int set_valid(self, bint* output, const State* state) except -1 + cdef int set_valid(self, bint* output, StateClass state) except -1 cdef int set_costs(self, int* output, const State* state, GoldParse gold) except -1 - cdef Transition best_valid(self, const weight_t* scores, const State* state) except * + cdef Transition best_valid(self, const weight_t* scores, StateClass stcls) except * - cdef Transition best_gold(self, const weight_t* scores, const State* state, + cdef Transition best_gold(self, const weight_t* scores, State* state, GoldParse gold) except * diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx index 664af67c4..4725f8b74 100644 --- a/spacy/syntax/transition_system.pyx +++ b/spacy/syntax/transition_system.pyx @@ -44,10 +44,10 @@ cdef class TransitionSystem: cdef Transition init_transition(self, int clas, int move, int label) except *: raise NotImplementedError - cdef Transition best_valid(self, const weight_t* scores, const State* s) except *: + cdef Transition best_valid(self, const weight_t* scores, StateClass s) except *: raise NotImplementedError - cdef int set_valid(self, bint* output, const State* state) except -1: + cdef int set_valid(self, bint* output, StateClass state) except -1: raise NotImplementedError cdef int set_costs(self, int* output, const State* s, GoldParse gold) except -1: @@ -63,9 +63,10 @@ cdef class TransitionSystem: cdef weight_t score = MIN_SCORE cdef int i for i in range(self.n_moves): - cost = self.c[i].get_cost(s, &gold.c, self.c[i].label) - if scores[i] > score and cost == 0: - best = self.c[i] - score = scores[i] + if self.c[i].is_valid(stcls, self.c[i].label): + cost = self.c[i].get_cost(s, &gold.c, self.c[i].label) + if scores[i] > score and cost == 0: + best = self.c[i] + score = scores[i] assert score > MIN_SCORE return best From 4b98b3e9c8e3c8a76a4b933b3b419f94709a3689 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 10 Jun 2015 00:40:43 +0200 Subject: [PATCH 08/75] * Cost functions now take StateClass argument, instead of State*. --- spacy/syntax/arc_eager.pyx | 82 ++++++++++++++---------------- spacy/syntax/ner.pyx | 24 ++++----- spacy/syntax/parser.pyx | 6 +-- spacy/syntax/transition_system.pxd | 12 ++--- spacy/syntax/transition_system.pyx | 13 ++--- 5 files changed, 65 insertions(+), 72 deletions(-) diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index c5bd5475b..a99c383f5 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -53,9 +53,7 @@ MOVE_NAMES[BREAK] = 'B' # Helper functions for the arc-eager oracle -cdef int push_cost(const State* st, const GoldParseC* gold, int target) except -1: - cdef StateClass stcls = StateClass(st.sent_len) - stcls.from_struct(st) +cdef int push_cost(StateClass stcls, const GoldParseC* gold, int target) except -1: cdef int cost = 0 cdef int i, S_i for i in range(stcls.stack_depth()): @@ -64,13 +62,11 @@ cdef int push_cost(const State* st, const GoldParseC* gold, int target) except - cost += 1 if gold.heads[S_i] == target and (NON_MONOTONIC or not stcls.has_head(S_i)): cost += 1 - cost += Break.is_valid(stcls, -1) and Break.move_cost(st, gold) == 0 + cost += Break.is_valid(stcls, -1) and Break.move_cost(stcls, gold) == 0 return cost -cdef int pop_cost(const State* st, const GoldParseC* gold, int target) except -1: - cdef StateClass stcls = StateClass(st.sent_len) - stcls.from_struct(st) +cdef int pop_cost(StateClass stcls, const GoldParseC* gold, int target) except -1: cdef int cost = 0 cdef int i, B_i for i in range(stcls.buffer_length()): @@ -81,9 +77,7 @@ cdef int pop_cost(const State* st, const GoldParseC* gold, int target) except -1 break return cost -cdef int arc_cost(const State* st, const GoldParseC* gold, int head, int child) except -1: - cdef StateClass stcls = StateClass(st.sent_len) - stcls.from_struct(st) +cdef int arc_cost(StateClass stcls, const GoldParseC* gold, int head, int child) except -1: if arc_is_gold(gold, head, child): return 0 elif stcls.H(child) == gold.heads[child]: @@ -133,15 +127,15 @@ cdef class Shift: push_stack(state) @staticmethod - cdef int cost(const State* s, const GoldParseC* gold, int label) except -1: - return Shift.move_cost(s, gold) + Shift.label_cost(s, gold, label) + cdef int cost(StateClass st, const GoldParseC* gold, int label) except -1: + return Shift.move_cost(st, gold) + Shift.label_cost(st, gold, label) @staticmethod - cdef int move_cost(const State* s, const GoldParseC* gold) except -1: - return push_cost(s, gold, s.i) + cdef int move_cost(StateClass s, const GoldParseC* gold) except -1: + return push_cost(s, gold, s.B(0)) @staticmethod - cdef int label_cost(const State* s, const GoldParseC* gold, int label) except -1: + cdef int label_cost(StateClass s, const GoldParseC* gold, int label) except -1: return 0 @@ -160,18 +154,18 @@ cdef class Reduce: pop_stack(state) @staticmethod - cdef int cost(const State* s, const GoldParseC* gold, int label) except -1: + cdef int cost(StateClass s, const GoldParseC* gold, int label) except -1: return Reduce.move_cost(s, gold) + Reduce.label_cost(s, gold, label) @staticmethod - cdef int move_cost(const State* s, const GoldParseC* gold) except -1: + cdef int move_cost(StateClass s, const GoldParseC* gold) except -1: if NON_MONOTONIC: - return pop_cost(s, gold, s.stack[0]) + return pop_cost(s, gold, s.S(0)) else: - return children_in_buffer(s, s.stack[0], gold.heads) + return children_in_buffer(s, s.S(0), gold.heads) @staticmethod - cdef int label_cost(const State* s, const GoldParseC* gold, int label) except -1: + cdef int label_cost(StateClass s, const GoldParseC* gold, int label) except -1: return 0 @@ -193,19 +187,19 @@ cdef class LeftArc: pop_stack(state) @staticmethod - cdef int cost(const State* s, const GoldParseC* gold, int label) except -1: + cdef int cost(StateClass s, const GoldParseC* gold, int label) except -1: return LeftArc.move_cost(s, gold) + LeftArc.label_cost(s, gold, label) @staticmethod - cdef int move_cost(const State* s, const GoldParseC* gold) except -1: - if arc_is_gold(gold, s.i, s.stack[0]): + cdef int move_cost(StateClass s, const GoldParseC* gold) except -1: + if arc_is_gold(gold, s.B(0), s.S(0)): return 0 else: - return pop_cost(s, gold, s.stack[0]) + arc_cost(s, gold, s.i, s.stack[0]) + return pop_cost(s, gold, s.S(0)) + arc_cost(s, gold, s.B(0), s.S(0)) @staticmethod - cdef int label_cost(const State* s, const GoldParseC* gold, int label) except -1: - return arc_is_gold(gold, s.i, s.stack[0]) and not label_is_gold(gold, s.i, s.stack[0], label) + cdef int label_cost(StateClass s, const GoldParseC* gold, int label) except -1: + return arc_is_gold(gold, s.B(0), s.S(0)) and not label_is_gold(gold, s.B(0), s.S(0), label) cdef class RightArc: @@ -219,19 +213,19 @@ cdef class RightArc: push_stack(state) @staticmethod - cdef int cost(const State* s, const GoldParseC* gold, int label) except -1: + cdef int cost(StateClass s, const GoldParseC* gold, int label) except -1: return RightArc.move_cost(s, gold) + RightArc.label_cost(s, gold, label) @staticmethod - cdef int move_cost(const State* s, const GoldParseC* gold) except -1: - if arc_is_gold(gold, s.stack[0], s.i): + cdef int move_cost(StateClass s, const GoldParseC* gold) except -1: + if arc_is_gold(gold, s.S(0), s.B(0)): return 0 else: - return push_cost(s, gold, s.i) + arc_cost(s, gold, s.stack[0], s.i) + return push_cost(s, gold, s.B(0)) + arc_cost(s, gold, s.S(0), s.B(0)) @staticmethod - cdef int label_cost(const State* s, const GoldParseC* gold, int label) except -1: - return arc_is_gold(gold, s.stack[0], s.i) and not label_is_gold(gold, s.stack[0], s.i, label) + cdef int label_cost(StateClass s, const GoldParseC* gold, int label) except -1: + return arc_is_gold(gold, s.S(0), s.B(0)) and not label_is_gold(gold, s.S(0), s.B(0), label) cdef class Break: @@ -271,21 +265,25 @@ cdef class Break: state.stack_len -= 1 @staticmethod - cdef int cost(const State* s, const GoldParseC* gold, int label) except -1: + cdef int cost(StateClass s, const GoldParseC* gold, int label) except -1: return Break.move_cost(s, gold) + Break.label_cost(s, gold, label) @staticmethod - cdef int move_cost(const State* s, const GoldParseC* gold) except -1: + cdef int move_cost(StateClass s, const GoldParseC* gold) except -1: # When we break, we Reduce all of the words on the stack. cdef int cost = 0 # Number of deps between S0...Sn and N0...Nn - for i in range(s.i, s.sent_len): - cost += children_in_stack(s, i, gold.heads) - cost += head_in_stack(s, i, gold.heads) + cdef int i, B_i, S_i + for i in range(s.buffer_length()): + B_i = s.B(i) + for j in range(s.stack_depth()): + S_i = s.S(j) + cost += gold.heads[B_i] == S_i + cost += gold.heads[S_i] == B_i return cost @staticmethod - cdef int label_cost(const State* s, const GoldParseC* gold, int label) except -1: + cdef int label_cost(StateClass s, const GoldParseC* gold, int label) except -1: return 0 @@ -389,7 +387,7 @@ cdef class ArcEager(TransitionSystem): for i in range(self.n_moves): output[i] = is_valid[self.c[i].move] - cdef int set_costs(self, int* output, const State* s, GoldParse gold) except -1: + cdef int set_costs(self, int* output, StateClass stcls, GoldParse gold) except -1: cdef int i, move, label cdef label_cost_func_t[N_MOVES] label_cost_funcs cdef move_cost_func_t[N_MOVES] move_cost_funcs @@ -411,8 +409,6 @@ cdef class ArcEager(TransitionSystem): cdef int* labels = gold.c.labels cdef int* heads = gold.c.heads - cdef StateClass stcls = StateClass(s.sent_len) - stcls.from_struct(s) self.set_valid(self._is_valid, stcls) for i in range(self.n_moves): if not self._is_valid[i]: @@ -421,8 +417,8 @@ cdef class ArcEager(TransitionSystem): move = self.c[i].move label = self.c[i].label if move_costs[move] == -1: - move_costs[move] = move_cost_funcs[move](s, &gold.c) - output[i] = move_costs[move] + label_cost_funcs[move](s, &gold.c, label) + move_costs[move] = move_cost_funcs[move](stcls, &gold.c) + output[i] = move_costs[move] + label_cost_funcs[move](stcls, &gold.c, label) cdef Transition best_valid(self, const weight_t* scores, StateClass stcls) except *: cdef bint[N_MOVES] is_valid diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx index 5791b0845..01aec7769 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/syntax/ner.pyx @@ -36,18 +36,14 @@ MOVE_NAMES[OUT] = 'O' cdef do_func_t[N_MOVES] do_funcs -cdef bint entity_is_open(const State *s) except -1: - return s.ents_len >= 1 and s.ent.end == 0 - - -cdef bint _entity_is_sunk(const State *s, Transition* golds) except -1: - if not entity_is_open(s): +cdef bint _entity_is_sunk(StateClass st, Transition* golds) except -1: + if not st.entity_is_open(): return False - cdef const Transition* gold = &golds[s.ent.start] + cdef const Transition* gold = &golds[st.E(0)] if gold.move != BEGIN and gold.move != UNIT: return True - elif gold.label != s.ent.label: + elif gold.label != st.E_(0).ent_type: return True else: return False @@ -166,7 +162,7 @@ cdef class Missing: raise NotImplementedError @staticmethod - cdef int cost(const State* s, const GoldParseC* gold, int label) except -1: + cdef int cost(StateClass s, const GoldParseC* gold, int label) except -1: return 9000 @@ -187,7 +183,7 @@ cdef class Begin: s.i += 1 @staticmethod - cdef int cost(const State* s, const GoldParseC* gold, int label) except -1: + cdef int cost(StateClass s, const GoldParseC* gold, int label) except -1: cdef int g_act = gold.ner[s.i].move cdef int g_tag = gold.ner[s.i].label @@ -216,7 +212,7 @@ cdef class In: s.i += 1 @staticmethod - cdef int cost(const State* s, const GoldParseC* gold, int label) except -1: + cdef int cost(StateClass s, const GoldParseC* gold, int label) except -1: move = IN cdef int next_act = gold.ner[s.i+1].move if s.i < s.sent_len else OUT cdef int g_act = gold.ner[s.i].move @@ -257,7 +253,7 @@ cdef class Last: s.i += 1 @staticmethod - cdef int cost(const State* s, const GoldParseC* gold, int label) except -1: + cdef int cost(StateClass s, const GoldParseC* gold, int label) except -1: move = LAST cdef int g_act = gold.ner[s.i].move @@ -301,7 +297,7 @@ cdef class Unit: s.i += 1 @staticmethod - cdef int cost(const State* s, const GoldParseC* gold, int label) except -1: + cdef int cost(StateClass s, const GoldParseC* gold, int label) except -1: cdef int g_act = gold.ner[s.i].move cdef int g_tag = gold.ner[s.i].label @@ -329,7 +325,7 @@ cdef class Out: s.i += 1 @staticmethod - cdef int cost(const State* s, const GoldParseC* gold, int label) except -1: + cdef int cost(StateClass s, const GoldParseC* gold, int label) except -1: cdef int g_act = gold.ner[s.i].move cdef int g_tag = gold.ner[s.i].label diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index b2436feb1..2d4d2c3dc 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -144,8 +144,8 @@ cdef class Parser: _new_fill_context(context, stcls) scores = self.model.score(context) guess = self.moves.best_valid(scores, stcls) - best = self.moves.best_gold(scores, state, gold) - cost = guess.get_cost(state, &gold.c, guess.label) + best = self.moves.best_gold(scores, stcls, gold) + cost = guess.get_cost(stcls, &gold.c, guess.label) self.model.update(context, guess.clas, best.clas, cost) guess.do(state, guess.label) loss += cost @@ -191,7 +191,7 @@ cdef class Parser: if gold is not None: for i in range(beam.size): state = beam.at(i) - self.moves.set_costs(beam.costs[i], state, gold) + self.moves.set_costs(beam.costs[i], stcls, gold) if follow_gold: for j in range(self.moves.n_moves): beam.is_valid[i][j] *= beam.costs[i][j] == 0 diff --git a/spacy/syntax/transition_system.pxd b/spacy/syntax/transition_system.pxd index 80a4af3a3..5027e66be 100644 --- a/spacy/syntax/transition_system.pxd +++ b/spacy/syntax/transition_system.pxd @@ -18,13 +18,13 @@ cdef struct Transition: weight_t score bint (*is_valid)(StateClass state, int label) except -1 - int (*get_cost)(const State* state, const GoldParseC* gold, int label) except -1 + int (*get_cost)(StateClass state, const GoldParseC* gold, int label) except -1 int (*do)(State* state, int label) except -1 -ctypedef int (*get_cost_func_t)(const State* state, const GoldParseC* gold, int label) except -1 -ctypedef int (*move_cost_func_t)(const State* state, const GoldParseC* gold) except -1 -ctypedef int (*label_cost_func_t)(const State* state, const GoldParseC* gold, int label) except -1 +ctypedef int (*get_cost_func_t)(StateClass state, const GoldParseC* gold, int label) except -1 +ctypedef int (*move_cost_func_t)(StateClass state, const GoldParseC* gold) except -1 +ctypedef int (*label_cost_func_t)(StateClass state, const GoldParseC* gold, int label) except -1 ctypedef int (*do_func_t)(State* state, int label) except -1 @@ -47,9 +47,9 @@ cdef class TransitionSystem: cdef int set_valid(self, bint* output, StateClass state) except -1 - cdef int set_costs(self, int* output, const State* state, GoldParse gold) except -1 + cdef int set_costs(self, int* output, StateClass state, GoldParse gold) except -1 cdef Transition best_valid(self, const weight_t* scores, StateClass stcls) except * - cdef Transition best_gold(self, const weight_t* scores, State* state, + cdef Transition best_gold(self, const weight_t* scores, StateClass state, GoldParse gold) except * diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx index 4725f8b74..f1dd06320 100644 --- a/spacy/syntax/transition_system.pyx +++ b/spacy/syntax/transition_system.pyx @@ -50,21 +50,22 @@ cdef class TransitionSystem: cdef int set_valid(self, bint* output, StateClass state) except -1: raise NotImplementedError - cdef int set_costs(self, int* output, const State* s, GoldParse gold) except -1: + cdef int set_costs(self, int* output, StateClass stcls, GoldParse gold) except -1: cdef int i for i in range(self.n_moves): - output[i] = self.c[i].get_cost(s, &gold.c, self.c[i].label) + if self.c[i].is_valid(stcls, self.c[i].label): + output[i] = self.c[i].get_cost(stcls, &gold.c, self.c[i].label) + else: + output[i] = 9000 - cdef Transition best_gold(self, const weight_t* scores, const State* s, + cdef Transition best_gold(self, const weight_t* scores, StateClass stcls, GoldParse gold) except *: - cdef StateClass stcls = StateClass(s.sent_len) - stcls.from_struct(s) cdef Transition best cdef weight_t score = MIN_SCORE cdef int i for i in range(self.n_moves): if self.c[i].is_valid(stcls, self.c[i].label): - cost = self.c[i].get_cost(s, &gold.c, self.c[i].label) + cost = self.c[i].get_cost(stcls, &gold.c, self.c[i].label) if scores[i] > score and cost == 0: best = self.c[i] score = scores[i] From d68c686ec17cfdd385cc9c75cf9fa5e019ed403b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 10 Jun 2015 01:35:28 +0200 Subject: [PATCH 09/75] * Move StateClass into interface of transition functions --- spacy/syntax/arc_eager.pyx | 51 ++++++++++++++------------- spacy/syntax/ner.pyx | 55 +++++++++++++----------------- spacy/syntax/parser.pyx | 33 +++++++++++------- spacy/syntax/stateclass.pyx | 15 ++++++-- spacy/syntax/transition_system.pxd | 6 ++-- spacy/syntax/transition_system.pyx | 2 +- 6 files changed, 86 insertions(+), 76 deletions(-) diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index a99c383f5..546ea5281 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -120,11 +120,11 @@ cdef class Shift: return not st.eol() @staticmethod - cdef int transition(State* state, int label) except -1: + cdef int transition(StateClass state, int label) except -1: # Set the dep label, in case we need it after we reduce if NON_MONOTONIC: - state.sent[state.i].dep = label - push_stack(state) + state._sent[state.B(0)].dep = label + state.push() @staticmethod cdef int cost(StateClass st, const GoldParseC* gold, int label) except -1: @@ -148,10 +148,10 @@ cdef class Reduce: return st.stack_depth() >= 2 and st.has_head(st.S(0)) @staticmethod - cdef int transition(State* state, int label) except -1: - if NON_MONOTONIC and not has_head(get_s0(state)) and state.stack_len >= 2: - add_dep(state, state.stack[-1], state.stack[0], get_s0(state).dep) - pop_stack(state) + cdef int transition(StateClass st, int label) except -1: + if NON_MONOTONIC and not st.has_head(st.S(0)) and st.stack_depth() >= 2: + st.add_arc(st.S(1), st.S(0), st.S_(0).dep) + st.pop() @staticmethod cdef int cost(StateClass s, const GoldParseC* gold, int label) except -1: @@ -178,13 +178,13 @@ cdef class LeftArc: return st.stack_depth() >= 1 and not st.has_head(st.S(0)) @staticmethod - cdef int transition(State* state, int label) except -1: + cdef int transition(StateClass st, int label) except -1: # Interpret left-arcs from EOL as attachment to root - if at_eol(state): - add_dep(state, state.stack[0], state.stack[0], label) + if st.eol(): + st.add_arc(st.S(0), st.S(0), label) else: - add_dep(state, state.i, state.stack[0], label) - pop_stack(state) + st.add_arc(st.B(0), st.S(0), label) + st.pop() @staticmethod cdef int cost(StateClass s, const GoldParseC* gold, int label) except -1: @@ -208,9 +208,9 @@ cdef class RightArc: return st.stack_depth() >= 1 and not st.eol() @staticmethod - cdef int transition(State* state, int label) except -1: - add_dep(state, state.stack[0], state.i, label) - push_stack(state) + cdef int transition(StateClass st, int label) except -1: + st.add_arc(st.S(0), st.B(0), label) + st.push() @staticmethod cdef int cost(StateClass s, const GoldParseC* gold, int label) except -1: @@ -256,13 +256,12 @@ cdef class Break: return True @staticmethod - cdef int transition(State* state, int label) except -1: - state.sent[state.i-1].sent_end = True - while state.stack_len != 0: - if get_s0(state).head == 0: - get_s0(state).dep = label - state.stack -= 1 - state.stack_len -= 1 + cdef int transition(StateClass st, int label) except -1: + st.set_sent_end(st.B(0)-1) + while not st.empty(): + if not st.has_head(st.S(0)): + st._sent[st.S(0)].dep = label + st.pop() @staticmethod cdef int cost(StateClass s, const GoldParseC* gold, int label) except -1: @@ -370,11 +369,11 @@ cdef class ArcEager(TransitionSystem): cdef int initialize_state(self, State* state) except -1: push_stack(state) - cdef int finalize_state(self, State* state) except -1: + cdef int finalize_state(self, StateClass st) except -1: cdef int root_label = self.strings['ROOT'] - for i in range(state.sent_len): - if state.sent[i].head == 0 and state.sent[i].dep == 0: - state.sent[i].dep = root_label + for i in range(st.length): + if st._sent[i].head == 0 and st._sent[i].dep == 0: + st._sent[i].dep = root_label cdef int set_valid(self, bint* output, StateClass stcls) except -1: cdef bint[N_MOVES] is_valid diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx index 01aec7769..833d1f299 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/syntax/ner.pyx @@ -158,7 +158,7 @@ cdef class Missing: return False @staticmethod - cdef int transition(State* s, int label) except -1: + cdef int transition(StateClass s, int label) except -1: raise NotImplementedError @staticmethod @@ -172,15 +172,11 @@ cdef class Begin: return label != 0 and not st.entity_is_open() @staticmethod - cdef int transition(State* s, int label) except -1: - s.ent += 1 - s.ents_len += 1 - s.ent.start = s.i - s.ent.label = label - s.ent.end = 0 - s.sent[s.i].ent_iob = 3 - s.sent[s.i].ent_type = label - s.i += 1 + cdef int transition(StateClass st, int label) except -1: + st.open_ent(label) + st.set_ent_tag(st.B(0), 3, label) + st.push() + st.pop() @staticmethod cdef int cost(StateClass s, const GoldParseC* gold, int label) except -1: @@ -206,10 +202,10 @@ cdef class In: return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label @staticmethod - cdef int transition(State* s, int label) except -1: - s.sent[s.i].ent_iob = 1 - s.sent[s.i].ent_type = label - s.i += 1 + cdef int transition(StateClass st, int label) except -1: + st.set_ent_tag(st.B(0), 1, label) + st.push() + st.pop() @staticmethod cdef int cost(StateClass s, const GoldParseC* gold, int label) except -1: @@ -246,11 +242,10 @@ cdef class Last: return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label @staticmethod - cdef int transition(State* s, int label) except -1: - s.ent.end = s.i+1 - s.sent[s.i].ent_iob = 1 - s.sent[s.i].ent_type = label - s.i += 1 + cdef int transition(StateClass st, int label) except -1: + st.close_ent() + st.push() + st.pop() @staticmethod cdef int cost(StateClass s, const GoldParseC* gold, int label) except -1: @@ -286,15 +281,12 @@ cdef class Unit: return label != 0 and not st.entity_is_open() @staticmethod - cdef int transition(State* s, int label) except -1: - s.ent += 1 - s.ents_len += 1 - s.ent.start = s.i - s.ent.label = label - s.ent.end = s.i+1 - s.sent[s.i].ent_iob = 3 - s.sent[s.i].ent_type = label - s.i += 1 + cdef int transition(StateClass st, int label) except -1: + st.open_ent(label) + st.close_ent() + st.set_ent_tag(st.B(0), 3, label) + st.push() + st.pop() @staticmethod cdef int cost(StateClass s, const GoldParseC* gold, int label) except -1: @@ -320,9 +312,10 @@ cdef class Out: return not st.entity_is_open() @staticmethod - cdef int transition(State* s, int label) except -1: - s.sent[s.i].ent_iob = 2 - s.i += 1 + cdef int transition(StateClass st, int label) except -1: + st.set_ent_tag(st.B(0), 2, 0) + st.push() + st.pop() @staticmethod cdef int cost(StateClass s, const GoldParseC* gold, int label) except -1: diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 2d4d2c3dc..1ff5a523f 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -106,15 +106,17 @@ cdef class Parser: cdef State* state = new_state(mem, tokens.data, tokens.length) self.moves.initialize_state(state) cdef StateClass stcls = StateClass(state.sent_len) + stcls.from_struct(state) cdef Transition guess - while not is_final(state): - stcls.from_struct(state) + words = [w.orth_ for w in tokens] + while not stcls.is_final(): + #print stcls.print_state(words) _new_fill_context(context, stcls) scores = self.model.score(context) guess = self.moves.best_valid(scores, stcls) - guess.do(state, guess.label) - self.moves.finalize_state(state) - tokens.set_parse(state.sent) + guess.do(stcls, guess.label) + self.moves.finalize_state(stcls) + tokens.set_parse(stcls._sent) cdef int _beam_parse(self, Tokens tokens) except -1: cdef Beam beam = Beam(self.moves.n_moves, self.cfg.beam_width) @@ -123,8 +125,9 @@ cdef class Parser: while not beam.is_done: self._advance_beam(beam, None, False) state = beam.at(0) - self.moves.finalize_state(state) - tokens.set_parse(state.sent) + #self.moves.finalize_state(state) + #tokens.set_parse(state.sent) + raise Exception def _greedy_train(self, Tokens tokens, GoldParse gold): cdef Pool mem = Pool() @@ -137,17 +140,18 @@ cdef class Parser: cdef Transition guess cdef Transition best cdef StateClass stcls = StateClass(state.sent_len) + stcls.from_struct(state) cdef atom_t[CONTEXT_SIZE] context loss = 0 - while not is_final(state): - stcls.from_struct(state) + words = [w.orth_ for w in tokens] + while not stcls.is_final(): _new_fill_context(context, stcls) scores = self.model.score(context) guess = self.moves.best_valid(scores, stcls) best = self.moves.best_gold(scores, stcls, gold) cost = guess.get_cost(stcls, &gold.c, guess.label) self.model.update(context, guess.clas, best.clas, cost) - guess.do(state, guess.label) + guess.do(stcls, guess.label) loss += cost return loss @@ -203,14 +207,16 @@ cdef class Parser: cdef Pool mem = Pool() cdef State* state = new_state(mem, tokens.data, tokens.length) self.moves.initialize_state(state) + cdef StateClass stcls = StateClass(state.sent_len) + stcls.from_struct(state) cdef class_t clas cdef int n_feats for clas in hist: - fill_context(context, state) + _new_fill_context(context, stcls) feats = self.model._extractor.get_feats(context, &n_feats) count_feats(counts[clas], feats, n_feats, inc) - self.moves.c[clas].do(state, self.moves.c[clas].label) + self.moves.c[clas].do(stcls, self.moves.c[clas].label) # These are passed as callbacks to thinc.search.Beam @@ -220,7 +226,8 @@ cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) src = _src moves = _moves copy_state(dest, src) - moves[clas].do(dest, moves[clas].label) + raise Exception + #moves[clas].do(dest, moves[clas].label) cdef void* _init_state(Pool mem, int length, void* tokens) except NULL: diff --git a/spacy/syntax/stateclass.pyx b/spacy/syntax/stateclass.pyx index d15a2b650..81227db26 100644 --- a/spacy/syntax/stateclass.pyx +++ b/spacy/syntax/stateclass.pyx @@ -126,7 +126,7 @@ cdef class StateClass: return self._b_i >= self.length cdef bint is_final(self) nogil: - return self.eol() and self.empty() + return self.eol() and self.stack_depth() <= 1 cdef bint has_head(self, int i) nogil: return self.safe_get(i).head != 0 @@ -196,7 +196,7 @@ cdef class StateClass: self._sent[i].ent_type = ent_type cdef void set_sent_end(self, int i) nogil: - if 0 < i < self.length: + if 0 <= i < self.length: self._sent[i].sent_end = True cdef void clone(self, StateClass src) nogil: @@ -207,6 +207,17 @@ cdef class StateClass: self._b_i = src._b_i self._s_i = src._s_i self._e_i = src._e_i + + def print_state(self, words): + words = list(words) + ['_'] + top = words[self.S(0)] + '_%d' % self.H(self.S(0)) + second = words[self.S(1)] + '_%d' % self.H(self.S(1)) + third = words[self.S(2)] + '_%d' % self.H(self.S(2)) + n0 = words[self.B(0)] + n1 = words[self.B(1)] + return ' '.join((str(self.stack_depth()), third, second, top, '|', n0, n1)) + + # From https://en.wikipedia.org/wiki/Hamming_weight diff --git a/spacy/syntax/transition_system.pxd b/spacy/syntax/transition_system.pxd index 5027e66be..f144d282e 100644 --- a/spacy/syntax/transition_system.pxd +++ b/spacy/syntax/transition_system.pxd @@ -19,14 +19,14 @@ cdef struct Transition: bint (*is_valid)(StateClass state, int label) except -1 int (*get_cost)(StateClass state, const GoldParseC* gold, int label) except -1 - int (*do)(State* state, int label) except -1 + int (*do)(StateClass state, int label) except -1 ctypedef int (*get_cost_func_t)(StateClass state, const GoldParseC* gold, int label) except -1 ctypedef int (*move_cost_func_t)(StateClass state, const GoldParseC* gold) except -1 ctypedef int (*label_cost_func_t)(StateClass state, const GoldParseC* gold, int label) except -1 -ctypedef int (*do_func_t)(State* state, int label) except -1 +ctypedef int (*do_func_t)(StateClass state, int label) except -1 cdef class TransitionSystem: @@ -37,7 +37,7 @@ cdef class TransitionSystem: cdef readonly int n_moves cdef int initialize_state(self, State* state) except -1 - cdef int finalize_state(self, State* state) except -1 + cdef int finalize_state(self, StateClass state) except -1 cdef int preprocess_gold(self, GoldParse gold) except -1 diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx index f1dd06320..6d972bcf9 100644 --- a/spacy/syntax/transition_system.pyx +++ b/spacy/syntax/transition_system.pyx @@ -32,7 +32,7 @@ cdef class TransitionSystem: cdef int initialize_state(self, State* state) except -1: pass - cdef int finalize_state(self, State* state) except -1: + cdef int finalize_state(self, StateClass state) except -1: pass cdef int preprocess_gold(self, GoldParse gold) except -1: From f14a1526aa1439e29c61732b8f72b6d773ea966e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 10 Jun 2015 01:39:07 +0200 Subject: [PATCH 10/75] * Remove version of fill_context that takes State* --- spacy/syntax/_parse_features.pxd | 3 +-- spacy/syntax/_parse_features.pyx | 46 +------------------------------- spacy/syntax/parser.pyx | 9 +++---- 3 files changed, 6 insertions(+), 52 deletions(-) diff --git a/spacy/syntax/_parse_features.pxd b/spacy/syntax/_parse_features.pxd index 6c1f0d6a5..00aff5f9d 100644 --- a/spacy/syntax/_parse_features.pxd +++ b/spacy/syntax/_parse_features.pxd @@ -4,8 +4,7 @@ from ._state cimport State from .stateclass cimport StateClass -cdef int fill_context(atom_t* context, State* state) except -1 -cdef int _new_fill_context(atom_t* context, StateClass state) except -1 +cdef int fill_context(atom_t* context, StateClass state) except -1 # Context elements # Ensure each token's attributes are listed: w, p, c, c6, c4. The order diff --git a/spacy/syntax/_parse_features.pyx b/spacy/syntax/_parse_features.pyx index db59c82a2..64a83390c 100644 --- a/spacy/syntax/_parse_features.pyx +++ b/spacy/syntax/_parse_features.pyx @@ -65,7 +65,7 @@ cdef inline void fill_token(atom_t* context, const TokenC* token) nogil: context[10] = token.ent_iob context[11] = token.ent_type -cdef int _new_fill_context(atom_t* ctxt, StateClass st) except -1: +cdef int fill_context(atom_t* ctxt, StateClass st) except -1: # Take care to fill every element of context! # We could memset, but this makes it very easy to have broken features that # make almost no impact on accuracy. If instead they're unset, the impact @@ -111,50 +111,6 @@ cdef int _new_fill_context(atom_t* ctxt, StateClass st) except -1: ctxt[S2_has_head] = st.has_head(st.S(2)) + 1 -cdef int fill_context(atom_t* context, State* state) except -1: - # Take care to fill every element of context! - # We could memset, but this makes it very easy to have broken features that - # make almost no impact on accuracy. If instead they're unset, the impact - # tends to be dramatic, so we get an obvious regression to fix... - fill_token(&context[S2w], get_s2(state)) - fill_token(&context[S1w], get_s1(state)) - fill_token(&context[S1rw], get_right(state, get_s1(state), 1)) - fill_token(&context[S0lw], get_left(state, get_s0(state), 1)) - fill_token(&context[S0l2w], get_left(state, get_s0(state), 2)) - fill_token(&context[S0w], get_s0(state)) - fill_token(&context[S0r2w], get_right(state, get_s0(state), 2)) - fill_token(&context[S0rw], get_right(state, get_s0(state), 1)) - fill_token(&context[N0lw], get_left(state, get_n0(state), 1)) - fill_token(&context[N0l2w], get_left(state, get_n0(state), 2)) - fill_token(&context[N0w], get_n0(state)) - fill_token(&context[N1w], get_n1(state)) - fill_token(&context[N2w], get_n2(state)) - fill_token(&context[P1w], get_p1(state)) - fill_token(&context[P2w], get_p2(state)) - - fill_token(&context[E0w], get_e0(state)) - fill_token(&context[E1w], get_e1(state)) - if state.stack_len >= 1: - context[dist] = min(state.stack[0] - state.i, 5) - else: - context[dist] = 0 - context[N0lv] = min(count_left_kids(get_n0(state)), 5) - context[S0lv] = min(count_left_kids(get_s0(state)), 5) - context[S0rv] = min(count_right_kids(get_s0(state)), 5) - context[S1lv] = min(count_left_kids(get_s1(state)), 5) - context[S1rv] = min(count_right_kids(get_s1(state)), 5) - - context[S0_has_head] = 0 - context[S1_has_head] = 0 - context[S2_has_head] = 0 - if state.stack_len >= 1: - context[S0_has_head] = has_head(get_s0(state)) + 1 - if state.stack_len >= 2: - context[S1_has_head] = has_head(get_s1(state)) + 1 - if state.stack_len >= 3: - context[S2_has_head] = has_head(get_s2(state)) + 1 - - ner = ( (N0W,), (P1W,), diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 1ff5a523f..fab990ef9 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -40,7 +40,6 @@ from ..gold cimport GoldParse from . import _parse_features from ._parse_features cimport CONTEXT_SIZE -from ._parse_features cimport _new_fill_context from ._parse_features cimport fill_context from .stateclass cimport StateClass @@ -111,7 +110,7 @@ cdef class Parser: words = [w.orth_ for w in tokens] while not stcls.is_final(): #print stcls.print_state(words) - _new_fill_context(context, stcls) + fill_context(context, stcls) scores = self.model.score(context) guess = self.moves.best_valid(scores, stcls) guess.do(stcls, guess.label) @@ -145,7 +144,7 @@ cdef class Parser: loss = 0 words = [w.orth_ for w in tokens] while not stcls.is_final(): - _new_fill_context(context, stcls) + fill_context(context, stcls) scores = self.model.score(context) guess = self.moves.best_valid(scores, stcls) best = self.moves.best_gold(scores, stcls, gold) @@ -188,7 +187,7 @@ cdef class Parser: state = beam.at(i) stcls.from_struct(state) if not is_final(state): - fill_context(context, state) + fill_context(context, stcls) self.model.set_scores(beam.scores[i], context) self.moves.set_valid(beam.is_valid[i], stcls) @@ -213,7 +212,7 @@ cdef class Parser: cdef class_t clas cdef int n_feats for clas in hist: - _new_fill_context(context, stcls) + fill_context(context, stcls) feats = self.model._extractor.get_feats(context, &n_feats) count_feats(counts[clas], feats, n_feats, inc) self.moves.c[clas].do(stcls, self.moves.c[clas].label) From 6a94b64ecacac2b74a88d8c2e559e9e0931ad213 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 10 Jun 2015 02:03:38 +0200 Subject: [PATCH 11/75] * Remove State* from parser.pyx entirely, switching over to StateClass. Beam parsing still untested. --- spacy/syntax/parser.pyx | 66 ++++++++++++++--------------------------- 1 file changed, 22 insertions(+), 44 deletions(-) diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index fab990ef9..93fdff043 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -35,7 +35,6 @@ from ..strings cimport StringStore from .arc_eager cimport TransitionSystem, Transition from .transition_system import OracleError -from ._state cimport State, new_state, copy_state, is_final, push_stack, get_left, get_n0 from ..gold cimport GoldParse from . import _parse_features @@ -43,6 +42,7 @@ from ._parse_features cimport CONTEXT_SIZE from ._parse_features cimport fill_context from .stateclass cimport StateClass +from cpython.ref cimport PyObject DEBUG = False def set_debug(val): @@ -50,20 +50,6 @@ def set_debug(val): DEBUG = val -cdef unicode print_state(State* s, list words): - words = list(words) + ['EOL'] - top = words[s.stack[0]] + '_%d' % s.sent[s.stack[0]].head - second = words[s.stack[-1]] + '_%d' % s.sent[s.stack[-1]].head - third = words[s.stack[-2]] + '_%d' % s.sent[s.stack[-2]].head - n0 = words[s.i] if s.i < len(words) else 'EOL' - n1 = words[s.i + 1] if s.i+1 < len(words) else 'EOL' - if s.ents_len: - ent = '%s %d-%d' % (s.ent.label, s.ent.start, s.ent.end) - else: - ent = '-' - return ' '.join((ent, str(s.stack_len), third, second, top, '|', n0, n1)) - - def get_templates(name): pf = _parse_features if name == 'ner': @@ -102,10 +88,8 @@ cdef class Parser: cdef atom_t[CONTEXT_SIZE] context cdef int n_feats cdef Pool mem = Pool() - cdef State* state = new_state(mem, tokens.data, tokens.length) - self.moves.initialize_state(state) - cdef StateClass stcls = StateClass(state.sent_len) - stcls.from_struct(state) + cdef StateClass stcls = StateClass.init(tokens.data, tokens.length) + self.moves.initialize_state(stcls) cdef Transition guess words = [w.orth_ for w in tokens] while not stcls.is_final(): @@ -123,23 +107,21 @@ cdef class Parser: beam.check_done(_check_final_state, NULL) while not beam.is_done: self._advance_beam(beam, None, False) - state = beam.at(0) + state = beam.at(0) #self.moves.finalize_state(state) #tokens.set_parse(state.sent) raise Exception def _greedy_train(self, Tokens tokens, GoldParse gold): cdef Pool mem = Pool() - cdef State* state = new_state(mem, tokens.data, tokens.length) - self.moves.initialize_state(state) + cdef StateClass stcls = StateClass.init(tokens.data, tokens.length) + self.moves.initialize_state(stcls) cdef int cost cdef const Feature* feats cdef const weight_t* scores cdef Transition guess cdef Transition best - cdef StateClass stcls = StateClass(state.sent_len) - stcls.from_struct(state) cdef atom_t[CONTEXT_SIZE] context loss = 0 words = [w.orth_ for w in tokens] @@ -178,36 +160,32 @@ cdef class Parser: def _advance_beam(self, Beam beam, GoldParse gold, bint follow_gold): cdef atom_t[CONTEXT_SIZE] context - cdef State* state cdef int i, j, cost cdef bint is_valid cdef const Transition* move cdef StateClass stcls = StateClass(gold.length) for i in range(beam.size): - state = beam.at(i) - stcls.from_struct(state) - if not is_final(state): + stcls = beam.at(i) + if not stcls.is_final(): fill_context(context, stcls) self.model.set_scores(beam.scores[i], context) self.moves.set_valid(beam.is_valid[i], stcls) if gold is not None: for i in range(beam.size): - state = beam.at(i) + stcls = beam.at(i) self.moves.set_costs(beam.costs[i], stcls, gold) if follow_gold: for j in range(self.moves.n_moves): beam.is_valid[i][j] *= beam.costs[i][j] == 0 - beam.advance(_transition_state, _hash_state, self.moves.c) + beam.advance(_transition_state, NULL, self.moves.c) beam.check_done(_check_final_state, NULL) def _count_feats(self, dict counts, Tokens tokens, list hist, int inc): cdef atom_t[CONTEXT_SIZE] context cdef Pool mem = Pool() - cdef State* state = new_state(mem, tokens.data, tokens.length) - self.moves.initialize_state(state) - cdef StateClass stcls = StateClass(state.sent_len) - stcls.from_struct(state) + cdef StateClass stcls = StateClass.init(tokens.data, tokens.length) + self.moves.initialize_state(stcls) cdef class_t clas cdef int n_feats @@ -221,24 +199,23 @@ cdef class Parser: # These are passed as callbacks to thinc.search.Beam cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1: - dest = _dest - src = _src + dest = _dest + src = _src moves = _moves - copy_state(dest, src) - raise Exception - #moves[clas].do(dest, moves[clas].label) + dest.clone(src) + moves[clas].do(dest, moves[clas].label) cdef void* _init_state(Pool mem, int length, void* tokens) except NULL: - state = new_state(mem, tokens, length) - push_stack(state) - return state + cdef StateClass st = StateClass.init(tokens, length) + return st -cdef int _check_final_state(void* state, void* extra_args) except -1: - return is_final(state) +cdef int _check_final_state(void* _state, void* extra_args) except -1: + return (_state).is_final() +""" cdef hash_t _hash_state(void* _state, void* _) except 0: state = _state cdef atom_t[10] rep @@ -257,3 +234,4 @@ cdef hash_t _hash_state(void* _state, void* _) except 0: rep[8] = 0 rep[9] = state.sent[state.i].l_kids return hash64(rep, sizeof(atom_t) * 10, 0) +""" From 04b1cd9b8c44ef73d533900eed800193ac36bc51 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 10 Jun 2015 04:20:23 +0200 Subject: [PATCH 12/75] * Greedy parsing working with new StateClass. Beam parsing broken --- spacy/syntax/arc_eager.pyx | 10 ++++- spacy/syntax/parser.pyx | 63 ++++++++++++++++++------------ spacy/syntax/stateclass.pxd | 9 +++++ spacy/syntax/stateclass.pyx | 5 +-- spacy/syntax/transition_system.pxd | 3 +- spacy/syntax/transition_system.pyx | 3 +- 6 files changed, 58 insertions(+), 35 deletions(-) diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 546ea5281..99835e106 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -366,8 +366,8 @@ cdef class ArcEager(TransitionSystem): raise Exception(move) return t - cdef int initialize_state(self, State* state) except -1: - push_stack(state) + cdef int initialize_state(self, StateClass st) except -1: + st.push() cdef int finalize_state(self, StateClass st) except -1: cdef int root_label = self.strings['ROOT'] @@ -383,8 +383,11 @@ cdef class ArcEager(TransitionSystem): is_valid[RIGHT] = RightArc.is_valid(stcls, -1) is_valid[BREAK] = Break.is_valid(stcls, -1) cdef int i + n_valid = 0 for i in range(self.n_moves): output[i] = is_valid[self.c[i].move] + n_valid += output[i] + assert n_valid >= 1 cdef int set_costs(self, int* output, StateClass stcls, GoldParse gold) except -1: cdef int i, move, label @@ -409,6 +412,7 @@ cdef class ArcEager(TransitionSystem): cdef int* heads = gold.c.heads self.set_valid(self._is_valid, stcls) + n_gold = 0 for i in range(self.n_moves): if not self._is_valid[i]: output[i] = 9000 @@ -418,6 +422,8 @@ cdef class ArcEager(TransitionSystem): if move_costs[move] == -1: move_costs[move] = move_cost_funcs[move](stcls, &gold.c) output[i] = move_costs[move] + label_cost_funcs[move](stcls, &gold.c, label) + n_gold += output[i] == 0 + assert n_gold >= 1 cdef Transition best_valid(self, const weight_t* scores, StateClass stcls) except *: cdef bint[N_MOVES] is_valid diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 93fdff043..b860425cd 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -5,6 +5,9 @@ MALT-style dependency parser """ from __future__ import unicode_literals cimport cython + +from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF + from libc.stdint cimport uint32_t, uint64_t from libc.string cimport memset, memcpy import random @@ -42,7 +45,6 @@ from ._parse_features cimport CONTEXT_SIZE from ._parse_features cimport fill_context from .stateclass cimport StateClass -from cpython.ref cimport PyObject DEBUG = False def set_debug(val): @@ -108,9 +110,9 @@ cdef class Parser: while not beam.is_done: self._advance_beam(beam, None, False) state = beam.at(0) - #self.moves.finalize_state(state) - #tokens.set_parse(state.sent) - raise Exception + self.moves.finalize_state(state) + tokens.set_parse(state._sent) + _cleanup(beam) def _greedy_train(self, Tokens tokens, GoldParse gold): cdef Pool mem = Pool() @@ -156,6 +158,8 @@ cdef class Parser: else: counts = {} self.model._model.update(counts) + _cleanup(pred) + _cleanup(gold) return pred.loss def _advance_beam(self, Beam beam, GoldParse gold, bint follow_gold): @@ -163,22 +167,23 @@ cdef class Parser: cdef int i, j, cost cdef bint is_valid cdef const Transition* move - cdef StateClass stcls = StateClass(gold.length) for i in range(beam.size): stcls = beam.at(i) if not stcls.is_final(): fill_context(context, stcls) self.model.set_scores(beam.scores[i], context) self.moves.set_valid(beam.is_valid[i], stcls) - if gold is not None: for i in range(beam.size): stcls = beam.at(i) self.moves.set_costs(beam.costs[i], stcls, gold) if follow_gold: + n_true = 0 for j in range(self.moves.n_moves): beam.is_valid[i][j] *= beam.costs[i][j] == 0 - beam.advance(_transition_state, NULL, self.moves.c) + n_true += beam.is_valid[i][j] + assert n_true >= 1 + beam.advance(_transition_state, _hash_state, self.moves.c) beam.check_done(_check_final_state, NULL) def _count_feats(self, dict counts, Tokens tokens, list hist, int inc): @@ -208,6 +213,7 @@ cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) cdef void* _init_state(Pool mem, int length, void* tokens) except NULL: cdef StateClass st = StateClass.init(tokens, length) + Py_INCREF(st) return st @@ -215,23 +221,28 @@ cdef int _check_final_state(void* _state, void* extra_args) except -1: return (_state).is_final() -""" -cdef hash_t _hash_state(void* _state, void* _) except 0: - state = _state - cdef atom_t[10] rep +def _cleanup(Beam beam): + for i in range(beam.width): + Py_XDECREF(beam._states[i].content) + Py_XDECREF(beam._parents[i].content) - rep[0] = state.stack[0] if state.stack_len >= 1 else 0 - rep[1] = state.stack[-1] if state.stack_len >= 2 else 0 - rep[2] = state.stack[-2] if state.stack_len >= 3 else 0 - rep[3] = state.i - rep[4] = state.sent[state.stack[0]].l_kids if state.stack_len >= 1 else 0 - rep[5] = state.sent[state.stack[0]].r_kids if state.stack_len >= 1 else 0 - rep[6] = state.sent[state.stack[0]].dep if state.stack_len >= 1 else 0 - rep[7] = state.sent[state.stack[-1]].dep if state.stack_len >= 2 else 0 - if get_left(state, get_n0(state), 1) != NULL: - rep[8] = get_left(state, get_n0(state), 1).dep - else: - rep[8] = 0 - rep[9] = state.sent[state.i].l_kids - return hash64(rep, sizeof(atom_t) * 10, 0) -""" +cdef hash_t _hash_state(void* _state, void* _) except 0: + return _state + + #state = _state + #cdef atom_t[10] rep + + #rep[0] = state.stack[0] if state.stack_len >= 1 else 0 + #rep[1] = state.stack[-1] if state.stack_len >= 2 else 0 + #rep[2] = state.stack[-2] if state.stack_len >= 3 else 0 + #rep[3] = state.i + #rep[4] = state.sent[state.stack[0]].l_kids if state.stack_len >= 1 else 0 + #rep[5] = state.sent[state.stack[0]].r_kids if state.stack_len >= 1 else 0 + #rep[6] = state.sent[state.stack[0]].dep if state.stack_len >= 1 else 0 + #rep[7] = state.sent[state.stack[-1]].dep if state.stack_len >= 2 else 0 + #if get_left(state, get_n0(state), 1) != NULL: + # rep[8] = get_left(state, get_n0(state), 1).dep + #else: + # rep[8] = 0 + #rep[9] = state.sent[state.i].l_kids + #return hash64(rep, sizeof(atom_t) * 10, 0) diff --git a/spacy/syntax/stateclass.pxd b/spacy/syntax/stateclass.pxd index c5b9dfa47..1d6a58d29 100644 --- a/spacy/syntax/stateclass.pxd +++ b/spacy/syntax/stateclass.pxd @@ -21,6 +21,15 @@ cdef class StateClass: cdef int _b_i cdef int _e_i + @staticmethod + cdef inline StateClass init(const TokenC* sent, int length): + cdef StateClass self = StateClass(length) + cdef int i + for i in range(length): + self._sent[i] = sent[i] + self._buffer[i] = i + return self + cdef int from_struct(self, const State* state) except -1 cdef int S(self, int i) nogil diff --git a/spacy/syntax/stateclass.pyx b/spacy/syntax/stateclass.pyx index 81227db26..c7568f7d0 100644 --- a/spacy/syntax/stateclass.pyx +++ b/spacy/syntax/stateclass.pyx @@ -1,6 +1,7 @@ from libc.string cimport memcpy, memset from libc.stdint cimport uint32_t from ..vocab cimport EMPTY_LEXEME +from ..structs cimport Entity cdef class StateClass: @@ -203,7 +204,7 @@ cdef class StateClass: memcpy(self._sent, src._sent, self.length * sizeof(TokenC)) memcpy(self._stack, src._stack, self.length * sizeof(int)) memcpy(self._buffer, src._buffer, self.length * sizeof(int)) - memcpy(self._ents, src._ents, self.length * sizeof(int)) + memcpy(self._ents, src._ents, self.length * sizeof(Entity)) self._b_i = src._b_i self._s_i = src._s_i self._e_i = src._e_i @@ -216,8 +217,6 @@ cdef class StateClass: n0 = words[self.B(0)] n1 = words[self.B(1)] return ' '.join((str(self.stack_depth()), third, second, top, '|', n0, n1)) - - # From https://en.wikipedia.org/wiki/Hamming_weight diff --git a/spacy/syntax/transition_system.pxd b/spacy/syntax/transition_system.pxd index f144d282e..adb093969 100644 --- a/spacy/syntax/transition_system.pxd +++ b/spacy/syntax/transition_system.pxd @@ -2,7 +2,6 @@ from cymem.cymem cimport Pool from thinc.typedefs cimport weight_t from ..structs cimport TokenC -from ._state cimport State from ..gold cimport GoldParse from ..gold cimport GoldParseC from ..strings cimport StringStore @@ -36,7 +35,7 @@ cdef class TransitionSystem: cdef bint* _is_valid cdef readonly int n_moves - cdef int initialize_state(self, State* state) except -1 + cdef int initialize_state(self, StateClass state) except -1 cdef int finalize_state(self, StateClass state) except -1 cdef int preprocess_gold(self, GoldParse gold) except -1 diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx index 6d972bcf9..927498cba 100644 --- a/spacy/syntax/transition_system.pyx +++ b/spacy/syntax/transition_system.pyx @@ -1,5 +1,4 @@ from cymem.cymem cimport Pool -from ._state cimport State from ..structs cimport TokenC from thinc.typedefs cimport weight_t @@ -29,7 +28,7 @@ cdef class TransitionSystem: i += 1 self.c = moves - cdef int initialize_state(self, State* state) except -1: + cdef int initialize_state(self, StateClass state) except -1: pass cdef int finalize_state(self, StateClass state) except -1: From d70304b7dd4912f2c50d2f8293fdb1eb2112f4a0 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 10 Jun 2015 04:20:42 +0200 Subject: [PATCH 13/75] * Require newer thinc --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index a7414f7dd..76a8d64f1 100644 --- a/setup.py +++ b/setup.py @@ -118,7 +118,7 @@ def run_setup(exts): ext_modules=exts, license="Dual: Commercial or AGPL", install_requires=['numpy', 'murmurhash', 'cymem >= 1.11', 'preshed == 0.37', - 'thinc == 1.76', "unidecode", 'wget', 'plac', 'six', + 'thinc == 2.0', "unidecode", 'wget', 'plac', 'six', 'ujson'], setup_requires=["headers_workaround"], ) From 4575e7a60f9f1eda2c29db3c23a9ae96f3a9d2d5 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 10 Jun 2015 06:33:39 +0200 Subject: [PATCH 14/75] * Fix beam search with new StateClass --- spacy/syntax/parser.pyx | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index b860425cd..4be1046bc 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -107,8 +107,9 @@ cdef class Parser: cdef Beam beam = Beam(self.moves.n_moves, self.cfg.beam_width) beam.initialize(_init_state, tokens.length, tokens.data) beam.check_done(_check_final_state, NULL) + words = [w.orth_ for w in tokens] while not beam.is_done: - self._advance_beam(beam, None, False) + self._advance_beam(beam, None, False, words) state = beam.at(0) self.moves.finalize_state(state) tokens.set_parse(state._sent) @@ -147,9 +148,10 @@ cdef class Parser: gold.check_done(_check_final_state, NULL) violn = MaxViolation() + words = [w.orth_ for w in tokens] while not pred.is_done and not gold.is_done: - self._advance_beam(pred, gold_parse, False) - self._advance_beam(gold, gold_parse, True) + self._advance_beam(pred, gold_parse, False, words) + self._advance_beam(gold, gold_parse, True, words) violn.check(pred, gold) if pred.loss >= 1: counts = {clas: {} for clas in range(self.model.n_classes)} @@ -162,7 +164,7 @@ cdef class Parser: _cleanup(gold) return pred.loss - def _advance_beam(self, Beam beam, GoldParse gold, bint follow_gold): + def _advance_beam(self, Beam beam, GoldParse gold, bint follow_gold, words): cdef atom_t[CONTEXT_SIZE] context cdef int i, j, cost cdef bint is_valid @@ -176,13 +178,11 @@ cdef class Parser: if gold is not None: for i in range(beam.size): stcls = beam.at(i) - self.moves.set_costs(beam.costs[i], stcls, gold) - if follow_gold: - n_true = 0 - for j in range(self.moves.n_moves): - beam.is_valid[i][j] *= beam.costs[i][j] == 0 - n_true += beam.is_valid[i][j] - assert n_true >= 1 + if not stcls.is_final(): + self.moves.set_costs(beam.costs[i], stcls, gold) + if follow_gold: + for j in range(self.moves.n_moves): + beam.is_valid[i][j] *= beam.costs[i][j] == 0 beam.advance(_transition_state, _hash_state, self.moves.c) beam.check_done(_check_final_state, NULL) @@ -213,6 +213,7 @@ cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) cdef void* _init_state(Pool mem, int length, void* tokens) except NULL: cdef StateClass st = StateClass.init(tokens, length) + st.push() Py_INCREF(st) return st From e5570c97007b6ad4c05669552133f6958af1eb72 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 10 Jun 2015 06:56:35 +0200 Subject: [PATCH 15/75] * Set nogil for oracle functions --- spacy/syntax/arc_eager.pyx | 71 +++++++++++++++--------------- spacy/syntax/transition_system.pxd | 14 +++--- 2 files changed, 42 insertions(+), 43 deletions(-) diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 99835e106..03a89cda4 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -53,7 +53,7 @@ MOVE_NAMES[BREAK] = 'B' # Helper functions for the arc-eager oracle -cdef int push_cost(StateClass stcls, const GoldParseC* gold, int target) except -1: +cdef int push_cost(StateClass stcls, const GoldParseC* gold, int target) nogil: cdef int cost = 0 cdef int i, S_i for i in range(stcls.stack_depth()): @@ -66,7 +66,7 @@ cdef int push_cost(StateClass stcls, const GoldParseC* gold, int target) except return cost -cdef int pop_cost(StateClass stcls, const GoldParseC* gold, int target) except -1: +cdef int pop_cost(StateClass stcls, const GoldParseC* gold, int target) nogil: cdef int cost = 0 cdef int i, B_i for i in range(stcls.buffer_length()): @@ -77,7 +77,7 @@ cdef int pop_cost(StateClass stcls, const GoldParseC* gold, int target) except - break return cost -cdef int arc_cost(StateClass stcls, const GoldParseC* gold, int head, int child) except -1: +cdef int arc_cost(StateClass stcls, const GoldParseC* gold, int head, int child) nogil: if arc_is_gold(gold, head, child): return 0 elif stcls.H(child) == gold.heads[child]: @@ -88,7 +88,7 @@ cdef int arc_cost(StateClass stcls, const GoldParseC* gold, int head, int child) return 0 -cdef bint arc_is_gold(const GoldParseC* gold, int head, int child) except -1: +cdef bint arc_is_gold(const GoldParseC* gold, int head, int child) nogil: if gold.labels[child] == -1: return True elif _is_gold_root(gold, head) and _is_gold_root(gold, child): @@ -99,7 +99,7 @@ cdef bint arc_is_gold(const GoldParseC* gold, int head, int child) except -1: return False -cdef bint label_is_gold(const GoldParseC* gold, int head, int child, int label) except -1: +cdef bint label_is_gold(const GoldParseC* gold, int head, int child, int label) nogil: if gold.labels[child] == -1: return True elif label == -1: @@ -110,75 +110,75 @@ cdef bint label_is_gold(const GoldParseC* gold, int head, int child, int label) return False -cdef bint _is_gold_root(const GoldParseC* gold, int word) except -1: +cdef bint _is_gold_root(const GoldParseC* gold, int word) nogil: return gold.labels[word] == -1 or gold.heads[word] == word cdef class Shift: @staticmethod - cdef bint is_valid(StateClass st, int label) except -1: + cdef bint is_valid(StateClass st, int label) nogil: return not st.eol() @staticmethod - cdef int transition(StateClass state, int label) except -1: + cdef int transition(StateClass state, int label) nogil: # Set the dep label, in case we need it after we reduce if NON_MONOTONIC: state._sent[state.B(0)].dep = label state.push() @staticmethod - cdef int cost(StateClass st, const GoldParseC* gold, int label) except -1: + cdef int cost(StateClass st, const GoldParseC* gold, int label) nogil: return Shift.move_cost(st, gold) + Shift.label_cost(st, gold, label) @staticmethod - cdef int move_cost(StateClass s, const GoldParseC* gold) except -1: + cdef inline int move_cost(StateClass s, const GoldParseC* gold) nogil: return push_cost(s, gold, s.B(0)) @staticmethod - cdef int label_cost(StateClass s, const GoldParseC* gold, int label) except -1: + cdef inline int label_cost(StateClass s, const GoldParseC* gold, int label) nogil: return 0 cdef class Reduce: @staticmethod - cdef bint is_valid(StateClass st, int label) except -1: + cdef bint is_valid(StateClass st, int label) nogil: if NON_MONOTONIC: return st.stack_depth() >= 2 #and not missing_brackets(s) else: return st.stack_depth() >= 2 and st.has_head(st.S(0)) @staticmethod - cdef int transition(StateClass st, int label) except -1: + cdef int transition(StateClass st, int label) nogil: if NON_MONOTONIC and not st.has_head(st.S(0)) and st.stack_depth() >= 2: st.add_arc(st.S(1), st.S(0), st.S_(0).dep) st.pop() @staticmethod - cdef int cost(StateClass s, const GoldParseC* gold, int label) except -1: + cdef int cost(StateClass s, const GoldParseC* gold, int label) nogil: return Reduce.move_cost(s, gold) + Reduce.label_cost(s, gold, label) @staticmethod - cdef int move_cost(StateClass s, const GoldParseC* gold) except -1: + cdef inline int move_cost(StateClass s, const GoldParseC* gold) nogil: if NON_MONOTONIC: return pop_cost(s, gold, s.S(0)) else: return children_in_buffer(s, s.S(0), gold.heads) @staticmethod - cdef int label_cost(StateClass s, const GoldParseC* gold, int label) except -1: + cdef inline int label_cost(StateClass s, const GoldParseC* gold, int label) nogil: return 0 cdef class LeftArc: @staticmethod - cdef bint is_valid(StateClass st, int label) except -1: + cdef bint is_valid(StateClass st, int label) nogil: if NON_MONOTONIC: return st.stack_depth() >= 1 #and not missing_brackets(s) else: return st.stack_depth() >= 1 and not st.has_head(st.S(0)) @staticmethod - cdef int transition(StateClass st, int label) except -1: + cdef int transition(StateClass st, int label) nogil: # Interpret left-arcs from EOL as attachment to root if st.eol(): st.add_arc(st.S(0), st.S(0), label) @@ -187,50 +187,50 @@ cdef class LeftArc: st.pop() @staticmethod - cdef int cost(StateClass s, const GoldParseC* gold, int label) except -1: + cdef int cost(StateClass s, const GoldParseC* gold, int label) nogil: return LeftArc.move_cost(s, gold) + LeftArc.label_cost(s, gold, label) @staticmethod - cdef int move_cost(StateClass s, const GoldParseC* gold) except -1: + cdef inline int move_cost(StateClass s, const GoldParseC* gold) nogil: if arc_is_gold(gold, s.B(0), s.S(0)): return 0 else: return pop_cost(s, gold, s.S(0)) + arc_cost(s, gold, s.B(0), s.S(0)) @staticmethod - cdef int label_cost(StateClass s, const GoldParseC* gold, int label) except -1: + cdef inline int label_cost(StateClass s, const GoldParseC* gold, int label) nogil: return arc_is_gold(gold, s.B(0), s.S(0)) and not label_is_gold(gold, s.B(0), s.S(0), label) cdef class RightArc: @staticmethod - cdef bint is_valid(StateClass st, int label) except -1: + cdef bint is_valid(StateClass st, int label) nogil: return st.stack_depth() >= 1 and not st.eol() @staticmethod - cdef int transition(StateClass st, int label) except -1: + cdef int transition(StateClass st, int label) nogil: st.add_arc(st.S(0), st.B(0), label) st.push() @staticmethod - cdef int cost(StateClass s, const GoldParseC* gold, int label) except -1: + cdef inline int cost(StateClass s, const GoldParseC* gold, int label) nogil: return RightArc.move_cost(s, gold) + RightArc.label_cost(s, gold, label) @staticmethod - cdef int move_cost(StateClass s, const GoldParseC* gold) except -1: + cdef inline int move_cost(StateClass s, const GoldParseC* gold) nogil: if arc_is_gold(gold, s.S(0), s.B(0)): return 0 else: return push_cost(s, gold, s.B(0)) + arc_cost(s, gold, s.S(0), s.B(0)) @staticmethod - cdef int label_cost(StateClass s, const GoldParseC* gold, int label) except -1: + cdef int label_cost(StateClass s, const GoldParseC* gold, int label) nogil: return arc_is_gold(gold, s.S(0), s.B(0)) and not label_is_gold(gold, s.S(0), s.B(0), label) cdef class Break: @staticmethod - cdef bint is_valid(StateClass st, int label) except -1: + cdef bint is_valid(StateClass st, int label) nogil: cdef int i if not USE_BREAK: return False @@ -256,7 +256,7 @@ cdef class Break: return True @staticmethod - cdef int transition(StateClass st, int label) except -1: + cdef int transition(StateClass st, int label) nogil: st.set_sent_end(st.B(0)-1) while not st.empty(): if not st.has_head(st.S(0)): @@ -264,15 +264,15 @@ cdef class Break: st.pop() @staticmethod - cdef int cost(StateClass s, const GoldParseC* gold, int label) except -1: + cdef int cost(StateClass s, const GoldParseC* gold, int label) nogil: return Break.move_cost(s, gold) + Break.label_cost(s, gold, label) @staticmethod - cdef int move_cost(StateClass s, const GoldParseC* gold) except -1: + cdef inline int move_cost(StateClass s, const GoldParseC* gold) nogil: # When we break, we Reduce all of the words on the stack. cdef int cost = 0 # Number of deps between S0...Sn and N0...Nn - cdef int i, B_i, S_i + cdef int i, j, B_i, S_i for i in range(s.buffer_length()): B_i = s.B(i) for j in range(s.stack_depth()): @@ -282,7 +282,7 @@ cdef class Break: return cost @staticmethod - cdef int label_cost(StateClass s, const GoldParseC* gold, int label) except -1: + cdef inline int label_cost(StateClass s, const GoldParseC* gold, int label) nogil: return 0 @@ -411,18 +411,17 @@ cdef class ArcEager(TransitionSystem): cdef int* labels = gold.c.labels cdef int* heads = gold.c.heads - self.set_valid(self._is_valid, stcls) n_gold = 0 for i in range(self.n_moves): - if not self._is_valid[i]: - output[i] = 9000 - else: + if self.c[i].is_valid(stcls, self.c[i].label): move = self.c[i].move label = self.c[i].label if move_costs[move] == -1: move_costs[move] = move_cost_funcs[move](stcls, &gold.c) output[i] = move_costs[move] + label_cost_funcs[move](stcls, &gold.c, label) n_gold += output[i] == 0 + else: + output[i] = 9000 assert n_gold >= 1 cdef Transition best_valid(self, const weight_t* scores, StateClass stcls) except *: diff --git a/spacy/syntax/transition_system.pxd b/spacy/syntax/transition_system.pxd index adb093969..d9bd2b3e6 100644 --- a/spacy/syntax/transition_system.pxd +++ b/spacy/syntax/transition_system.pxd @@ -16,16 +16,16 @@ cdef struct Transition: weight_t score - bint (*is_valid)(StateClass state, int label) except -1 - int (*get_cost)(StateClass state, const GoldParseC* gold, int label) except -1 - int (*do)(StateClass state, int label) except -1 + bint (*is_valid)(StateClass state, int label) nogil + int (*get_cost)(StateClass state, const GoldParseC* gold, int label) nogil + int (*do)(StateClass state, int label) nogil -ctypedef int (*get_cost_func_t)(StateClass state, const GoldParseC* gold, int label) except -1 -ctypedef int (*move_cost_func_t)(StateClass state, const GoldParseC* gold) except -1 -ctypedef int (*label_cost_func_t)(StateClass state, const GoldParseC* gold, int label) except -1 +ctypedef int (*get_cost_func_t)(StateClass state, const GoldParseC* gold, int label) nogil +ctypedef int (*move_cost_func_t)(StateClass state, const GoldParseC* gold) nogil +ctypedef int (*label_cost_func_t)(StateClass state, const GoldParseC* gold, int label) nogil -ctypedef int (*do_func_t)(StateClass state, int label) except -1 +ctypedef int (*do_func_t)(StateClass state, int label) nogil cdef class TransitionSystem: From 90a3add8d71151f1b7a38f47577f829b8cdf790a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 10 Jun 2015 06:57:13 +0200 Subject: [PATCH 16/75] * Require thinc 2.0 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 2391c1c3f..498a72b15 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ cython cymem == 1.11 pathlib preshed == 0.37 -thinc == 1.76 +thinc == 2.0 murmurhash == 0.24 unidecode numpy From 18cc326dc080949d36df85e7ed26191e73e469b0 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 10 Jun 2015 06:57:41 +0200 Subject: [PATCH 17/75] * Bug fixes to ner.pyx --- spacy/syntax/ner.pyx | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx index 833d1f299..c7fa88342 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/syntax/ner.pyx @@ -180,8 +180,8 @@ cdef class Begin: @staticmethod cdef int cost(StateClass s, const GoldParseC* gold, int label) except -1: - cdef int g_act = gold.ner[s.i].move - cdef int g_tag = gold.ner[s.i].label + cdef int g_act = gold.ner[s.B(0)].move + cdef int g_tag = gold.ner[s.B(0)].label if g_act == MISSING: return 0 @@ -210,9 +210,9 @@ cdef class In: @staticmethod cdef int cost(StateClass s, const GoldParseC* gold, int label) except -1: move = IN - cdef int next_act = gold.ner[s.i+1].move if s.i < s.sent_len else OUT - cdef int g_act = gold.ner[s.i].move - cdef int g_tag = gold.ner[s.i].label + cdef int next_act = gold.ner[s.B(1)].move if s.B(0) < s.length else OUT + cdef int g_act = gold.ner[s.B(0)].move + cdef int g_tag = gold.ner[s.B(0)].label cdef bint is_sunk = _entity_is_sunk(s, gold.ner) if g_act == MISSING: @@ -251,8 +251,8 @@ cdef class Last: cdef int cost(StateClass s, const GoldParseC* gold, int label) except -1: move = LAST - cdef int g_act = gold.ner[s.i].move - cdef int g_tag = gold.ner[s.i].label + cdef int g_act = gold.ner[s.B(0)].move + cdef int g_tag = gold.ner[s.B(0)].label if g_act == MISSING: return 0 @@ -290,8 +290,8 @@ cdef class Unit: @staticmethod cdef int cost(StateClass s, const GoldParseC* gold, int label) except -1: - cdef int g_act = gold.ner[s.i].move - cdef int g_tag = gold.ner[s.i].label + cdef int g_act = gold.ner[s.B(0)].move + cdef int g_tag = gold.ner[s.B(0)].label if g_act == MISSING: return 0 @@ -319,8 +319,8 @@ cdef class Out: @staticmethod cdef int cost(StateClass s, const GoldParseC* gold, int label) except -1: - cdef int g_act = gold.ner[s.i].move - cdef int g_tag = gold.ner[s.i].label + cdef int g_act = gold.ner[s.B(0)].move + cdef int g_tag = gold.ner[s.B(0)].label if g_act == MISSING: From e9aaecc6195ee159c1d908e15168c3ebe87d121b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 10 Jun 2015 06:58:27 +0200 Subject: [PATCH 18/75] * Remove from_struct method from StateClass --- spacy/syntax/stateclass.pxd | 6 ------ spacy/syntax/stateclass.pyx | 11 ----------- 2 files changed, 17 deletions(-) diff --git a/spacy/syntax/stateclass.pxd b/spacy/syntax/stateclass.pxd index 1d6a58d29..141c96195 100644 --- a/spacy/syntax/stateclass.pxd +++ b/spacy/syntax/stateclass.pxd @@ -4,8 +4,6 @@ from cymem.cymem cimport Pool from ..structs cimport TokenC, Entity -from ._state cimport State - from ..vocab cimport EMPTY_LEXEME @@ -30,8 +28,6 @@ cdef class StateClass: self._buffer[i] = i return self - cdef int from_struct(self, const State* state) except -1 - cdef int S(self, int i) nogil cdef int B(self, int i) nogil @@ -90,5 +86,3 @@ cdef class StateClass: cdef void set_sent_end(self, int i) nogil cdef void clone(self, StateClass src) nogil - - diff --git a/spacy/syntax/stateclass.pyx b/spacy/syntax/stateclass.pyx index c7568f7d0..5b3660a94 100644 --- a/spacy/syntax/stateclass.pyx +++ b/spacy/syntax/stateclass.pyx @@ -21,17 +21,6 @@ cdef class StateClass: self._buffer[i] = i self._empty_token.lex = &EMPTY_LEXEME - cdef int from_struct(self, const State* state) except -1: - self._s_i = state.stack_len - self._b_i = state.i - self._e_i = state.ents_len - memcpy(self._sent, state.sent, sizeof(TokenC) * self.length) - cdef int i - for i in range(state.stack_len): - self._stack[self._s_i - (i+1)] = state.stack[-i] - for i in range(state.ents_len): - self._ents[i] = state.ent[-i] - cdef int S(self, int i) nogil: if i >= self._s_i: return -1 From e2f9a80713ded5a862545cbe9b1f82daf1cca38b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 10 Jun 2015 07:09:17 +0200 Subject: [PATCH 19/75] * Remove old _state imports --- setup.py | 2 +- spacy/syntax/_parse_features.pxd | 1 - spacy/syntax/_parse_features.pyx | 9 +------ spacy/syntax/arc_eager.pxd | 2 -- spacy/syntax/arc_eager.pyx | 7 ------ spacy/syntax/ner.pyx | 40 ++++++++++++++++---------------- 6 files changed, 22 insertions(+), 39 deletions(-) diff --git a/setup.py b/setup.py index 76a8d64f1..194648f95 100644 --- a/setup.py +++ b/setup.py @@ -152,7 +152,7 @@ MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings', 'spacy.morphology', 'spacy.syntax.stateclass', 'spacy._ml', 'spacy.tokenizer', 'spacy.en.attrs', - 'spacy.en.pos', 'spacy.syntax.parser', 'spacy.syntax._state', + 'spacy.en.pos', 'spacy.syntax.parser', 'spacy.syntax.transition_system', 'spacy.syntax.arc_eager', 'spacy.syntax._parse_features', 'spacy.gold', 'spacy.orth', diff --git a/spacy/syntax/_parse_features.pxd b/spacy/syntax/_parse_features.pxd index 00aff5f9d..4067587ad 100644 --- a/spacy/syntax/_parse_features.pxd +++ b/spacy/syntax/_parse_features.pxd @@ -1,6 +1,5 @@ from thinc.typedefs cimport atom_t -from ._state cimport State from .stateclass cimport StateClass diff --git a/spacy/syntax/_parse_features.pyx b/spacy/syntax/_parse_features.pyx index 64a83390c..aeb764e50 100644 --- a/spacy/syntax/_parse_features.pyx +++ b/spacy/syntax/_parse_features.pyx @@ -12,13 +12,6 @@ from libc.string cimport memset from itertools import combinations from ..tokens cimport TokenC -from ._state cimport State -from ._state cimport get_s2, get_s1, get_s0, get_n0, get_n1, get_n2 -from ._state cimport get_p2, get_p1 -from ._state cimport get_e0, get_e1 -from ._state cimport has_head, get_left, get_right -from ._state cimport count_left_kids, count_right_kids - from .stateclass cimport StateClass @@ -58,7 +51,7 @@ cdef inline void fill_token(atom_t* context, const TokenC* token) nogil: # the source that are set to 1. context[4] = token.lex.cluster & 15 context[5] = token.lex.cluster & 63 - context[6] = token.dep if has_head(token) else 0 + context[6] = token.dep if token.head != 0 else 0 context[7] = token.lex.prefix context[8] = token.lex.suffix context[9] = token.lex.shape diff --git a/spacy/syntax/arc_eager.pxd b/spacy/syntax/arc_eager.pxd index 81b26f703..1390d949c 100644 --- a/spacy/syntax/arc_eager.pxd +++ b/spacy/syntax/arc_eager.pxd @@ -2,8 +2,6 @@ from cymem.cymem cimport Pool from thinc.typedefs cimport weight_t - -from ._state cimport State from .stateclass cimport StateClass from .transition_system cimport TransitionSystem, Transition diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 03a89cda4..1fdab2f1f 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -4,13 +4,6 @@ from __future__ import unicode_literals import ctypes import os -from ._state cimport State -from ._state cimport has_head, get_idx, get_s0, get_n0, get_left, get_right -from ._state cimport is_final, at_eol, pop_stack, push_stack, add_dep -from ._state cimport head_in_buffer, children_in_buffer -from ._state cimport head_in_stack, children_in_stack -from ._state cimport count_left_kids - from ..structs cimport TokenC from .transition_system cimport do_func_t, get_cost_func_t diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx index c7fa88342..c1005adae 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/syntax/ner.pyx @@ -36,7 +36,7 @@ MOVE_NAMES[OUT] = 'O' cdef do_func_t[N_MOVES] do_funcs -cdef bint _entity_is_sunk(StateClass st, Transition* golds) except -1: +cdef bint _entity_is_sunk(StateClass st, Transition* golds) nogil: if not st.entity_is_open(): return False @@ -154,32 +154,32 @@ cdef class BiluoPushDown(TransitionSystem): cdef class Missing: @staticmethod - cdef bint is_valid(StateClass st, int label) except -1: + cdef bint is_valid(StateClass st, int label) nogil: return False @staticmethod - cdef int transition(StateClass s, int label) except -1: - raise NotImplementedError + cdef int transition(StateClass s, int label) nogil: + pass @staticmethod - cdef int cost(StateClass s, const GoldParseC* gold, int label) except -1: + cdef int cost(StateClass s, const GoldParseC* gold, int label) nogil: return 9000 cdef class Begin: @staticmethod - cdef bint is_valid(StateClass st, int label) except -1: + cdef bint is_valid(StateClass st, int label) nogil: return label != 0 and not st.entity_is_open() @staticmethod - cdef int transition(StateClass st, int label) except -1: + cdef int transition(StateClass st, int label) nogil: st.open_ent(label) st.set_ent_tag(st.B(0), 3, label) st.push() st.pop() @staticmethod - cdef int cost(StateClass s, const GoldParseC* gold, int label) except -1: + cdef int cost(StateClass s, const GoldParseC* gold, int label) nogil: cdef int g_act = gold.ner[s.B(0)].move cdef int g_tag = gold.ner[s.B(0)].label @@ -198,17 +198,17 @@ cdef class Begin: cdef class In: @staticmethod - cdef bint is_valid(StateClass st, int label) except -1: + cdef bint is_valid(StateClass st, int label) nogil: return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label @staticmethod - cdef int transition(StateClass st, int label) except -1: + cdef int transition(StateClass st, int label) nogil: st.set_ent_tag(st.B(0), 1, label) st.push() st.pop() @staticmethod - cdef int cost(StateClass s, const GoldParseC* gold, int label) except -1: + cdef int cost(StateClass s, const GoldParseC* gold, int label) nogil: move = IN cdef int next_act = gold.ner[s.B(1)].move if s.B(0) < s.length else OUT cdef int g_act = gold.ner[s.B(0)].move @@ -238,17 +238,17 @@ cdef class In: cdef class Last: @staticmethod - cdef bint is_valid(StateClass st, int label) except -1: + cdef bint is_valid(StateClass st, int label) nogil: return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label @staticmethod - cdef int transition(StateClass st, int label) except -1: + cdef int transition(StateClass st, int label) nogil: st.close_ent() st.push() st.pop() @staticmethod - cdef int cost(StateClass s, const GoldParseC* gold, int label) except -1: + cdef int cost(StateClass s, const GoldParseC* gold, int label) nogil: move = LAST cdef int g_act = gold.ner[s.B(0)].move @@ -277,11 +277,11 @@ cdef class Last: cdef class Unit: @staticmethod - cdef bint is_valid(StateClass st, int label) except -1: + cdef bint is_valid(StateClass st, int label) nogil: return label != 0 and not st.entity_is_open() @staticmethod - cdef int transition(StateClass st, int label) except -1: + cdef int transition(StateClass st, int label) nogil: st.open_ent(label) st.close_ent() st.set_ent_tag(st.B(0), 3, label) @@ -289,7 +289,7 @@ cdef class Unit: st.pop() @staticmethod - cdef int cost(StateClass s, const GoldParseC* gold, int label) except -1: + cdef int cost(StateClass s, const GoldParseC* gold, int label) nogil: cdef int g_act = gold.ner[s.B(0)].move cdef int g_tag = gold.ner[s.B(0)].label @@ -308,17 +308,17 @@ cdef class Unit: cdef class Out: @staticmethod - cdef bint is_valid(StateClass st, int label) except -1: + cdef bint is_valid(StateClass st, int label) nogil: return not st.entity_is_open() @staticmethod - cdef int transition(StateClass st, int label) except -1: + cdef int transition(StateClass st, int label) nogil: st.set_ent_tag(st.B(0), 2, 0) st.push() st.pop() @staticmethod - cdef int cost(StateClass s, const GoldParseC* gold, int label) except -1: + cdef int cost(StateClass s, const GoldParseC* gold, int label) nogil: cdef int g_act = gold.ner[s.B(0)].move cdef int g_tag = gold.ner[s.B(0)].label From abd07c067ad18b1d7e7e54d615988f4641dc2bb4 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 10 Jun 2015 07:22:33 +0200 Subject: [PATCH 20/75] * Inline B and S methods on stateclass --- spacy/syntax/stateclass.pxd | 11 +++++++++-- spacy/syntax/stateclass.pyx | 10 ---------- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/spacy/syntax/stateclass.pxd b/spacy/syntax/stateclass.pxd index 141c96195..be0380ecb 100644 --- a/spacy/syntax/stateclass.pxd +++ b/spacy/syntax/stateclass.pxd @@ -28,8 +28,15 @@ cdef class StateClass: self._buffer[i] = i return self - cdef int S(self, int i) nogil - cdef int B(self, int i) nogil + cdef inline int S(self, int i) nogil: + if i >= self._s_i: + return -1 + return self._stack[self._s_i - (i+1)] + + cdef inline int B(self, int i) nogil: + if (i + self._b_i) >= self.length: + return -1 + return self._buffer[self._b_i + i] cdef int H(self, int i) nogil cdef int E(self, int i) nogil diff --git a/spacy/syntax/stateclass.pyx b/spacy/syntax/stateclass.pyx index 5b3660a94..4d7cc0fea 100644 --- a/spacy/syntax/stateclass.pyx +++ b/spacy/syntax/stateclass.pyx @@ -21,16 +21,6 @@ cdef class StateClass: self._buffer[i] = i self._empty_token.lex = &EMPTY_LEXEME - cdef int S(self, int i) nogil: - if i >= self._s_i: - return -1 - return self._stack[self._s_i - (i+1)] - - cdef int B(self, int i) nogil: - if (i + self._b_i) >= self.length: - return -1 - return self._buffer[self._b_i + i] - cdef int H(self, int i) nogil: if i < 0 or i >= self.length: return -1 From f7c8069e6546bbaaff3f86db3bac9d18239ee94e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 10 Jun 2015 10:12:17 +0200 Subject: [PATCH 21/75] * Fix bug in distance feature --- spacy/syntax/_parse_features.pyx | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/spacy/syntax/_parse_features.pyx b/spacy/syntax/_parse_features.pyx index aeb764e50..81c1b8dfc 100644 --- a/spacy/syntax/_parse_features.pyx +++ b/spacy/syntax/_parse_features.pyx @@ -79,12 +79,11 @@ cdef int fill_context(atom_t* ctxt, StateClass st) except -1: fill_token(&ctxt[P1w], st.safe_get(st.B(0)-1)) fill_token(&ctxt[P2w], st.safe_get(st.B(0)-2)) - # TODO fill_token(&ctxt[E0w], st.E_(0)) fill_token(&ctxt[E1w], st.E_(1)) if st.stack_depth() >= 1 and not st.eol(): - ctxt[dist] = min(st.S(0) - st.B(0), 5) # TODO: This is backwards!! + ctxt[dist] = min(st.B(0) - st.E(0), 5) else: ctxt[dist] = 0 ctxt[N0lv] = min(st.n_L(st.B(0)), 5) From 7bf6b7de3ecf9a4fa96ca52a8e97700a448343fb Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 10 Jun 2015 10:13:03 +0200 Subject: [PATCH 22/75] * Add unshift action to StateClass, and track which moves have been shifted --- spacy/syntax/stateclass.pxd | 5 ++++- spacy/syntax/stateclass.pyx | 19 +++++++++++++------ 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/spacy/syntax/stateclass.pxd b/spacy/syntax/stateclass.pxd index be0380ecb..e94e74f0c 100644 --- a/spacy/syntax/stateclass.pxd +++ b/spacy/syntax/stateclass.pxd @@ -11,10 +11,12 @@ cdef class StateClass: cdef Pool mem cdef int* _stack cdef int* _buffer + cdef bint* shifted cdef TokenC* _sent cdef Entity* _ents cdef TokenC _empty_token cdef int length + cdef bint at_sent_end cdef int _s_i cdef int _b_i cdef int _e_i @@ -64,7 +66,6 @@ cdef class StateClass: cdef bint is_final(self) nogil cdef bint has_head(self, int i) nogil - cdef int n_L(self, int i) nogil @@ -79,6 +80,8 @@ cdef class StateClass: cdef void push(self) nogil cdef void pop(self) nogil + + cdef void unshift(self) nogil cdef void add_arc(self, int head, int child, int label) nogil diff --git a/spacy/syntax/stateclass.pyx b/spacy/syntax/stateclass.pyx index 4d7cc0fea..23098b70d 100644 --- a/spacy/syntax/stateclass.pyx +++ b/spacy/syntax/stateclass.pyx @@ -9,6 +9,7 @@ cdef class StateClass: cdef Pool mem = Pool() self._buffer = mem.alloc(length, sizeof(int)) self._stack = mem.alloc(length, sizeof(int)) + self.shifted = mem.alloc(length, sizeof(bint)) self._sent = mem.alloc(length, sizeof(TokenC)) self._ents = mem.alloc(length, sizeof(Entity)) self.mem = mem @@ -103,10 +104,10 @@ cdef class StateClass: return self._s_i <= 0 cdef bint eol(self) nogil: - return self._b_i >= self.length + return self._b_i >= self.length or self.at_sent_end cdef bint is_final(self) nogil: - return self.eol() and self.stack_depth() <= 1 + return self.stack_depth() <= 1 and self.buffer_length() == 0 cdef bint has_head(self, int i) nogil: return self.safe_get(i).head != 0 @@ -133,12 +134,18 @@ cdef class StateClass: cdef void push(self) nogil: self._stack[self._s_i] = self.B(0) + self.shifted[self.B(0)] = True self._s_i += 1 self._b_i += 1 cdef void pop(self) nogil: self._s_i -= 1 + cdef void unshift(self) nogil: + self._b_i -= 1 + self._buffer[self._b_i] = self.S(0) + self._s_i -= 1 + cdef void add_arc(self, int head, int child, int label) nogil: if self.has_head(child): self.del_arc(self.H(child), child) @@ -190,12 +197,12 @@ cdef class StateClass: def print_state(self, words): words = list(words) + ['_'] - top = words[self.S(0)] + '_%d' % self.H(self.S(0)) - second = words[self.S(1)] + '_%d' % self.H(self.S(1)) - third = words[self.S(2)] + '_%d' % self.H(self.S(2)) + top = words[self.S(0)] + '_%d' % self.S_(0).head + second = words[self.S(1)] + '_%d' % self.S_(1).head + third = words[self.S(2)] + '_%d' % self.S_(2).head n0 = words[self.B(0)] n1 = words[self.B(1)] - return ' '.join((str(self.stack_depth()), third, second, top, '|', n0, n1)) + return ' '.join((str(self.buffer_length()), str(self.stack_depth()), third, second, top, '|', n0, n1)) # From https://en.wikipedia.org/wiki/Hamming_weight From aa9625f6881e37b3508f9d8eda74008545501b85 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 10 Jun 2015 10:15:56 +0200 Subject: [PATCH 23/75] * Do non-monotonic Unshift. Every word can be shifted at most 1 time. When the Reduce move is used, if S0 has no head, we put the word back on the buffer. Gets 86.4 on nw 1k with gold pre-proc. Break transition not yet implemented for this. --- spacy/syntax/arc_eager.pyx | 74 ++++++++++++++------------------------ 1 file changed, 27 insertions(+), 47 deletions(-) diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 1fdab2f1f..99fe7f943 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -19,7 +19,7 @@ from .stateclass cimport StateClass DEF NON_MONOTONIC = True -DEF USE_BREAK = True +DEF USE_BREAK = False cdef weight_t MIN_SCORE = -90000 @@ -70,12 +70,14 @@ cdef int pop_cost(StateClass stcls, const GoldParseC* gold, int target) nogil: break return cost + cdef int arc_cost(StateClass stcls, const GoldParseC* gold, int head, int child) nogil: if arc_is_gold(gold, head, child): return 0 elif stcls.H(child) == gold.heads[child]: return 1 - elif gold.heads[child] >= stcls.B(0): + # Head in buffer + elif gold.heads[child] >= stcls.B(0) and stcls.B(1) != -1: return 1 else: return 0 @@ -110,13 +112,10 @@ cdef bint _is_gold_root(const GoldParseC* gold, int word) nogil: cdef class Shift: @staticmethod cdef bint is_valid(StateClass st, int label) nogil: - return not st.eol() + return st.buffer_length() >= 2 and not st.shifted[st.B(0)] @staticmethod cdef int transition(StateClass state, int label) nogil: - # Set the dep label, in case we need it after we reduce - if NON_MONOTONIC: - state._sent[state.B(0)].dep = label state.push() @staticmethod @@ -135,27 +134,25 @@ cdef class Shift: cdef class Reduce: @staticmethod cdef bint is_valid(StateClass st, int label) nogil: - if NON_MONOTONIC: - return st.stack_depth() >= 2 #and not missing_brackets(s) - else: - return st.stack_depth() >= 2 and st.has_head(st.S(0)) + return st.stack_depth() >= 2 @staticmethod cdef int transition(StateClass st, int label) nogil: - if NON_MONOTONIC and not st.has_head(st.S(0)) and st.stack_depth() >= 2: - st.add_arc(st.S(1), st.S(0), st.S_(0).dep) - st.pop() + if st.has_head(st.S(0)): + st.pop() + else: + st.unshift() @staticmethod cdef int cost(StateClass s, const GoldParseC* gold, int label) nogil: return Reduce.move_cost(s, gold) + Reduce.label_cost(s, gold, label) @staticmethod - cdef inline int move_cost(StateClass s, const GoldParseC* gold) nogil: - if NON_MONOTONIC: - return pop_cost(s, gold, s.S(0)) + cdef inline int move_cost(StateClass st, const GoldParseC* gold) nogil: + if st.shifted[st.S(0)]: + return pop_cost(st, gold, st.S(0)) else: - return children_in_buffer(s, s.S(0), gold.heads) + return 0 @staticmethod cdef inline int label_cost(StateClass s, const GoldParseC* gold, int label) nogil: @@ -166,18 +163,16 @@ cdef class LeftArc: @staticmethod cdef bint is_valid(StateClass st, int label) nogil: if NON_MONOTONIC: - return st.stack_depth() >= 1 #and not missing_brackets(s) + return st.stack_depth() >= 1 and st.buffer_length() >= 1 #and not missing_brackets(s) else: - return st.stack_depth() >= 1 and not st.has_head(st.S(0)) + return st.stack_depth() >= 1 and st.buffer_length() >= 1 and not st.has_head(st.S(0)) @staticmethod cdef int transition(StateClass st, int label) nogil: - # Interpret left-arcs from EOL as attachment to root - if st.eol(): - st.add_arc(st.S(0), st.S(0), label) - else: - st.add_arc(st.B(0), st.S(0), label) + st.add_arc(st.B(0), st.S(0), label) st.pop() + if st.empty(): + st.push() @staticmethod cdef int cost(StateClass s, const GoldParseC* gold, int label) nogil: @@ -198,7 +193,7 @@ cdef class LeftArc: cdef class RightArc: @staticmethod cdef bint is_valid(StateClass st, int label) nogil: - return st.stack_depth() >= 1 and not st.eol() + return st.stack_depth() >= 1 and st.buffer_length() >= 1 @staticmethod cdef int transition(StateClass st, int label) nogil: @@ -213,6 +208,8 @@ cdef class RightArc: cdef inline int move_cost(StateClass s, const GoldParseC* gold) nogil: if arc_is_gold(gold, s.S(0), s.B(0)): return 0 + elif s.shifted[s.B(0)]: + return push_cost(s, gold, s.B(0)) else: return push_cost(s, gold, s.B(0)) + arc_cost(s, gold, s.S(0), s.B(0)) @@ -231,30 +228,13 @@ cdef class Break: return False elif st.stack_depth() < 1: return False - elif NON_MONOTONIC: - return True else: - # In the Break transition paper, they have this constraint that prevents - # Break if stack is disconnected. But, if we're doing non-monotonic parsing, - # we prefer to relax this constraint. This is helpful in parsing whole - # documents, because then we don't get stuck with words on the stack. - seen_headless = False - for i in range(st.stack_depth()): - if not st.has_head(st.S(i)): - if seen_headless: - return False - else: - seen_headless = True - # TODO: Constituency constraints return True @staticmethod cdef int transition(StateClass st, int label) nogil: - st.set_sent_end(st.B(0)-1) - while not st.empty(): - if not st.has_head(st.S(0)): - st._sent[st.S(0)].dep = label - st.pop() + #st.set_sent_end() + pass @staticmethod cdef int cost(StateClass s, const GoldParseC* gold, int label) nogil: @@ -262,9 +242,9 @@ cdef class Break: @staticmethod cdef inline int move_cost(StateClass s, const GoldParseC* gold) nogil: - # When we break, we Reduce all of the words on the stack. + # When we break, we can't reach any arcs between stack and buffer + # So cost is number of deps between S0...Sn and B0...Nn cdef int cost = 0 - # Number of deps between S0...Sn and N0...Nn cdef int i, j, B_i, S_i for i in range(s.buffer_length()): B_i = s.B(i) @@ -432,7 +412,7 @@ cdef class ArcEager(TransitionSystem): best = self.c[i] score = scores[i] assert best.clas < self.n_moves - assert score > MIN_SCORE + assert score > MIN_SCORE, (stcls.stack_depth(), stcls.buffer_length()) # Label Shift moves with the best Right-Arc label, for non-monotonic # actions if best.move == SHIFT: From bb09b5d91a7a6538cce37acea580d4e8b07b34b4 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 10 Jun 2015 11:33:09 +0200 Subject: [PATCH 24/75] * Fix shifted bit vector in stateclass --- should reflect whether the word has been *unshifted*. --- spacy/syntax/stateclass.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/syntax/stateclass.pyx b/spacy/syntax/stateclass.pyx index 23098b70d..be3ccf5aa 100644 --- a/spacy/syntax/stateclass.pyx +++ b/spacy/syntax/stateclass.pyx @@ -134,7 +134,6 @@ cdef class StateClass: cdef void push(self) nogil: self._stack[self._s_i] = self.B(0) - self.shifted[self.B(0)] = True self._s_i += 1 self._b_i += 1 @@ -145,6 +144,7 @@ cdef class StateClass: self._b_i -= 1 self._buffer[self._b_i] = self.S(0) self._s_i -= 1 + self.shifted[self.B(0)] = True cdef void add_arc(self, int head, int child, int label) nogil: if self.has_head(child): From b7b18c279d2a594983319f16f92507643abdd34a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 10 Jun 2015 11:33:39 +0200 Subject: [PATCH 25/75] * Fix Reduce oracle. Getting 86.35 --- spacy/syntax/arc_eager.pyx | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 99fe7f943..222388b69 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -149,10 +149,7 @@ cdef class Reduce: @staticmethod cdef inline int move_cost(StateClass st, const GoldParseC* gold) nogil: - if st.shifted[st.S(0)]: - return pop_cost(st, gold, st.S(0)) - else: - return 0 + return pop_cost(st, gold, st.S(0)) @staticmethod cdef inline int label_cost(StateClass s, const GoldParseC* gold, int label) nogil: @@ -233,8 +230,9 @@ cdef class Break: @staticmethod cdef int transition(StateClass st, int label) nogil: - #st.set_sent_end() - pass + #st.set_sent_start() + while st.stack_depth() >= 2 and st.buffer_length() == 0: + Reduce.transition(st, -1) @staticmethod cdef int cost(StateClass s, const GoldParseC* gold, int label) nogil: From 495f528709f2988b8d62d634c3ba4f3a3f3f60cf Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 10 Jun 2015 12:33:55 +0200 Subject: [PATCH 26/75] * Add support for sentence breaks in stateclass --- spacy/syntax/stateclass.pxd | 4 ++-- spacy/syntax/stateclass.pyx | 12 ++++++++---- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/spacy/syntax/stateclass.pxd b/spacy/syntax/stateclass.pxd index e94e74f0c..dcc57474c 100644 --- a/spacy/syntax/stateclass.pxd +++ b/spacy/syntax/stateclass.pxd @@ -16,10 +16,10 @@ cdef class StateClass: cdef Entity* _ents cdef TokenC _empty_token cdef int length - cdef bint at_sent_end cdef int _s_i cdef int _b_i cdef int _e_i + cdef int _break @staticmethod cdef inline StateClass init(const TokenC* sent, int length): @@ -93,6 +93,6 @@ cdef class StateClass: cdef void set_ent_tag(self, int i, int ent_iob, int ent_type) nogil - cdef void set_sent_end(self, int i) nogil + cdef void set_break(self, int i) nogil cdef void clone(self, StateClass src) nogil diff --git a/spacy/syntax/stateclass.pyx b/spacy/syntax/stateclass.pyx index be3ccf5aa..1e4f3b3f0 100644 --- a/spacy/syntax/stateclass.pyx +++ b/spacy/syntax/stateclass.pyx @@ -14,6 +14,7 @@ cdef class StateClass: self._ents = mem.alloc(length, sizeof(Entity)) self.mem = mem self.length = length + self._break = length self._s_i = 0 self._b_i = 0 self._e_i = 0 @@ -104,10 +105,10 @@ cdef class StateClass: return self._s_i <= 0 cdef bint eol(self) nogil: - return self._b_i >= self.length or self.at_sent_end + return self._b_i >= self._break cdef bint is_final(self) nogil: - return self.stack_depth() <= 1 and self.buffer_length() == 0 + return self.stack_depth() <= 1 and self._b_i >= self.length cdef bint has_head(self, int i) nogil: return self.safe_get(i).head != 0 @@ -130,12 +131,14 @@ cdef class StateClass: return self._s_i cdef int buffer_length(self) nogil: - return self.length - self._b_i + return self._break - self._b_i cdef void push(self) nogil: self._stack[self._s_i] = self.B(0) self._s_i += 1 self._b_i += 1 + if self._b_i >= self._break: + self._break = self.length cdef void pop(self) nogil: self._s_i -= 1 @@ -182,9 +185,10 @@ cdef class StateClass: self._sent[i].ent_iob = ent_iob self._sent[i].ent_type = ent_type - cdef void set_sent_end(self, int i) nogil: + cdef void set_break(self, int i) nogil: if 0 <= i < self.length: self._sent[i].sent_end = True + self._break = i cdef void clone(self, StateClass src) nogil: memcpy(self._sent, src._sent, self.length * sizeof(TokenC)) From afd77a529baee6bee0ab61e5ffb405b7500fc43e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 10 Jun 2015 14:08:30 +0200 Subject: [PATCH 27/75] * Prepare for break transition, with fast-forwarding. 86.5 on 1k nw gold preproc --- spacy/syntax/arc_eager.pyx | 35 +++++++++++++++++++---------------- spacy/syntax/stateclass.pxd | 4 ++++ spacy/syntax/stateclass.pyx | 33 ++++++++++++++++++++++++++++----- 3 files changed, 51 insertions(+), 21 deletions(-) diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 222388b69..1bd7c00f5 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -112,11 +112,12 @@ cdef bint _is_gold_root(const GoldParseC* gold, int word) nogil: cdef class Shift: @staticmethod cdef bint is_valid(StateClass st, int label) nogil: - return st.buffer_length() >= 2 and not st.shifted[st.B(0)] + return st.buffer_length() >= 2 and not st.shifted[st.B(0)] and not st.B_(0).sent_end @staticmethod - cdef int transition(StateClass state, int label) nogil: - state.push() + cdef int transition(StateClass st, int label) nogil: + st.push() + st.fast_forward() @staticmethod cdef int cost(StateClass st, const GoldParseC* gold, int label) nogil: @@ -142,6 +143,7 @@ cdef class Reduce: st.pop() else: st.unshift() + st.fast_forward() @staticmethod cdef int cost(StateClass s, const GoldParseC* gold, int label) nogil: @@ -159,17 +161,13 @@ cdef class Reduce: cdef class LeftArc: @staticmethod cdef bint is_valid(StateClass st, int label) nogil: - if NON_MONOTONIC: - return st.stack_depth() >= 1 and st.buffer_length() >= 1 #and not missing_brackets(s) - else: - return st.stack_depth() >= 1 and st.buffer_length() >= 1 and not st.has_head(st.S(0)) + return not st.B_(0).sent_end @staticmethod cdef int transition(StateClass st, int label) nogil: st.add_arc(st.B(0), st.S(0), label) st.pop() - if st.empty(): - st.push() + st.fast_forward() @staticmethod cdef int cost(StateClass s, const GoldParseC* gold, int label) nogil: @@ -190,12 +188,13 @@ cdef class LeftArc: cdef class RightArc: @staticmethod cdef bint is_valid(StateClass st, int label) nogil: - return st.stack_depth() >= 1 and st.buffer_length() >= 1 + return not st.B_(0).sent_end @staticmethod cdef int transition(StateClass st, int label) nogil: st.add_arc(st.S(0), st.B(0), label) st.push() + st.fast_forward() @staticmethod cdef inline int cost(StateClass s, const GoldParseC* gold, int label) nogil: @@ -221,7 +220,7 @@ cdef class Break: cdef int i if not USE_BREAK: return False - elif st.eol(): + elif st.at_break(): return False elif st.stack_depth() < 1: return False @@ -230,9 +229,13 @@ cdef class Break: @staticmethod cdef int transition(StateClass st, int label) nogil: - #st.set_sent_start() - while st.stack_depth() >= 2 and st.buffer_length() == 0: - Reduce.transition(st, -1) + st.set_break(st.B(0)) + while st.stack_depth() >= 2 and st.has_head(st.S(0)): + st.pop() + if st.stack_depth() == 1: + st.pop() + else: + st.unshift() @staticmethod cdef int cost(StateClass s, const GoldParseC* gold, int label) nogil: @@ -338,7 +341,7 @@ cdef class ArcEager(TransitionSystem): return t cdef int initialize_state(self, StateClass st) except -1: - st.push() + st.fast_forward() cdef int finalize_state(self, StateClass st) except -1: cdef int root_label = self.strings['ROOT'] @@ -410,7 +413,7 @@ cdef class ArcEager(TransitionSystem): best = self.c[i] score = scores[i] assert best.clas < self.n_moves - assert score > MIN_SCORE, (stcls.stack_depth(), stcls.buffer_length()) + assert score > MIN_SCORE, (stcls.stack_depth(), stcls.buffer_length(), stcls.is_final(), stcls._b_i, stcls.length) # Label Shift moves with the best Right-Arc label, for non-monotonic # actions if best.move == SHIFT: diff --git a/spacy/syntax/stateclass.pxd b/spacy/syntax/stateclass.pxd index dcc57474c..54b039208 100644 --- a/spacy/syntax/stateclass.pxd +++ b/spacy/syntax/stateclass.pxd @@ -62,6 +62,8 @@ cdef class StateClass: cdef bint entity_is_open(self) nogil cdef bint eol(self) nogil + + cdef bint at_break(self) nogil cdef bint is_final(self) nogil @@ -96,3 +98,5 @@ cdef class StateClass: cdef void set_break(self, int i) nogil cdef void clone(self, StateClass src) nogil + + cdef void fast_forward(self) nogil diff --git a/spacy/syntax/stateclass.pyx b/spacy/syntax/stateclass.pyx index 1e4f3b3f0..8b6abfdab 100644 --- a/spacy/syntax/stateclass.pyx +++ b/spacy/syntax/stateclass.pyx @@ -14,7 +14,7 @@ cdef class StateClass: self._ents = mem.alloc(length, sizeof(Entity)) self.mem = mem self.length = length - self._break = length + self._break = -1 self._s_i = 0 self._b_i = 0 self._e_i = 0 @@ -105,10 +105,13 @@ cdef class StateClass: return self._s_i <= 0 cdef bint eol(self) nogil: - return self._b_i >= self._break + return self.buffer_length() == 0 + + cdef bint at_break(self) nogil: + return self._break != -1 cdef bint is_final(self) nogil: - return self.stack_depth() <= 1 and self._b_i >= self.length + return self.stack_depth() <= 0 and self._b_i >= self.length cdef bint has_head(self, int i) nogil: return self.safe_get(i).head != 0 @@ -131,14 +134,17 @@ cdef class StateClass: return self._s_i cdef int buffer_length(self) nogil: - return self._break - self._b_i + if self._break != -1: + return self._break - self._b_i + else: + return self.length - self._b_i cdef void push(self) nogil: self._stack[self._s_i] = self.B(0) self._s_i += 1 self._b_i += 1 if self._b_i >= self._break: - self._break = self.length + self._break = -1 cdef void pop(self) nogil: self._s_i -= 1 @@ -149,6 +155,23 @@ cdef class StateClass: self._s_i -= 1 self.shifted[self.B(0)] = True + cdef void fast_forward(self) nogil: + while self.buffer_length() == 0 or self.stack_depth() == 0: + if self.buffer_length() == 1 and self.stack_depth() == 0: + self.push() + self.pop() + elif self.buffer_length() == 0 and self.stack_depth() == 1: + self.pop() + elif self.buffer_length() == 0 and self.stack_depth() >= 2: + if self.has_head(self.S(0)): + self.pop() + else: + self.unshift() + elif self.buffer_length() >= 2 and self.stack_depth() == 0: + self.push() + else: + break + cdef void add_arc(self, int head, int child, int label) nogil: if self.has_head(child): self.del_arc(self.H(child), child) From 15e177d7a1ad18137b3ff8f62b4e55b759594899 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 12 Jun 2015 01:50:23 +0200 Subject: [PATCH 28/75] * Fixes to unshift/fast-forward strategy. Getting 91.55 greedy on NW dev, gold preproc --- spacy/syntax/arc_eager.pxd | 7 ++++++ spacy/syntax/arc_eager.pyx | 48 +++++++++++++++++++++---------------- spacy/syntax/ner.pyx | 1 - spacy/syntax/parser.pyx | 22 +++++++++++++++-- spacy/syntax/stateclass.pyx | 13 +++++----- 5 files changed, 61 insertions(+), 30 deletions(-) diff --git a/spacy/syntax/arc_eager.pxd b/spacy/syntax/arc_eager.pxd index 1390d949c..5b7a6e3db 100644 --- a/spacy/syntax/arc_eager.pxd +++ b/spacy/syntax/arc_eager.pxd @@ -5,6 +5,13 @@ from thinc.typedefs cimport weight_t from .stateclass cimport StateClass from .transition_system cimport TransitionSystem, Transition +from ..gold cimport GoldParseC + cdef class ArcEager(TransitionSystem): pass + + +cdef int push_cost(StateClass stcls, const GoldParseC* gold, int target) nogil +cdef int arc_cost(StateClass stcls, const GoldParseC* gold, int head, int child) nogil + diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 1bd7c00f5..e5257b18a 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -222,20 +222,20 @@ cdef class Break: return False elif st.at_break(): return False + elif st.B(0) == 0: + return False elif st.stack_depth() < 1: return False + elif (st.S(0) + 1) != st.B(0): + # Must break at the token boundary + return False else: return True @staticmethod cdef int transition(StateClass st, int label) nogil: st.set_break(st.B(0)) - while st.stack_depth() >= 2 and st.has_head(st.S(0)): - st.pop() - if st.stack_depth() == 1: - st.pop() - else: - st.unshift() + st.fast_forward() @staticmethod cdef int cost(StateClass s, const GoldParseC* gold, int label) nogil: @@ -243,32 +243,37 @@ cdef class Break: @staticmethod cdef inline int move_cost(StateClass s, const GoldParseC* gold) nogil: - # When we break, we can't reach any arcs between stack and buffer - # So cost is number of deps between S0...Sn and B0...Nn - cdef int cost = 0 - cdef int i, j, B_i, S_i - for i in range(s.buffer_length()): - B_i = s.B(i) - for j in range(s.stack_depth()): - S_i = s.S(j) - cost += gold.heads[B_i] == S_i - cost += gold.heads[S_i] == B_i - return cost + # Check for sentence boundary --- if it's here, we can't have any deps + # between stack and buffer, so rest of action is irrelevant. + s0_root = _get_root(s.S(0), gold) + b0_root = _get_root(s.B(0), gold) + if s0_root == -1 or b0_root == -1 or s0_root != b0_root: + return 0 + else: + return 1 @staticmethod cdef inline int label_cost(StateClass s, const GoldParseC* gold, int label) nogil: return 0 +cdef int _get_root(int word, const GoldParseC* gold) nogil: + while gold.heads[word] != word and gold.labels[word] != -1 and word >= 0: + word = gold.heads[word] + if gold.labels[word] == -1: + return -1 + else: + return word + cdef class ArcEager(TransitionSystem): @classmethod def get_labels(cls, gold_parses): - move_labels = {SHIFT: {'': True}, REDUCE: {'': True}, RIGHT: {}, - LEFT: {'ROOT': True}, BREAK: {'ROOT': True}} + move_labels = {SHIFT: {'': True}, REDUCE: {'': True}, RIGHT: {'root': True}, + LEFT: {'root': True}, BREAK: {'root': True}} for raw_text, sents in gold_parses: for (ids, words, tags, heads, labels, iob), ctnts in sents: for child, head, label in zip(ids, heads, labels): - if label != 'ROOT': + if label != 'root': if head < child: move_labels[RIGHT][label] = True elif head > child: @@ -341,6 +346,9 @@ cdef class ArcEager(TransitionSystem): return t cdef int initialize_state(self, StateClass st) except -1: + # Ensure sent_end is set to 0 throughout + for i in range(st.length): + st._sent[i].sent_end = False st.fast_forward() cdef int finalize_state(self, StateClass st) except -1: diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx index c1005adae..c27bae1f2 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/syntax/ner.pyx @@ -322,7 +322,6 @@ cdef class Out: cdef int g_act = gold.ner[s.B(0)].move cdef int g_tag = gold.ner[s.B(0)].label - if g_act == MISSING: return 0 elif g_act == BEGIN: diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 4be1046bc..30d5b5f92 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -36,6 +36,8 @@ from ..tokens cimport Tokens, TokenC from ..strings cimport StringStore from .arc_eager cimport TransitionSystem, Transition +from .arc_eager cimport push_cost, arc_cost + from .transition_system import OracleError from ..gold cimport GoldParse @@ -95,11 +97,12 @@ cdef class Parser: cdef Transition guess words = [w.orth_ for w in tokens] while not stcls.is_final(): - #print stcls.print_state(words) fill_context(context, stcls) scores = self.model.score(context) guess = self.moves.best_valid(scores, stcls) + #print self.moves.move_name(guess.move, guess.label), stcls.print_state(words) guess.do(stcls, guess.label) + assert stcls._s_i >= 0 self.moves.finalize_state(stcls) tokens.set_parse(stcls._sent) @@ -128,12 +131,27 @@ cdef class Parser: cdef atom_t[CONTEXT_SIZE] context loss = 0 words = [w.orth_ for w in tokens] + history = [] while not stcls.is_final(): + assert stcls._s_i >= 0 fill_context(context, stcls) scores = self.model.score(context) guess = self.moves.best_valid(scores, stcls) - best = self.moves.best_gold(scores, stcls, gold) + try: + best = self.moves.best_gold(scores, stcls, gold) + except: + history.append((self.moves.move_name(guess.move, guess.label), '!', stcls.print_state(words))) + for i, word in enumerate(words): + print gold.orig_annot[i] + print '\n'.join('\t'.join(s) for s in history) + print words[gold.c.heads[stcls.S(0)]] + print words[gold.c.heads[stcls.B(0)]] + print push_cost(stcls, &gold.c, stcls.B(0)) + print arc_cost(stcls, &gold.c, stcls.S(0), stcls.B(0)) + self.moves.set_valid(self.moves._is_valid, stcls) + raise cost = guess.get_cost(stcls, &gold.c, guess.label) + history.append((self.moves.move_name(guess.move, guess.label), str(cost), stcls.print_state(words))) self.model.update(context, guess.clas, best.clas, cost) guess.do(stcls, guess.label) loss += cost diff --git a/spacy/syntax/stateclass.pyx b/spacy/syntax/stateclass.pyx index 8b6abfdab..5c1895a1e 100644 --- a/spacy/syntax/stateclass.pyx +++ b/spacy/syntax/stateclass.pyx @@ -143,7 +143,7 @@ cdef class StateClass: self._stack[self._s_i] = self.B(0) self._s_i += 1 self._b_i += 1 - if self._b_i >= self._break: + if self._b_i > self._break: self._break = -1 cdef void pop(self) nogil: @@ -167,7 +167,7 @@ cdef class StateClass: self.pop() else: self.unshift() - elif self.buffer_length() >= 2 and self.stack_depth() == 0: + elif (self.length - self._b_i) >= 1 and self.stack_depth() == 0: self.push() else: break @@ -208,10 +208,9 @@ cdef class StateClass: self._sent[i].ent_iob = ent_iob self._sent[i].ent_type = ent_type - cdef void set_break(self, int i) nogil: - if 0 <= i < self.length: - self._sent[i].sent_end = True - self._break = i + cdef void set_break(self, int _) nogil: + self._sent[self.B(0)].sent_end = True + self._break = self._b_i cdef void clone(self, StateClass src) nogil: memcpy(self._sent, src._sent, self.length * sizeof(TokenC)) @@ -229,7 +228,7 @@ cdef class StateClass: third = words[self.S(2)] + '_%d' % self.S_(2).head n0 = words[self.B(0)] n1 = words[self.B(1)] - return ' '.join((str(self.buffer_length()), str(self.stack_depth()), third, second, top, '|', n0, n1)) + return ' '.join((str(self.buffer_length()), str(self.B_(0).sent_end), str(self._b_i), str(self._break), str(self.length), str(self.stack_depth()), third, second, top, '|', n0, n1)) # From https://en.wikipedia.org/wiki/Hamming_weight From b643cb3d5c777a02343bc115b2cfc2e74fe2f68b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 12 Jun 2015 02:42:08 +0200 Subject: [PATCH 29/75] * Allow training documents to be filtered in gold.pyx --- spacy/gold.pyx | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index fe53fdb8a..f3ed33d10 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -121,7 +121,7 @@ def _min_edit_path(cand_words, gold_words): return prev_costs[n_gold], previous_row[-1] -def read_json_file(loc): +def read_json_file(loc, docs_filter=None): print loc if path.isdir(loc): for filename in os.listdir(loc): @@ -130,6 +130,8 @@ def read_json_file(loc): with open(loc) as file_: docs = ujson.load(file_) for doc in docs: + if docs_filter is not None and not docs_filter(doc): + continue paragraphs = [] for paragraph in doc['paragraphs']: sents = [] From 75289b4761fa880b45f94c52c43e2f1da623e89d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 13 Jun 2015 22:55:55 +0200 Subject: [PATCH 30/75] * Don't refuse to parse single token sentences, incase some transition system needs them, e.g. single word entity. Instead fix error in _init_state. --- spacy/syntax/parser.pyx | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 30d5b5f92..2a86c87f8 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -74,8 +74,6 @@ cdef class Parser: self.model = Model(self.moves.n_moves, templates, model_dir) def __call__(self, Tokens tokens): - if tokens.length == 0: - return 0 if self.cfg.get('beam_width', 1) < 1: self._greedy_parse(tokens) else: @@ -108,9 +106,9 @@ cdef class Parser: cdef int _beam_parse(self, Tokens tokens) except -1: cdef Beam beam = Beam(self.moves.n_moves, self.cfg.beam_width) + words = [w.orth_ for w in tokens] beam.initialize(_init_state, tokens.length, tokens.data) beam.check_done(_check_final_state, NULL) - words = [w.orth_ for w in tokens] while not beam.is_done: self._advance_beam(beam, None, False, words) state = beam.at(0) @@ -231,7 +229,7 @@ cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) cdef void* _init_state(Pool mem, int length, void* tokens) except NULL: cdef StateClass st = StateClass.init(tokens, length) - st.push() + st.fast_forward() Py_INCREF(st) return st From 399f15fbdf2d04bd96f1ce90f557ccda644dd563 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 14 Jun 2015 00:28:37 +0200 Subject: [PATCH 31/75] * Add flag to toggle handling of multi-root inputs without the Break transition. Clear up now unused best_valid stuff. --- spacy/syntax/arc_eager.pyx | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index e5257b18a..b0c3819b6 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -20,6 +20,7 @@ from .stateclass cimport StateClass DEF NON_MONOTONIC = True DEF USE_BREAK = False +DEF USE_ROOT_ARC_SEGMENT = True cdef weight_t MIN_SCORE = -90000 @@ -86,7 +87,7 @@ cdef int arc_cost(StateClass stcls, const GoldParseC* gold, int head, int child) cdef bint arc_is_gold(const GoldParseC* gold, int head, int child) nogil: if gold.labels[child] == -1: return True - elif _is_gold_root(gold, head) and _is_gold_root(gold, child): + elif USE_ROOT_ARC_SEGMENT and _is_gold_root(gold, head) and _is_gold_root(gold, child): return True elif gold.heads[child] == head: return True @@ -352,10 +353,14 @@ cdef class ArcEager(TransitionSystem): st.fast_forward() cdef int finalize_state(self, StateClass st) except -1: - cdef int root_label = self.strings['ROOT'] + cdef int root_label = self.strings['root'] for i in range(st.length): if st._sent[i].head == 0 and st._sent[i].dep == 0: st._sent[i].dep = root_label + # If we're not using the Break transition, we segment via root-labelled + # arcs between the root words. + elif USE_ROOT_ARC_SEGMENT and st._sent[i].dep == root_label: + st._sent[i].head = 0 cdef int set_valid(self, bint* output, StateClass stcls) except -1: cdef bint[N_MOVES] is_valid @@ -422,12 +427,4 @@ cdef class ArcEager(TransitionSystem): score = scores[i] assert best.clas < self.n_moves assert score > MIN_SCORE, (stcls.stack_depth(), stcls.buffer_length(), stcls.is_final(), stcls._b_i, stcls.length) - # Label Shift moves with the best Right-Arc label, for non-monotonic - # actions - if best.move == SHIFT: - score = MIN_SCORE - for i in range(self.n_moves): - if self.c[i].move == RIGHT and scores[i] > score: - best.label = self.c[i].label - score = scores[i] return best From bdd07bf00075c7b5e2ae0c06aca10ffc67476980 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 14 Jun 2015 17:44:03 +0200 Subject: [PATCH 32/75] * Fix Break oracle, but disable the Break transition for now, while we finalize the gold-standard experiments --- spacy/syntax/arc_eager.pyx | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index b0c3819b6..7ac11fd46 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -20,7 +20,7 @@ from .stateclass cimport StateClass DEF NON_MONOTONIC = True DEF USE_BREAK = False -DEF USE_ROOT_ARC_SEGMENT = True +DEF USE_ROOT_ARC_SEGMENT = False cdef weight_t MIN_SCORE = -90000 @@ -69,6 +69,7 @@ cdef int pop_cost(StateClass stcls, const GoldParseC* gold, int target) nogil: cost += gold.heads[target] == B_i if gold.heads[B_i] == B_i or gold.heads[B_i] < target: break + cost += Break.is_valid(stcls, -1) and Break.move_cost(stcls, gold) == 0 return cost @@ -244,14 +245,22 @@ cdef class Break: @staticmethod cdef inline int move_cost(StateClass s, const GoldParseC* gold) nogil: + cdef int cost = 0 + cdef int S_i, B_i + for i in range(s.stack_depth()): + S_i = s.S(i) + for j in range(s.buffer_length()): + B_i = s.B(j) + cost += gold.heads[S_i] == B_i + cost += gold.heads[B_i] == S_i # Check for sentence boundary --- if it's here, we can't have any deps # between stack and buffer, so rest of action is irrelevant. s0_root = _get_root(s.S(0), gold) b0_root = _get_root(s.B(0), gold) - if s0_root == -1 or b0_root == -1 or s0_root != b0_root: - return 0 + if s0_root != b0_root or s0_root == -1 or b0_root == -1: + return cost else: - return 1 + return cost + 1 @staticmethod cdef inline int label_cost(StateClass s, const GoldParseC* gold, int label) nogil: From 763cbd23d5556fe6324dcdc4e45c859f7f0d1422 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 14 Jun 2015 17:44:29 +0200 Subject: [PATCH 33/75] * Upd stateclass.print_state --- spacy/syntax/stateclass.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/syntax/stateclass.pyx b/spacy/syntax/stateclass.pyx index 5c1895a1e..b03f6ed16 100644 --- a/spacy/syntax/stateclass.pyx +++ b/spacy/syntax/stateclass.pyx @@ -228,7 +228,7 @@ cdef class StateClass: third = words[self.S(2)] + '_%d' % self.S_(2).head n0 = words[self.B(0)] n1 = words[self.B(1)] - return ' '.join((str(self.buffer_length()), str(self.B_(0).sent_end), str(self._b_i), str(self._break), str(self.length), str(self.stack_depth()), third, second, top, '|', n0, n1)) + return ' '.join((third, second, top, '|', n0, n1)) # From https://en.wikipedia.org/wiki/Hamming_weight From c500d72dc2e25f79d7f90433435019f0278f7087 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 14 Jun 2015 17:45:31 +0200 Subject: [PATCH 34/75] * Temporarily disable NER, and wire up the verbose flag during training --- bin/parser/train.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/bin/parser/train.py b/bin/parser/train.py index 489d9259c..be9e997d4 100755 --- a/bin/parser/train.py +++ b/bin/parser/train.py @@ -48,7 +48,7 @@ def add_noise(orig, noise_level): return ''.join(_corrupt(c, noise_level) for c in orig) -def score_model(scorer, nlp, raw_text, annot_tuples): +def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False): if raw_text is None: tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) else: @@ -57,7 +57,7 @@ def score_model(scorer, nlp, raw_text, annot_tuples): nlp.entity(tokens) nlp.parser(tokens) gold = GoldParse(tokens, annot_tuples) - scorer.score(tokens, gold, verbose=False) + scorer.score(tokens, gold, verbose=verbose) def _merge_sents(sents): @@ -78,7 +78,7 @@ def _merge_sents(sents): def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0, gold_preproc=False, n_sents=0, corruption_level=0, - beam_width=1): + beam_width=1, verbose=False): dep_model_dir = path.join(model_dir, 'deps') pos_model_dir = path.join(model_dir, 'pos') ner_model_dir = path.join(model_dir, 'ner') @@ -118,7 +118,8 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', for annot_tuples, ctnt in sents: if len(annot_tuples[1]) == 1: continue - score_model(scorer, nlp, raw_text, annot_tuples) + score_model(scorer, nlp, raw_text, annot_tuples, + verbose=verbose if itn >= 2 else False) if raw_text is None: words = add_noise(annot_tuples[1], corruption_level) tokens = nlp.tokenizer.tokens_from_list(words) @@ -129,7 +130,7 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', gold = GoldParse(tokens, annot_tuples, make_projective=True) loss += nlp.parser.train(tokens, gold) - nlp.entity.train(tokens, gold) + #nlp.entity.train(tokens, gold) nlp.tagger.train(tokens, gold.tags) random.shuffle(gold_tuples) print '%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f, @@ -156,7 +157,7 @@ def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False if raw_text is None: tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) nlp.tagger(tokens) - nlp.entity(tokens) + #nlp.entity(tokens) nlp.parser(tokens) else: tokens = nlp(raw_text, merge_mwes=False) @@ -178,7 +179,7 @@ def write_parses(Language, dev_loc, model_dir, out_loc, beam_width=None): if raw_text is None: tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) nlp.tagger(tokens) - nlp.entity(tokens) + #nlp.entity(tokens) nlp.parser(tokens) else: tokens = nlp(raw_text, merge_mwes=False) @@ -214,9 +215,9 @@ def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbos feat_set='basic' if not debug else 'debug', gold_preproc=gold_preproc, n_sents=n_sents, corruption_level=corruption_level, n_iter=n_iter, - beam_width=beam_width) - if out_loc: - write_parses(English, dev_loc, model_dir, out_loc, beam_width=beam_width) + beam_width=beam_width, verbose=verbose) + #if out_loc: + # write_parses(English, dev_loc, model_dir, out_loc, beam_width=beam_width) scorer = evaluate(English, list(read_json_file(dev_loc)), model_dir, gold_preproc=gold_preproc, verbose=verbose, beam_width=beam_width) From e50ac1a47f7f0dcf8a6d8181a0c04bee7526d385 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 14 Jun 2015 17:45:50 +0200 Subject: [PATCH 35/75] * Add verbose printing to scorer --- spacy/scorer.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/spacy/scorer.py b/spacy/scorer.py index 4c210656b..8310cbd49 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -113,3 +113,12 @@ class Scorer(object): set(item[:2] for item in cand_deps), set(item[:2] for item in gold_deps), ) + if verbose: + gold_words = [item[1] for item in gold.orig_annot] + for w_id, h_id, dep in (cand_deps - gold_deps): + print 'F', gold_words[w_id], dep, gold_words[h_id] + for w_id, h_id, dep in (gold_deps - cand_deps): + print 'M', gold_words[w_id], dep, gold_words[h_id] + + + From e0984ca139e4e945e8cc465f497da014516aa8e6 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 14 Jun 2015 17:50:26 +0200 Subject: [PATCH 36/75] * Fix valency features in StateClass --- spacy/syntax/stateclass.pyx | 34 ++++++---------------------------- 1 file changed, 6 insertions(+), 28 deletions(-) diff --git a/spacy/syntax/stateclass.pyx b/spacy/syntax/stateclass.pyx index b03f6ed16..1259b1354 100644 --- a/spacy/syntax/stateclass.pyx +++ b/spacy/syntax/stateclass.pyx @@ -117,10 +117,10 @@ cdef class StateClass: return self.safe_get(i).head != 0 cdef int n_L(self, int i) nogil: - return _popcount(self.safe_get(i).l_kids) + return self.safe_get(i).l_kids cdef int n_R(self, int i) nogil: - return _popcount(self.safe_get(i).r_kids) + return self.safe_get(i).r_kids cdef bint stack_is_connected(self) nogil: return False @@ -182,16 +182,16 @@ cdef class StateClass: # Keep a bit-vector tracking child dependencies. If a word has a child at # offset i from it, set that bit (tracking left and right separately) if child > head: - self._sent[head].r_kids |= 1 << (-dist) + self._sent[head].r_kids += 1 else: - self._sent[head].l_kids |= 1 << dist + self._sent[head].l_kids += 1 cdef void del_arc(self, int head, int child) nogil: cdef int dist = head - child if child > head: - self._sent[head].r_kids &= ~(1 << (-dist)) + self._sent[head].r_kids -= 1 else: - self._sent[head].l_kids &= ~(1 << dist) + self._sent[head].l_kids -= 1 cdef void open_ent(self, int label) nogil: self._ents[self._e_i].start = self.B(0) @@ -229,25 +229,3 @@ cdef class StateClass: n0 = words[self.B(0)] n1 = words[self.B(1)] return ' '.join((third, second, top, '|', n0, n1)) - - -# From https://en.wikipedia.org/wiki/Hamming_weight -cdef inline uint32_t _popcount(uint32_t x) nogil: - """Find number of non-zero bits.""" - cdef int count = 0 - while x != 0: - x &= x - 1 - count += 1 - return count - - -cdef inline uint32_t _nth_significant_bit(uint32_t bits, int n) nogil: - cdef int i - for i in range(32): - if bits & (1 << i): - if n < 1: - return i - n -= 1 - return 0 - - From ea8a1030078347b10d5877f49ad4358a2838e112 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 14 Jun 2015 19:01:26 +0200 Subject: [PATCH 37/75] * Fix import of TransitionSystem in parser.pyx --- spacy/syntax/parser.pyx | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 2a86c87f8..c5648cb06 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -35,10 +35,9 @@ from thinc.search cimport MaxViolation from ..tokens cimport Tokens, TokenC from ..strings cimport StringStore -from .arc_eager cimport TransitionSystem, Transition -from .arc_eager cimport push_cost, arc_cost from .transition_system import OracleError +from .transition_system cimport TransitionSystem, Transition from ..gold cimport GoldParse @@ -144,8 +143,6 @@ cdef class Parser: print '\n'.join('\t'.join(s) for s in history) print words[gold.c.heads[stcls.S(0)]] print words[gold.c.heads[stcls.B(0)]] - print push_cost(stcls, &gold.c, stcls.B(0)) - print arc_cost(stcls, &gold.c, stcls.S(0), stcls.B(0)) self.moves.set_valid(self.moves._is_valid, stcls) raise cost = guess.get_cost(stcls, &gold.c, guess.label) From bcfdf126a4b7eea9ceb960efa4653b21b582871f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 14 Jun 2015 20:28:14 +0200 Subject: [PATCH 38/75] * Add toggle for OrigArcEager system --- bin/parser/train.py | 15 +++++++++++---- setup.py | 4 +++- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/bin/parser/train.py b/bin/parser/train.py index be9e997d4..841ba2e6e 100755 --- a/bin/parser/train.py +++ b/bin/parser/train.py @@ -17,6 +17,7 @@ import spacy.util from spacy.en import English from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir +from spacy.syntax.orig_arc_eager import OrigArcEager from spacy.syntax.util import Config from spacy.gold import read_json_file from spacy.gold import GoldParse @@ -78,7 +79,8 @@ def _merge_sents(sents): def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0, gold_preproc=False, n_sents=0, corruption_level=0, - beam_width=1, verbose=False): + beam_width=1, verbose=False, + use_orig_arc_eager=False): dep_model_dir = path.join(model_dir, 'deps') pos_model_dir = path.join(model_dir, 'pos') ner_model_dir = path.join(model_dir, 'ner') @@ -92,6 +94,9 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', os.mkdir(pos_model_dir) os.mkdir(ner_model_dir) + if use_orig_arc_eager: + Language.ParserTransitionSystem = OrigArcEager + setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir) Config.write(dep_model_dir, 'config', features=feat_set, seed=seed, @@ -204,18 +209,20 @@ def write_parses(Language, dev_loc, model_dir, out_loc, beam_width=None): n_iter=("Number of training iterations", "option", "i", int), beam_width=("Number of candidates to maintain in the beam", "option", "k", int), verbose=("Verbose error reporting", "flag", "v", bool), - debug=("Debug mode", "flag", "d", bool) + debug=("Debug mode", "flag", "d", bool), + use_orig_arc_eager=("Use the original, monotonic arc-eager system", "flag", "m", bool) ) def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False, debug=False, corruption_level=0.0, gold_preproc=False, beam_width=1, - eval_only=False): + eval_only=False, use_orig_arc_eager=False): if not eval_only: gold_train = list(read_json_file(train_loc)) train(English, gold_train, model_dir, feat_set='basic' if not debug else 'debug', gold_preproc=gold_preproc, n_sents=n_sents, corruption_level=corruption_level, n_iter=n_iter, - beam_width=beam_width, verbose=verbose) + beam_width=beam_width, verbose=verbose, + use_orig_arc_eager=use_orig_arc_eager) #if out_loc: # write_parses(English, dev_loc, model_dir, out_loc, beam_width=beam_width) scorer = evaluate(English, list(read_json_file(dev_loc)), diff --git a/setup.py b/setup.py index 194648f95..1baef0e85 100644 --- a/setup.py +++ b/setup.py @@ -154,7 +154,9 @@ MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings', 'spacy._ml', 'spacy.tokenizer', 'spacy.en.attrs', 'spacy.en.pos', 'spacy.syntax.parser', 'spacy.syntax.transition_system', - 'spacy.syntax.arc_eager', 'spacy.syntax._parse_features', + 'spacy.syntax.arc_eager', + 'spacy.syntax.orig_arc_eager', + 'spacy.syntax._parse_features', 'spacy.gold', 'spacy.orth', 'spacy.syntax.ner'] From 3da8e0f317ce14201c40d364caf79766110a9c79 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 14 Jun 2015 20:31:44 +0200 Subject: [PATCH 39/75] * Add orig_arc_eager --- spacy/syntax/orig_arc_eager.pxd | 17 ++ spacy/syntax/orig_arc_eager.pyx | 357 ++++++++++++++++++++++++++++++++ 2 files changed, 374 insertions(+) create mode 100644 spacy/syntax/orig_arc_eager.pxd create mode 100644 spacy/syntax/orig_arc_eager.pyx diff --git a/spacy/syntax/orig_arc_eager.pxd b/spacy/syntax/orig_arc_eager.pxd new file mode 100644 index 000000000..82ec85f34 --- /dev/null +++ b/spacy/syntax/orig_arc_eager.pxd @@ -0,0 +1,17 @@ +from cymem.cymem cimport Pool + +from thinc.typedefs cimport weight_t + +from .stateclass cimport StateClass + +from .transition_system cimport TransitionSystem, Transition +from ..gold cimport GoldParseC + + +cdef class OrigArcEager(TransitionSystem): + pass + + +cdef int push_cost(StateClass stcls, const GoldParseC* gold, int target) nogil +cdef int arc_cost(StateClass stcls, const GoldParseC* gold, int head, int child) nogil + diff --git a/spacy/syntax/orig_arc_eager.pyx b/spacy/syntax/orig_arc_eager.pyx new file mode 100644 index 000000000..304c72c83 --- /dev/null +++ b/spacy/syntax/orig_arc_eager.pyx @@ -0,0 +1,357 @@ +# cython: profile=True +from __future__ import unicode_literals + +import ctypes +import os + +from ..structs cimport TokenC + +from .transition_system cimport do_func_t, get_cost_func_t +from .transition_system cimport move_cost_func_t, label_cost_func_t +from ..gold cimport GoldParse +from ..gold cimport GoldParseC + +from libc.stdint cimport uint32_t +from libc.string cimport memcpy + +from cymem.cymem cimport Pool +from .stateclass cimport StateClass + + +cdef weight_t MIN_SCORE = -90000 + +cdef enum: + SHIFT + REDUCE + LEFT + RIGHT + + N_MOVES + + +MOVE_NAMES = [None] * N_MOVES +MOVE_NAMES[SHIFT] = 'S' +MOVE_NAMES[REDUCE] = 'D' +MOVE_NAMES[LEFT] = 'L' +MOVE_NAMES[RIGHT] = 'R' + + +# Helper functions for the arc-eager oracle + +cdef int push_cost(StateClass stcls, const GoldParseC* gold, int target) nogil: + cdef int cost = 0 + cdef int i, S_i + for i in range(stcls.stack_depth()): + S_i = stcls.S(i) + if gold.heads[target] == S_i: + cost += 1 + if gold.heads[S_i] == target and not stcls.has_head(S_i): + cost += 1 + return cost + + +cdef int pop_cost(StateClass stcls, const GoldParseC* gold, int target) nogil: + if stcls.buffer_length() == 0: + return 0 + cdef int cost = 0 + cdef int i, B_i + for i in range(stcls.buffer_length()): + B_i = stcls.B(i) + cost += gold.heads[B_i] == target + if not stcls.has_head(target): + cost += gold.heads[target] == B_i + if gold.heads[B_i] == B_i or gold.heads[B_i] < target: + break + return cost + + +cdef int arc_cost(StateClass stcls, const GoldParseC* gold, int head, int child) nogil: + if arc_is_gold(gold, head, child): + return 0 + elif stcls.H(child) == gold.heads[child]: + return 1 + # Head in buffer + elif gold.heads[child] >= stcls.B(0) and stcls.B(1) != -1: + return 1 + else: + return 0 + + +cdef bint arc_is_gold(const GoldParseC* gold, int head, int child) nogil: + if gold.labels[child] == -1: + return True + elif gold.heads[child] == head: + return True + else: + return False + + +cdef bint label_is_gold(const GoldParseC* gold, int head, int child, int label) nogil: + if gold.labels[child] == -1: + return True + elif label == -1: + return True + elif gold.labels[child] == label: + return True + else: + return False + + +cdef bint _is_gold_root(const GoldParseC* gold, int word) nogil: + return gold.labels[word] == -1 or gold.heads[word] == word + + +cdef class Shift: + @staticmethod + cdef bint is_valid(StateClass st, int label) nogil: + return st.buffer_length() >= 1 + + @staticmethod + cdef int transition(StateClass st, int label) nogil: + st.push() + + @staticmethod + cdef int cost(StateClass st, const GoldParseC* gold, int label) nogil: + return Shift.move_cost(st, gold) + Shift.label_cost(st, gold, label) + + @staticmethod + cdef inline int move_cost(StateClass s, const GoldParseC* gold) nogil: + return push_cost(s, gold, s.B(0)) + + @staticmethod + cdef inline int label_cost(StateClass s, const GoldParseC* gold, int label) nogil: + return 0 + + +cdef class Reduce: + @staticmethod + cdef bint is_valid(StateClass st, int label) nogil: + return st.stack_depth() >= 1 and st.has_head(st.S(0)) + + @staticmethod + cdef int transition(StateClass st, int label) nogil: + st.pop() + + @staticmethod + cdef int cost(StateClass s, const GoldParseC* gold, int label) nogil: + return Reduce.move_cost(s, gold) + Reduce.label_cost(s, gold, label) + + @staticmethod + cdef inline int move_cost(StateClass st, const GoldParseC* gold) nogil: + return pop_cost(st, gold, st.S(0)) + + @staticmethod + cdef inline int label_cost(StateClass s, const GoldParseC* gold, int label) nogil: + return 0 + + +cdef class LeftArc: + @staticmethod + cdef bint is_valid(StateClass st, int label) nogil: + return st.stack_depth() >= 1 and not st.has_head(st.S(0)) + + @staticmethod + cdef int transition(StateClass st, int label) nogil: + if not st.buffer_length(): + st.add_arc(st.S(0), st.S(0), label) + else: + st.add_arc(st.B(0), st.S(0), label) + st.pop() + + @staticmethod + cdef int cost(StateClass s, const GoldParseC* gold, int label) nogil: + return LeftArc.move_cost(s, gold) + LeftArc.label_cost(s, gold, label) + + @staticmethod + cdef inline int move_cost(StateClass s, const GoldParseC* gold) nogil: + if not s.buffer_length(): + return 0 + elif arc_is_gold(gold, s.B(0), s.S(0)): + return 0 + else: + return pop_cost(s, gold, s.S(0)) + arc_cost(s, gold, s.B(0), s.S(0)) + + @staticmethod + cdef inline int label_cost(StateClass s, const GoldParseC* gold, int label) nogil: + if not s.buffer_length(): + return 0 + return arc_is_gold(gold, s.B(0), s.S(0)) and not label_is_gold(gold, s.B(0), s.S(0), label) + + +cdef class RightArc: + @staticmethod + cdef bint is_valid(StateClass st, int label) nogil: + return st.stack_depth() >= 1 and st.buffer_length() >= 1 + + @staticmethod + cdef int transition(StateClass st, int label) nogil: + st.add_arc(st.S(0), st.B(0), label) + st.push() + + @staticmethod + cdef inline int cost(StateClass s, const GoldParseC* gold, int label) nogil: + return RightArc.move_cost(s, gold) + RightArc.label_cost(s, gold, label) + + @staticmethod + cdef inline int move_cost(StateClass s, const GoldParseC* gold) nogil: + if arc_is_gold(gold, s.S(0), s.B(0)): + return 0 + elif s.shifted[s.B(0)]: + return push_cost(s, gold, s.B(0)) + else: + return push_cost(s, gold, s.B(0)) + arc_cost(s, gold, s.S(0), s.B(0)) + + @staticmethod + cdef int label_cost(StateClass s, const GoldParseC* gold, int label) nogil: + return arc_is_gold(gold, s.S(0), s.B(0)) and not label_is_gold(gold, s.S(0), s.B(0), label) + + +cdef class OrigArcEager(TransitionSystem): + @classmethod + def get_labels(cls, gold_parses): + move_labels = {SHIFT: {'': True}, RIGHT: {'': True}, + REDUCE: {'': True}, LEFT: {'root': True}} + for raw_text, sents in gold_parses: + for (ids, words, tags, heads, labels, iob), ctnts in sents: + for child, head, label in zip(ids, heads, labels): + if label != 'root': + if head < child: + move_labels[RIGHT][label] = True + elif head > child: + move_labels[LEFT][label] = True + return move_labels + + cdef int preprocess_gold(self, GoldParse gold) except -1: + for i in range(gold.length): + if gold.heads[i] is None: # Missing values + gold.c.heads[i] = i + gold.c.labels[i] = -1 + else: + gold.c.heads[i] = gold.heads[i] + gold.c.labels[i] = self.strings[gold.labels[i]] + for end, brackets in gold.brackets.items(): + for start, label_strs in brackets.items(): + gold.c.brackets[start][end] = 1 + for label_str in label_strs: + # Add the encoded label to the set + gold.brackets[end][start].add(self.strings[label_str]) + + cdef Transition lookup_transition(self, object name) except *: + if '-' in name: + move_str, label_str = name.split('-', 1) + label = self.label_ids[label_str] + else: + label = 0 + move = MOVE_NAMES.index(move_str) + for i in range(self.n_moves): + if self.c[i].move == move and self.c[i].label == label: + return self.c[i] + + def move_name(self, int move, int label): + label_str = self.strings[label] + if label_str: + return MOVE_NAMES[move] + '-' + label_str + else: + return MOVE_NAMES[move] + + cdef Transition init_transition(self, int clas, int move, int label) except *: + # TODO: Apparent Cython bug here when we try to use the Transition() + # constructor with the function pointers + cdef Transition t + t.score = 0 + t.clas = clas + t.move = move + t.label = label + if move == SHIFT: + t.is_valid = Shift.is_valid + t.do = Shift.transition + t.get_cost = Shift.cost + elif move == REDUCE: + t.is_valid = Reduce.is_valid + t.do = Reduce.transition + t.get_cost = Reduce.cost + elif move == LEFT: + t.is_valid = LeftArc.is_valid + t.do = LeftArc.transition + t.get_cost = LeftArc.cost + elif move == RIGHT: + t.is_valid = RightArc.is_valid + t.do = RightArc.transition + t.get_cost = RightArc.cost + else: + raise Exception(move) + return t + + cdef int initialize_state(self, StateClass st) except -1: + # Ensure sent_end is set to 0 throughout + for i in range(st.length): + st._sent[i].sent_end = False + st.push() + + cdef int finalize_state(self, StateClass st) except -1: + cdef int root_label = self.strings['root'] + for i in range(st.length): + if st._sent[i].head == 0 and st._sent[i].dep == 0: + st._sent[i].dep = root_label + + cdef int set_valid(self, bint* output, StateClass stcls) except -1: + cdef bint[N_MOVES] is_valid + is_valid[SHIFT] = Shift.is_valid(stcls, -1) + is_valid[REDUCE] = Reduce.is_valid(stcls, -1) + is_valid[LEFT] = LeftArc.is_valid(stcls, -1) + is_valid[RIGHT] = RightArc.is_valid(stcls, -1) + cdef int i + n_valid = 0 + for i in range(self.n_moves): + output[i] = is_valid[self.c[i].move] + n_valid += output[i] + assert n_valid >= 1 + + cdef int set_costs(self, int* output, StateClass stcls, GoldParse gold) except -1: + cdef int i, move, label + cdef label_cost_func_t[N_MOVES] label_cost_funcs + cdef move_cost_func_t[N_MOVES] move_cost_funcs + cdef int[N_MOVES] move_costs + for i in range(N_MOVES): + move_costs[i] = -1 + move_cost_funcs[SHIFT] = Shift.move_cost + move_cost_funcs[REDUCE] = Reduce.move_cost + move_cost_funcs[LEFT] = LeftArc.move_cost + move_cost_funcs[RIGHT] = RightArc.move_cost + + label_cost_funcs[SHIFT] = Shift.label_cost + label_cost_funcs[REDUCE] = Reduce.label_cost + label_cost_funcs[LEFT] = LeftArc.label_cost + label_cost_funcs[RIGHT] = RightArc.label_cost + + cdef int* labels = gold.c.labels + cdef int* heads = gold.c.heads + + n_gold = 0 + for i in range(self.n_moves): + if self.c[i].is_valid(stcls, self.c[i].label): + move = self.c[i].move + label = self.c[i].label + if move_costs[move] == -1: + move_costs[move] = move_cost_funcs[move](stcls, &gold.c) + output[i] = move_costs[move] + label_cost_funcs[move](stcls, &gold.c, label) + n_gold += output[i] == 0 + else: + output[i] = 9000 + assert n_gold >= 1 + + cdef Transition best_valid(self, const weight_t* scores, StateClass stcls) except *: + cdef bint[N_MOVES] is_valid + is_valid[SHIFT] = Shift.is_valid(stcls, -1) + is_valid[REDUCE] = Reduce.is_valid(stcls, -1) + is_valid[LEFT] = LeftArc.is_valid(stcls, -1) + is_valid[RIGHT] = RightArc.is_valid(stcls, -1) + cdef Transition best + cdef weight_t score = MIN_SCORE + cdef int i + for i in range(self.n_moves): + if scores[i] > score and is_valid[self.c[i].move]: + best = self.c[i] + score = scores[i] + assert score > MIN_SCORE, (self.n_moves, stcls.stack_depth(), stcls.buffer_length(), stcls.is_final(), stcls._b_i, stcls.length, stcls.has_head(stcls.S(0)), LeftArc.is_valid(stcls, -1)) + return best From f66228f2533a7fb9813c9c223d21d68e13aae812 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 14 Jun 2015 21:17:39 +0200 Subject: [PATCH 40/75] * Add some more features, esp for labels --- spacy/syntax/_parse_features.pyx | 27 +++++++++++++++++++++++++++ spacy/syntax/parser.pyx | 2 +- 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/spacy/syntax/_parse_features.pyx b/spacy/syntax/_parse_features.pyx index 81c1b8dfc..9c7703074 100644 --- a/spacy/syntax/_parse_features.pyx +++ b/spacy/syntax/_parse_features.pyx @@ -264,6 +264,29 @@ s0_n0 = ( (S0p, S0rp, N0p), (S0p, N0lp, N0W), (S0p, N0lp, N0p), + (S0L, N0p), + (S0p, S0rL, N0p), + (S0p, N0lL, N0p), + (S0p, S0rv, N0p), + (S0p, N0lv, N0p), + (S0c6, S0rL, S0r2L, N0p), + (S0p, N0lL, N0l2L, N0p), +) + + +s1_s0 = ( + (S1p, S0p), + (S1p, S0p, S0_has_head), + (S1W, S0p), + (S1W, S0p, S0_has_head), + (S1c, S0p), + (S1c, S0p, S0_has_head), + (S1p, S1rL, S0p), + (S1p, S1rL, S0p, S0_has_head), + (S1p, S0lL, S0p), + (S1p, S0lL, S0p, S0_has_head), + (S1p, S0lL, S0l2L, S0p), + (S1p, S0lL, S0l2L, S0p, S0_has_head), ) @@ -275,6 +298,8 @@ s1_n0 = ( (S1W, S1p, N0p), (S1p, N0W, N0p), (S1c6, S1p, N0c6, N0p), + (S1L, N0p), + (S1p, S1rL, N0p), ) @@ -286,6 +311,8 @@ s0_n1 = ( (S0W, S0p, N1p), (S0p, N1W, N1p), (S0c6, S0p, N1c6, N1p), + (S0L, N1p), + (S0p, S0rL, N1p), ) diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index c5648cb06..061738fe1 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -60,7 +60,7 @@ def get_templates(name): elif name == 'debug': return pf.unigrams else: - return (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s0_n1 + pf.n0_n1 + \ + return (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s1_s0 + pf.s0_n1 + pf.n0_n1 + \ pf.tree_shape + pf.trigrams) From 38a6afa484fbce4354c2d4bc51d1c3c06e1585a8 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 15 Jun 2015 02:50:00 +0200 Subject: [PATCH 41/75] * Make possibly dubious correction to the unshift oracle --- spacy/syntax/arc_eager.pyx | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 7ac11fd46..ba20f813c 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -177,9 +177,15 @@ cdef class LeftArc: @staticmethod cdef inline int move_cost(StateClass s, const GoldParseC* gold) nogil: + cdef int cost = 0 if arc_is_gold(gold, s.B(0), s.S(0)): return 0 else: + # Account for deps we might lose between S0 and stack + if not s.has_head(s.S(0)): + for i in range(1, s.stack_depth()): + cost += gold.heads[s.S(i)] == s.S(0) + cost += gold.heads[s.S(0)] == s.S(i) return pop_cost(s, gold, s.S(0)) + arc_cost(s, gold, s.B(0), s.S(0)) @staticmethod From 4841f8ad5ecd469d2b33c8efaa7326ba8e0ebe3e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 15 Jun 2015 02:54:12 +0200 Subject: [PATCH 42/75] * Set transition system early --- bin/parser/train.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/bin/parser/train.py b/bin/parser/train.py index 841ba2e6e..16fbb7fc4 100755 --- a/bin/parser/train.py +++ b/bin/parser/train.py @@ -94,9 +94,6 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', os.mkdir(pos_model_dir) os.mkdir(ner_model_dir) - if use_orig_arc_eager: - Language.ParserTransitionSystem = OrigArcEager - setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir) Config.write(dep_model_dir, 'config', features=feat_set, seed=seed, @@ -108,6 +105,8 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', if n_sents > 0: gold_tuples = gold_tuples[:n_sents] + if use_orig_arc_eager: + Language.ParserTransitionSystem = OrigArcEager nlp = Language(data_dir=model_dir) From 21930ede15c6b2d51b4c214fd2372d4e28665a0a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 15 Jun 2015 02:54:32 +0200 Subject: [PATCH 43/75] * Switch toggle on USE_ROOT_ARC_SEGMENT --- spacy/syntax/arc_eager.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index ba20f813c..f7dc77724 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -20,7 +20,7 @@ from .stateclass cimport StateClass DEF NON_MONOTONIC = True DEF USE_BREAK = False -DEF USE_ROOT_ARC_SEGMENT = False +DEF USE_ROOT_ARC_SEGMENT = True cdef weight_t MIN_SCORE = -90000 From 8156a01bca0c106c6a136f27383f012c48f797da Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 15 Jun 2015 02:54:55 +0200 Subject: [PATCH 44/75] * Fix root label for orig_arc_eager --- spacy/syntax/orig_arc_eager.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/syntax/orig_arc_eager.pyx b/spacy/syntax/orig_arc_eager.pyx index 304c72c83..e0d73eeab 100644 --- a/spacy/syntax/orig_arc_eager.pyx +++ b/spacy/syntax/orig_arc_eager.pyx @@ -291,7 +291,7 @@ cdef class OrigArcEager(TransitionSystem): cdef int finalize_state(self, StateClass st) except -1: cdef int root_label = self.strings['root'] for i in range(st.length): - if st._sent[i].head == 0 and st._sent[i].dep == 0: + if st._sent[i].head == 0: st._sent[i].dep = root_label cdef int set_valid(self, bint* output, StateClass stcls) except -1: From 5da5cf7084884054e41a7f879ecf7677e56f1c19 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 15 Jun 2015 04:07:13 +0200 Subject: [PATCH 45/75] * Add some more features for S1/S0 --- spacy/syntax/_parse_features.pyx | 3 +++ 1 file changed, 3 insertions(+) diff --git a/spacy/syntax/_parse_features.pyx b/spacy/syntax/_parse_features.pyx index 9c7703074..efefc7273 100644 --- a/spacy/syntax/_parse_features.pyx +++ b/spacy/syntax/_parse_features.pyx @@ -287,6 +287,9 @@ s1_s0 = ( (S1p, S0lL, S0p, S0_has_head), (S1p, S0lL, S0l2L, S0p), (S1p, S0lL, S0l2L, S0p, S0_has_head), + (S1L, S0L, S0W), + (S1L, S0L, S0p), + (S1p, S1L, S0L, S0p), ) From a5ae98a5435cb3acf94b2a573178bb7f81ca8b0b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 15 Jun 2015 08:22:59 +0200 Subject: [PATCH 46/75] * Add tree_arc_eager to setup.py --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 1baef0e85..4ee33e255 100644 --- a/setup.py +++ b/setup.py @@ -156,6 +156,7 @@ MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings', 'spacy.syntax.transition_system', 'spacy.syntax.arc_eager', 'spacy.syntax.orig_arc_eager', + 'spacy.syntax.tree_arc_eager', 'spacy.syntax._parse_features', 'spacy.gold', 'spacy.orth', 'spacy.syntax.ner'] From c40a2c661cc0161ea25d046c0df4b416ea4d1d00 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 15 Jun 2015 08:23:24 +0200 Subject: [PATCH 47/75] * Add tree_arc_eager --- spacy/syntax/tree_arc_eager.pxd | 17 ++ spacy/syntax/tree_arc_eager.pyx | 438 ++++++++++++++++++++++++++++++++ 2 files changed, 455 insertions(+) create mode 100644 spacy/syntax/tree_arc_eager.pxd create mode 100644 spacy/syntax/tree_arc_eager.pyx diff --git a/spacy/syntax/tree_arc_eager.pxd b/spacy/syntax/tree_arc_eager.pxd new file mode 100644 index 000000000..fab2c15fc --- /dev/null +++ b/spacy/syntax/tree_arc_eager.pxd @@ -0,0 +1,17 @@ +from cymem.cymem cimport Pool + +from thinc.typedefs cimport weight_t + +from .stateclass cimport StateClass + +from .transition_system cimport TransitionSystem, Transition +from ..gold cimport GoldParseC + + +cdef class TreeArcEager(TransitionSystem): + pass + + +cdef int push_cost(StateClass stcls, const GoldParseC* gold, int target) nogil +cdef int arc_cost(StateClass stcls, const GoldParseC* gold, int head, int child) nogil + diff --git a/spacy/syntax/tree_arc_eager.pyx b/spacy/syntax/tree_arc_eager.pyx new file mode 100644 index 000000000..38d437087 --- /dev/null +++ b/spacy/syntax/tree_arc_eager.pyx @@ -0,0 +1,438 @@ +# cython: profile=True +from __future__ import unicode_literals + +import ctypes +import os + +from ..structs cimport TokenC + +from .transition_system cimport do_func_t, get_cost_func_t +from .transition_system cimport move_cost_func_t, label_cost_func_t +from ..gold cimport GoldParse +from ..gold cimport GoldParseC + +from libc.stdint cimport uint32_t +from libc.string cimport memcpy + +from cymem.cymem cimport Pool +from .stateclass cimport StateClass + + +DEF NON_MONOTONIC = False +DEF USE_BREAK = False +DEF USE_ROOT_ARC_SEGMENT = False + +cdef weight_t MIN_SCORE = -90000 + +# Break transition from here +# http://www.aclweb.org/anthology/P13-1074 +cdef enum: + SHIFT + REDUCE + LEFT + RIGHT + + BREAK + + N_MOVES + + +MOVE_NAMES = [None] * N_MOVES +MOVE_NAMES[SHIFT] = 'S' +MOVE_NAMES[REDUCE] = 'D' +MOVE_NAMES[LEFT] = 'L' +MOVE_NAMES[RIGHT] = 'R' +MOVE_NAMES[BREAK] = 'B' + + +# Helper functions for the arc-eager oracle + +cdef int push_cost(StateClass stcls, const GoldParseC* gold, int target) nogil: + cdef int cost = 0 + cdef int i, S_i + for i in range(stcls.stack_depth()): + S_i = stcls.S(i) + if gold.heads[target] == S_i: + cost += 1 + if gold.heads[S_i] == target and not stcls.has_head(S_i): + cost += 1 + cost += Break.is_valid(stcls, -1) and Break.move_cost(stcls, gold) == 0 + return cost + + +cdef int pop_cost(StateClass stcls, const GoldParseC* gold, int target) nogil: + cdef int cost = 0 + cdef int i, B_i + for i in range(stcls.buffer_length()): + B_i = stcls.B(i) + cost += gold.heads[B_i] == target + if not stcls.has_head(target): + cost += gold.heads[target] == B_i + if gold.heads[B_i] == B_i or gold.heads[B_i] < target: + break + cost += Break.is_valid(stcls, -1) and Break.move_cost(stcls, gold) == 0 + return cost + + +cdef int arc_cost(StateClass stcls, const GoldParseC* gold, int head, int child) nogil: + if arc_is_gold(gold, head, child): + return 0 + elif stcls.H(child) == gold.heads[child]: + return 1 + # Head in buffer + elif gold.heads[child] >= stcls.B(0) and stcls.B(1) != -1: + return 1 + else: + return 0 + + +cdef bint arc_is_gold(const GoldParseC* gold, int head, int child) nogil: + if gold.labels[child] == -1: + return True + elif USE_ROOT_ARC_SEGMENT and _is_gold_root(gold, head) and _is_gold_root(gold, child): + return True + elif gold.heads[child] == head: + return True + else: + return False + + +cdef bint label_is_gold(const GoldParseC* gold, int head, int child, int label) nogil: + if gold.labels[child] == -1: + return True + elif label == -1: + return True + elif gold.labels[child] == label: + return True + else: + return False + + +cdef bint _is_gold_root(const GoldParseC* gold, int word) nogil: + return gold.labels[word] == -1 or gold.heads[word] == word + + +cdef class Shift: + @staticmethod + cdef bint is_valid(StateClass st, int label) nogil: + return st.buffer_length() >= 2 and not st.shifted[st.B(0)] and not st.B_(0).sent_end + + @staticmethod + cdef int transition(StateClass st, int label) nogil: + st.push() + st.fast_forward() + + @staticmethod + cdef int cost(StateClass st, const GoldParseC* gold, int label) nogil: + return Shift.move_cost(st, gold) + Shift.label_cost(st, gold, label) + + @staticmethod + cdef inline int move_cost(StateClass s, const GoldParseC* gold) nogil: + return push_cost(s, gold, s.B(0)) + + @staticmethod + cdef inline int label_cost(StateClass s, const GoldParseC* gold, int label) nogil: + return 0 + + +cdef class Reduce: + @staticmethod + cdef bint is_valid(StateClass st, int label) nogil: + return st.stack_depth() >= 2 and st.has_head(st.S(0)) + + @staticmethod + cdef int transition(StateClass st, int label) nogil: + st.pop() + st.fast_forward() + + @staticmethod + cdef int cost(StateClass s, const GoldParseC* gold, int label) nogil: + return Reduce.move_cost(s, gold) + Reduce.label_cost(s, gold, label) + + @staticmethod + cdef inline int move_cost(StateClass st, const GoldParseC* gold) nogil: + return pop_cost(st, gold, st.S(0)) + + @staticmethod + cdef inline int label_cost(StateClass s, const GoldParseC* gold, int label) nogil: + return 0 + + +cdef class LeftArc: + @staticmethod + cdef bint is_valid(StateClass st, int label) nogil: + return not st.B_(0).sent_end and not st.has_head(st.S(0)) + + @staticmethod + cdef int transition(StateClass st, int label) nogil: + st.add_arc(st.B(0), st.S(0), label) + st.pop() + st.fast_forward() + + @staticmethod + cdef int cost(StateClass s, const GoldParseC* gold, int label) nogil: + return LeftArc.move_cost(s, gold) + LeftArc.label_cost(s, gold, label) + + @staticmethod + cdef inline int move_cost(StateClass s, const GoldParseC* gold) nogil: + cdef int cost = 0 + if arc_is_gold(gold, s.B(0), s.S(0)): + return 0 + else: + return pop_cost(s, gold, s.S(0)) + arc_cost(s, gold, s.B(0), s.S(0)) + + @staticmethod + cdef inline int label_cost(StateClass s, const GoldParseC* gold, int label) nogil: + return arc_is_gold(gold, s.B(0), s.S(0)) and not label_is_gold(gold, s.B(0), s.S(0), label) + + +cdef class RightArc: + @staticmethod + cdef bint is_valid(StateClass st, int label) nogil: + return not st.B_(0).sent_end + + @staticmethod + cdef int transition(StateClass st, int label) nogil: + st.add_arc(st.S(0), st.B(0), label) + st.push() + st.fast_forward() + + @staticmethod + cdef inline int cost(StateClass s, const GoldParseC* gold, int label) nogil: + return RightArc.move_cost(s, gold) + RightArc.label_cost(s, gold, label) + + @staticmethod + cdef inline int move_cost(StateClass s, const GoldParseC* gold) nogil: + if arc_is_gold(gold, s.S(0), s.B(0)): + return 0 + elif s.shifted[s.B(0)]: + return push_cost(s, gold, s.B(0)) + else: + return push_cost(s, gold, s.B(0)) + arc_cost(s, gold, s.S(0), s.B(0)) + + @staticmethod + cdef int label_cost(StateClass s, const GoldParseC* gold, int label) nogil: + return arc_is_gold(gold, s.S(0), s.B(0)) and not label_is_gold(gold, s.S(0), s.B(0), label) + + +cdef class Break: + @staticmethod + cdef bint is_valid(StateClass st, int label) nogil: + cdef int i + if not USE_BREAK: + return False + elif st.at_break(): + return False + elif st.B(0) == 0: + return False + elif st.stack_depth() < 1: + return False + elif (st.S(0) + 1) != st.B(0): + # Must break at the token boundary + return False + else: + return True + + @staticmethod + cdef int transition(StateClass st, int label) nogil: + st.set_break(st.B(0)) + st.fast_forward() + + @staticmethod + cdef int cost(StateClass s, const GoldParseC* gold, int label) nogil: + return Break.move_cost(s, gold) + Break.label_cost(s, gold, label) + + @staticmethod + cdef inline int move_cost(StateClass s, const GoldParseC* gold) nogil: + cdef int cost = 0 + cdef int S_i, B_i + for i in range(s.stack_depth()): + S_i = s.S(i) + for j in range(s.buffer_length()): + B_i = s.B(j) + cost += gold.heads[S_i] == B_i + cost += gold.heads[B_i] == S_i + # Check for sentence boundary --- if it's here, we can't have any deps + # between stack and buffer, so rest of action is irrelevant. + s0_root = _get_root(s.S(0), gold) + b0_root = _get_root(s.B(0), gold) + if s0_root != b0_root or s0_root == -1 or b0_root == -1: + return cost + else: + return cost + 1 + + @staticmethod + cdef inline int label_cost(StateClass s, const GoldParseC* gold, int label) nogil: + return 0 + +cdef int _get_root(int word, const GoldParseC* gold) nogil: + while gold.heads[word] != word and gold.labels[word] != -1 and word >= 0: + word = gold.heads[word] + if gold.labels[word] == -1: + return -1 + else: + return word + + +cdef class TreeArcEager(TransitionSystem): + @classmethod + def get_labels(cls, gold_parses): + move_labels = {SHIFT: {'': True}, REDUCE: {'': True}, RIGHT: {'root': True}, + LEFT: {'root': True}, BREAK: {'root': True}} + for raw_text, sents in gold_parses: + for (ids, words, tags, heads, labels, iob), ctnts in sents: + for child, head, label in zip(ids, heads, labels): + if label != 'root': + if head < child: + move_labels[RIGHT][label] = True + elif head > child: + move_labels[LEFT][label] = True + return move_labels + + cdef int preprocess_gold(self, GoldParse gold) except -1: + for i in range(gold.length): + if gold.heads[i] is None: # Missing values + gold.c.heads[i] = i + gold.c.labels[i] = -1 + else: + gold.c.heads[i] = gold.heads[i] + gold.c.labels[i] = self.strings[gold.labels[i]] + for end, brackets in gold.brackets.items(): + for start, label_strs in brackets.items(): + gold.c.brackets[start][end] = 1 + for label_str in label_strs: + # Add the encoded label to the set + gold.brackets[end][start].add(self.strings[label_str]) + + cdef Transition lookup_transition(self, object name) except *: + if '-' in name: + move_str, label_str = name.split('-', 1) + label = self.label_ids[label_str] + else: + label = 0 + move = MOVE_NAMES.index(move_str) + for i in range(self.n_moves): + if self.c[i].move == move and self.c[i].label == label: + return self.c[i] + + def move_name(self, int move, int label): + label_str = self.strings[label] + if label_str: + return MOVE_NAMES[move] + '-' + label_str + else: + return MOVE_NAMES[move] + + cdef Transition init_transition(self, int clas, int move, int label) except *: + # TODO: Apparent Cython bug here when we try to use the Transition() + # constructor with the function pointers + cdef Transition t + t.score = 0 + t.clas = clas + t.move = move + t.label = label + if move == SHIFT: + t.is_valid = Shift.is_valid + t.do = Shift.transition + t.get_cost = Shift.cost + elif move == REDUCE: + t.is_valid = Reduce.is_valid + t.do = Reduce.transition + t.get_cost = Reduce.cost + elif move == LEFT: + t.is_valid = LeftArc.is_valid + t.do = LeftArc.transition + t.get_cost = LeftArc.cost + elif move == RIGHT: + t.is_valid = RightArc.is_valid + t.do = RightArc.transition + t.get_cost = RightArc.cost + elif move == BREAK: + t.is_valid = Break.is_valid + t.do = Break.transition + t.get_cost = Break.cost + else: + raise Exception(move) + return t + + cdef int initialize_state(self, StateClass st) except -1: + # Ensure sent_end is set to 0 throughout + for i in range(st.length): + st._sent[i].sent_end = False + st.fast_forward() + + cdef int finalize_state(self, StateClass st) except -1: + cdef int root_label = self.strings['root'] + for i in range(st.length): + if st._sent[i].head == 0 and st._sent[i].dep == 0: + st._sent[i].dep = root_label + # If we're not using the Break transition, we segment via root-labelled + # arcs between the root words. + elif USE_ROOT_ARC_SEGMENT and st._sent[i].dep == root_label: + st._sent[i].head = 0 + + cdef int set_valid(self, bint* output, StateClass stcls) except -1: + cdef bint[N_MOVES] is_valid + is_valid[SHIFT] = Shift.is_valid(stcls, -1) + is_valid[REDUCE] = Reduce.is_valid(stcls, -1) + is_valid[LEFT] = LeftArc.is_valid(stcls, -1) + is_valid[RIGHT] = RightArc.is_valid(stcls, -1) + is_valid[BREAK] = Break.is_valid(stcls, -1) + cdef int i + n_valid = 0 + for i in range(self.n_moves): + output[i] = is_valid[self.c[i].move] + n_valid += output[i] + assert n_valid >= 1 + + cdef int set_costs(self, int* output, StateClass stcls, GoldParse gold) except -1: + cdef int i, move, label + cdef label_cost_func_t[N_MOVES] label_cost_funcs + cdef move_cost_func_t[N_MOVES] move_cost_funcs + cdef int[N_MOVES] move_costs + for i in range(N_MOVES): + move_costs[i] = -1 + move_cost_funcs[SHIFT] = Shift.move_cost + move_cost_funcs[REDUCE] = Reduce.move_cost + move_cost_funcs[LEFT] = LeftArc.move_cost + move_cost_funcs[RIGHT] = RightArc.move_cost + move_cost_funcs[BREAK] = Break.move_cost + + label_cost_funcs[SHIFT] = Shift.label_cost + label_cost_funcs[REDUCE] = Reduce.label_cost + label_cost_funcs[LEFT] = LeftArc.label_cost + label_cost_funcs[RIGHT] = RightArc.label_cost + label_cost_funcs[BREAK] = Break.label_cost + + cdef int* labels = gold.c.labels + cdef int* heads = gold.c.heads + + n_gold = 0 + for i in range(self.n_moves): + if self.c[i].is_valid(stcls, self.c[i].label): + move = self.c[i].move + label = self.c[i].label + if move_costs[move] == -1: + move_costs[move] = move_cost_funcs[move](stcls, &gold.c) + output[i] = move_costs[move] + label_cost_funcs[move](stcls, &gold.c, label) + n_gold += output[i] == 0 + else: + output[i] = 9000 + assert n_gold >= 1 + + cdef Transition best_valid(self, const weight_t* scores, StateClass stcls) except *: + cdef bint[N_MOVES] is_valid + is_valid[SHIFT] = Shift.is_valid(stcls, -1) + is_valid[REDUCE] = Reduce.is_valid(stcls, -1) + is_valid[LEFT] = LeftArc.is_valid(stcls, -1) + is_valid[RIGHT] = RightArc.is_valid(stcls, -1) + is_valid[BREAK] = Break.is_valid(stcls, -1) + cdef Transition best + cdef weight_t score = MIN_SCORE + cdef int i + for i in range(self.n_moves): + if scores[i] > score and is_valid[self.c[i].move]: + best = self.c[i] + score = scores[i] + assert best.clas < self.n_moves + assert score > MIN_SCORE, (stcls.stack_depth(), stcls.buffer_length(), stcls.is_final(), stcls._b_i, stcls.length) + return best From 569958527816fc240769c5a85f4561de98279c6b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 15 Jun 2015 08:23:43 +0200 Subject: [PATCH 48/75] * Use tree_arc_eager system as baseline in experiments --- bin/parser/train.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bin/parser/train.py b/bin/parser/train.py index 16fbb7fc4..39e9d2cc9 100755 --- a/bin/parser/train.py +++ b/bin/parser/train.py @@ -18,6 +18,7 @@ from spacy.en import English from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir from spacy.syntax.orig_arc_eager import OrigArcEager +from spacy.syntax.tree_arc_eager import TreeArcEager from spacy.syntax.util import Config from spacy.gold import read_json_file from spacy.gold import GoldParse @@ -105,8 +106,6 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', if n_sents > 0: gold_tuples = gold_tuples[:n_sents] - if use_orig_arc_eager: - Language.ParserTransitionSystem = OrigArcEager nlp = Language(data_dir=model_dir) @@ -214,6 +213,8 @@ def write_parses(Language, dev_loc, model_dir, out_loc, beam_width=None): def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False, debug=False, corruption_level=0.0, gold_preproc=False, beam_width=1, eval_only=False, use_orig_arc_eager=False): + if use_orig_arc_eager: + English.ParserTransitionSystem = TreeArcEager if not eval_only: gold_train = list(read_json_file(train_loc)) train(English, gold_train, model_dir, From 9b13d11ab3e549574908007b05c33280ddc1b2d7 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 16 Jun 2015 23:35:21 +0200 Subject: [PATCH 49/75] * Fix handling of entities in StateClass --- spacy/syntax/stateclass.pyx | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/spacy/syntax/stateclass.pyx b/spacy/syntax/stateclass.pyx index 1259b1354..68e8a2198 100644 --- a/spacy/syntax/stateclass.pyx +++ b/spacy/syntax/stateclass.pyx @@ -12,13 +12,15 @@ cdef class StateClass: self.shifted = mem.alloc(length, sizeof(bint)) self._sent = mem.alloc(length, sizeof(TokenC)) self._ents = mem.alloc(length, sizeof(Entity)) + cdef int i + for i in range(length): + self._ents[i].end = -1 self.mem = mem self.length = length self._break = -1 self._s_i = 0 self._b_i = 0 self._e_i = 0 - cdef int i for i in range(length): self._buffer[i] = i self._empty_token.lex = &EMPTY_LEXEME @@ -29,7 +31,7 @@ cdef class StateClass: return self._sent[i].head + i cdef int E(self, int i) nogil: - return -1 + return self._ents[self._e_i-1].start cdef int L(self, int i, int idx) nogil: if idx < 1: @@ -128,7 +130,7 @@ cdef class StateClass: cdef bint entity_is_open(self) nogil: if self._e_i < 1: return False - return self._ents[self._e_i-1].end != 0 + return self._ents[self._e_i-1].end == -1 cdef int stack_depth(self) nogil: return self._s_i @@ -196,11 +198,11 @@ cdef class StateClass: cdef void open_ent(self, int label) nogil: self._ents[self._e_i].start = self.B(0) self._ents[self._e_i].label = label - self._ents[self._e_i].end = 0 + self._ents[self._e_i].end = -1 self._e_i += 1 cdef void close_ent(self) nogil: - self._ents[self._e_i].end = self.B(0)+1 + self._ents[self._e_i-1].end = self.B(0)+1 self._sent[self.B(0)].ent_iob = 1 cdef void set_ent_tag(self, int i, int ent_iob, int ent_type) nogil: From 4dad4058c35993cd2322d84d22f245aa552a5f2c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 16 Jun 2015 23:36:54 +0200 Subject: [PATCH 50/75] * Uncomment NER training --- bin/parser/train.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bin/parser/train.py b/bin/parser/train.py index 39e9d2cc9..c0834c45c 100755 --- a/bin/parser/train.py +++ b/bin/parser/train.py @@ -133,7 +133,7 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', gold = GoldParse(tokens, annot_tuples, make_projective=True) loss += nlp.parser.train(tokens, gold) - #nlp.entity.train(tokens, gold) + nlp.entity.train(tokens, gold) nlp.tagger.train(tokens, gold.tags) random.shuffle(gold_tuples) print '%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f, @@ -160,7 +160,7 @@ def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False if raw_text is None: tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) nlp.tagger(tokens) - #nlp.entity(tokens) + nlp.entity(tokens) nlp.parser(tokens) else: tokens = nlp(raw_text, merge_mwes=False) @@ -182,7 +182,7 @@ def write_parses(Language, dev_loc, model_dir, out_loc, beam_width=None): if raw_text is None: tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) nlp.tagger(tokens) - #nlp.entity(tokens) + nlp.entity(tokens) nlp.parser(tokens) else: tokens = nlp(raw_text, merge_mwes=False) From ab110be125873339a55e568f2668f6f199d2fd7e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 16 Jun 2015 23:37:25 +0200 Subject: [PATCH 51/75] * Remove debugging in parser.pyx --- spacy/syntax/parser.pyx | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 061738fe1..740e86025 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -130,23 +130,11 @@ cdef class Parser: words = [w.orth_ for w in tokens] history = [] while not stcls.is_final(): - assert stcls._s_i >= 0 fill_context(context, stcls) scores = self.model.score(context) guess = self.moves.best_valid(scores, stcls) - try: - best = self.moves.best_gold(scores, stcls, gold) - except: - history.append((self.moves.move_name(guess.move, guess.label), '!', stcls.print_state(words))) - for i, word in enumerate(words): - print gold.orig_annot[i] - print '\n'.join('\t'.join(s) for s in history) - print words[gold.c.heads[stcls.S(0)]] - print words[gold.c.heads[stcls.B(0)]] - self.moves.set_valid(self.moves._is_valid, stcls) - raise + best = self.moves.best_gold(scores, stcls, gold) cost = guess.get_cost(stcls, &gold.c, guess.label) - history.append((self.moves.move_name(guess.move, guess.label), str(cost), stcls.print_state(words))) self.model.update(context, guess.clas, best.clas, cost) guess.do(stcls, guess.label) loss += cost From f868175e43b5f0d72c4d8ac085de6bf3711747e1 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 16 Jun 2015 23:37:46 +0200 Subject: [PATCH 52/75] * Whitespace --- spacy/scorer.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/spacy/scorer.py b/spacy/scorer.py index 8310cbd49..509966308 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -119,6 +119,3 @@ class Scorer(object): print 'F', gold_words[w_id], dep, gold_words[h_id] for w_id, h_id, dep in (gold_deps - cand_deps): print 'M', gold_words[w_id], dep, gold_words[h_id] - - - From 60d26243e3a18982608c99d3b0d1dfa107e25ab1 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 18 Jun 2015 16:35:27 +0200 Subject: [PATCH 53/75] * Fix head alignment in read_conll.parse, which was causing corrupt parses when strip_bad_periods=True. A similar problem may apply to other data readers. --- spacy/munge/read_conll.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/spacy/munge/read_conll.py b/spacy/munge/read_conll.py index ed6037a4d..a120ea497 100644 --- a/spacy/munge/read_conll.py +++ b/spacy/munge/read_conll.py @@ -10,11 +10,12 @@ def parse(sent_text, strip_bad_periods=False): assert sent_text annot = [] words = [] - id_map = {} + id_map = {-1: -1} for i, line in enumerate(sent_text.split('\n')): word, tag, head, dep = _parse_line(line) if strip_bad_periods and words and _is_bad_period(words[-1], word): continue + id_map[i] = len(words) annot.append({ 'id': len(words), @@ -23,6 +24,8 @@ def parse(sent_text, strip_bad_periods=False): 'head': int(head) - 1, 'dep': dep}) words.append(word) + for entry in annot: + entry['head'] = id_map[entry['head']] return words, annot From fe9118a528ec38daf2b98d1be441747f5db081a8 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 18 Jun 2015 16:36:04 +0200 Subject: [PATCH 54/75] * Add test for strip_bad_periods reading in read_conll.parse --- tests/munge/test_bad_periods.py | 59 +++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 tests/munge/test_bad_periods.py diff --git a/tests/munge/test_bad_periods.py b/tests/munge/test_bad_periods.py new file mode 100644 index 000000000..fc448476a --- /dev/null +++ b/tests/munge/test_bad_periods.py @@ -0,0 +1,59 @@ +import spacy.munge.read_conll + +hongbin_example = """ +1 2. 0. LS _ 24 meta _ _ _ +2 . . . _ 1 punct _ _ _ +3 Wang wang NNP _ 4 compound _ _ _ +4 Hongbin hongbin NNP _ 16 nsubj _ _ _ +5 , , , _ 4 punct _ _ _ +6 the the DT _ 11 det _ _ _ +7 " " `` _ 11 punct _ _ _ +8 communist communist JJ _ 11 amod _ _ _ +9 trail trail NN _ 11 compound _ _ _ +10 - - HYPH _ 11 punct _ _ _ +11 blazer blazer NN _ 4 appos _ _ _ +12 , , , _ 16 punct _ _ _ +13 " " '' _ 16 punct _ _ _ +14 has have VBZ _ 16 aux _ _ _ +15 not not RB _ 16 neg _ _ _ +16 turned turn VBN _ 24 ccomp _ _ _ +17 into into IN syn=CLR 16 prep _ _ _ +18 a a DT _ 19 det _ _ _ +19 capitalist capitalist NN _ 17 pobj _ _ _ +20 ( ( -LRB- _ 24 punct _ _ _ +21 he he PRP _ 24 nsubj _ _ _ +22 does do VBZ _ 24 aux _ _ _ +23 n't not RB _ 24 neg _ _ _ +24 have have VB _ 0 root _ _ _ +25 any any DT _ 26 det _ _ _ +26 shares share NNS _ 24 dobj _ _ _ +27 , , , _ 24 punct _ _ _ +28 does do VBZ _ 30 aux _ _ _ +29 n't not RB _ 30 neg _ _ _ +30 have have VB _ 24 conj _ _ _ +31 any any DT _ 32 det _ _ _ +32 savings saving NNS _ 30 dobj _ _ _ +33 , , , _ 30 punct _ _ _ +34 does do VBZ _ 36 aux _ _ _ +35 n't not RB _ 36 neg _ _ _ +36 have have VB _ 30 conj _ _ _ +37 his his PRP$ _ 39 poss _ _ _ +38 own own JJ _ 39 amod _ _ _ +39 car car NN _ 36 dobj _ _ _ +40 , , , _ 36 punct _ _ _ +41 and and CC _ 36 cc _ _ _ +42 does do VBZ _ 44 aux _ _ _ +43 n't not RB _ 44 neg _ _ _ +44 have have VB _ 36 conj _ _ _ +45 a a DT _ 46 det _ _ _ +46 mansion mansion NN _ 44 dobj _ _ _ +47 ; ; . _ 24 punct _ _ _ +""".strip() + + +def test_hongbin(): + words, annot = spacy.munge.read_conll.parse(hongbin_example, strip_bad_periods=True) + assert words[annot[0]['head']] == 'have' + assert words[annot[1]['head']] == 'Hongbin' + + From 839e5038b70ea16e1d55bf3a02a58a832ab36ae9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 23 Jun 2015 00:01:55 +0200 Subject: [PATCH 55/75] * Raise exception on non-projective input --- bin/parser/train.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/bin/parser/train.py b/bin/parser/train.py index c0834c45c..e3de03dcf 100755 --- a/bin/parser/train.py +++ b/bin/parser/train.py @@ -130,8 +130,13 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', raw_text = add_noise(raw_text, corruption_level) tokens = nlp.tokenizer(raw_text) nlp.tagger(tokens) - gold = GoldParse(tokens, annot_tuples, make_projective=True) - loss += nlp.parser.train(tokens, gold) + gold = GoldParse(tokens, annot_tuples, make_projective=False) + if not gold.is_projective: + raise Exception( + "Non-projective sentence in training, after we should " + "have enforced projectivity: %s" % annot_tuples + ) + loss += nlp.parser.train(tokens, gold) nlp.entity.train(tokens, gold) nlp.tagger.train(tokens, gold.tags) From 46fb24e9fd20baa6c21623a895f01c53cf6ac107 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 23 Jun 2015 00:02:22 +0200 Subject: [PATCH 56/75] * Add cycle-checking code in gold.pyx --- spacy/gold.pyx | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index f3ed33d10..489b8b124 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -242,6 +242,16 @@ cdef class GoldParse: self.heads[w2] = None self.labels[w2] = '' + # Check there are no cycles in the dependencies, i.e. we are a tree + for w in range(self.length): + seen = set([w]) + head = w + while self.heads[head] != head and self.heads[head] != None: + head = self.heads[head] + if head in seen: + raise Exception("Cycle found: %s" % seen) + seen.add(head) + self.brackets = {} for (gold_start, gold_end, label_str) in brackets: start = self.gold_to_cand[gold_start] From cc579ed429cd083106761b6695b5707d2bab6c69 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 23 Jun 2015 00:02:50 +0200 Subject: [PATCH 57/75] * Add __len__ function to StringStore --- spacy/strings.pyx | 3 +++ 1 file changed, 3 insertions(+) diff --git a/spacy/strings.pyx b/spacy/strings.pyx index e15f88837..56df4d2f1 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -61,6 +61,9 @@ cdef class StringStore: def __get__(self): return self.size-1 + def __len__(self): + return self.size + def __getitem__(self, object string_or_id): cdef bytes byte_string cdef const Utf8Str* utf8str From 69507bc7295711a7bc9544541070964a025fe1e5 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 23 Jun 2015 00:03:30 +0200 Subject: [PATCH 58/75] * Re-enable Break transition in arc_eager.pyx --- spacy/syntax/arc_eager.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index f7dc77724..4d89ad386 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -19,7 +19,7 @@ from .stateclass cimport StateClass DEF NON_MONOTONIC = True -DEF USE_BREAK = False +DEF USE_BREAK = True DEF USE_ROOT_ARC_SEGMENT = True cdef weight_t MIN_SCORE = -90000 @@ -252,7 +252,7 @@ cdef class Break: @staticmethod cdef inline int move_cost(StateClass s, const GoldParseC* gold) nogil: cdef int cost = 0 - cdef int S_i, B_i + cdef int i, j, S_i, B_i for i in range(s.stack_depth()): S_i = s.S(i) for j in range(s.buffer_length()): From 5e94b5d58187d45818d5b52dda243aa6b69eadf4 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 23 Jun 2015 00:07:06 +0200 Subject: [PATCH 59/75] * Have Tokens return proper numpy arrays, not Cython views. --- spacy/tokens.pxd | 4 ++-- spacy/tokens.pyx | 14 +++++++++----- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd index 9ddd126a1..8b3ff9fe9 100644 --- a/spacy/tokens.pxd +++ b/spacy/tokens.pxd @@ -1,7 +1,7 @@ from libc.stdint cimport uint32_t from numpy cimport ndarray -cimport numpy +cimport numpy as np from cymem.cymem cimport Pool from thinc.typedefs cimport atom_t @@ -47,7 +47,7 @@ cdef class Tokens: cdef int push_back(self, int i, LexemeOrToken lex_or_tok) except -1 - cpdef long[:,:] to_array(self, object features) + cpdef np.ndarray to_array(self, object features) cdef int set_parse(self, const TokenC* parsed) except -1 diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index 3ee559dcf..3b132c4c9 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -18,7 +18,9 @@ from .structs cimport UniStr from unidecode import unidecode -cimport numpy +cimport numpy as np +np.import_array() + import numpy cimport cython @@ -207,7 +209,7 @@ cdef class Tokens: return idx + t.lex.length @cython.boundscheck(False) - cpdef long[:,:] to_array(self, object py_attr_ids): + cpdef np.ndarray to_array(self, object py_attr_ids): """Given a list of M attribute IDs, export the tokens to a numpy ndarray of shape N*M, where N is the length of the sentence. @@ -221,10 +223,10 @@ cdef class Tokens: """ cdef int i, j cdef attr_id_t feature - cdef numpy.ndarray[long, ndim=2] output + cdef np.ndarray[long, ndim=2] output # Make an array from the attributes --- otherwise our inner loop is Python # dict iteration. - cdef numpy.ndarray[long, ndim=1] attr_ids = numpy.asarray(py_attr_ids) + cdef np.ndarray[long, ndim=1] attr_ids = numpy.asarray(py_attr_ids) output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.int) for i in range(self.length): for j, feature in enumerate(attr_ids): @@ -464,7 +466,9 @@ cdef class Token: property repvec: def __get__(self): - return numpy.asarray( self.c.lex.repvec) + cdef int length = self.vocab.repvec_length + repvec_view = self.c.lex.repvec + return numpy.asarray(repvec_view) property n_lefts: def __get__(self): From f01b3d043ea2b8a37f79afcf513936722c445a24 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 23 Jun 2015 03:03:22 +0200 Subject: [PATCH 60/75] * Add padding to arrays in stateclass. May be papering over a deeper bug. --- spacy/syntax/stateclass.pyx | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/spacy/syntax/stateclass.pyx b/spacy/syntax/stateclass.pyx index 68e8a2198..725850850 100644 --- a/spacy/syntax/stateclass.pyx +++ b/spacy/syntax/stateclass.pyx @@ -7,14 +7,17 @@ from ..structs cimport Entity cdef class StateClass: def __init__(self, int length): cdef Pool mem = Pool() - self._buffer = mem.alloc(length, sizeof(int)) - self._stack = mem.alloc(length, sizeof(int)) - self.shifted = mem.alloc(length, sizeof(bint)) - self._sent = mem.alloc(length, sizeof(TokenC)) - self._ents = mem.alloc(length, sizeof(Entity)) + PADDING = 5 + self._buffer = mem.alloc(length + PADDING, sizeof(int)) + self._stack = mem.alloc(length + PADDING, sizeof(int)) + self.shifted = mem.alloc(length + PADDING, sizeof(bint)) + self._sent = mem.alloc(length + PADDING, sizeof(TokenC)) + self._ents = mem.alloc(length + PADDING, sizeof(Entity)) cdef int i for i in range(length): self._ents[i].end = -1 + for i in range(length, length + PADDING): + self._sent[i].lex = &EMPTY_LEXEME self.mem = mem self.length = length self._break = -1 @@ -181,8 +184,6 @@ cdef class StateClass: cdef int dist = head - child self._sent[child].head = dist self._sent[child].dep = label - # Keep a bit-vector tracking child dependencies. If a word has a child at - # offset i from it, set that bit (tracking left and right separately) if child > head: self._sent[head].r_kids += 1 else: From 89ae218b7586437714a4e367b6d85bef310e7f18 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 23 Jun 2015 03:04:34 +0200 Subject: [PATCH 61/75] * Add import to tokens.pyx from weird Cython compiler issue with casting from memory views --- spacy/tokens.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index 3b132c4c9..afda93b79 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -17,7 +17,8 @@ from .spans import Span from .structs cimport UniStr from unidecode import unidecode - +# Compiler crashes on memory view coercion without this. Should report bug. +from cython.view cimport array as cvarray cimport numpy as np np.import_array() From 065c2e1d2de8bc0d1906838df3e23f0d62a0cbab Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 23 Jun 2015 04:13:09 +0200 Subject: [PATCH 62/75] * Add some bounds checking around state arrays --- spacy/syntax/stateclass.pxd | 2 ++ spacy/syntax/stateclass.pyx | 25 +++++++++++++++++-------- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/spacy/syntax/stateclass.pxd b/spacy/syntax/stateclass.pxd index 54b039208..e3c36751e 100644 --- a/spacy/syntax/stateclass.pxd +++ b/spacy/syntax/stateclass.pxd @@ -28,6 +28,8 @@ cdef class StateClass: for i in range(length): self._sent[i] = sent[i] self._buffer[i] = i + for i in range(length, length + 5): + self._sent[i].lex = &EMPTY_LEXEME return self cdef inline int S(self, int i) nogil: diff --git a/spacy/syntax/stateclass.pyx b/spacy/syntax/stateclass.pyx index 725850850..f143ca087 100644 --- a/spacy/syntax/stateclass.pyx +++ b/spacy/syntax/stateclass.pyx @@ -34,6 +34,10 @@ cdef class StateClass: return self._sent[i].head + i cdef int E(self, int i) nogil: + if self._e_i <= 0 or self._e_i >= self.length: + return -1 + if i <= 0 or i >= self.length: + return -1 return self._ents[self._e_i-1].start cdef int L(self, int i, int idx) nogil: @@ -145,14 +149,16 @@ cdef class StateClass: return self.length - self._b_i cdef void push(self) nogil: - self._stack[self._s_i] = self.B(0) + if self.B(0) != -1: + self._stack[self._s_i] = self.B(0) self._s_i += 1 self._b_i += 1 if self._b_i > self._break: self._break = -1 cdef void pop(self) nogil: - self._s_i -= 1 + if self._s_i >= 1: + self._s_i -= 1 cdef void unshift(self) nogil: self._b_i -= 1 @@ -197,10 +203,11 @@ cdef class StateClass: self._sent[head].l_kids -= 1 cdef void open_ent(self, int label) nogil: - self._ents[self._e_i].start = self.B(0) - self._ents[self._e_i].label = label - self._ents[self._e_i].end = -1 - self._e_i += 1 + if 0 <= self._e_i < self.length: + self._ents[self._e_i].start = self.B(0) + self._ents[self._e_i].label = label + self._ents[self._e_i].end = -1 + self._e_i += 1 cdef void close_ent(self) nogil: self._ents[self._e_i-1].end = self.B(0)+1 @@ -212,8 +219,9 @@ cdef class StateClass: self._sent[i].ent_type = ent_type cdef void set_break(self, int _) nogil: - self._sent[self.B(0)].sent_end = True - self._break = self._b_i + if 0 <= self.B(0) < self.length: + self._sent[self.B(0)].sent_end = True + self._break = self._b_i cdef void clone(self, StateClass src) nogil: memcpy(self._sent, src._sent, self.length * sizeof(TokenC)) @@ -223,6 +231,7 @@ cdef class StateClass: self._b_i = src._b_i self._s_i = src._s_i self._e_i = src._e_i + self._break = src._break def print_state(self, words): words = list(words) + ['_'] From 43ef5ddea51c41e8595ea42736d04946376a2876 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 23 Jun 2015 04:14:03 +0200 Subject: [PATCH 63/75] * Ensure root albel is spelled ROOT, for backwards compatibility --- spacy/gold.pyx | 3 +++ spacy/syntax/arc_eager.pyx | 13 +++++++++---- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 489b8b124..9a2e51d84 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -148,6 +148,9 @@ def read_json_file(loc, docs_filter=None): tags.append(token['tag']) heads.append(token['head'] + i) labels.append(token['dep']) + # Ensure ROOT label is case-insensitive + if labels[-1].lower() == 'root': + labels[-1] = 'ROOT' ner.append(token.get('ner', '-')) sents.append(( (ids, words, tags, heads, labels, ner), diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 4d89ad386..663ffd2cb 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -284,12 +284,14 @@ cdef int _get_root(int word, const GoldParseC* gold) nogil: cdef class ArcEager(TransitionSystem): @classmethod def get_labels(cls, gold_parses): - move_labels = {SHIFT: {'': True}, REDUCE: {'': True}, RIGHT: {'root': True}, - LEFT: {'root': True}, BREAK: {'root': True}} + move_labels = {SHIFT: {'': True}, REDUCE: {'': True}, RIGHT: {'ROOT': True}, + LEFT: {'ROOT': True}, BREAK: {'ROOT': True}} for raw_text, sents in gold_parses: for (ids, words, tags, heads, labels, iob), ctnts in sents: for child, head, label in zip(ids, heads, labels): - if label != 'root': + if label.upper() == 'ROOT': + label = 'ROOT' + if label != 'ROOT': if head < child: move_labels[RIGHT][label] = True elif head > child: @@ -302,8 +304,11 @@ cdef class ArcEager(TransitionSystem): gold.c.heads[i] = i gold.c.labels[i] = -1 else: + label = gold.labels[i] + if label.upper() == 'ROOT': + label = 'ROOT' gold.c.heads[i] = gold.heads[i] - gold.c.labels[i] = self.strings[gold.labels[i]] + gold.c.labels[i] = self.strings[label] for end, brackets in gold.brackets.items(): for start, label_strs in brackets.items(): gold.c.brackets[start][end] = 1 From ee3e56f27b157af4a5281a4ea0d91ff92aa847e7 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 23 Jun 2015 04:35:08 +0200 Subject: [PATCH 64/75] * Fix bounds checking on entities --- spacy/syntax/stateclass.pyx | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/spacy/syntax/stateclass.pyx b/spacy/syntax/stateclass.pyx index f143ca087..cbcebac11 100644 --- a/spacy/syntax/stateclass.pyx +++ b/spacy/syntax/stateclass.pyx @@ -35,9 +35,9 @@ cdef class StateClass: cdef int E(self, int i) nogil: if self._e_i <= 0 or self._e_i >= self.length: - return -1 - if i <= 0 or i >= self.length: - return -1 + return 0 + if i < 0 or i >= self.length: + return 0 return self._ents[self._e_i-1].start cdef int L(self, int i, int idx) nogil: @@ -203,11 +203,10 @@ cdef class StateClass: self._sent[head].l_kids -= 1 cdef void open_ent(self, int label) nogil: - if 0 <= self._e_i < self.length: - self._ents[self._e_i].start = self.B(0) - self._ents[self._e_i].label = label - self._ents[self._e_i].end = -1 - self._e_i += 1 + self._ents[self._e_i].start = self.B(0) + self._ents[self._e_i].label = label + self._ents[self._e_i].end = -1 + self._e_i += 1 cdef void close_ent(self) nogil: self._ents[self._e_i-1].end = self.B(0)+1 From 59e9f9153cd635e94836f188e2b6388b2bced093 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 23 Jun 2015 05:04:46 +0200 Subject: [PATCH 65/75] * Remove projectivity constraint in train.py, but raise Exception if non-projective sentence is encountered, since we've told GoldParse to projectivize --- bin/parser/train.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/bin/parser/train.py b/bin/parser/train.py index e3de03dcf..0c9565f62 100755 --- a/bin/parser/train.py +++ b/bin/parser/train.py @@ -130,14 +130,13 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', raw_text = add_noise(raw_text, corruption_level) tokens = nlp.tokenizer(raw_text) nlp.tagger(tokens) - gold = GoldParse(tokens, annot_tuples, make_projective=False) + gold = GoldParse(tokens, annot_tuples, make_projective=True) if not gold.is_projective: raise Exception( "Non-projective sentence in training, after we should " "have enforced projectivity: %s" % annot_tuples ) - loss += nlp.parser.train(tokens, gold) - + loss += nlp.parser.train(tokens, gold) nlp.entity.train(tokens, gold) nlp.tagger.train(tokens, gold.tags) random.shuffle(gold_tuples) From 34c0ef2ee8575b4a62d92b195982c270ec6b51a8 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 23 Jun 2015 05:38:17 +0200 Subject: [PATCH 66/75] * Don't compile the orig_arc_eager and tree_arc_eager modules used for the EMNLP paper --- bin/parser/train.py | 2 -- setup.py | 2 -- 2 files changed, 4 deletions(-) diff --git a/bin/parser/train.py b/bin/parser/train.py index 0c9565f62..79f665d01 100755 --- a/bin/parser/train.py +++ b/bin/parser/train.py @@ -17,8 +17,6 @@ import spacy.util from spacy.en import English from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir -from spacy.syntax.orig_arc_eager import OrigArcEager -from spacy.syntax.tree_arc_eager import TreeArcEager from spacy.syntax.util import Config from spacy.gold import read_json_file from spacy.gold import GoldParse diff --git a/setup.py b/setup.py index 4ee33e255..76615e141 100644 --- a/setup.py +++ b/setup.py @@ -155,8 +155,6 @@ MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings', 'spacy.en.pos', 'spacy.syntax.parser', 'spacy.syntax.transition_system', 'spacy.syntax.arc_eager', - 'spacy.syntax.orig_arc_eager', - 'spacy.syntax.tree_arc_eager', 'spacy.syntax._parse_features', 'spacy.gold', 'spacy.orth', 'spacy.syntax.ner'] From a7bf7b062640250e37083a2d040094c580ea27af Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 23 Jun 2015 05:39:23 +0200 Subject: [PATCH 67/75] * Rename sent_start to sent_end, to reflect its new usage in the Break transition --- spacy/structs.pxd | 2 +- spacy/syntax/arc_eager.pyx | 10 +++++----- spacy/syntax/stateclass.pyx | 2 +- spacy/tokens.pyx | 13 +++++-------- 4 files changed, 12 insertions(+), 15 deletions(-) diff --git a/spacy/structs.pxd b/spacy/structs.pxd index 4f46ff1a2..a26c87e2f 100644 --- a/spacy/structs.pxd +++ b/spacy/structs.pxd @@ -68,7 +68,7 @@ cdef struct TokenC: int sense int head int dep - bint sent_end + bint sent_start uint32_t l_kids uint32_t r_kids diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 663ffd2cb..6808e8689 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -114,7 +114,7 @@ cdef bint _is_gold_root(const GoldParseC* gold, int word) nogil: cdef class Shift: @staticmethod cdef bint is_valid(StateClass st, int label) nogil: - return st.buffer_length() >= 2 and not st.shifted[st.B(0)] and not st.B_(0).sent_end + return st.buffer_length() >= 2 and not st.shifted[st.B(0)] and not st.B_(0).sent_start @staticmethod cdef int transition(StateClass st, int label) nogil: @@ -163,7 +163,7 @@ cdef class Reduce: cdef class LeftArc: @staticmethod cdef bint is_valid(StateClass st, int label) nogil: - return not st.B_(0).sent_end + return not st.B_(0).sent_start @staticmethod cdef int transition(StateClass st, int label) nogil: @@ -196,7 +196,7 @@ cdef class LeftArc: cdef class RightArc: @staticmethod cdef bint is_valid(StateClass st, int label) nogil: - return not st.B_(0).sent_end + return not st.B_(0).sent_start @staticmethod cdef int transition(StateClass st, int label) nogil: @@ -367,9 +367,9 @@ cdef class ArcEager(TransitionSystem): return t cdef int initialize_state(self, StateClass st) except -1: - # Ensure sent_end is set to 0 throughout + # Ensure sent_start is set to 0 throughout for i in range(st.length): - st._sent[i].sent_end = False + st._sent[i].sent_start = False st.fast_forward() cdef int finalize_state(self, StateClass st) except -1: diff --git a/spacy/syntax/stateclass.pyx b/spacy/syntax/stateclass.pyx index cbcebac11..da37ae7ae 100644 --- a/spacy/syntax/stateclass.pyx +++ b/spacy/syntax/stateclass.pyx @@ -219,7 +219,7 @@ cdef class StateClass: cdef void set_break(self, int _) nogil: if 0 <= self.B(0) < self.length: - self._sent[self.B(0)].sent_end = True + self._sent[self.B(0)].sent_start = True self._break = self._b_i cdef void clone(self, StateClass src) nogil: diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index afda93b79..55389cdde 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -186,15 +186,12 @@ cdef class Tokens: """ cdef int i cdef Tokens sent = Tokens(self.vocab, self._string[self.data[0].idx:]) - start = None - for i in range(self.length): - if start is None: + start = 0 + for i in range(1, self.length): + if self.data[i].sent_start: + yield Span(self, start, i) start = i - if self.data[i].sent_end: - yield Span(self, start, i+1) - start = None - if start is not None: - yield Span(self, start, self.length) + yield Span(self, start, self.length) cdef int push_back(self, int idx, LexemeOrToken lex_or_tok) except -1: if self.length == self.max_length: From 221e2e485f9a7e610a52570cc5c8c323ae8078bc Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 23 Jun 2015 15:08:36 +0200 Subject: [PATCH 68/75] * Assign 'ROOT' as label, not 'root' --- spacy/syntax/arc_eager.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 6808e8689..f5b2921ee 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -373,7 +373,7 @@ cdef class ArcEager(TransitionSystem): st.fast_forward() cdef int finalize_state(self, StateClass st) except -1: - cdef int root_label = self.strings['root'] + cdef int root_label = self.strings['ROOT'] for i in range(st.length): if st._sent[i].head == 0 and st._sent[i].dep == 0: st._sent[i].dep = root_label From 35c290bee4853089dae19d70002e7ae83cd245f9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 23 Jun 2015 15:50:56 +0200 Subject: [PATCH 69/75] * Fix edge features --- spacy/syntax/stateclass.pyx | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/spacy/syntax/stateclass.pyx b/spacy/syntax/stateclass.pyx index da37ae7ae..c206ae992 100644 --- a/spacy/syntax/stateclass.pyx +++ b/spacy/syntax/stateclass.pyx @@ -190,17 +190,27 @@ cdef class StateClass: cdef int dist = head - child self._sent[child].head = dist self._sent[child].dep = label + cdef int i if child > head: self._sent[head].r_kids += 1 + i = 0 + while self.has_head(head) and i < self.length: + self._sent[head].r_edge = child + head = self.H(head) + i += 1 # Guard against infinite loops else: self._sent[head].l_kids += 1 + self._sent[head].l_edge = self._sent[child].l_edge - cdef void del_arc(self, int head, int child) nogil: - cdef int dist = head - child - if child > head: - self._sent[head].r_kids -= 1 + cdef void del_arc(self, int h_i, int c_i) nogil: + cdef int dist = h_i - c_i + cdef TokenC* h = &self._sent[h_i] + if c_i > h_i: + h.r_kids -= 1 + h.r_edge = self.R_(h_i, h.r_kids-1).r_edge if h.r_kids >= 1 else h_i else: - self._sent[head].l_kids -= 1 + h.l_kids -= 1 + h.l_edge = self.L_(h_i, h.l_kids-1).l_edge if h.l_kids >= 1 else h_i cdef void open_ent(self, int label) nogil: self._ents[self._e_i].start = self.B(0) From 8d4bbacfc57382f319d7616270762ba4285cb69f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 23 Jun 2015 16:07:34 +0200 Subject: [PATCH 70/75] * Fix edge navigation in Token objects --- spacy/tokens.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index 55389cdde..7efdc6913 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -548,13 +548,13 @@ cdef class Token: property left_edge: def __get__(self): return Token.cinit(self.vocab, self._string, - self.c + self.c.l_edge, self.i + self.c.l_edge, + (self.c - self.i) + self.c.l_edge, self.c.l_edge, self.array_len, self._seq) property right_edge: def __get__(self): return Token.cinit(self.vocab, self._string, - self.c + self.c.r_edge, self.i + self.c.r_edge, + (self.c - self.i) + self.c.r_edge, self.c.r_edge, self.array_len, self._seq) property head: From 7b125f5a869ccf75bd2c56bc56e7ba0274e8cbdf Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 23 Jun 2015 16:31:01 +0200 Subject: [PATCH 71/75] * Fixes to edge features --- spacy/syntax/arc_eager.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index f5b2921ee..29e62cb4e 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -370,6 +370,8 @@ cdef class ArcEager(TransitionSystem): # Ensure sent_start is set to 0 throughout for i in range(st.length): st._sent[i].sent_start = False + st._sent[i].l_edge = i + st._sent[i].r_edge = i st.fast_forward() cdef int finalize_state(self, StateClass st) except -1: From 7ebfe4b983ce4e4f032d9006adea4e04b5878426 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 23 Jun 2015 16:32:54 +0200 Subject: [PATCH 72/75] * Fixes to edge features --- spacy/syntax/stateclass.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/syntax/stateclass.pyx b/spacy/syntax/stateclass.pyx index c206ae992..b2c789d02 100644 --- a/spacy/syntax/stateclass.pyx +++ b/spacy/syntax/stateclass.pyx @@ -193,6 +193,7 @@ cdef class StateClass: cdef int i if child > head: self._sent[head].r_kids += 1 + self._sent[head].r_edge = child i = 0 while self.has_head(head) and i < self.length: self._sent[head].r_edge = child From 9ab9dd2bf7ef3a7383bf1c57d94463fb8f0628c0 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 23 Jun 2015 17:17:33 +0200 Subject: [PATCH 73/75] * Clean up unused orig_arc_eager and tree_arc_eager modules, which were only added for EMNLP experiments --- spacy/syntax/orig_arc_eager.pxd | 17 -- spacy/syntax/orig_arc_eager.pyx | 357 -------------------------- spacy/syntax/tree_arc_eager.pxd | 17 -- spacy/syntax/tree_arc_eager.pyx | 438 -------------------------------- 4 files changed, 829 deletions(-) delete mode 100644 spacy/syntax/orig_arc_eager.pxd delete mode 100644 spacy/syntax/orig_arc_eager.pyx delete mode 100644 spacy/syntax/tree_arc_eager.pxd delete mode 100644 spacy/syntax/tree_arc_eager.pyx diff --git a/spacy/syntax/orig_arc_eager.pxd b/spacy/syntax/orig_arc_eager.pxd deleted file mode 100644 index 82ec85f34..000000000 --- a/spacy/syntax/orig_arc_eager.pxd +++ /dev/null @@ -1,17 +0,0 @@ -from cymem.cymem cimport Pool - -from thinc.typedefs cimport weight_t - -from .stateclass cimport StateClass - -from .transition_system cimport TransitionSystem, Transition -from ..gold cimport GoldParseC - - -cdef class OrigArcEager(TransitionSystem): - pass - - -cdef int push_cost(StateClass stcls, const GoldParseC* gold, int target) nogil -cdef int arc_cost(StateClass stcls, const GoldParseC* gold, int head, int child) nogil - diff --git a/spacy/syntax/orig_arc_eager.pyx b/spacy/syntax/orig_arc_eager.pyx deleted file mode 100644 index e0d73eeab..000000000 --- a/spacy/syntax/orig_arc_eager.pyx +++ /dev/null @@ -1,357 +0,0 @@ -# cython: profile=True -from __future__ import unicode_literals - -import ctypes -import os - -from ..structs cimport TokenC - -from .transition_system cimport do_func_t, get_cost_func_t -from .transition_system cimport move_cost_func_t, label_cost_func_t -from ..gold cimport GoldParse -from ..gold cimport GoldParseC - -from libc.stdint cimport uint32_t -from libc.string cimport memcpy - -from cymem.cymem cimport Pool -from .stateclass cimport StateClass - - -cdef weight_t MIN_SCORE = -90000 - -cdef enum: - SHIFT - REDUCE - LEFT - RIGHT - - N_MOVES - - -MOVE_NAMES = [None] * N_MOVES -MOVE_NAMES[SHIFT] = 'S' -MOVE_NAMES[REDUCE] = 'D' -MOVE_NAMES[LEFT] = 'L' -MOVE_NAMES[RIGHT] = 'R' - - -# Helper functions for the arc-eager oracle - -cdef int push_cost(StateClass stcls, const GoldParseC* gold, int target) nogil: - cdef int cost = 0 - cdef int i, S_i - for i in range(stcls.stack_depth()): - S_i = stcls.S(i) - if gold.heads[target] == S_i: - cost += 1 - if gold.heads[S_i] == target and not stcls.has_head(S_i): - cost += 1 - return cost - - -cdef int pop_cost(StateClass stcls, const GoldParseC* gold, int target) nogil: - if stcls.buffer_length() == 0: - return 0 - cdef int cost = 0 - cdef int i, B_i - for i in range(stcls.buffer_length()): - B_i = stcls.B(i) - cost += gold.heads[B_i] == target - if not stcls.has_head(target): - cost += gold.heads[target] == B_i - if gold.heads[B_i] == B_i or gold.heads[B_i] < target: - break - return cost - - -cdef int arc_cost(StateClass stcls, const GoldParseC* gold, int head, int child) nogil: - if arc_is_gold(gold, head, child): - return 0 - elif stcls.H(child) == gold.heads[child]: - return 1 - # Head in buffer - elif gold.heads[child] >= stcls.B(0) and stcls.B(1) != -1: - return 1 - else: - return 0 - - -cdef bint arc_is_gold(const GoldParseC* gold, int head, int child) nogil: - if gold.labels[child] == -1: - return True - elif gold.heads[child] == head: - return True - else: - return False - - -cdef bint label_is_gold(const GoldParseC* gold, int head, int child, int label) nogil: - if gold.labels[child] == -1: - return True - elif label == -1: - return True - elif gold.labels[child] == label: - return True - else: - return False - - -cdef bint _is_gold_root(const GoldParseC* gold, int word) nogil: - return gold.labels[word] == -1 or gold.heads[word] == word - - -cdef class Shift: - @staticmethod - cdef bint is_valid(StateClass st, int label) nogil: - return st.buffer_length() >= 1 - - @staticmethod - cdef int transition(StateClass st, int label) nogil: - st.push() - - @staticmethod - cdef int cost(StateClass st, const GoldParseC* gold, int label) nogil: - return Shift.move_cost(st, gold) + Shift.label_cost(st, gold, label) - - @staticmethod - cdef inline int move_cost(StateClass s, const GoldParseC* gold) nogil: - return push_cost(s, gold, s.B(0)) - - @staticmethod - cdef inline int label_cost(StateClass s, const GoldParseC* gold, int label) nogil: - return 0 - - -cdef class Reduce: - @staticmethod - cdef bint is_valid(StateClass st, int label) nogil: - return st.stack_depth() >= 1 and st.has_head(st.S(0)) - - @staticmethod - cdef int transition(StateClass st, int label) nogil: - st.pop() - - @staticmethod - cdef int cost(StateClass s, const GoldParseC* gold, int label) nogil: - return Reduce.move_cost(s, gold) + Reduce.label_cost(s, gold, label) - - @staticmethod - cdef inline int move_cost(StateClass st, const GoldParseC* gold) nogil: - return pop_cost(st, gold, st.S(0)) - - @staticmethod - cdef inline int label_cost(StateClass s, const GoldParseC* gold, int label) nogil: - return 0 - - -cdef class LeftArc: - @staticmethod - cdef bint is_valid(StateClass st, int label) nogil: - return st.stack_depth() >= 1 and not st.has_head(st.S(0)) - - @staticmethod - cdef int transition(StateClass st, int label) nogil: - if not st.buffer_length(): - st.add_arc(st.S(0), st.S(0), label) - else: - st.add_arc(st.B(0), st.S(0), label) - st.pop() - - @staticmethod - cdef int cost(StateClass s, const GoldParseC* gold, int label) nogil: - return LeftArc.move_cost(s, gold) + LeftArc.label_cost(s, gold, label) - - @staticmethod - cdef inline int move_cost(StateClass s, const GoldParseC* gold) nogil: - if not s.buffer_length(): - return 0 - elif arc_is_gold(gold, s.B(0), s.S(0)): - return 0 - else: - return pop_cost(s, gold, s.S(0)) + arc_cost(s, gold, s.B(0), s.S(0)) - - @staticmethod - cdef inline int label_cost(StateClass s, const GoldParseC* gold, int label) nogil: - if not s.buffer_length(): - return 0 - return arc_is_gold(gold, s.B(0), s.S(0)) and not label_is_gold(gold, s.B(0), s.S(0), label) - - -cdef class RightArc: - @staticmethod - cdef bint is_valid(StateClass st, int label) nogil: - return st.stack_depth() >= 1 and st.buffer_length() >= 1 - - @staticmethod - cdef int transition(StateClass st, int label) nogil: - st.add_arc(st.S(0), st.B(0), label) - st.push() - - @staticmethod - cdef inline int cost(StateClass s, const GoldParseC* gold, int label) nogil: - return RightArc.move_cost(s, gold) + RightArc.label_cost(s, gold, label) - - @staticmethod - cdef inline int move_cost(StateClass s, const GoldParseC* gold) nogil: - if arc_is_gold(gold, s.S(0), s.B(0)): - return 0 - elif s.shifted[s.B(0)]: - return push_cost(s, gold, s.B(0)) - else: - return push_cost(s, gold, s.B(0)) + arc_cost(s, gold, s.S(0), s.B(0)) - - @staticmethod - cdef int label_cost(StateClass s, const GoldParseC* gold, int label) nogil: - return arc_is_gold(gold, s.S(0), s.B(0)) and not label_is_gold(gold, s.S(0), s.B(0), label) - - -cdef class OrigArcEager(TransitionSystem): - @classmethod - def get_labels(cls, gold_parses): - move_labels = {SHIFT: {'': True}, RIGHT: {'': True}, - REDUCE: {'': True}, LEFT: {'root': True}} - for raw_text, sents in gold_parses: - for (ids, words, tags, heads, labels, iob), ctnts in sents: - for child, head, label in zip(ids, heads, labels): - if label != 'root': - if head < child: - move_labels[RIGHT][label] = True - elif head > child: - move_labels[LEFT][label] = True - return move_labels - - cdef int preprocess_gold(self, GoldParse gold) except -1: - for i in range(gold.length): - if gold.heads[i] is None: # Missing values - gold.c.heads[i] = i - gold.c.labels[i] = -1 - else: - gold.c.heads[i] = gold.heads[i] - gold.c.labels[i] = self.strings[gold.labels[i]] - for end, brackets in gold.brackets.items(): - for start, label_strs in brackets.items(): - gold.c.brackets[start][end] = 1 - for label_str in label_strs: - # Add the encoded label to the set - gold.brackets[end][start].add(self.strings[label_str]) - - cdef Transition lookup_transition(self, object name) except *: - if '-' in name: - move_str, label_str = name.split('-', 1) - label = self.label_ids[label_str] - else: - label = 0 - move = MOVE_NAMES.index(move_str) - for i in range(self.n_moves): - if self.c[i].move == move and self.c[i].label == label: - return self.c[i] - - def move_name(self, int move, int label): - label_str = self.strings[label] - if label_str: - return MOVE_NAMES[move] + '-' + label_str - else: - return MOVE_NAMES[move] - - cdef Transition init_transition(self, int clas, int move, int label) except *: - # TODO: Apparent Cython bug here when we try to use the Transition() - # constructor with the function pointers - cdef Transition t - t.score = 0 - t.clas = clas - t.move = move - t.label = label - if move == SHIFT: - t.is_valid = Shift.is_valid - t.do = Shift.transition - t.get_cost = Shift.cost - elif move == REDUCE: - t.is_valid = Reduce.is_valid - t.do = Reduce.transition - t.get_cost = Reduce.cost - elif move == LEFT: - t.is_valid = LeftArc.is_valid - t.do = LeftArc.transition - t.get_cost = LeftArc.cost - elif move == RIGHT: - t.is_valid = RightArc.is_valid - t.do = RightArc.transition - t.get_cost = RightArc.cost - else: - raise Exception(move) - return t - - cdef int initialize_state(self, StateClass st) except -1: - # Ensure sent_end is set to 0 throughout - for i in range(st.length): - st._sent[i].sent_end = False - st.push() - - cdef int finalize_state(self, StateClass st) except -1: - cdef int root_label = self.strings['root'] - for i in range(st.length): - if st._sent[i].head == 0: - st._sent[i].dep = root_label - - cdef int set_valid(self, bint* output, StateClass stcls) except -1: - cdef bint[N_MOVES] is_valid - is_valid[SHIFT] = Shift.is_valid(stcls, -1) - is_valid[REDUCE] = Reduce.is_valid(stcls, -1) - is_valid[LEFT] = LeftArc.is_valid(stcls, -1) - is_valid[RIGHT] = RightArc.is_valid(stcls, -1) - cdef int i - n_valid = 0 - for i in range(self.n_moves): - output[i] = is_valid[self.c[i].move] - n_valid += output[i] - assert n_valid >= 1 - - cdef int set_costs(self, int* output, StateClass stcls, GoldParse gold) except -1: - cdef int i, move, label - cdef label_cost_func_t[N_MOVES] label_cost_funcs - cdef move_cost_func_t[N_MOVES] move_cost_funcs - cdef int[N_MOVES] move_costs - for i in range(N_MOVES): - move_costs[i] = -1 - move_cost_funcs[SHIFT] = Shift.move_cost - move_cost_funcs[REDUCE] = Reduce.move_cost - move_cost_funcs[LEFT] = LeftArc.move_cost - move_cost_funcs[RIGHT] = RightArc.move_cost - - label_cost_funcs[SHIFT] = Shift.label_cost - label_cost_funcs[REDUCE] = Reduce.label_cost - label_cost_funcs[LEFT] = LeftArc.label_cost - label_cost_funcs[RIGHT] = RightArc.label_cost - - cdef int* labels = gold.c.labels - cdef int* heads = gold.c.heads - - n_gold = 0 - for i in range(self.n_moves): - if self.c[i].is_valid(stcls, self.c[i].label): - move = self.c[i].move - label = self.c[i].label - if move_costs[move] == -1: - move_costs[move] = move_cost_funcs[move](stcls, &gold.c) - output[i] = move_costs[move] + label_cost_funcs[move](stcls, &gold.c, label) - n_gold += output[i] == 0 - else: - output[i] = 9000 - assert n_gold >= 1 - - cdef Transition best_valid(self, const weight_t* scores, StateClass stcls) except *: - cdef bint[N_MOVES] is_valid - is_valid[SHIFT] = Shift.is_valid(stcls, -1) - is_valid[REDUCE] = Reduce.is_valid(stcls, -1) - is_valid[LEFT] = LeftArc.is_valid(stcls, -1) - is_valid[RIGHT] = RightArc.is_valid(stcls, -1) - cdef Transition best - cdef weight_t score = MIN_SCORE - cdef int i - for i in range(self.n_moves): - if scores[i] > score and is_valid[self.c[i].move]: - best = self.c[i] - score = scores[i] - assert score > MIN_SCORE, (self.n_moves, stcls.stack_depth(), stcls.buffer_length(), stcls.is_final(), stcls._b_i, stcls.length, stcls.has_head(stcls.S(0)), LeftArc.is_valid(stcls, -1)) - return best diff --git a/spacy/syntax/tree_arc_eager.pxd b/spacy/syntax/tree_arc_eager.pxd deleted file mode 100644 index fab2c15fc..000000000 --- a/spacy/syntax/tree_arc_eager.pxd +++ /dev/null @@ -1,17 +0,0 @@ -from cymem.cymem cimport Pool - -from thinc.typedefs cimport weight_t - -from .stateclass cimport StateClass - -from .transition_system cimport TransitionSystem, Transition -from ..gold cimport GoldParseC - - -cdef class TreeArcEager(TransitionSystem): - pass - - -cdef int push_cost(StateClass stcls, const GoldParseC* gold, int target) nogil -cdef int arc_cost(StateClass stcls, const GoldParseC* gold, int head, int child) nogil - diff --git a/spacy/syntax/tree_arc_eager.pyx b/spacy/syntax/tree_arc_eager.pyx deleted file mode 100644 index 38d437087..000000000 --- a/spacy/syntax/tree_arc_eager.pyx +++ /dev/null @@ -1,438 +0,0 @@ -# cython: profile=True -from __future__ import unicode_literals - -import ctypes -import os - -from ..structs cimport TokenC - -from .transition_system cimport do_func_t, get_cost_func_t -from .transition_system cimport move_cost_func_t, label_cost_func_t -from ..gold cimport GoldParse -from ..gold cimport GoldParseC - -from libc.stdint cimport uint32_t -from libc.string cimport memcpy - -from cymem.cymem cimport Pool -from .stateclass cimport StateClass - - -DEF NON_MONOTONIC = False -DEF USE_BREAK = False -DEF USE_ROOT_ARC_SEGMENT = False - -cdef weight_t MIN_SCORE = -90000 - -# Break transition from here -# http://www.aclweb.org/anthology/P13-1074 -cdef enum: - SHIFT - REDUCE - LEFT - RIGHT - - BREAK - - N_MOVES - - -MOVE_NAMES = [None] * N_MOVES -MOVE_NAMES[SHIFT] = 'S' -MOVE_NAMES[REDUCE] = 'D' -MOVE_NAMES[LEFT] = 'L' -MOVE_NAMES[RIGHT] = 'R' -MOVE_NAMES[BREAK] = 'B' - - -# Helper functions for the arc-eager oracle - -cdef int push_cost(StateClass stcls, const GoldParseC* gold, int target) nogil: - cdef int cost = 0 - cdef int i, S_i - for i in range(stcls.stack_depth()): - S_i = stcls.S(i) - if gold.heads[target] == S_i: - cost += 1 - if gold.heads[S_i] == target and not stcls.has_head(S_i): - cost += 1 - cost += Break.is_valid(stcls, -1) and Break.move_cost(stcls, gold) == 0 - return cost - - -cdef int pop_cost(StateClass stcls, const GoldParseC* gold, int target) nogil: - cdef int cost = 0 - cdef int i, B_i - for i in range(stcls.buffer_length()): - B_i = stcls.B(i) - cost += gold.heads[B_i] == target - if not stcls.has_head(target): - cost += gold.heads[target] == B_i - if gold.heads[B_i] == B_i or gold.heads[B_i] < target: - break - cost += Break.is_valid(stcls, -1) and Break.move_cost(stcls, gold) == 0 - return cost - - -cdef int arc_cost(StateClass stcls, const GoldParseC* gold, int head, int child) nogil: - if arc_is_gold(gold, head, child): - return 0 - elif stcls.H(child) == gold.heads[child]: - return 1 - # Head in buffer - elif gold.heads[child] >= stcls.B(0) and stcls.B(1) != -1: - return 1 - else: - return 0 - - -cdef bint arc_is_gold(const GoldParseC* gold, int head, int child) nogil: - if gold.labels[child] == -1: - return True - elif USE_ROOT_ARC_SEGMENT and _is_gold_root(gold, head) and _is_gold_root(gold, child): - return True - elif gold.heads[child] == head: - return True - else: - return False - - -cdef bint label_is_gold(const GoldParseC* gold, int head, int child, int label) nogil: - if gold.labels[child] == -1: - return True - elif label == -1: - return True - elif gold.labels[child] == label: - return True - else: - return False - - -cdef bint _is_gold_root(const GoldParseC* gold, int word) nogil: - return gold.labels[word] == -1 or gold.heads[word] == word - - -cdef class Shift: - @staticmethod - cdef bint is_valid(StateClass st, int label) nogil: - return st.buffer_length() >= 2 and not st.shifted[st.B(0)] and not st.B_(0).sent_end - - @staticmethod - cdef int transition(StateClass st, int label) nogil: - st.push() - st.fast_forward() - - @staticmethod - cdef int cost(StateClass st, const GoldParseC* gold, int label) nogil: - return Shift.move_cost(st, gold) + Shift.label_cost(st, gold, label) - - @staticmethod - cdef inline int move_cost(StateClass s, const GoldParseC* gold) nogil: - return push_cost(s, gold, s.B(0)) - - @staticmethod - cdef inline int label_cost(StateClass s, const GoldParseC* gold, int label) nogil: - return 0 - - -cdef class Reduce: - @staticmethod - cdef bint is_valid(StateClass st, int label) nogil: - return st.stack_depth() >= 2 and st.has_head(st.S(0)) - - @staticmethod - cdef int transition(StateClass st, int label) nogil: - st.pop() - st.fast_forward() - - @staticmethod - cdef int cost(StateClass s, const GoldParseC* gold, int label) nogil: - return Reduce.move_cost(s, gold) + Reduce.label_cost(s, gold, label) - - @staticmethod - cdef inline int move_cost(StateClass st, const GoldParseC* gold) nogil: - return pop_cost(st, gold, st.S(0)) - - @staticmethod - cdef inline int label_cost(StateClass s, const GoldParseC* gold, int label) nogil: - return 0 - - -cdef class LeftArc: - @staticmethod - cdef bint is_valid(StateClass st, int label) nogil: - return not st.B_(0).sent_end and not st.has_head(st.S(0)) - - @staticmethod - cdef int transition(StateClass st, int label) nogil: - st.add_arc(st.B(0), st.S(0), label) - st.pop() - st.fast_forward() - - @staticmethod - cdef int cost(StateClass s, const GoldParseC* gold, int label) nogil: - return LeftArc.move_cost(s, gold) + LeftArc.label_cost(s, gold, label) - - @staticmethod - cdef inline int move_cost(StateClass s, const GoldParseC* gold) nogil: - cdef int cost = 0 - if arc_is_gold(gold, s.B(0), s.S(0)): - return 0 - else: - return pop_cost(s, gold, s.S(0)) + arc_cost(s, gold, s.B(0), s.S(0)) - - @staticmethod - cdef inline int label_cost(StateClass s, const GoldParseC* gold, int label) nogil: - return arc_is_gold(gold, s.B(0), s.S(0)) and not label_is_gold(gold, s.B(0), s.S(0), label) - - -cdef class RightArc: - @staticmethod - cdef bint is_valid(StateClass st, int label) nogil: - return not st.B_(0).sent_end - - @staticmethod - cdef int transition(StateClass st, int label) nogil: - st.add_arc(st.S(0), st.B(0), label) - st.push() - st.fast_forward() - - @staticmethod - cdef inline int cost(StateClass s, const GoldParseC* gold, int label) nogil: - return RightArc.move_cost(s, gold) + RightArc.label_cost(s, gold, label) - - @staticmethod - cdef inline int move_cost(StateClass s, const GoldParseC* gold) nogil: - if arc_is_gold(gold, s.S(0), s.B(0)): - return 0 - elif s.shifted[s.B(0)]: - return push_cost(s, gold, s.B(0)) - else: - return push_cost(s, gold, s.B(0)) + arc_cost(s, gold, s.S(0), s.B(0)) - - @staticmethod - cdef int label_cost(StateClass s, const GoldParseC* gold, int label) nogil: - return arc_is_gold(gold, s.S(0), s.B(0)) and not label_is_gold(gold, s.S(0), s.B(0), label) - - -cdef class Break: - @staticmethod - cdef bint is_valid(StateClass st, int label) nogil: - cdef int i - if not USE_BREAK: - return False - elif st.at_break(): - return False - elif st.B(0) == 0: - return False - elif st.stack_depth() < 1: - return False - elif (st.S(0) + 1) != st.B(0): - # Must break at the token boundary - return False - else: - return True - - @staticmethod - cdef int transition(StateClass st, int label) nogil: - st.set_break(st.B(0)) - st.fast_forward() - - @staticmethod - cdef int cost(StateClass s, const GoldParseC* gold, int label) nogil: - return Break.move_cost(s, gold) + Break.label_cost(s, gold, label) - - @staticmethod - cdef inline int move_cost(StateClass s, const GoldParseC* gold) nogil: - cdef int cost = 0 - cdef int S_i, B_i - for i in range(s.stack_depth()): - S_i = s.S(i) - for j in range(s.buffer_length()): - B_i = s.B(j) - cost += gold.heads[S_i] == B_i - cost += gold.heads[B_i] == S_i - # Check for sentence boundary --- if it's here, we can't have any deps - # between stack and buffer, so rest of action is irrelevant. - s0_root = _get_root(s.S(0), gold) - b0_root = _get_root(s.B(0), gold) - if s0_root != b0_root or s0_root == -1 or b0_root == -1: - return cost - else: - return cost + 1 - - @staticmethod - cdef inline int label_cost(StateClass s, const GoldParseC* gold, int label) nogil: - return 0 - -cdef int _get_root(int word, const GoldParseC* gold) nogil: - while gold.heads[word] != word and gold.labels[word] != -1 and word >= 0: - word = gold.heads[word] - if gold.labels[word] == -1: - return -1 - else: - return word - - -cdef class TreeArcEager(TransitionSystem): - @classmethod - def get_labels(cls, gold_parses): - move_labels = {SHIFT: {'': True}, REDUCE: {'': True}, RIGHT: {'root': True}, - LEFT: {'root': True}, BREAK: {'root': True}} - for raw_text, sents in gold_parses: - for (ids, words, tags, heads, labels, iob), ctnts in sents: - for child, head, label in zip(ids, heads, labels): - if label != 'root': - if head < child: - move_labels[RIGHT][label] = True - elif head > child: - move_labels[LEFT][label] = True - return move_labels - - cdef int preprocess_gold(self, GoldParse gold) except -1: - for i in range(gold.length): - if gold.heads[i] is None: # Missing values - gold.c.heads[i] = i - gold.c.labels[i] = -1 - else: - gold.c.heads[i] = gold.heads[i] - gold.c.labels[i] = self.strings[gold.labels[i]] - for end, brackets in gold.brackets.items(): - for start, label_strs in brackets.items(): - gold.c.brackets[start][end] = 1 - for label_str in label_strs: - # Add the encoded label to the set - gold.brackets[end][start].add(self.strings[label_str]) - - cdef Transition lookup_transition(self, object name) except *: - if '-' in name: - move_str, label_str = name.split('-', 1) - label = self.label_ids[label_str] - else: - label = 0 - move = MOVE_NAMES.index(move_str) - for i in range(self.n_moves): - if self.c[i].move == move and self.c[i].label == label: - return self.c[i] - - def move_name(self, int move, int label): - label_str = self.strings[label] - if label_str: - return MOVE_NAMES[move] + '-' + label_str - else: - return MOVE_NAMES[move] - - cdef Transition init_transition(self, int clas, int move, int label) except *: - # TODO: Apparent Cython bug here when we try to use the Transition() - # constructor with the function pointers - cdef Transition t - t.score = 0 - t.clas = clas - t.move = move - t.label = label - if move == SHIFT: - t.is_valid = Shift.is_valid - t.do = Shift.transition - t.get_cost = Shift.cost - elif move == REDUCE: - t.is_valid = Reduce.is_valid - t.do = Reduce.transition - t.get_cost = Reduce.cost - elif move == LEFT: - t.is_valid = LeftArc.is_valid - t.do = LeftArc.transition - t.get_cost = LeftArc.cost - elif move == RIGHT: - t.is_valid = RightArc.is_valid - t.do = RightArc.transition - t.get_cost = RightArc.cost - elif move == BREAK: - t.is_valid = Break.is_valid - t.do = Break.transition - t.get_cost = Break.cost - else: - raise Exception(move) - return t - - cdef int initialize_state(self, StateClass st) except -1: - # Ensure sent_end is set to 0 throughout - for i in range(st.length): - st._sent[i].sent_end = False - st.fast_forward() - - cdef int finalize_state(self, StateClass st) except -1: - cdef int root_label = self.strings['root'] - for i in range(st.length): - if st._sent[i].head == 0 and st._sent[i].dep == 0: - st._sent[i].dep = root_label - # If we're not using the Break transition, we segment via root-labelled - # arcs between the root words. - elif USE_ROOT_ARC_SEGMENT and st._sent[i].dep == root_label: - st._sent[i].head = 0 - - cdef int set_valid(self, bint* output, StateClass stcls) except -1: - cdef bint[N_MOVES] is_valid - is_valid[SHIFT] = Shift.is_valid(stcls, -1) - is_valid[REDUCE] = Reduce.is_valid(stcls, -1) - is_valid[LEFT] = LeftArc.is_valid(stcls, -1) - is_valid[RIGHT] = RightArc.is_valid(stcls, -1) - is_valid[BREAK] = Break.is_valid(stcls, -1) - cdef int i - n_valid = 0 - for i in range(self.n_moves): - output[i] = is_valid[self.c[i].move] - n_valid += output[i] - assert n_valid >= 1 - - cdef int set_costs(self, int* output, StateClass stcls, GoldParse gold) except -1: - cdef int i, move, label - cdef label_cost_func_t[N_MOVES] label_cost_funcs - cdef move_cost_func_t[N_MOVES] move_cost_funcs - cdef int[N_MOVES] move_costs - for i in range(N_MOVES): - move_costs[i] = -1 - move_cost_funcs[SHIFT] = Shift.move_cost - move_cost_funcs[REDUCE] = Reduce.move_cost - move_cost_funcs[LEFT] = LeftArc.move_cost - move_cost_funcs[RIGHT] = RightArc.move_cost - move_cost_funcs[BREAK] = Break.move_cost - - label_cost_funcs[SHIFT] = Shift.label_cost - label_cost_funcs[REDUCE] = Reduce.label_cost - label_cost_funcs[LEFT] = LeftArc.label_cost - label_cost_funcs[RIGHT] = RightArc.label_cost - label_cost_funcs[BREAK] = Break.label_cost - - cdef int* labels = gold.c.labels - cdef int* heads = gold.c.heads - - n_gold = 0 - for i in range(self.n_moves): - if self.c[i].is_valid(stcls, self.c[i].label): - move = self.c[i].move - label = self.c[i].label - if move_costs[move] == -1: - move_costs[move] = move_cost_funcs[move](stcls, &gold.c) - output[i] = move_costs[move] + label_cost_funcs[move](stcls, &gold.c, label) - n_gold += output[i] == 0 - else: - output[i] = 9000 - assert n_gold >= 1 - - cdef Transition best_valid(self, const weight_t* scores, StateClass stcls) except *: - cdef bint[N_MOVES] is_valid - is_valid[SHIFT] = Shift.is_valid(stcls, -1) - is_valid[REDUCE] = Reduce.is_valid(stcls, -1) - is_valid[LEFT] = LeftArc.is_valid(stcls, -1) - is_valid[RIGHT] = RightArc.is_valid(stcls, -1) - is_valid[BREAK] = Break.is_valid(stcls, -1) - cdef Transition best - cdef weight_t score = MIN_SCORE - cdef int i - for i in range(self.n_moves): - if scores[i] > score and is_valid[self.c[i].move]: - best = self.c[i] - score = scores[i] - assert best.clas < self.n_moves - assert score > MIN_SCORE, (stcls.stack_depth(), stcls.buffer_length(), stcls.is_final(), stcls._b_i, stcls.length) - return best From 88f55d136b6e27ecd94c5ad64ad11186352fba9e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 23 Jun 2015 17:19:51 +0200 Subject: [PATCH 74/75] * Remove deprecated _state module --- spacy/syntax/_state.pxd | 114 ------------------ spacy/syntax/_state.pyx | 254 ---------------------------------------- 2 files changed, 368 deletions(-) delete mode 100644 spacy/syntax/_state.pxd delete mode 100644 spacy/syntax/_state.pyx diff --git a/spacy/syntax/_state.pxd b/spacy/syntax/_state.pxd deleted file mode 100644 index fc4a3e58d..000000000 --- a/spacy/syntax/_state.pxd +++ /dev/null @@ -1,114 +0,0 @@ -from libc.stdint cimport uint32_t - -from cymem.cymem cimport Pool - -from ..structs cimport TokenC, Entity, Constituent - - - -cdef struct State: - TokenC* sent - int* stack - Entity* ent - int i - int sent_len - int stack_len - int ents_len - - -cdef int add_dep(State *s, const int head, const int child, const int label) except -1 - - -cdef int pop_stack(State *s) except -1 -cdef int push_stack(State *s) except -1 - - -cdef bint has_head(const TokenC* t) nogil - - -cdef inline int get_idx(const State* s, const TokenC* t) nogil: - return t - s.sent - - -cdef inline TokenC* get_n0(const State* s) nogil: - return &s.sent[s.i] - - -cdef inline TokenC* get_n1(const State* s) nogil: - if (s.i+1) >= s.sent_len: - return NULL - else: - return &s.sent[s.i+1] - - -cdef inline TokenC* get_p1(const State* s) nogil: - if s.i < 1: - return NULL - else: - return &s.sent[s.i-1] - - -cdef inline TokenC* get_p2(const State* s) nogil: - if s.i < 2: - return NULL - else: - return &s.sent[s.i-2] - - -cdef inline TokenC* get_e0(const State* s) nogil: - if s.ent.end != 0: - return NULL - else: - return &s.sent[s.ent.start] - - -cdef inline TokenC* get_e1(const State* s) nogil: - if s.ent.end != 0 or s.ent.start >= (s.i + 1): - return NULL - else: - return &s.sent[s.ent.start + 1] - - -cdef inline TokenC* get_n2(const State* s) nogil: - if (s.i + 2) >= s.sent_len: - return NULL - else: - return &s.sent[s.i+2] - - -cdef inline TokenC* get_s0(const State *s) nogil: - return &s.sent[s.stack[0]] - - -cdef inline TokenC* get_s1(const State *s) nogil: - # Rely on our padding to ensure we don't go out of bounds here - return &s.sent[s.stack[-1]] - - -cdef inline TokenC* get_s2(const State *s) nogil: - # Rely on our padding to ensure we don't go out of bounds here - return &s.sent[s.stack[-2]] - -cdef const TokenC* get_right(const State* s, const TokenC* head, const int idx) nogil - -cdef const TokenC* get_left(const State* s, const TokenC* head, const int idx) nogil - -cdef inline bint at_eol(const State *s) nogil: - return s.i >= s.sent_len - - -cdef inline bint is_final(const State *s) nogil: - return at_eol(s) and s.stack_len < 2 - - -cdef int children_in_buffer(const State *s, const int head, const int* gold) except -1 -cdef int head_in_buffer(const State *s, const int child, const int* gold) except -1 -cdef int children_in_stack(const State *s, const int head, const int* gold) except -1 -cdef int head_in_stack(const State *s, const int child, const int* gold) except -1 - -cdef State* new_state(Pool mem, const TokenC* sent, const int sent_length) except NULL -cdef int copy_state(State* dest, const State* src) except -1 - -cdef int count_left_kids(const TokenC* head) nogil - -cdef int count_right_kids(const TokenC* head) nogil diff --git a/spacy/syntax/_state.pyx b/spacy/syntax/_state.pyx deleted file mode 100644 index e499b6461..000000000 --- a/spacy/syntax/_state.pyx +++ /dev/null @@ -1,254 +0,0 @@ -# cython: profile=True -from libc.string cimport memmove, memcpy -from cymem.cymem cimport Pool - -from ..lexeme cimport EMPTY_LEXEME -from ..structs cimport TokenC, Entity, Constituent - - -DEF PADDING = 5 -DEF NON_MONOTONIC = True - - -cdef int add_dep(State *s, int head, int child, int label) except -1: - if has_head(&s.sent[child]): - del_dep(s, child + s.sent[child].head, child) - cdef int dist = head - child - s.sent[child].head = dist - s.sent[child].dep = label - # Keep a bit-vector tracking child dependencies. If a word has a child at - # offset i from it, set that bit (tracking left and right separately) - if child > head: - s.sent[head].r_kids |= 1 << (-dist) - s.sent[head].r_edge = child - head - # Walk up the tree, setting right edge - n_iter = 0 - start = head - while s.sent[head].head != 0: - head += s.sent[head].head - s.sent[head].r_edge = child - head - n_iter += 1 - if n_iter >= s.sent_len: - tree = [(i + s.sent[i].head) for i in range(s.sent_len)] - msg = "Error adding dependency (%d, %d). Could not find root of tree: %s" - msg = msg % (start, child, tree) - raise Exception(msg) - else: - s.sent[head].l_kids |= 1 << dist - s.sent[head].l_edge = (child + s.sent[child].l_edge) - head - - -cdef int del_dep(State *s, int head, int child) except -1: - cdef const TokenC* next_child - cdef int dist = head - child - if child > head: - s.sent[head].r_kids &= ~(1 << (-dist)) - next_child = get_right(s, &s.sent[head], 1) - if next_child == NULL: - s.sent[head].r_edge = 0 - else: - s.sent[head].r_edge = next_child.r_edge - else: - s.sent[head].l_kids &= ~(1 << dist) - next_child = get_left(s, &s.sent[head], 1) - if next_child == NULL: - s.sent[head].l_edge = 0 - else: - s.sent[head].l_edge = next_child.l_edge - - -cdef int pop_stack(State *s) except -1: - assert s.stack_len >= 1 - s.stack_len -= 1 - s.stack -= 1 - #if s.stack_len == 0 and not at_eol(s): - # push_stack(s) - - -cdef int push_stack(State *s) except -1: - assert s.i < s.sent_len - s.stack += 1 - s.stack[0] = s.i - s.stack_len += 1 - s.i += 1 - - -cdef int children_in_buffer(const State *s, int head, const int* gold) except -1: - # Golds holds an array of head offsets --- the head of word i is i - golds[i] - # Iterate over the tokens of the queue, and check whether their gold head is - # our target - cdef int i - cdef int n = 0 - for i in range(s.i, s.sent_len): - if gold[i] == head: - n += 1 - elif gold[i] == i or gold[i] < head: - break - return n - - -cdef int head_in_buffer(const State *s, const int child, const int* gold) except -1: - return gold[child] >= s.i - - -cdef int children_in_stack(const State *s, const int head, const int* gold) except -1: - cdef int i - cdef int n = 0 - for i in range(s.stack_len): - if gold[s.stack[-i]] == head: - if NON_MONOTONIC or not has_head(get_s0(s)): - n += 1 - return n - - -cdef int head_in_stack(const State *s, const int child, const int* gold) except -1: - cdef int i - for i in range(s.stack_len): - if gold[child] == s.stack[-i]: - return 1 - return 0 - - -cdef bint has_head(const TokenC* t) nogil: - return t.head != 0 - - -cdef const TokenC* get_left(const State* s, const TokenC* head, const int idx) nogil: - return _new_get_left(s, head, idx) - -""" - cdef uint32_t kids = head.l_kids - if kids == 0: - return NULL - cdef int offset = _nth_significant_bit(kids, idx) - cdef const TokenC* child = head - offset - if child >= s.sent: - return child - else: - return NULL -""" - -cdef const TokenC* get_right(const State* s, const TokenC* head, const int idx) nogil: - return _new_get_right(s, head, idx) - -""" - cdef uint32_t kids = head.r_kids - if kids == 0: - return NULL - cdef int offset = _nth_significant_bit(kids, idx) - cdef const TokenC* child = head + offset - if child < (s.sent + s.sent_len): - return child - else: - return NULL -""" - -cdef int count_left_kids(const TokenC* head) nogil: - return _popcount(head.l_kids) - - -cdef int count_right_kids(const TokenC* head) nogil: - return _popcount(head.r_kids) - - -cdef State* new_state(Pool mem, const TokenC* sent, const int sent_len) except NULL: - cdef int padded_len = sent_len + PADDING + PADDING - cdef State* s = mem.alloc(1, sizeof(State)) - #s.ctnt = mem.alloc(padded_len, sizeof(Constituent)) - s.ent = mem.alloc(padded_len, sizeof(Entity)) - s.stack = mem.alloc(padded_len, sizeof(int)) - for i in range(PADDING): - s.stack[i] = -1 - #s.ctnt += (PADDING -1) - s.stack += (PADDING - 1) - s.ent += (PADDING - 1) - assert s.stack[0] == -1 - state_sent = mem.alloc(padded_len, sizeof(TokenC)) - memcpy(state_sent, sent - PADDING, padded_len * sizeof(TokenC)) - s.sent = state_sent + PADDING - s.stack_len = 0 - s.i = 0 - s.sent_len = sent_len - return s - - -cdef int copy_state(State* dest, const State* src) except -1: - cdef int i - # Copy stack --- remember stack uses pointer arithmetic, so stack[-stack_len] - # is the last word of the stack. - dest.stack += (src.stack_len - dest.stack_len) - for i in range(src.stack_len): - dest.stack[-i] = src.stack[-i] - dest.stack_len = src.stack_len - # Copy sentence (i.e. the parse), up to and including word i. - if src.i > dest.i: - memcpy(dest.sent, src.sent, sizeof(TokenC) * (src.i+1)) - else: - memcpy(dest.sent, src.sent, sizeof(TokenC) * (dest.i+1)) - dest.i = src.i - # Copy assigned entities --- also pointer arithmetic - dest.ent += (src.ents_len - dest.ents_len) - for i in range(src.ents_len): - dest.ent[-i] = src.ent[-i] - dest.ents_len = src.ents_len - - -# From https://en.wikipedia.org/wiki/Hamming_weight -cdef inline uint32_t _popcount(uint32_t x) nogil: - """Find number of non-zero bits.""" - cdef uint32_t count = 0 - while x != 0: - x &= x - 1 - count += 1 - return count - - -cdef inline uint32_t _nth_significant_bit(uint32_t bits, int n) nogil: - cdef uint32_t i - for i in range(32): - if bits & (1 << i): - n -= 1 - if n < 1: - return i - return 0 - - -cdef const TokenC* _new_get_left(const State* s, const TokenC* target, int idx) nogil: - if idx < 1: - return NULL - cdef const TokenC* ptr = s.sent - while ptr < target: - # If this head is still to the right of us, we can skip to it - # No token that's between this token and this head could be our - # child. - if (ptr.head >= 1) and (ptr + ptr.head) < target: - ptr += ptr.head - - elif ptr + ptr.head == target: - idx -= 1 - if idx == 0: - return ptr - ptr += 1 - else: - ptr += 1 - return NULL - - -cdef const TokenC* _new_get_right(const State* s, const TokenC* target, int idx) nogil: - if idx < 1: - return NULL - cdef const TokenC* ptr = s.sent + (s.sent_len - 1) - while ptr > target: - # If this head is still to the right of us, we can skip to it - # No token that's between this token and this head could be our - # child. - if (ptr.head < 0) and ((ptr + ptr.head) > target): - ptr += ptr.head - elif ptr + ptr.head == target: - idx -= 1 - if idx == 0: - return ptr - ptr -= 1 - else: - ptr -= 1 - return NULL From 579735a09526c07c239cf68d90f227d0503b7ea0 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 23 Jun 2015 17:25:08 +0200 Subject: [PATCH 75/75] * Remove import of _state module --- spacy/syntax/ner.pxd | 1 - spacy/syntax/ner.pyx | 2 -- spacy/syntax/parser.pxd | 2 -- 3 files changed, 5 deletions(-) diff --git a/spacy/syntax/ner.pxd b/spacy/syntax/ner.pxd index 3687bbb27..0e3403230 100644 --- a/spacy/syntax/ner.pxd +++ b/spacy/syntax/ner.pxd @@ -1,6 +1,5 @@ from .transition_system cimport TransitionSystem from .transition_system cimport Transition -from ._state cimport State from ..gold cimport GoldParseC diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx index c27bae1f2..4a47a20a8 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/syntax/ner.pyx @@ -1,7 +1,5 @@ from __future__ import unicode_literals -from ._state cimport State - from .transition_system cimport Transition from .transition_system cimport do_func_t diff --git a/spacy/syntax/parser.pxd b/spacy/syntax/parser.pxd index 1b4bf15fd..103ff9c02 100644 --- a/spacy/syntax/parser.pxd +++ b/spacy/syntax/parser.pxd @@ -5,8 +5,6 @@ from .._ml cimport Model from .arc_eager cimport TransitionSystem from ..tokens cimport Tokens, TokenC -from ._state cimport State - cdef class Parser: