mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	* Everything compiling after reorg. About to start testing.
This commit is contained in:
		
							parent
							
								
									e1c1a4b868
								
							
						
					
					
						commit
						b34a1325d3
					
				|  | @ -20,8 +20,7 @@ cdef int pop_stack(State *s) except -1 | |||
| cdef int push_stack(State *s) except -1 | ||||
| 
 | ||||
| 
 | ||||
| cdef inline bint has_head(const TokenC* t) nogil: | ||||
|     return t.head != 0 | ||||
| cdef bint has_head(const TokenC* t) nogil | ||||
| 
 | ||||
| 
 | ||||
| cdef inline int get_idx(const State* s, const TokenC* t) nogil: | ||||
|  | @ -71,14 +70,29 @@ cdef inline bint is_final(const State *s) nogil: | |||
|     return at_eol(s) # The stack will be attached to root anyway | ||||
| 
 | ||||
| 
 | ||||
| cdef int children_in_buffer(const State *s, const int head, int* gold) except -1 | ||||
| cdef int head_in_buffer(const State *s, const int child, int* gold) except -1 | ||||
| cdef int children_in_stack(const State *s, const int head, int* gold) except -1 | ||||
| cdef int head_in_stack(const State *s, const int child, int* gold) except -1 | ||||
| cdef int children_in_buffer(const State *s, const int head, const int* gold) except -1 | ||||
| cdef int head_in_buffer(const State *s, const int child, const int* gold) except -1 | ||||
| cdef int children_in_stack(const State *s, const int head, const int* gold) except -1 | ||||
| cdef int head_in_stack(const State *s, const int child, const int* gold) except -1 | ||||
| 
 | ||||
| cdef State* init_state(Pool mem, TokenC* sent, const int sent_length) except NULL | ||||
| 
 | ||||
| 
 | ||||
| cdef int count_left_kids(const TokenC* head) nogil | ||||
| 
 | ||||
| 
 | ||||
| cdef int count_right_kids(const TokenC* head) nogil | ||||
| 
 | ||||
| 
 | ||||
| # From https://en.wikipedia.org/wiki/Hamming_weight | ||||
| cdef inline uint32_t _popcount(uint32_t x) nogil: | ||||
|     """Find number of non-zero bits.""" | ||||
|     cdef int count = 0 | ||||
|     while x != 0: | ||||
|         x &= x - 1 | ||||
|         count += 1 | ||||
|     return count | ||||
| 
 | ||||
| 
 | ||||
| cdef inline uint32_t _nth_significant_bit(uint32_t bits, int n) nogil: | ||||
|     cdef int i | ||||
|  |  | |||
|  | @ -3,24 +3,32 @@ from libc.string cimport memmove | |||
| from cymem.cymem cimport Pool | ||||
| 
 | ||||
| from ..lexeme cimport EMPTY_LEXEME | ||||
| from ..structs cimport TokenC | ||||
| 
 | ||||
| 
 | ||||
| DEF PADDING = 5 | ||||
| DEF NON_MONOTONIC = True | ||||
| 
 | ||||
| 
 | ||||
| cdef int add_dep(State *s, int head, int child, int label) except -1: | ||||
|     s.sent[child].head = head - child | ||||
|     cdef int dist = head - child | ||||
|     s.sent[child].head = dist | ||||
|     s.sent[child].dep_tag = label | ||||
|     # Keep a bit-vector tracking child dependencies.  If a word has a child at | ||||
|     # offset i from it, set that bit (tracking left and right separately) | ||||
|     if child > head: | ||||
|         s.sent[head].r_kids |= 1 << (-s.sent[child].head) | ||||
|         s.sent[head].r_kids |= 1 << (-dist) | ||||
|     else: | ||||
|         s.sent[head].l_kids |= 1 << s.sent[child].head | ||||
|         s.sent[head].l_kids |= 1 << dist | ||||
| 
 | ||||
| 
 | ||||
| cdef int pop_stack(State *s) except -1: | ||||
|     assert s.stack_len >= 1 | ||||
|     s.stack_len -= 1 | ||||
|     s.stack -= 1 | ||||
| 
 | ||||
|     if s.stack_len == 0 and not at_eol(s): | ||||
|         push_stack(s) | ||||
|          | ||||
| 
 | ||||
| cdef int push_stack(State *s) except -1: | ||||
|     assert s.i < s.sent_len | ||||
|  | @ -28,9 +36,14 @@ cdef int push_stack(State *s) except -1: | |||
|     s.stack[0] = s.i | ||||
|     s.stack_len += 1 | ||||
|     s.i += 1 | ||||
|     if at_eol(s): | ||||
|         while s.stack_len != 0: | ||||
|             if not has_head(get_s0(s)): | ||||
|                 get_s0(s).dep_tag = 0 | ||||
|             pop_stack(s) | ||||
| 
 | ||||
| 
 | ||||
| cdef int children_in_buffer(const State *s, int head, int* gold) except -1: | ||||
| cdef int children_in_buffer(const State *s, int head, const int* gold) except -1: | ||||
|     # Golds holds an array of head offsets --- the head of word i is i - golds[i] | ||||
|     # Iterate over the tokens of the queue, and check whether their gold head is | ||||
|     # our target | ||||
|  | @ -42,20 +55,21 @@ cdef int children_in_buffer(const State *s, int head, int* gold) except -1: | |||
|     return n | ||||
| 
 | ||||
| 
 | ||||
| cdef int head_in_buffer(const State *s, const int child, int* gold) except -1: | ||||
| cdef int head_in_buffer(const State *s, const int child, const int* gold) except -1: | ||||
|     return gold[child] >= s.i | ||||
| 
 | ||||
| 
 | ||||
| cdef int children_in_stack(const State *s, const int head, int* gold) except -1: | ||||
| cdef int children_in_stack(const State *s, const int head, const int* gold) except -1: | ||||
|     cdef int i | ||||
|     cdef int n = 0 | ||||
|     for i in range(s.stack_len): | ||||
|         if gold[s.stack[-i]] == head: | ||||
|             n += 1 | ||||
|             if NON_MONOTONIC or not has_head(get_s0(s)): | ||||
|                 n += 1 | ||||
|     return n | ||||
| 
 | ||||
| 
 | ||||
| cdef int head_in_stack(const State *s, const int child, int* gold) except -1: | ||||
| cdef int head_in_stack(const State *s, const int child, const int* gold) except -1: | ||||
|     cdef int i | ||||
|     for i in range(s.stack_len): | ||||
|         if gold[child] == s.stack[-i]: | ||||
|  | @ -72,7 +86,7 @@ cdef const TokenC* get_left(const State* s, const TokenC* head, const int idx) n | |||
|     if child >= s.sent: | ||||
|         return child | ||||
|     else: | ||||
|         return s.sent - 1 | ||||
|         return NULL | ||||
| 
 | ||||
| 
 | ||||
| cdef const TokenC* get_right(const State* s, const TokenC* head, const int idx) nogil: | ||||
|  | @ -84,10 +98,20 @@ cdef const TokenC* get_right(const State* s, const TokenC* head, const int idx) | |||
|     if child < (s.sent + s.sent_len): | ||||
|         return child | ||||
|     else: | ||||
|         return s.sent - 1 | ||||
|         return NULL | ||||
| 
 | ||||
| 
 | ||||
| DEF PADDING = 5 | ||||
| cdef bint has_head(const TokenC* t) nogil: | ||||
|     return t.head != 0 | ||||
| 
 | ||||
| 
 | ||||
| cdef int count_left_kids(const TokenC* head) nogil: | ||||
|     return _popcount(head.l_kids) | ||||
| 
 | ||||
| 
 | ||||
| cdef int count_right_kids(const TokenC* head) nogil: | ||||
|     return _popcount(head.r_kids) | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| cdef State* init_state(Pool mem, TokenC* sent, const int sent_length) except NULL: | ||||
|  | @ -102,4 +126,5 @@ cdef State* init_state(Pool mem, TokenC* sent, const int sent_length) except NUL | |||
|     s.stack_len = 0 | ||||
|     s.i = 0 | ||||
|     s.sent_len = sent_length | ||||
|     push_stack(s) | ||||
|     return s | ||||
|  |  | |||
|  | @ -7,8 +7,11 @@ from ._state cimport State | |||
| 
 | ||||
| 
 | ||||
| cdef struct Transition: | ||||
|     int clas | ||||
|     int move | ||||
|     int label | ||||
|     int cost | ||||
|     weight_t score | ||||
| 
 | ||||
| 
 | ||||
| cdef class TransitionSystem: | ||||
|  | @ -18,7 +21,8 @@ cdef class TransitionSystem: | |||
| 
 | ||||
|     cdef const Transition* _moves | ||||
| 
 | ||||
|     cdef Transition best_valid(self, const weight_t* scores, const State* s) except -1 | ||||
|     cdef Transition best_gold(self, const weight_t* scores, const State* s, | ||||
|                               int* gold_heads, int* gold_labels) except -1 | ||||
|     cdef Transition best_valid(self, const weight_t* scores, const State* s) except * | ||||
|     cdef Transition best_gold(self, Transition* guess, const weight_t* scores, | ||||
|                               const State* s, | ||||
|                               const int* gold_heads, const int* gold_labels) except * | ||||
|     cdef int transition(self, State *s, const Transition* t) except -1 | ||||
|  |  | |||
|  | @ -5,7 +5,9 @@ from ._state cimport is_final, at_eol, pop_stack, push_stack, add_dep | |||
| from ._state cimport head_in_buffer, children_in_buffer | ||||
| from ._state cimport head_in_stack, children_in_stack | ||||
| 
 | ||||
| from ..tokens cimport TokenC | ||||
| from ..structs cimport TokenC | ||||
| 
 | ||||
| DEF NON_MONOTONIC = True | ||||
| 
 | ||||
| 
 | ||||
| cdef enum: | ||||
|  | @ -25,22 +27,30 @@ cdef inline bint _can_right(const State* s) nogil: | |||
| 
 | ||||
| 
 | ||||
| cdef inline bint _can_left(const State* s) nogil: | ||||
|     return s.stack_len >= 1 and not has_head(get_s0(s)) | ||||
|     if NON_MONOTONIC: | ||||
|         return s.stack_len >= 1 | ||||
|     else: | ||||
|         return s.stack_len >= 1 and not has_head(get_s0(s)) | ||||
| 
 | ||||
| 
 | ||||
| cdef inline bint _can_reduce(const State* s) nogil: | ||||
|     return s.stack_len >= 2 and has_head(get_s0(s)) | ||||
|     if NON_MONOTONIC: | ||||
|         return s.stack_len >= 2 | ||||
|     else: | ||||
|         return s.stack_len >= 2 and has_head(get_s0(s)) | ||||
| 
 | ||||
| 
 | ||||
| cdef int _shift_cost(const State* s, int* gold) except -1: | ||||
| cdef int _shift_cost(const State* s, const int* gold) except -1: | ||||
|     assert not at_eol(s) | ||||
|     cost = 0 | ||||
|     cost += head_in_stack(s, s.i, gold) | ||||
|     cost += children_in_stack(s, s.i, gold) | ||||
|     if NON_MONOTONIC: | ||||
|         cost += gold[s.stack[0]] == s.i | ||||
|     return cost | ||||
| 
 | ||||
| 
 | ||||
| cdef int _right_cost(const State* s, int* gold) except -1: | ||||
| cdef int _right_cost(const State* s, const int* gold) except -1: | ||||
|     assert s.stack_len >= 1 | ||||
|     cost = 0 | ||||
|     if gold[s.i] == s.stack[0]: | ||||
|  | @ -48,10 +58,12 @@ cdef int _right_cost(const State* s, int* gold) except -1: | |||
|     cost += head_in_buffer(s, s.i, gold) | ||||
|     cost += children_in_stack(s, s.i, gold) | ||||
|     cost += head_in_stack(s, s.i, gold) | ||||
|     if NON_MONOTONIC: | ||||
|         cost += gold[s.stack[0]] == s.i | ||||
|     return cost | ||||
| 
 | ||||
| 
 | ||||
| cdef int _left_cost(const State* s, int* gold) except -1: | ||||
| cdef int _left_cost(const State* s, const int* gold) except -1: | ||||
|     assert s.stack_len >= 1 | ||||
|     cost = 0 | ||||
|     if gold[s.stack[0]] == s.i: | ||||
|  | @ -59,11 +71,17 @@ cdef int _left_cost(const State* s, int* gold) except -1: | |||
| 
 | ||||
|     cost += head_in_buffer(s, s.stack[0], gold) | ||||
|     cost += children_in_buffer(s, s.stack[0], gold) | ||||
|     if NON_MONOTONIC and s.stack_len >= 2: | ||||
|         cost += gold[s.stack[0]] == s.stack[-1] | ||||
|     return cost | ||||
| 
 | ||||
| 
 | ||||
| cdef int _reduce_cost(const State* s, int* gold) except -1: | ||||
|     return children_in_buffer(s, s.stack[0], gold) | ||||
| cdef int _reduce_cost(const State* s, const int* gold) except -1: | ||||
|     cdef int cost = 0 | ||||
|     cost += children_in_buffer(s, s.stack[0], gold) | ||||
|     if NON_MONOTONIC: | ||||
|         cost += head_in_buffer(s, s.stack[0], gold) | ||||
|     return cost | ||||
| 
 | ||||
| 
 | ||||
| cdef class TransitionSystem: | ||||
|  | @ -73,38 +91,40 @@ cdef class TransitionSystem: | |||
|         right_labels.sort() | ||||
|         if 'ROOT' in right_labels: | ||||
|             right_labels.pop(right_labels.index('ROOT')) | ||||
|         if 'dep' in right_labels: | ||||
|             right_labels.pop(right_labels.index('dep')) | ||||
|         if 'ROOT' in left_labels: | ||||
|             left_labels.pop(left_labels.index('ROOT')) | ||||
|         if 'dep' in left_labels: | ||||
|             left_labels.pop(left_labels.index('dep')) | ||||
|         self.n_moves = 2 + len(left_labels) + len(right_labels)  | ||||
|         moves = <Transition*>self.mem.alloc(self.n_moves, sizeof(Transition)) | ||||
|         cdef int i = 0 | ||||
|         moves[i].move = SHIFT | ||||
|         moves[i].label = 0 | ||||
|         moves[i].clas = i | ||||
|         i += 1 | ||||
|         moves[i].move = REDUCE | ||||
|         moves[i].label = 0 | ||||
|         moves[i].clas = i | ||||
|         i += 1 | ||||
|         self.label_ids = {'ROOT': 0, 'dep': -1} | ||||
|         self.label_ids = {'ROOT': 0} | ||||
|         cdef int label_id | ||||
|         for label_str in left_labels: | ||||
|             label_id = self.label_ids.setdefault(label_str, len(self.label_ids)) | ||||
|             moves[i].move = LEFT | ||||
|             moves[i].label = label_id | ||||
|             moves[i].clas = i | ||||
|             i += 1 | ||||
|         for label_str in right_labels: | ||||
|             label_id = self.label_ids.setdefault(label_str, len(self.label_ids)) | ||||
|             moves[i].move = RIGHT | ||||
|             moves[i].label = label_id | ||||
|             moves[i].clas = i | ||||
|             i += 1 | ||||
|         self._moves = moves | ||||
| 
 | ||||
|     cdef int transition(self, State *s, const int clas) except -1: | ||||
|         cdef const Transition* t = &self._moves[clas] | ||||
|     cdef int transition(self, State *s, const Transition* t) except -1: | ||||
|         if t.move == SHIFT: | ||||
|             # Set the dep label, in case we need it after we reduce | ||||
|             if NON_MONOTONIC: | ||||
|                 get_s0(s).dep_tag = t.label | ||||
|             push_stack(s) | ||||
|         elif t.move == LEFT: | ||||
|             add_dep(s, s.i, s.stack[0], t.label) | ||||
|  | @ -113,11 +133,12 @@ cdef class TransitionSystem: | |||
|             add_dep(s, s.stack[0], s.i, t.label) | ||||
|             push_stack(s) | ||||
|         elif t.move == REDUCE: | ||||
|             add_dep(s, s.stack[-1], s.stack[0], get_s0(s).dep_tag) | ||||
|             pop_stack(s) | ||||
|         else: | ||||
|             raise StandardError(t.move) | ||||
| 
 | ||||
|     cdef int best_valid(self, const weight_t* scores, const State* s) except -1: | ||||
|     cdef Transition best_valid(self, const weight_t* scores, const State* s) except *: | ||||
|         cdef bint[N_MOVES] valid | ||||
|         valid[SHIFT] = _can_shift(s) | ||||
|         valid[LEFT] = _can_left(s) | ||||
|  | @ -126,59 +147,61 @@ cdef class TransitionSystem: | |||
| 
 | ||||
|         cdef int best = -1 | ||||
|         cdef weight_t score = 0 | ||||
|         cdef weight_t best_r_score = -9000 | ||||
|         cdef int best_r_label = -1 | ||||
|         cdef int i | ||||
|         for i in range(self.n_moves): | ||||
|             if valid[self._moves[i].move] and (best == -1 or scores[i] > score): | ||||
|                 best = i | ||||
|                 score = scores[i] | ||||
|             if self._moves[i].move == RIGHT and scores[i] > best_r_score: | ||||
|                 best_r_label = self._moves[i].label | ||||
|         assert best >= 0 | ||||
|         return best | ||||
|         cdef Transition t = self._moves[best] | ||||
|         t.score = score | ||||
|         if t.move == SHIFT: | ||||
|             t.label = best_r_label | ||||
|         return t | ||||
| 
 | ||||
|     cdef int best_gold(self, const weight_t* scores, const State* s, | ||||
|                        int* gold_heads, int* gold_labels) except -1: | ||||
|     cdef Transition best_gold(self, Transition* guess, const weight_t* scores, | ||||
|                               const State* s, | ||||
|                               const int* gold_heads, const int* gold_labels) except *: | ||||
|         # If we can create a gold dependency, only one action can be correct | ||||
|         cdef int[N_MOVES] unl_costs | ||||
|         unl_costs[SHIFT] = _shift_cost(s, gold_heads) if _can_shift(s) else -1 | ||||
|         unl_costs[LEFT] = _left_cost(s, gold_heads) if _can_left(s) else -1 | ||||
|         unl_costs[RIGHT] = _right_cost(s, gold_heads) if _can_right(s) else -1 | ||||
|         unl_costs[REDUCE] = _reduce_cost(s, gold_heads) if _can_reduce(s) else -1 | ||||
| 
 | ||||
|         cdef int cost | ||||
|         cdef int move | ||||
|         cdef int label | ||||
|         guess.cost = unl_costs[guess.move] | ||||
|         cdef Transition t | ||||
|         cdef int target_label | ||||
|         cdef int i | ||||
|         if gold_heads[s.stack[0]] == s.i: | ||||
|             target_label = gold_labels[s.stack[0]] | ||||
|             if guess.move == LEFT: | ||||
|                 guess.cost += guess.label != target_label | ||||
|             for i in range(self.n_moves): | ||||
|                 t = self._moves[i] | ||||
|                 if t.move == LEFT and t.label == target_label: | ||||
|                     return t | ||||
|         elif gold_heads[s.i] == s.stack[0]: | ||||
|             target_label = gold_labels[s.i] | ||||
|             if guess.move == RIGHT: | ||||
|                 guess.cost += guess.label != target_label | ||||
|             for i in range(self.n_moves): | ||||
|                 t = self._moves[i] | ||||
|                 if t.move == RIGHT and t.label == target_label: | ||||
|                     return t | ||||
| 
 | ||||
|         cdef int best = -1 | ||||
|         cdef weight_t score = -9000 | ||||
|         cdef int i | ||||
|         for i in range(self.n_moves): | ||||
|             move = self._moves[i].move | ||||
|             label = self._moves[i].label | ||||
|             if unl_costs[move] == 0:  | ||||
|                 if move == SHIFT or move == REDUCE: | ||||
|                     cost = 0 | ||||
|                 elif move == LEFT: | ||||
|                     if gold_heads[s.stack[0]] == s.i and gold_labels[s.stack[0]] != -1: | ||||
|                         cost = label != gold_labels[s.stack[0]] | ||||
|                     else: | ||||
|                         cost = 0 | ||||
|                 elif move == RIGHT: | ||||
|                     if gold_heads[s.i] == s.stack[0] and gold_labels[s.i] != -1: | ||||
|                         cost = label != gold_labels[s.i] | ||||
|                     else: | ||||
|                         cost = 0 | ||||
|                 else: | ||||
|                     raise StandardError("Unknown Move") | ||||
|                 if cost == 0 and (best == -1 or scores[i] > score): | ||||
|                     best = i | ||||
|                     score = scores[i] | ||||
|   | ||||
|         if best < 0: | ||||
|             print unl_costs[SHIFT], unl_costs[REDUCE], unl_costs[LEFT], unl_costs[RIGHT] | ||||
|             print s.stack_len | ||||
|             print has_head(get_s0(s)) | ||||
|             print s.sent[s.stack[0]].head | ||||
|             print s.stack[0], s.i | ||||
|             print gold_heads[s.stack[0]], gold_heads[s.i] | ||||
|             print gold_labels[s.i] | ||||
|             print children_in_buffer(s, s.stack[0], gold_heads) | ||||
|             print head_in_buffer(s, s.stack[0], gold_heads) | ||||
|             raise StandardError  | ||||
|         return best | ||||
|             t = self._moves[i] | ||||
|             if unl_costs[t.move] == 0 and (best == -1 or scores[i] > score): | ||||
|                 best = i | ||||
|                 score = scores[i] | ||||
|         t = self._moves[best] | ||||
|         t.score = score | ||||
|         assert best >= 0 | ||||
|         return t | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user