mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-22 15:24:11 +03:00
8656a08777
* Get basic beam tests working * Get basic beam tests working * Compile _beam_utils * Remove prints * Test beam density * Beam parser seems to train * Draft beam NER * Upd beam * Add hypothesis as dev dependency * Implement missing is-gold-parse method * Implement early update * Fix state hashing * Fix test * Fix test * Default to non-beam in parser constructor * Improve oracle for beam * Start refactoring beam * Update test * Refactor beam * Update nn * Refactor beam and weight by cost * Update ner beam settings * Update test * Add __init__.pxd * Upd test * Fix test * Upd test * Fix test * Remove ring buffer history from StateC * WIP change arc-eager transitions * Add state tests * Support ternary sent start values * Fix arc eager * Fix NER * Pass oracle cut size for beam * Fix ner test * Fix beam * Improve StateC.clone * Improve StateClass.borrow * Work directly with StateC, not StateClass * Remove print statements * Fix state copy * Improve state class * Refactor parser oracles * Fix arc eager oracle * Fix arc eager oracle * Use a vector to implement the stack * Refactor state data structure * Fix alignment of sent start * Add get_aligned_sent_starts method * Add test for ae oracle when bad sentence starts * Fix sentence segment handling * Avoid Reduce that inserts illegal sentence * Update preset SBD test * Fix test * Remove prints * Fix sent starts in Example * Improve python API of StateClass * Tweak comments and debug output of arc eager * Upd test * Fix state test * Fix state test
179 lines
4.5 KiB
Cython
179 lines
4.5 KiB
Cython
# cython: infer_types=True
|
|
import numpy
|
|
from libcpp.vector cimport vector
|
|
from ._state cimport ArcC
|
|
|
|
from ...tokens.doc cimport Doc
|
|
|
|
|
|
cdef class StateClass:
|
|
def __init__(self, Doc doc=None, int offset=0):
|
|
self._borrowed = 0
|
|
if doc is not None:
|
|
self.c = new StateC(doc.c, doc.length)
|
|
self.c.offset = offset
|
|
self.doc = doc
|
|
else:
|
|
self.doc = None
|
|
|
|
def __dealloc__(self):
|
|
if self._borrowed != 1:
|
|
del self.c
|
|
|
|
@property
|
|
def stack(self):
|
|
return [self.S(i) for i in range(self.c.stack_depth())]
|
|
|
|
@property
|
|
def queue(self):
|
|
return [self.B(i) for i in range(self.c.buffer_length())]
|
|
|
|
@property
|
|
def token_vector_lenth(self):
|
|
return self.doc.tensor.shape[1]
|
|
|
|
@property
|
|
def arcs(self):
|
|
cdef vector[ArcC] arcs
|
|
self.c.get_arcs(&arcs)
|
|
return list(arcs)
|
|
#py_arcs = []
|
|
#for arc in arcs:
|
|
# if arc.head != -1 and arc.child != -1:
|
|
# py_arcs.append((arc.head, arc.child, arc.label))
|
|
#return arcs
|
|
|
|
def add_arc(self, int head, int child, int label):
|
|
self.c.add_arc(head, child, label)
|
|
|
|
def del_arc(self, int head, int child):
|
|
self.c.del_arc(head, child)
|
|
|
|
def H(self, int child):
|
|
return self.c.H(child)
|
|
|
|
def L(self, int head, int idx):
|
|
return self.c.L(head, idx)
|
|
|
|
def R(self, int head, int idx):
|
|
return self.c.R(head, idx)
|
|
|
|
@property
|
|
def _b_i(self):
|
|
return self.c._b_i
|
|
|
|
@property
|
|
def length(self):
|
|
return self.c.length
|
|
|
|
def is_final(self):
|
|
return self.c.is_final()
|
|
|
|
def copy(self):
|
|
cdef StateClass new_state = StateClass(doc=self.doc, offset=self.c.offset)
|
|
new_state.c.clone(self.c)
|
|
return new_state
|
|
|
|
def print_state(self):
|
|
words = [token.text for token in self.doc]
|
|
words = list(words) + ['_']
|
|
bools = ["F", "T"]
|
|
sent_starts = [bools[self.c.is_sent_start(i)] for i in range(len(self.doc))]
|
|
shifted = [1 if self.c.is_unshiftable(i) else 0 for i in range(self.c.length)]
|
|
shifted.append("")
|
|
sent_starts.append("")
|
|
top = f"{self.S(0)}{words[self.S(0)]}_{words[self.H(self.S(0))]}_{shifted[self.S(0)]}"
|
|
second = f"{self.S(1)}{words[self.S(1)]}_{words[self.H(self.S(1))]}_{shifted[self.S(1)]}"
|
|
third = f"{self.S(2)}{words[self.S(2)]}_{words[self.H(self.S(2))]}_{shifted[self.S(2)]}"
|
|
n0 = f"{self.B(0)}{words[self.B(0)]}_{sent_starts[self.B(0)]}_{shifted[self.B(0)]}"
|
|
n1 = f"{self.B(1)}{words[self.B(1)]}_{sent_starts[self.B(1)]}_{shifted[self.B(1)]}"
|
|
return ' '.join((str(self.stack_depth()), str(self.buffer_length()), third, second, top, '|', n0, n1))
|
|
|
|
def S(self, int i):
|
|
return self.c.S(i)
|
|
|
|
def B(self, int i):
|
|
return self.c.B(i)
|
|
|
|
def H(self, int i):
|
|
return self.c.H(i)
|
|
|
|
def E(self, int i):
|
|
return self.c.E(i)
|
|
|
|
def L(self, int i, int idx):
|
|
return self.c.L(i, idx)
|
|
|
|
def R(self, int i, int idx):
|
|
return self.c.R(i, idx)
|
|
|
|
def S_(self, int i):
|
|
return self.doc[self.c.S(i)]
|
|
|
|
def B_(self, int i):
|
|
return self.doc[self.c.B(i)]
|
|
|
|
def H_(self, int i):
|
|
return self.doc[self.c.H(i)]
|
|
|
|
def E_(self, int i):
|
|
return self.doc[self.c.E(i)]
|
|
|
|
def L_(self, int i, int idx):
|
|
return self.doc[self.c.L(i, idx)]
|
|
|
|
def R_(self, int i, int idx):
|
|
return self.doc[self.c.R(i, idx)]
|
|
|
|
def empty(self):
|
|
return self.c.empty()
|
|
|
|
def eol(self):
|
|
return self.c.eol()
|
|
|
|
def at_break(self):
|
|
return False
|
|
#return self.c.at_break()
|
|
|
|
def has_head(self, int i):
|
|
return self.c.has_head(i)
|
|
|
|
def n_L(self, int i):
|
|
return self.c.n_L(i)
|
|
|
|
def n_R(self, int i):
|
|
return self.c.n_R(i)
|
|
|
|
def entity_is_open(self):
|
|
return self.c.entity_is_open()
|
|
|
|
def stack_depth(self):
|
|
return self.c.stack_depth()
|
|
|
|
def buffer_length(self):
|
|
return self.c.buffer_length()
|
|
|
|
def push(self):
|
|
self.c.push()
|
|
|
|
def pop(self):
|
|
self.c.pop()
|
|
|
|
def unshift(self):
|
|
self.c.unshift()
|
|
|
|
def add_arc(self, int head, int child, attr_t label):
|
|
self.c.add_arc(head, child, label)
|
|
|
|
def del_arc(self, int head, int child):
|
|
self.c.del_arc(head, child)
|
|
|
|
def open_ent(self, attr_t label):
|
|
self.c.open_ent(label)
|
|
|
|
def close_ent(self):
|
|
self.c.close_ent()
|
|
|
|
def clone(self, StateClass src):
|
|
self.c.clone(src.c)
|