spaCy/spacy/pipeline/_parser_internals/stateclass.pyx
Matthew Honnibal 8656a08777
Add beam_parser and beam_ner components for v3 (#6369)
* Get basic beam tests working

* Get basic beam tests working

* Compile _beam_utils

* Remove prints

* Test beam density

* Beam parser seems to train

* Draft beam NER

* Upd beam

* Add hypothesis as dev dependency

* Implement missing is-gold-parse method

* Implement early update

* Fix state hashing

* Fix test

* Fix test

* Default to non-beam in parser constructor

* Improve oracle for beam

* Start refactoring beam

* Update test

* Refactor beam

* Update nn

* Refactor beam and weight by cost

* Update ner beam settings

* Update test

* Add __init__.pxd

* Upd test

* Fix test

* Upd test

* Fix test

* Remove ring buffer history from StateC

* WIP change arc-eager transitions

* Add state tests

* Support ternary sent start values

* Fix arc eager

* Fix NER

* Pass oracle cut size for beam

* Fix ner test

* Fix beam

* Improve StateC.clone

* Improve StateClass.borrow

* Work directly with StateC, not StateClass

* Remove print statements

* Fix state copy

* Improve state class

* Refactor parser oracles

* Fix arc eager oracle

* Fix arc eager oracle

* Use a vector to implement the stack

* Refactor state data structure

* Fix alignment of sent start

* Add get_aligned_sent_starts method

* Add test for ae oracle when bad sentence starts

* Fix sentence segment handling

* Avoid Reduce that inserts illegal sentence

* Update preset SBD test

* Fix test

* Remove prints

* Fix sent starts in Example

* Improve python API of StateClass

* Tweak comments and debug output of arc eager

* Upd test

* Fix state test

* Fix state test
2020-12-13 09:08:32 +08:00

179 lines
4.5 KiB
Cython

# cython: infer_types=True
import numpy
from libcpp.vector cimport vector
from ._state cimport ArcC
from ...tokens.doc cimport Doc
cdef class StateClass:
def __init__(self, Doc doc=None, int offset=0):
self._borrowed = 0
if doc is not None:
self.c = new StateC(doc.c, doc.length)
self.c.offset = offset
self.doc = doc
else:
self.doc = None
def __dealloc__(self):
if self._borrowed != 1:
del self.c
@property
def stack(self):
return [self.S(i) for i in range(self.c.stack_depth())]
@property
def queue(self):
return [self.B(i) for i in range(self.c.buffer_length())]
@property
def token_vector_lenth(self):
return self.doc.tensor.shape[1]
@property
def arcs(self):
cdef vector[ArcC] arcs
self.c.get_arcs(&arcs)
return list(arcs)
#py_arcs = []
#for arc in arcs:
# if arc.head != -1 and arc.child != -1:
# py_arcs.append((arc.head, arc.child, arc.label))
#return arcs
def add_arc(self, int head, int child, int label):
self.c.add_arc(head, child, label)
def del_arc(self, int head, int child):
self.c.del_arc(head, child)
def H(self, int child):
return self.c.H(child)
def L(self, int head, int idx):
return self.c.L(head, idx)
def R(self, int head, int idx):
return self.c.R(head, idx)
@property
def _b_i(self):
return self.c._b_i
@property
def length(self):
return self.c.length
def is_final(self):
return self.c.is_final()
def copy(self):
cdef StateClass new_state = StateClass(doc=self.doc, offset=self.c.offset)
new_state.c.clone(self.c)
return new_state
def print_state(self):
words = [token.text for token in self.doc]
words = list(words) + ['_']
bools = ["F", "T"]
sent_starts = [bools[self.c.is_sent_start(i)] for i in range(len(self.doc))]
shifted = [1 if self.c.is_unshiftable(i) else 0 for i in range(self.c.length)]
shifted.append("")
sent_starts.append("")
top = f"{self.S(0)}{words[self.S(0)]}_{words[self.H(self.S(0))]}_{shifted[self.S(0)]}"
second = f"{self.S(1)}{words[self.S(1)]}_{words[self.H(self.S(1))]}_{shifted[self.S(1)]}"
third = f"{self.S(2)}{words[self.S(2)]}_{words[self.H(self.S(2))]}_{shifted[self.S(2)]}"
n0 = f"{self.B(0)}{words[self.B(0)]}_{sent_starts[self.B(0)]}_{shifted[self.B(0)]}"
n1 = f"{self.B(1)}{words[self.B(1)]}_{sent_starts[self.B(1)]}_{shifted[self.B(1)]}"
return ' '.join((str(self.stack_depth()), str(self.buffer_length()), third, second, top, '|', n0, n1))
def S(self, int i):
return self.c.S(i)
def B(self, int i):
return self.c.B(i)
def H(self, int i):
return self.c.H(i)
def E(self, int i):
return self.c.E(i)
def L(self, int i, int idx):
return self.c.L(i, idx)
def R(self, int i, int idx):
return self.c.R(i, idx)
def S_(self, int i):
return self.doc[self.c.S(i)]
def B_(self, int i):
return self.doc[self.c.B(i)]
def H_(self, int i):
return self.doc[self.c.H(i)]
def E_(self, int i):
return self.doc[self.c.E(i)]
def L_(self, int i, int idx):
return self.doc[self.c.L(i, idx)]
def R_(self, int i, int idx):
return self.doc[self.c.R(i, idx)]
def empty(self):
return self.c.empty()
def eol(self):
return self.c.eol()
def at_break(self):
return False
#return self.c.at_break()
def has_head(self, int i):
return self.c.has_head(i)
def n_L(self, int i):
return self.c.n_L(i)
def n_R(self, int i):
return self.c.n_R(i)
def entity_is_open(self):
return self.c.entity_is_open()
def stack_depth(self):
return self.c.stack_depth()
def buffer_length(self):
return self.c.buffer_length()
def push(self):
self.c.push()
def pop(self):
self.c.pop()
def unshift(self):
self.c.unshift()
def add_arc(self, int head, int child, attr_t label):
self.c.add_arc(head, child, label)
def del_arc(self, int head, int child):
self.c.del_arc(head, child)
def open_ent(self, attr_t label):
self.c.open_ent(label)
def close_ent(self):
self.c.close_ent()
def clone(self, StateClass src):
self.c.clone(src.c)