diff --git a/spacy/structs.pxd b/spacy/structs.pxd index 3c60cd87f..cfcadc3d0 100644 --- a/spacy/structs.pxd +++ b/spacy/structs.pxd @@ -61,13 +61,13 @@ cdef struct TokenC: attr_t sense int head attr_t dep - bint sent_start uint32_t l_kids uint32_t r_kids uint32_t l_edge uint32_t r_edge + int sent_start int ent_iob attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth.. hash_t ent_id diff --git a/spacy/syntax/_state.pxd b/spacy/syntax/_state.pxd index 1864b22b3..4675d887e 100644 --- a/spacy/syntax/_state.pxd +++ b/spacy/syntax/_state.pxd @@ -307,6 +307,8 @@ cdef cppclass StateC: this._stack[this._s_i] = this.B(0) this._s_i += 1 this._b_i += 1 + if this.B_(0).sent_start == 1: + this.set_break(this.B(0)) if this._b_i > this._break: this._break = -1 @@ -383,7 +385,7 @@ cdef cppclass StateC: void set_break(int i) nogil: if 0 <= i < this.length: - this._sent[i].sent_start = True + this._sent[i].sent_start = 1 this._break = this._b_i void clone(const StateC* src) nogil: diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index d1e1987d7..9770383d1 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -118,7 +118,7 @@ cdef bint _is_gold_root(const GoldParseC* gold, int word) nogil: cdef class Shift: @staticmethod cdef bint is_valid(const StateC* st, attr_t label) nogil: - return st.buffer_length() >= 2 and not st.shifted[st.B(0)] and not st.B_(0).sent_start + return st.buffer_length() >= 2 and not st.shifted[st.B(0)] and st.B_(0).sent_start != 1 @staticmethod cdef int transition(StateC* st, attr_t label) nogil: @@ -178,7 +178,7 @@ cdef class Reduce: cdef class LeftArc: @staticmethod cdef bint is_valid(const StateC* st, attr_t label) nogil: - return not st.B_(0).sent_start + return st.B_(0).sent_start != 1 @staticmethod cdef int transition(StateC* st, attr_t label) nogil: @@ -212,7 +212,7 @@ cdef class LeftArc: cdef class RightArc: @staticmethod cdef bint is_valid(const StateC* st, attr_t label) nogil: - return not st.B_(0).sent_start + return st.B_(0).sent_start != 1 @staticmethod cdef int transition(StateC* st, attr_t label) nogil: @@ -248,6 +248,10 @@ cdef class Break: return False elif st.stack_depth() < 1: return False + elif st.B_(0).l_edge < 0: + return False + elif st._sent[st.B_(0).l_edge].sent_start < 0: + return False else: return True diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 153f7a484..619431766 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -244,8 +244,8 @@ cdef class Parser: hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 128)) parser_maxout_pieces = util.env_opt('parser_maxout_pieces', cfg.get('maxout_pieces', 1)) embed_size = util.env_opt('embed_size', cfg.get('embed_size', 7000)) - hist_size = util.env_opt('history_feats', cfg.get('hist_size', 4)) - hist_width = util.env_opt('history_width', cfg.get('hist_width', 16)) + hist_size = util.env_opt('history_feats', cfg.get('hist_size', 0)) + hist_width = util.env_opt('history_width', cfg.get('hist_width', 0)) if hist_size >= 1 and depth == 0: raise ValueError("Inconsistent hyper-params: " "history_feats >= 1 but parser_hidden_depth==0") diff --git a/spacy/tests/parser/test_preset_sbd.py b/spacy/tests/parser/test_preset_sbd.py new file mode 100644 index 000000000..77326f797 --- /dev/null +++ b/spacy/tests/parser/test_preset_sbd.py @@ -0,0 +1,73 @@ +'''Test that the parser respects preset sentence boundaries.''' +from __future__ import unicode_literals +import pytest +from thinc.neural.optimizers import Adam +from thinc.neural.ops import NumpyOps + +from ...attrs import NORM +from ...gold import GoldParse +from ...vocab import Vocab +from ...tokens import Doc +from ...pipeline import NeuralDependencyParser + +@pytest.fixture +def vocab(): + return Vocab(lex_attr_getters={NORM: lambda s: s}) + +@pytest.fixture +def parser(vocab): + parser = NeuralDependencyParser(vocab) + parser.cfg['token_vector_width'] = 4 + parser.cfg['hidden_width'] = 32 + #parser.add_label('right') + parser.add_label('left') + parser.begin_training([], **parser.cfg) + sgd = Adam(NumpyOps(), 0.001) + + for i in range(10): + losses = {} + doc = Doc(vocab, words=['a', 'b', 'c', 'd']) + gold = GoldParse(doc, heads=[1, 1, 3, 3], + deps=['left', 'ROOT', 'left', 'ROOT']) + parser.update([doc], [gold], sgd=sgd, losses=losses) + return parser + +def test_no_sentences(parser): + doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd']) + doc = parser(doc) + assert len(list(doc.sents)) == 2 + + +def test_sents_1(parser): + doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd']) + doc[2].sent_start = True + doc = parser(doc) + assert len(list(doc.sents)) >= 2 + doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd']) + doc[1].sent_start = False + doc[2].sent_start = True + doc[3].sent_start = False + doc = parser(doc) + assert len(list(doc.sents)) == 2 + + +def test_sents_1_2(parser): + doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd']) + doc[1].sent_start = True + doc[2].sent_start = True + doc = parser(doc) + assert len(list(doc.sents)) == 3 + + +def test_sents_1_3(parser): + doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd']) + doc[1].sent_start = True + doc[3].sent_start = True + doc = parser(doc) + assert len(list(doc.sents)) == 4 + doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd']) + doc[1].sent_start = True + doc[2].sent_start = False + doc[3].sent_start = True + doc = parser(doc) + assert len(list(doc.sents)) == 3 diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index fcb5a16fa..df75ab3ec 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -485,7 +485,7 @@ cdef class Doc: cdef int i start = 0 for i in range(1, self.length): - if self.c[i].sent_start: + if self.c[i].sent_start == 1: yield Span(self, start, i) start = i if start != self.length: diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 7e29cccf4..c6bb1a0bb 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -111,6 +111,30 @@ cdef class Span: for i in range(self.start, self.end): yield self.doc[i] + def as_doc(self): + '''Create a Doc object view of the Span's data. + + This is mostly useful for C-typed interfaces. + ''' + cdef Doc doc = Doc(self.doc.vocab) + doc.length = self.end-self.start + doc.c = &self.doc.c[self.start] + doc.mem = self.doc.mem + doc.is_parsed = self.doc.is_parsed + doc.is_tagged = self.doc.is_tagged + doc.noun_chunks_iterator = self.doc.noun_chunks_iterator + doc.user_hooks = self.doc.user_hooks + doc.user_span_hooks = self.doc.user_span_hooks + doc.user_token_hooks = self.doc.user_token_hooks + doc.vector = self.vector + doc.vector_norm = self.vector_norm + for key, value in self.doc.cats.items(): + if hasattr(key, '__len__') and len(key) == 3: + cat_start, cat_end, cat_label = key + if cat_start == self.start_char and cat_end == self.end_char: + doc.cats[cat_label] = value + return doc + def merge(self, *args, **attributes): """Retokenize the document, such that the span is merged into a single token. diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 7b11d6efa..78ba920dd 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -281,13 +281,21 @@ cdef class Token: def __get__(self): return self.c.sent_start - def __set__(self, bint value): + def __set__(self, value): if self.doc.is_parsed: raise ValueError( 'Refusing to write to token.sent_start if its document is parsed, ' 'because this may cause inconsistent state. ' 'See https://github.com/spacy-io/spaCy/issues/235 for workarounds.') - self.c.sent_start = value + if value is None: + self.c.sent_start = 0 + elif value is True: + self.c.sent_start = 1 + elif value is False: + self.c.sent_start = -1 + else: + raise ValueError("Invalid value for token.sent_start -- must be one of " + "None, True, False") property lefts: def __get__(self):