diff --git a/spacy/syntax/_state.pxd b/spacy/syntax/_state.pxd index 401be9bf6..d7a24dbd1 100644 --- a/spacy/syntax/_state.pxd +++ b/spacy/syntax/_state.pxd @@ -8,6 +8,10 @@ from ..symbols cimport punct from ..attrs cimport IS_SPACE +cdef inline bint is_space_token(const TokenC* token) nogil: + return Lexeme.c_check_flag(token.lex, IS_SPACE) + + cdef cppclass StateC: int* _stack int* _buffer @@ -292,23 +296,67 @@ cdef cppclass StateC: this._break = src._break void fast_forward() nogil: - while this.buffer_length() == 0 \ - or this.stack_depth() == 0 \ - or Lexeme.c_check_flag(this.S_(0).lex, IS_SPACE): - if this.buffer_length() == 1 and this.stack_depth() == 0: - this.push() - this.pop() - elif this.buffer_length() == 0 and this.stack_depth() == 1: - this.pop() - elif this.buffer_length() == 0 and this.stack_depth() >= 2: - if this.has_head(this.S(0)): + # space token attachement policy: + # - attach space tokens always to the last preceding real token + # - except if it's the beginning of a sentence, then attach to the first following + # - boundary case: a document containing multiple space tokens but nothing else, + # then make the last space token the head of all others + + while is_space_token(this.B_(0)) \ + or this.buffer_length() == 0 \ + or this.stack_depth() == 0: + if this.buffer_length() == 0: + # remove the last sentence's root from the stack + if this.stack_depth() == 1: this.pop() - else: - this.unshift() - elif (this.length - this._b_i) >= 1 and this.stack_depth() == 0: - this.push() - elif Lexeme.c_check_flag(this.S_(0).lex, IS_SPACE): - this.add_arc(this.B(0), this.S(0), 0) - this.pop() - else: + # parser got stuck: reduce stack or unshift + elif this.stack_depth() > 1: + if this.has_head(this.S(0)): + this.pop() + else: + this.unshift() + # stack is empty but there is another sentence on the buffer + elif (this.length - this._b_i) >= 1: + this.push() + else: # stack empty and nothing else coming + break + + elif is_space_token(this.B_(0)): + # the normal case: we're somewhere inside a sentence + if this.stack_depth() > 0: + # assert not is_space_token(this.S_(0)) + # attach all coming space tokens to their last preceding + # real token (which should be on the top of the stack) + while is_space_token(this.B_(0)): + this.add_arc(this.S(0),this.B(0),0) + this.push() + this.pop() + # the rare case: we're at the beginning of a document: + # space tokens are attached to the first real token on the buffer + elif this.stack_depth() == 0: + # store all space tokens on the stack until a real token shows up + # or the last token on the buffer is reached + while is_space_token(this.B_(0)) and this.buffer_length() > 1: + this.push() + # empty the stack by attaching all space tokens to the + # first token on the buffer + # boundary case: if all tokens are space tokens, the last one + # becomes the head of all others + while this.stack_depth() > 0: + this.add_arc(this.B(0),this.S(0),0) + this.pop() + # move the first token onto the stack + this.push() + + elif this.stack_depth() == 0: + # for one token sentences (?) + if this.buffer_length() == 1: + this.push() + this.pop() + # with an empty stack and a non-empty buffer + # only shift is valid anyway + elif (this.length - this._b_i) >= 1: + this.push() + + else: # can this even happen? break diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index b92b66230..48614b591 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -9,7 +9,7 @@ from .transition_system cimport do_func_t, get_cost_func_t from .transition_system cimport move_cost_func_t, label_cost_func_t from ..gold cimport GoldParse from ..gold cimport GoldParseC -from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE +from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE, IS_SPACE from ..lexeme cimport Lexeme from libc.stdint cimport uint32_t @@ -17,7 +17,7 @@ from libc.string cimport memcpy from cymem.cymem cimport Pool from .stateclass cimport StateClass -from ._state cimport StateC +from ._state cimport StateC, is_space_token DEF NON_MONOTONIC = True @@ -233,11 +233,19 @@ cdef class Break: return False elif st.at_break(): return False - elif st.B(0) == 0: - return False elif st.stack_depth() < 1: return False - elif (st.S(0) + 1) != st.B(0): + # It is okay to predict a sentence boundary if the top item on the stack + # and the first item on the buffer are adjacent tokens. If this is not the + # case, it is still okay if there are only space tokens in between. + # This is checked by testing whether the head of a space token immediately + # preceding the first item in the buffer is the top item on the stack. + # Intervening space tokens must be attached to the previous non-space token. + # Therefore, if the head of a space token that immediately precedes the first + # item on the buffer is the top item on the stack, a sentence boundary can be + # predicted. + elif (st.S(0) + 1) != st.B(0) \ + and not (is_space_token(st.safe_get(st.B(0)-1)) and st.H(st.B(0)-1) == st.S(0)): # Must break at the token boundary return False else: diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index ba1f4f1b8..20cce7bb6 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -188,9 +188,11 @@ cdef class Parser: action = self.moves.c[guess] if not eg.is_valid[guess]: - with gil: - move_name = self.moves.move_name(action.move, action.label) - return 1 + # with gil: + # move_name = self.moves.move_name(action.move, action.label) + # print 'invalid action:', move_name + return 1 + action.do(state, action.label) memset(eg.scores, 0, sizeof(eg.scores[0]) * eg.nr_class) for i in range(eg.nr_class): @@ -275,12 +277,12 @@ cdef class StepwiseState: @property def heads(self): - return [self.stcls.H(i) for i in range(self.stcls.length)] + return [self.stcls.H(i) for i in range(self.stcls.c.length)] @property def deps(self): return [self.doc.vocab.strings[self.stcls.c._sent[i].dep] - for i in range(self.stcls.length)] + for i in range(self.stcls.c.length)] def predict(self): self.eg.reset() diff --git a/spacy/syntax/stateclass.pyx b/spacy/syntax/stateclass.pyx index 775e613cd..a18cc284a 100644 --- a/spacy/syntax/stateclass.pyx +++ b/spacy/syntax/stateclass.pyx @@ -17,11 +17,11 @@ cdef class StateClass: @property def stack(self): - return {self.S(i) for i in range(self._s_i)} + return {self.S(i) for i in range(self.c._s_i)} @property def queue(self): - return {self.B(i) for i in range(self._b_i)} + return {self.B(i) for i in range(self.c._b_i)} def print_state(self, words): words = list(words) + ['_'] diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index e99148933..d4b633d0d 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import pytest @@ -6,3 +8,14 @@ def test_root(EN): tokens = EN(u"i don't have other assistance") for t in tokens: assert t.dep != 0, t.orth_ + + +@pytest.mark.models +def test_one_word_sentence(EN): + # one word sentence + doc = EN.tokenizer.tokens_from_list(['Hello']) + EN.tagger(doc) + assert len(doc) == 1 + with EN.parser.step_through(doc) as _: + pass + assert doc[0].dep != 0 diff --git a/spacy/tests/parser/test_sbd.py b/spacy/tests/parser/test_sbd.py index 57a79525f..771e2401f 100644 --- a/spacy/tests/parser/test_sbd.py +++ b/spacy/tests/parser/test_sbd.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals import pytest + @pytest.mark.models def test_single_period(EN): string = 'A test sentence.' @@ -37,3 +38,85 @@ def test_single_question(EN): assert len(words) == 4 assert len(list(words.sents)) == 1 assert sum(len(sent) for sent in words.sents) == len(words) + + +@pytest.mark.models +def test_sentence_breaks_no_space(EN): + doc = EN.tokenizer.tokens_from_list('This is a sentence . This is another one .'.split(' ')) + EN.tagger(doc) + with EN.parser.step_through(doc) as stepwise: + # stack empty, automatic Shift (This) + assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') + stepwise.transition('L-nsubj') # attach This + # stack empty, automatic Shift (is) + assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') + stepwise.transition('S') # shift a + assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') + stepwise.transition('L-det') # attach a + assert not EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') + stepwise.transition('R-attr') # attach sentence + stepwise.transition('D') # remove sentence + assert not EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') + stepwise.transition('R-punct') # attach . + assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') + stepwise.transition('B-ROOT') # set sentence start on This + # automatic reduction of the stack, automatic Shift to start second sentence + assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') + stepwise.transition('L-nsubj') # attach This + # stack empty, automatic Shift (is) + assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') + stepwise.transition('S') # shift another + assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') + stepwise.transition('L-attr') # attach another + assert not EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') + stepwise.transition('R-attr') # attach one + assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') + stepwise.transition('D') # remove one + assert not EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') + stepwise.transition('R-punct') # attach . + # buffer empty, automatic cleanup + assert len(list(doc.sents)) == 2 + for tok in doc: + assert tok.dep != 0 or tok.is_space + assert [ tok.head.i for tok in doc ] == [1,1,3,1,1,6,6,8,6,6] + + +@pytest.mark.models +def test_sentence_breaks_with_space(EN): + doc = EN.tokenizer.tokens_from_list('\t This is \n a sentence \n \n . \n \t \n This is another \t one .'.split(' ')) + EN.tagger(doc) + with EN.parser.step_through(doc) as stepwise: + # stack empty, automatic Shift (This) + assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') + stepwise.transition('L-nsubj') # attach This + # stack empty, automatic Shift (is) + assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') + stepwise.transition('S') # shift a + assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') + stepwise.transition('L-det') # attach a + assert not EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') + stepwise.transition('R-attr') # attach sentence + stepwise.transition('D') # remove sentence + assert not EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') + stepwise.transition('R-punct') # attach . + assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') + stepwise.transition('B-ROOT') # set sentence start on This + # automatic reduction of the stack, automatic Shift to start second sentence + assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') + stepwise.transition('L-nsubj') # attach This + # stack empty, automatic Shift (is) + assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') + stepwise.transition('S') # shift another + assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') + stepwise.transition('L-attr') # attach another + assert not EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') + stepwise.transition('R-attr') # attach one + assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') + stepwise.transition('D') # remove one + assert not EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') + stepwise.transition('R-punct') # attach . + # buffer empty, automatic cleanup + assert len(list(doc.sents)) == 2 + for tok in doc: + assert tok.dep != 0 or tok.is_space + assert [ tok.head.i for tok in doc ] == [1,2,2,2,5,2,5,5,2,8,8,8,13,13,16,14,13,13] diff --git a/spacy/tests/parser/test_space_attachment.py b/spacy/tests/parser/test_space_attachment.py index ca533e3ef..102618446 100644 --- a/spacy/tests/parser/test_space_attachment.py +++ b/spacy/tests/parser/test_space_attachment.py @@ -4,6 +4,12 @@ import pytest import numpy from spacy.attrs import HEAD +def make_doc(EN, sentstr): + sent = sentstr.split(' ') + doc = EN.tokenizer.tokens_from_list(sent) + EN.tagger(doc) + return doc + @pytest.mark.models def test_space_attachment(EN): @@ -22,3 +28,63 @@ def test_sentence_space(EN): doc = EN(text) assert len(list(doc.sents)) == 2 + +@pytest.mark.models +def test_space_attachment_leading_space(EN): + # leading space token + doc = make_doc(EN, '\t \n This is a sentence .') + assert doc[0].is_space + assert doc[1].is_space + assert doc[2].orth_ == 'This' + with EN.parser.step_through(doc) as stepwise: + pass + assert doc[0].head.i == 2 + assert doc[1].head.i == 2 + assert stepwise.stack == set([2]) + + +@pytest.mark.models +def test_space_attachment_intermediate_and_trailing_space(EN): + # intermediate and trailing space tokens + doc = make_doc(EN, 'This is \t a \t\n \n sentence . \n\n \n') + assert doc[2].is_space + assert doc[4].is_space + assert doc[5].is_space + assert doc[8].is_space + assert doc[9].is_space + with EN.parser.step_through(doc) as stepwise: + stepwise.transition('L-nsubj') + stepwise.transition('S') + stepwise.transition('L-det') + stepwise.transition('R-attr') + stepwise.transition('D') + stepwise.transition('R-punct') + assert stepwise.stack == set([]) + for tok in doc: + assert tok.dep != 0 or tok.is_space + assert [ tok.head.i for tok in doc ] == [1,1,1,6,3,3,1,1,7,7] + + +@pytest.mark.models +def test_space_attachment_one_space_sentence(EN): + # one space token sentence + doc = make_doc(EN, '\n') + assert len(doc) == 1 + with EN.parser.step_through(doc) as _: + pass + assert doc[0].is_space + assert doc[0].head.i == 0 + + +@pytest.mark.models +def test_space_attachment_only_space_sentence(EN): + # space-exclusive sentence + doc = make_doc(EN, '\n \t \n\n \t') + assert len(doc) == 4 + for tok in doc: + assert tok.is_space + with EN.parser.step_through(doc) as _: + pass + # all tokens are attached to the last one + for tok in doc: + assert tok.head.i == 3