Merge pull request #1400 from explosion/feature/sentence-parsing

💫 Force parser to respect preset sentence boundaries
2025-08-31 09:25:01 +03:00 · 2017-10-09 04:31:43 +02:00 · 2017-10-09 04:31:43 +02:00 · 689349e32f
commit 689349e32f
parent e79fc41ff8 81a64119db
8 changed files with 121 additions and 10 deletions
--- a/spacy/structs.pxd
+++ b/spacy/structs.pxd
@ -61,13 +61,13 @@ cdef struct TokenC:
    attr_t sense
    int head
    attr_t dep
-    bint sent_start

    uint32_t l_kids
    uint32_t r_kids
    uint32_t l_edge
    uint32_t r_edge

+    int sent_start
    int ent_iob
    attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
    hash_t ent_id
--- a/spacy/syntax/_state.pxd
+++ b/spacy/syntax/_state.pxd
@ -307,6 +307,8 @@ cdef cppclass StateC:
            this._stack[this._s_i] = this.B(0)
        this._s_i += 1
        this._b_i += 1
+        if this.B_(0).sent_start == 1:
+            this.set_break(this.B(0))
        if this._b_i > this._break:
            this._break = -1

@ -383,7 +385,7 @@ cdef cppclass StateC:

    void set_break(int i) nogil:
        if 0 <= i < this.length:
-            this._sent[i].sent_start = True
+            this._sent[i].sent_start = 1
            this._break = this._b_i

    void clone(const StateC* src) nogil:
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@ -118,7 +118,7 @@ cdef bint _is_gold_root(const GoldParseC* gold, int word) nogil:
 cdef class Shift:
    @staticmethod
    cdef bint is_valid(const StateC* st, attr_t label) nogil:
-        return st.buffer_length() >= 2 and not st.shifted[st.B(0)] and not st.B_(0).sent_start
+        return st.buffer_length() >= 2 and not st.shifted[st.B(0)] and st.B_(0).sent_start != 1

    @staticmethod
    cdef int transition(StateC* st, attr_t label) nogil:
@ -178,7 +178,7 @@ cdef class Reduce:
 cdef class LeftArc:
    @staticmethod
    cdef bint is_valid(const StateC* st, attr_t label) nogil:
-        return not st.B_(0).sent_start
+        return st.B_(0).sent_start != 1

    @staticmethod
    cdef int transition(StateC* st, attr_t label) nogil:
@ -212,7 +212,7 @@ cdef class LeftArc:
 cdef class RightArc:
    @staticmethod
    cdef bint is_valid(const StateC* st, attr_t label) nogil:
-        return not st.B_(0).sent_start
+        return st.B_(0).sent_start != 1

    @staticmethod
    cdef int transition(StateC* st, attr_t label) nogil:
@ -248,6 +248,10 @@ cdef class Break:
            return False
        elif st.stack_depth() < 1:
            return False
+        elif st.B_(0).l_edge < 0:
+            return False
+        elif st._sent[st.B_(0).l_edge].sent_start < 0:
+            return False
        else:
            return True

--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -244,8 +244,8 @@ cdef class Parser:
        hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 128))
        parser_maxout_pieces = util.env_opt('parser_maxout_pieces', cfg.get('maxout_pieces', 1))
        embed_size = util.env_opt('embed_size', cfg.get('embed_size', 7000))
-        hist_size = util.env_opt('history_feats', cfg.get('hist_size', 4))
-        hist_width = util.env_opt('history_width', cfg.get('hist_width', 16))
+        hist_size = util.env_opt('history_feats', cfg.get('hist_size', 0))
+        hist_width = util.env_opt('history_width', cfg.get('hist_width', 0))
        if hist_size >= 1 and depth == 0:
            raise ValueError("Inconsistent hyper-params: "
                "history_feats >= 1 but parser_hidden_depth==0")
--- a/spacy/tests/parser/test_preset_sbd.py
+++ b/spacy/tests/parser/test_preset_sbd.py
@ -0,0 +1,73 @@
+'''Test that the parser respects preset sentence boundaries.'''
+from __future__ import unicode_literals
+import pytest
+from thinc.neural.optimizers import Adam
+from thinc.neural.ops import NumpyOps
+
+from ...attrs import NORM
+from ...gold import GoldParse
+from ...vocab import Vocab
+from ...tokens import Doc
+from ...pipeline import NeuralDependencyParser
+
+@pytest.fixture
+def vocab():
+    return Vocab(lex_attr_getters={NORM: lambda s: s})
+
+@pytest.fixture
+def parser(vocab):
+    parser = NeuralDependencyParser(vocab)
+    parser.cfg['token_vector_width'] = 4
+    parser.cfg['hidden_width'] = 32
+    #parser.add_label('right')
+    parser.add_label('left')
+    parser.begin_training([], **parser.cfg)
+    sgd = Adam(NumpyOps(), 0.001)
+
+    for i in range(10):
+        losses = {}
+        doc = Doc(vocab, words=['a', 'b', 'c', 'd'])
+        gold = GoldParse(doc, heads=[1, 1, 3, 3],
+                deps=['left', 'ROOT', 'left', 'ROOT'])
+        parser.update([doc], [gold], sgd=sgd, losses=losses)
+    return parser
+
+def test_no_sentences(parser):
+    doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
+    doc = parser(doc)
+    assert len(list(doc.sents)) == 2
+
+
+def test_sents_1(parser):
+    doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
+    doc[2].sent_start = True
+    doc = parser(doc)
+    assert len(list(doc.sents)) >= 2
+    doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
+    doc[1].sent_start = False
+    doc[2].sent_start = True
+    doc[3].sent_start = False
+    doc = parser(doc)
+    assert len(list(doc.sents)) == 2
+
+
+def test_sents_1_2(parser):
+    doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
+    doc[1].sent_start = True
+    doc[2].sent_start = True
+    doc = parser(doc)
+    assert len(list(doc.sents)) == 3
+
+
+def test_sents_1_3(parser):
+    doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
+    doc[1].sent_start = True
+    doc[3].sent_start = True
+    doc = parser(doc)
+    assert len(list(doc.sents)) == 4
+    doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
+    doc[1].sent_start = True
+    doc[2].sent_start = False
+    doc[3].sent_start = True
+    doc = parser(doc)
+    assert len(list(doc.sents)) == 3
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -485,7 +485,7 @@ cdef class Doc:
            cdef int i
            start = 0
            for i in range(1, self.length):
-                if self.c[i].sent_start:
+                if self.c[i].sent_start == 1:
                    yield Span(self, start, i)
                    start = i
            if start != self.length:
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -111,6 +111,30 @@ cdef class Span:
        for i in range(self.start, self.end):
            yield self.doc[i]

+    def as_doc(self):
+        '''Create a Doc object view of the Span's data.
+
+        This is mostly useful for C-typed interfaces. 
+        '''
+        cdef Doc doc = Doc(self.doc.vocab)
+        doc.length = self.end-self.start
+        doc.c = &self.doc.c[self.start]
+        doc.mem = self.doc.mem
+        doc.is_parsed = self.doc.is_parsed
+        doc.is_tagged = self.doc.is_tagged
+        doc.noun_chunks_iterator = self.doc.noun_chunks_iterator
+        doc.user_hooks = self.doc.user_hooks
+        doc.user_span_hooks = self.doc.user_span_hooks
+        doc.user_token_hooks = self.doc.user_token_hooks
+        doc.vector = self.vector
+        doc.vector_norm = self.vector_norm
+        for key, value in self.doc.cats.items():
+            if hasattr(key, '__len__') and len(key) == 3:
+                cat_start, cat_end, cat_label = key
+                if cat_start == self.start_char and cat_end == self.end_char:
+                    doc.cats[cat_label] = value
+        return doc
+
    def merge(self, *args, **attributes):
        """Retokenize the document, such that the span is merged into a single
        token.
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -281,13 +281,21 @@ cdef class Token:
        def __get__(self):
            return self.c.sent_start

-        def __set__(self, bint value):
+        def __set__(self, value):
            if self.doc.is_parsed:
                raise ValueError(
                    'Refusing to write to token.sent_start if its document is parsed, '
                    'because this may cause inconsistent state. '
                    'See https://github.com/spacy-io/spaCy/issues/235 for workarounds.')
-            self.c.sent_start = value
+            if value is None:
+                self.c.sent_start = 0
+            elif value is True:
+                self.c.sent_start = 1
+            elif value is False:
+                self.c.sent_start = -1
+            else:
+                raise ValueError("Invalid value for token.sent_start -- must be one of "
+                                 "None, True, False")

    property lefts:
        def __get__(self):