the parser now introduces sentence boundaries properly when predicting dependents with root labels

2025-09-17 09:32:42 +03:00 · 2016-04-21 16:50:53 +02:00 · 2016-04-21 16:50:53 +02:00 · 6c7301cc6d
commit 6c7301cc6d
parent 12024b0b0a
2 changed files with 72 additions and 3 deletions
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@ -447,6 +447,7 @@ cdef class ArcEager(TransitionSystem):
                # note that this can create non-projective trees if there are arcs
                # between nodes on both sides of the new root node
                st._sent[i].head = 0
+                st._sent[st._sent[i].l_edge].sent_start = True

    cdef int set_valid(self, int* output, const StateC* st) nogil:
        cdef bint[N_MOVES] is_valid
--- a/spacy/tests/parser/test_sbd.py
+++ b/spacy/tests/parser/test_sbd.py
@ -1,7 +1,7 @@
 from __future__ import unicode_literals

 import pytest
-
+from spacy.tokens import Doc


@pytest.mark.models
@ -42,7 +42,7 @@ def test_single_question(EN):

@pytest.mark.models
 def test_sentence_breaks_no_space(EN):
-    doc = EN.tokenizer.tokens_from_list('This is a sentence . This is another one .'.split(' '))
+    doc = EN.tokenizer.tokens_from_list(u'This is a sentence . This is another one .'.split(' '))
    EN.tagger(doc)
    with EN.parser.step_through(doc) as stepwise:
        # stack empty, automatic Shift (This)
@ -83,7 +83,7 @@ def test_sentence_breaks_no_space(EN):

@pytest.mark.models
 def test_sentence_breaks_with_space(EN):
-    doc = EN.tokenizer.tokens_from_list('\t This is \n a sentence \n \n . \n \t \n This is another \t one .'.split(' '))
+    doc = EN.tokenizer.tokens_from_list(u'\t This is \n a sentence \n \n . \n \t \n This is another \t one .'.split(' '))
    EN.tagger(doc)
    with EN.parser.step_through(doc) as stepwise:
        # stack empty, automatic Shift (This)
@ -120,3 +120,71 @@ def test_sentence_breaks_with_space(EN):
    for tok in doc:
        assert tok.dep != 0 or tok.is_space
    assert [ tok.head.i for tok in doc ] == [1,2,2,2,5,2,5,5,2,8,8,8,13,13,16,14,13,13]
+
+
+
+@pytest.fixture
+@pytest.mark.models
+def example(EN):
+    def apply_transition_sequence(model, doc, sequence):
+        with model.parser.step_through(doc) as stepwise:
+            for transition in sequence:
+                stepwise.transition(transition)
+    doc = EN.tokenizer.tokens_from_list(u"I bought a couch from IKEA. It was n't very comfortable .".split(' '))
+    EN.tagger(doc)
+    apply_transition_sequence(EN, doc, ['L-nsubj','S','L-det','R-dobj','D','R-prep','R-pobj','D','D','S','L-nsubj','R-ROOT','R-neg','D','S','L-advmod','R-acomp','D','R-punct'])
+    return doc
+
+
+def test_sbd_for_root_label_dependents(example):
+    """
+    make sure that the parser properly introduces a sentence boundary without
+    the break transition by checking for dependents with the root label
+    """
+
+    assert example[1].head.i == 1
+    assert example[7].head.i == 7
+
+    sents = list(example.sents)
+    assert len(sents) == 2
+    assert sents[1][0].orth_ == u'It'
+
+
+
+@pytest.mark.models
+def test_sbd_serialization(EN, example):
+    """
+    test that before and after serialization, the sentence boundaries are the same even
+    if the parser predicted two roots for the sentence that were made into two sentences
+    after parsing by arc_eager.finalize()
+
+    This is actually an interaction between the sentence boundary prediction and doc.from_array
+    The process is the following: if the parser doesn't predict a sentence boundary but attaches
+    a word with the ROOT label, the second root node is made root of its own sentence after parsing.
+    During serialization, sentence boundary information is lost and reintroduced when the code
+    is deserialized by introducing sentence starts at every left-edge of every root node.
+
+    BUG that is tested here: So far, the parser wasn't introducing a sentence start when 
+    it introduced the second root node.
+    """
+
+    example_serialized = Doc(EN.vocab).from_bytes(example.to_bytes())
+
+    assert example.to_bytes() == example_serialized.to_bytes()
+    assert [s.text for s in example.sents] == [s.text for s in example_serialized.sents]
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+