the parser now introduces sentence boundaries properly when predicting dependents with root labels

This commit is contained in:
Wolfgang Seeker 2016-04-21 16:50:53 +02:00
parent 12024b0b0a
commit 6c7301cc6d
2 changed files with 72 additions and 3 deletions

View File

@ -447,6 +447,7 @@ cdef class ArcEager(TransitionSystem):
# note that this can create non-projective trees if there are arcs
# between nodes on both sides of the new root node
st._sent[i].head = 0
st._sent[st._sent[i].l_edge].sent_start = True
cdef int set_valid(self, int* output, const StateC* st) nogil:
cdef bint[N_MOVES] is_valid

View File

@ -1,7 +1,7 @@
from __future__ import unicode_literals
import pytest
from spacy.tokens import Doc
@pytest.mark.models
@ -42,7 +42,7 @@ def test_single_question(EN):
@pytest.mark.models
def test_sentence_breaks_no_space(EN):
doc = EN.tokenizer.tokens_from_list('This is a sentence . This is another one .'.split(' '))
doc = EN.tokenizer.tokens_from_list(u'This is a sentence . This is another one .'.split(' '))
EN.tagger(doc)
with EN.parser.step_through(doc) as stepwise:
# stack empty, automatic Shift (This)
@ -83,7 +83,7 @@ def test_sentence_breaks_no_space(EN):
@pytest.mark.models
def test_sentence_breaks_with_space(EN):
doc = EN.tokenizer.tokens_from_list('\t This is \n a sentence \n \n . \n \t \n This is another \t one .'.split(' '))
doc = EN.tokenizer.tokens_from_list(u'\t This is \n a sentence \n \n . \n \t \n This is another \t one .'.split(' '))
EN.tagger(doc)
with EN.parser.step_through(doc) as stepwise:
# stack empty, automatic Shift (This)
@ -120,3 +120,71 @@ def test_sentence_breaks_with_space(EN):
for tok in doc:
assert tok.dep != 0 or tok.is_space
assert [ tok.head.i for tok in doc ] == [1,2,2,2,5,2,5,5,2,8,8,8,13,13,16,14,13,13]
@pytest.fixture
@pytest.mark.models
def example(EN):
def apply_transition_sequence(model, doc, sequence):
with model.parser.step_through(doc) as stepwise:
for transition in sequence:
stepwise.transition(transition)
doc = EN.tokenizer.tokens_from_list(u"I bought a couch from IKEA. It was n't very comfortable .".split(' '))
EN.tagger(doc)
apply_transition_sequence(EN, doc, ['L-nsubj','S','L-det','R-dobj','D','R-prep','R-pobj','D','D','S','L-nsubj','R-ROOT','R-neg','D','S','L-advmod','R-acomp','D','R-punct'])
return doc
def test_sbd_for_root_label_dependents(example):
"""
make sure that the parser properly introduces a sentence boundary without
the break transition by checking for dependents with the root label
"""
assert example[1].head.i == 1
assert example[7].head.i == 7
sents = list(example.sents)
assert len(sents) == 2
assert sents[1][0].orth_ == u'It'
@pytest.mark.models
def test_sbd_serialization(EN, example):
"""
test that before and after serialization, the sentence boundaries are the same even
if the parser predicted two roots for the sentence that were made into two sentences
after parsing by arc_eager.finalize()
This is actually an interaction between the sentence boundary prediction and doc.from_array
The process is the following: if the parser doesn't predict a sentence boundary but attaches
a word with the ROOT label, the second root node is made root of its own sentence after parsing.
During serialization, sentence boundary information is lost and reintroduced when the code
is deserialized by introducing sentence starts at every left-edge of every root node.
BUG that is tested here: So far, the parser wasn't introducing a sentence start when
it introduced the second root node.
"""
example_serialized = Doc(EN.vocab).from_bytes(example.to_bytes())
assert example.to_bytes() == example_serialized.to_bytes()
assert [s.text for s in example.sents] == [s.text for s in example_serialized.sents]