diff --git a/spacy/errors.py b/spacy/errors.py index 3e62b5a3e..c44c72117 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -101,6 +101,7 @@ class Warnings(object): "the Knowledge Base.") W025 = ("'{name}' requires '{attr}' to be assigned, but none of the " "previous components in the pipeline declare that they assign it.") + W026 = ("Unable to set all sentence boundaries from dependency parses.") @add_codes diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index c140cb485..384f14dad 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -148,3 +148,20 @@ def test_parser_arc_eager_finalize_state(en_tokenizer, en_parser): assert tokens[4].left_edge.i == 0 assert tokens[4].right_edge.i == 4 assert tokens[4].head.i == 4 + + +def test_parser_set_sent_starts(en_vocab): + words = ['Ein', 'Satz', '.', 'Außerdem', 'ist', 'Zimmer', 'davon', 'überzeugt', ',', 'dass', 'auch', 'epige-', '\n', 'netische', 'Mechanismen', 'eine', 'Rolle', 'spielen', ',', 'also', 'Vorgänge', ',', 'die', '\n', 'sich', 'darauf', 'auswirken', ',', 'welche', 'Gene', 'abgelesen', 'werden', 'und', '\n', 'welche', 'nicht', '.', '\n'] + heads = [1, 0, -1, 27, 0, -1, 1, -3, -1, 8, 4, 3, -1, 1, 3, 1, 1, -11, -1, 1, -9, -1, 4, -1, 2, 1, -6, -1, 1, 2, 1, -6, -1, -1, -17, -31, -32, -1] + deps = ['nk', 'ROOT', 'punct', 'mo', 'ROOT', 'sb', 'op', 'pd', 'punct', 'cp', 'mo', 'nk', '', 'nk', 'sb', 'nk', 'oa', 're', 'punct', 'mo', 'app', 'punct', 'sb', '', 'oa', 'op', 'rc', 'punct', 'nk', 'sb', 'oc', 're', 'cd', '', 'oa', 'ng', 'punct', ''] + doc = get_doc( + en_vocab, words=words, deps=deps, heads=heads + ) + for i in range(len(words)): + if i == 0 or i == 3: + assert doc[i].is_sent_start == True + else: + assert doc[i].is_sent_start == None + for sent in doc.sents: + for token in sent: + assert token.head in sent diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index 62665fcc5..7f231887f 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -21,6 +21,9 @@ ctypedef fused LexemeOrToken: cdef int set_children_from_heads(TokenC* tokens, int length) except -1 +cdef int _set_lr_kids_and_edges(TokenC* tokens, int length, int loop_count) except -1 + + cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2 diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 6afe89e05..d4ba3803b 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -1153,35 +1153,69 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1: tokens[i].r_kids = 0 tokens[i].l_edge = i tokens[i].r_edge = i - # Three times, for non-projectivity. See issue #3170. This isn't a very - # satisfying fix, but I think it's sufficient. - for loop_count in range(3): - # Set left edges - for i in range(length): - child = &tokens[i] - head = &tokens[i + child.head] - if child < head and loop_count == 0: - head.l_kids += 1 - if child.l_edge < head.l_edge: - head.l_edge = child.l_edge - if child.r_edge > head.r_edge: - head.r_edge = child.r_edge - # Set right edges - same as above, but iterate in reverse - for i in range(length-1, -1, -1): - child = &tokens[i] - head = &tokens[i + child.head] - if child > head and loop_count == 0: - head.r_kids += 1 - if child.r_edge > head.r_edge: - head.r_edge = child.r_edge - if child.l_edge < head.l_edge: - head.l_edge = child.l_edge + cdef int loop_count = 0 + cdef bint heads_within_sents = False + # Try up to 10 iterations of adjusting lr_kids and lr_edges in order to + # handle non-projective dependency parses, stopping when all heads are + # within their respective sentence boundaries. We have documented cases + # that need at least 4 iterations, so this is to be on the safe side + # without risking getting stuck in an infinite loop if something is + # terribly malformed. + while not heads_within_sents: + heads_within_sents = _set_lr_kids_and_edges(tokens, length, loop_count) + if loop_count > 10: + user_warning(Warnings.W026) + loop_count += 1 # Set sentence starts for i in range(length): if tokens[i].head == 0 and tokens[i].dep != 0: tokens[tokens[i].l_edge].sent_start = True +cdef int _set_lr_kids_and_edges(TokenC* tokens, int length, int loop_count) except -1: + # May be called multiple times due to non-projectivity. See issues #3170 + # and #4688. + # Set left edges + cdef TokenC* head + cdef TokenC* child + cdef int i, j + for i in range(length): + child = &tokens[i] + head = &tokens[i + child.head] + if child < head and loop_count == 0: + head.l_kids += 1 + if child.l_edge < head.l_edge: + head.l_edge = child.l_edge + if child.r_edge > head.r_edge: + head.r_edge = child.r_edge + # Set right edges - same as above, but iterate in reverse + for i in range(length-1, -1, -1): + child = &tokens[i] + head = &tokens[i + child.head] + if child > head and loop_count == 0: + head.r_kids += 1 + if child.r_edge > head.r_edge: + head.r_edge = child.r_edge + if child.l_edge < head.l_edge: + head.l_edge = child.l_edge + # Get sentence start positions according to current state + sent_starts = set() + for i in range(length): + if tokens[i].head == 0 and tokens[i].dep != 0: + sent_starts.add(tokens[i].l_edge) + cdef int curr_sent_start = 0 + cdef int curr_sent_end = 0 + # Check whether any heads are not within the current sentence + for i in range(length): + if (i > 0 and i in sent_starts) or i == length - 1: + curr_sent_end = i + for j in range(curr_sent_start, curr_sent_end): + if tokens[j].head + j < curr_sent_start or tokens[j].head + j >= curr_sent_end + 1: + return False + curr_sent_start = i + return True + + cdef int _get_tokens_lca(Token token_j, Token token_k): """Given two tokens, returns the index of the lowest common ancestor (LCA) among the two. If they have no common ancestor, -1 is returned.