Iterate over lr_edges until sents are correct (#4702)

Iterate over lr_edges until all heads are within the current sentence.
Instead of iterating over them for a fixed number of iterations, check
whether the sentence boundaries are correct for the heads and stop when
all are correct. Stop after a maximum of 10 iterations, providing a
warning in this case since the sentence boundaries may not be correct.
This commit is contained in:
adrianeboyd 2019-11-25 13:06:36 +01:00 committed by Matthew Honnibal
parent cbacb0f1a4
commit 2d8c6e1124
4 changed files with 78 additions and 23 deletions

View File

@ -101,6 +101,7 @@ class Warnings(object):
"the Knowledge Base.") "the Knowledge Base.")
W025 = ("'{name}' requires '{attr}' to be assigned, but none of the " W025 = ("'{name}' requires '{attr}' to be assigned, but none of the "
"previous components in the pipeline declare that they assign it.") "previous components in the pipeline declare that they assign it.")
W026 = ("Unable to set all sentence boundaries from dependency parses.")
@add_codes @add_codes

View File

@ -148,3 +148,20 @@ def test_parser_arc_eager_finalize_state(en_tokenizer, en_parser):
assert tokens[4].left_edge.i == 0 assert tokens[4].left_edge.i == 0
assert tokens[4].right_edge.i == 4 assert tokens[4].right_edge.i == 4
assert tokens[4].head.i == 4 assert tokens[4].head.i == 4
def test_parser_set_sent_starts(en_vocab):
words = ['Ein', 'Satz', '.', 'Außerdem', 'ist', 'Zimmer', 'davon', 'überzeugt', ',', 'dass', 'auch', 'epige-', '\n', 'netische', 'Mechanismen', 'eine', 'Rolle', 'spielen', ',', 'also', 'Vorgänge', ',', 'die', '\n', 'sich', 'darauf', 'auswirken', ',', 'welche', 'Gene', 'abgelesen', 'werden', 'und', '\n', 'welche', 'nicht', '.', '\n']
heads = [1, 0, -1, 27, 0, -1, 1, -3, -1, 8, 4, 3, -1, 1, 3, 1, 1, -11, -1, 1, -9, -1, 4, -1, 2, 1, -6, -1, 1, 2, 1, -6, -1, -1, -17, -31, -32, -1]
deps = ['nk', 'ROOT', 'punct', 'mo', 'ROOT', 'sb', 'op', 'pd', 'punct', 'cp', 'mo', 'nk', '', 'nk', 'sb', 'nk', 'oa', 're', 'punct', 'mo', 'app', 'punct', 'sb', '', 'oa', 'op', 'rc', 'punct', 'nk', 'sb', 'oc', 're', 'cd', '', 'oa', 'ng', 'punct', '']
doc = get_doc(
en_vocab, words=words, deps=deps, heads=heads
)
for i in range(len(words)):
if i == 0 or i == 3:
assert doc[i].is_sent_start == True
else:
assert doc[i].is_sent_start == None
for sent in doc.sents:
for token in sent:
assert token.head in sent

View File

@ -21,6 +21,9 @@ ctypedef fused LexemeOrToken:
cdef int set_children_from_heads(TokenC* tokens, int length) except -1 cdef int set_children_from_heads(TokenC* tokens, int length) except -1
cdef int _set_lr_kids_and_edges(TokenC* tokens, int length, int loop_count) except -1
cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2 cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2

View File

@ -1153,35 +1153,69 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
tokens[i].r_kids = 0 tokens[i].r_kids = 0
tokens[i].l_edge = i tokens[i].l_edge = i
tokens[i].r_edge = i tokens[i].r_edge = i
# Three times, for non-projectivity. See issue #3170. This isn't a very cdef int loop_count = 0
# satisfying fix, but I think it's sufficient. cdef bint heads_within_sents = False
for loop_count in range(3): # Try up to 10 iterations of adjusting lr_kids and lr_edges in order to
# Set left edges # handle non-projective dependency parses, stopping when all heads are
for i in range(length): # within their respective sentence boundaries. We have documented cases
child = &tokens[i] # that need at least 4 iterations, so this is to be on the safe side
head = &tokens[i + child.head] # without risking getting stuck in an infinite loop if something is
if child < head and loop_count == 0: # terribly malformed.
head.l_kids += 1 while not heads_within_sents:
if child.l_edge < head.l_edge: heads_within_sents = _set_lr_kids_and_edges(tokens, length, loop_count)
head.l_edge = child.l_edge if loop_count > 10:
if child.r_edge > head.r_edge: user_warning(Warnings.W026)
head.r_edge = child.r_edge loop_count += 1
# Set right edges - same as above, but iterate in reverse
for i in range(length-1, -1, -1):
child = &tokens[i]
head = &tokens[i + child.head]
if child > head and loop_count == 0:
head.r_kids += 1
if child.r_edge > head.r_edge:
head.r_edge = child.r_edge
if child.l_edge < head.l_edge:
head.l_edge = child.l_edge
# Set sentence starts # Set sentence starts
for i in range(length): for i in range(length):
if tokens[i].head == 0 and tokens[i].dep != 0: if tokens[i].head == 0 and tokens[i].dep != 0:
tokens[tokens[i].l_edge].sent_start = True tokens[tokens[i].l_edge].sent_start = True
cdef int _set_lr_kids_and_edges(TokenC* tokens, int length, int loop_count) except -1:
# May be called multiple times due to non-projectivity. See issues #3170
# and #4688.
# Set left edges
cdef TokenC* head
cdef TokenC* child
cdef int i, j
for i in range(length):
child = &tokens[i]
head = &tokens[i + child.head]
if child < head and loop_count == 0:
head.l_kids += 1
if child.l_edge < head.l_edge:
head.l_edge = child.l_edge
if child.r_edge > head.r_edge:
head.r_edge = child.r_edge
# Set right edges - same as above, but iterate in reverse
for i in range(length-1, -1, -1):
child = &tokens[i]
head = &tokens[i + child.head]
if child > head and loop_count == 0:
head.r_kids += 1
if child.r_edge > head.r_edge:
head.r_edge = child.r_edge
if child.l_edge < head.l_edge:
head.l_edge = child.l_edge
# Get sentence start positions according to current state
sent_starts = set()
for i in range(length):
if tokens[i].head == 0 and tokens[i].dep != 0:
sent_starts.add(tokens[i].l_edge)
cdef int curr_sent_start = 0
cdef int curr_sent_end = 0
# Check whether any heads are not within the current sentence
for i in range(length):
if (i > 0 and i in sent_starts) or i == length - 1:
curr_sent_end = i
for j in range(curr_sent_start, curr_sent_end):
if tokens[j].head + j < curr_sent_start or tokens[j].head + j >= curr_sent_end + 1:
return False
curr_sent_start = i
return True
cdef int _get_tokens_lca(Token token_j, Token token_k): cdef int _get_tokens_lca(Token token_j, Token token_k):
"""Given two tokens, returns the index of the lowest common ancestor """Given two tokens, returns the index of the lowest common ancestor
(LCA) among the two. If they have no common ancestor, -1 is returned. (LCA) among the two. If they have no common ancestor, -1 is returned.