mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 01:16:28 +03:00
Iterate over lr_edges until sents are correct (#4702)
Iterate over lr_edges until all heads are within the current sentence. Instead of iterating over them for a fixed number of iterations, check whether the sentence boundaries are correct for the heads and stop when all are correct. Stop after a maximum of 10 iterations, providing a warning in this case since the sentence boundaries may not be correct.
This commit is contained in:
parent
cbacb0f1a4
commit
2d8c6e1124
|
@ -101,6 +101,7 @@ class Warnings(object):
|
||||||
"the Knowledge Base.")
|
"the Knowledge Base.")
|
||||||
W025 = ("'{name}' requires '{attr}' to be assigned, but none of the "
|
W025 = ("'{name}' requires '{attr}' to be assigned, but none of the "
|
||||||
"previous components in the pipeline declare that they assign it.")
|
"previous components in the pipeline declare that they assign it.")
|
||||||
|
W026 = ("Unable to set all sentence boundaries from dependency parses.")
|
||||||
|
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
|
|
|
@ -148,3 +148,20 @@ def test_parser_arc_eager_finalize_state(en_tokenizer, en_parser):
|
||||||
assert tokens[4].left_edge.i == 0
|
assert tokens[4].left_edge.i == 0
|
||||||
assert tokens[4].right_edge.i == 4
|
assert tokens[4].right_edge.i == 4
|
||||||
assert tokens[4].head.i == 4
|
assert tokens[4].head.i == 4
|
||||||
|
|
||||||
|
|
||||||
|
def test_parser_set_sent_starts(en_vocab):
|
||||||
|
words = ['Ein', 'Satz', '.', 'Außerdem', 'ist', 'Zimmer', 'davon', 'überzeugt', ',', 'dass', 'auch', 'epige-', '\n', 'netische', 'Mechanismen', 'eine', 'Rolle', 'spielen', ',', 'also', 'Vorgänge', ',', 'die', '\n', 'sich', 'darauf', 'auswirken', ',', 'welche', 'Gene', 'abgelesen', 'werden', 'und', '\n', 'welche', 'nicht', '.', '\n']
|
||||||
|
heads = [1, 0, -1, 27, 0, -1, 1, -3, -1, 8, 4, 3, -1, 1, 3, 1, 1, -11, -1, 1, -9, -1, 4, -1, 2, 1, -6, -1, 1, 2, 1, -6, -1, -1, -17, -31, -32, -1]
|
||||||
|
deps = ['nk', 'ROOT', 'punct', 'mo', 'ROOT', 'sb', 'op', 'pd', 'punct', 'cp', 'mo', 'nk', '', 'nk', 'sb', 'nk', 'oa', 're', 'punct', 'mo', 'app', 'punct', 'sb', '', 'oa', 'op', 'rc', 'punct', 'nk', 'sb', 'oc', 're', 'cd', '', 'oa', 'ng', 'punct', '']
|
||||||
|
doc = get_doc(
|
||||||
|
en_vocab, words=words, deps=deps, heads=heads
|
||||||
|
)
|
||||||
|
for i in range(len(words)):
|
||||||
|
if i == 0 or i == 3:
|
||||||
|
assert doc[i].is_sent_start == True
|
||||||
|
else:
|
||||||
|
assert doc[i].is_sent_start == None
|
||||||
|
for sent in doc.sents:
|
||||||
|
for token in sent:
|
||||||
|
assert token.head in sent
|
||||||
|
|
|
@ -21,6 +21,9 @@ ctypedef fused LexemeOrToken:
|
||||||
cdef int set_children_from_heads(TokenC* tokens, int length) except -1
|
cdef int set_children_from_heads(TokenC* tokens, int length) except -1
|
||||||
|
|
||||||
|
|
||||||
|
cdef int _set_lr_kids_and_edges(TokenC* tokens, int length, int loop_count) except -1
|
||||||
|
|
||||||
|
|
||||||
cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2
|
cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1153,10 +1153,32 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
|
||||||
tokens[i].r_kids = 0
|
tokens[i].r_kids = 0
|
||||||
tokens[i].l_edge = i
|
tokens[i].l_edge = i
|
||||||
tokens[i].r_edge = i
|
tokens[i].r_edge = i
|
||||||
# Three times, for non-projectivity. See issue #3170. This isn't a very
|
cdef int loop_count = 0
|
||||||
# satisfying fix, but I think it's sufficient.
|
cdef bint heads_within_sents = False
|
||||||
for loop_count in range(3):
|
# Try up to 10 iterations of adjusting lr_kids and lr_edges in order to
|
||||||
|
# handle non-projective dependency parses, stopping when all heads are
|
||||||
|
# within their respective sentence boundaries. We have documented cases
|
||||||
|
# that need at least 4 iterations, so this is to be on the safe side
|
||||||
|
# without risking getting stuck in an infinite loop if something is
|
||||||
|
# terribly malformed.
|
||||||
|
while not heads_within_sents:
|
||||||
|
heads_within_sents = _set_lr_kids_and_edges(tokens, length, loop_count)
|
||||||
|
if loop_count > 10:
|
||||||
|
user_warning(Warnings.W026)
|
||||||
|
loop_count += 1
|
||||||
|
# Set sentence starts
|
||||||
|
for i in range(length):
|
||||||
|
if tokens[i].head == 0 and tokens[i].dep != 0:
|
||||||
|
tokens[tokens[i].l_edge].sent_start = True
|
||||||
|
|
||||||
|
|
||||||
|
cdef int _set_lr_kids_and_edges(TokenC* tokens, int length, int loop_count) except -1:
|
||||||
|
# May be called multiple times due to non-projectivity. See issues #3170
|
||||||
|
# and #4688.
|
||||||
# Set left edges
|
# Set left edges
|
||||||
|
cdef TokenC* head
|
||||||
|
cdef TokenC* child
|
||||||
|
cdef int i, j
|
||||||
for i in range(length):
|
for i in range(length):
|
||||||
child = &tokens[i]
|
child = &tokens[i]
|
||||||
head = &tokens[i + child.head]
|
head = &tokens[i + child.head]
|
||||||
|
@ -1176,10 +1198,22 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
|
||||||
head.r_edge = child.r_edge
|
head.r_edge = child.r_edge
|
||||||
if child.l_edge < head.l_edge:
|
if child.l_edge < head.l_edge:
|
||||||
head.l_edge = child.l_edge
|
head.l_edge = child.l_edge
|
||||||
# Set sentence starts
|
# Get sentence start positions according to current state
|
||||||
|
sent_starts = set()
|
||||||
for i in range(length):
|
for i in range(length):
|
||||||
if tokens[i].head == 0 and tokens[i].dep != 0:
|
if tokens[i].head == 0 and tokens[i].dep != 0:
|
||||||
tokens[tokens[i].l_edge].sent_start = True
|
sent_starts.add(tokens[i].l_edge)
|
||||||
|
cdef int curr_sent_start = 0
|
||||||
|
cdef int curr_sent_end = 0
|
||||||
|
# Check whether any heads are not within the current sentence
|
||||||
|
for i in range(length):
|
||||||
|
if (i > 0 and i in sent_starts) or i == length - 1:
|
||||||
|
curr_sent_end = i
|
||||||
|
for j in range(curr_sent_start, curr_sent_end):
|
||||||
|
if tokens[j].head + j < curr_sent_start or tokens[j].head + j >= curr_sent_end + 1:
|
||||||
|
return False
|
||||||
|
curr_sent_start = i
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
cdef int _get_tokens_lca(Token token_j, Token token_k):
|
cdef int _get_tokens_lca(Token token_j, Token token_k):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user