mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 01:46:28 +03:00
Fix bug in sentence starts for non-projective parses
The set_children_from_heads function assumed parse trees were projective. However, non-projective parses may be passed in during deserialization, or after deprojectivising. This caused incorrect sentence boundaries to be set for non-projective parses. Close #2772.
This commit is contained in:
parent
48fd36bf05
commit
1759abf1e5
|
@ -2,7 +2,6 @@
|
||||||
import pytest
|
import pytest
|
||||||
from ..util import get_doc
|
from ..util import get_doc
|
||||||
|
|
||||||
@pytest.mark.xfail
|
|
||||||
def test_issue2772(en_vocab):
|
def test_issue2772(en_vocab):
|
||||||
words = 'When we write or communicate virtually , we can hide our true feelings .'.split()
|
words = 'When we write or communicate virtually , we can hide our true feelings .'.split()
|
||||||
# A tree with a non-projective (i.e. crossing) arc
|
# A tree with a non-projective (i.e. crossing) arc
|
||||||
|
|
|
@ -993,6 +993,8 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
|
||||||
tokens[i].r_kids = 0
|
tokens[i].r_kids = 0
|
||||||
tokens[i].l_edge = i
|
tokens[i].l_edge = i
|
||||||
tokens[i].r_edge = i
|
tokens[i].r_edge = i
|
||||||
|
# Twice, for non-projectivity
|
||||||
|
for _ in range(2):
|
||||||
# Set left edges
|
# Set left edges
|
||||||
for i in range(length):
|
for i in range(length):
|
||||||
child = &tokens[i]
|
child = &tokens[i]
|
||||||
|
@ -1001,7 +1003,8 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
|
||||||
head.l_kids += 1
|
head.l_kids += 1
|
||||||
if child.l_edge < head.l_edge:
|
if child.l_edge < head.l_edge:
|
||||||
head.l_edge = child.l_edge
|
head.l_edge = child.l_edge
|
||||||
|
if child.r_edge > head.r_edge:
|
||||||
|
head.r_edge = child.r_edge
|
||||||
# Set right edges --- same as above, but iterate in reverse
|
# Set right edges --- same as above, but iterate in reverse
|
||||||
for i in range(length-1, -1, -1):
|
for i in range(length-1, -1, -1):
|
||||||
child = &tokens[i]
|
child = &tokens[i]
|
||||||
|
@ -1010,8 +1013,8 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
|
||||||
head.r_kids += 1
|
head.r_kids += 1
|
||||||
if child.r_edge > head.r_edge:
|
if child.r_edge > head.r_edge:
|
||||||
head.r_edge = child.r_edge
|
head.r_edge = child.r_edge
|
||||||
|
if child.l_edge < head.l_edge:
|
||||||
|
head.l_edge = child.l_edge
|
||||||
# Set sentence starts
|
# Set sentence starts
|
||||||
for i in range(length):
|
for i in range(length):
|
||||||
if tokens[i].head == 0 and tokens[i].dep != 0:
|
if tokens[i].head == 0 and tokens[i].dep != 0:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user