* Fix whitespace attachment thing

This commit is contained in:
Matthew Honnibal 2015-10-13 09:46:38 +02:00
parent 37919eac82
commit 329ae57520

View File

@ -384,13 +384,16 @@ cdef class ArcEager(TransitionSystem):
for i in range(st.length): for i in range(st.length):
# Always attach spaces to the previous word # Always attach spaces to the previous word
if Lexeme.c_check_flag(st._sent[i].lex, IS_SPACE): if Lexeme.c_check_flag(st._sent[i].lex, IS_SPACE):
st._sent[i].head = -1 if (i >= 1) else 1
if st._sent[i].sent_start and st._sent[i].head == -1: if st._sent[i].sent_start and st._sent[i].head == -1:
st._sent[i].sent_start = False st._sent[i].sent_start = False
# If we had this space token as the start of a sentence, # If we had this space token as the start of a sentence,
# move that sentence start forward one # move that sentence start forward one
if (i + 1) < st.length and not st._sent[i+1].sent_start: if (i + 1) < st.length and not st._sent[i+1].sent_start:
st._sent[i+1].sent_start = True st._sent[i+1].sent_start = True
if i >= 1:
st.add_arc(i-1, i, st._sent[i].dep)
else:
st.add_arc(i+1, i, st._sent[i].dep)
elif st._sent[i].head == 0 and st._sent[i].dep == 0: elif st._sent[i].head == 0 and st._sent[i].dep == 0:
st._sent[i].dep = self.root_label st._sent[i].dep = self.root_label
# If we're not using the Break transition, we segment via root-labelled # If we're not using the Break transition, we segment via root-labelled