* Fix runtime error bug that arose from updated Span.root function.

This commit is contained in:
Matthew Honnibal 2016-01-25 15:22:42 +01:00
parent 2c8dd91785
commit 87172a15c6
2 changed files with 23 additions and 6 deletions

View File

@ -161,3 +161,21 @@ def test_merge_hang():
doc.from_array([HEAD], heads.T)
doc.merge(18, 32, '', '', 'ORG')
doc.merge(8, 32, '', '', 'ORG')
@pytest.mark.models
def test_runtime_error(EN):
# Example that caused run-time error while parsing Reddit
text = u'67% of black households are single parent \n\n72% of all black babies born out of wedlock \n\n50% of all black kids don\u2019t finish high school'
doc = EN(text)
nps = []
for np in doc.noun_chunks:
while len(np) > 1 and np[0].dep_ not in ('advmod', 'amod', 'compound'):
np = np[1:]
if len(np) > 1:
nps.append((np.start_char, np.end_char, np.root.tag_, np.text, np.root.ent_type_))
for np in nps:
print(np)
for word in doc:
print(word.idx, word.text, word.head.i, word.head.text)
doc.merge(*np)

View File

@ -190,12 +190,6 @@ cdef class Span:
for i in range(self.start, self.end):
if self.start <= (i+self.doc.c[i].head) < self.end:
continue
# Don't allow punctuation or spaces to be the root, if there are
# better candidates
if root != -1 and Lexeme.c_check_flag(self.doc.c[i].lex, IS_PUNCT):
continue
if root != -1 and Lexeme.c_check_flag(self.doc.c[i].lex, IS_SPACE):
continue
words_to_root = _count_words_to_root(&self.doc.c[i], self.doc.length)
if words_to_root < current_best:
current_best = words_to_root
@ -244,6 +238,11 @@ cdef class Span:
cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
# Don't allow spaces to be the root, if there are
# better candidates
if Lexeme.c_check_flag(token.lex, IS_SPACE):
return sent_length-1
cdef int n = 0
while token.head != 0:
token += token.head