* Fix calculation of root token in Span. Now take root to be word with shortest tree path. Avoids parse trees ending up in inconsistent state, as had occurred in Issue #214.

This commit is contained in:
Matthew Honnibal 2016-01-16 15:38:50 +01:00
parent c1039fa4b4
commit 8cbcc3a799

View File

@ -125,8 +125,10 @@ cdef class Span:
return u''.join([t.text_with_ws for t in self]) return u''.join([t.text_with_ws for t in self])
property root: property root:
"""The first ancestor of the first word of the span that has its head """The word of the span that is highest in the parse tree, i.e. has the
outside the span. shortest path to the root of the sentence (or is the root itself).
If multiple words are equally high in the tree, the first word is taken.
For example: For example:
@ -149,45 +151,37 @@ cdef class Span:
>>> new_york.root.orth_ >>> new_york.root.orth_
'York' 'York'
When there are multiple words with external dependencies, we take the first: Here's a more complicated case, raise by Issue #214
>>> toks[autumn].head.orth_, toks[dot].head.orth_ >>> toks = nlp(u'to, north and south carolina')
('in', like') >>> to, north, and_, south, carolina = toks
>>> autumn_dot = toks[autumn:] >>> south.head.text, carolina.head.text
>>> autumn_dot.root.orth_ ('north', 'to')
'Autumn'
Here 'south' is a child of 'north', which is a child of 'carolina'.
Carolina is the root of the span:
>>> south_carolina = toks[-2:]
>>> south_carolina.root.text
'carolina'
""" """
def __get__(self): def __get__(self):
self._recalculate_indices() self._recalculate_indices()
# This should probably be called 'head', and the other one called # This should probably be called 'head', and the other one called
# 'gov'. But we went with 'head' elsehwhere, and now we're stuck =/ # 'gov'. But we went with 'head' elsehwhere, and now we're stuck =/
cdef const TokenC* start = &self.doc.c[self.start] cdef int i
cdef const TokenC* end = &self.doc.c[self.end] cdef int current_best = _count_words_to_root(&self.doc.c[self.start],
head = start self.doc.length)
cdef int nr_iter = 0 cdef int root = self.start
while start <= (head + head.head) < end and head.head != 0: for i in range(self.start, self.end):
head += head.head if current_best == 0:
# Guard against infinite loops break
if nr_iter >= (self.doc.length+1): words_to_root = _count_words_to_root(&self.doc.c[i], self.doc.length)
# Retrieve the words without getting the Python tokens, to if words_to_root < current_best:
# avoid potential problems current_best = words_to_root
try: root = i
words = [self.doc.vocab.strings[self.doc.c[i].lex.orth] for i return self.doc[root]
in range(self.doc.length)]
except:
words = '<Exception retrieving words!>'
try:
heads = [self.doc.c[i].head for i in range(self.doc.length)]
except:
heads = '<Exception retrieving heads!>'
raise RuntimeError(
"Invalid dependency parse, leading to potentially infinite loop. " +
"Please report this error on the issue tracker.\n" +
("Words: %s\n" % repr(words)) +
("Heads: %s\n" % repr(heads)))
nr_iter += 1
return self.doc[head - self.doc.c]
property lefts: property lefts:
"""Tokens that are to the left of the Span, whose head is within the Span.""" """Tokens that are to the left of the Span, whose head is within the Span."""
def __get__(self): def __get__(self):
@ -228,3 +222,16 @@ cdef class Span:
def __get__(self): def __get__(self):
return self.doc.vocab.strings[self.label] return self.doc.vocab.strings[self.label]
cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
cdef int n = 0
while token.head != 0:
token += token.head
n += 1
if n >= sent_length:
raise RuntimeError(
"Array bounds exceeded while searching for root word. This likely "
"means the parse tree is in an invalid state. Please report this "
"issue here: http://github.com/honnibal/spaCy/")
token += token.head
return n