* Rename Span.head to Span.root.

This commit is contained in:
Matthew Honnibal 2015-07-09 17:30:58 +02:00
parent c0255ed7d8
commit 53d1f5b2eb
2 changed files with 62 additions and 21 deletions

View File

@ -5,6 +5,10 @@ from collections import defaultdict
cdef class Span: cdef class Span:
"""A slice from a Doc object.""" """A slice from a Doc object."""
def __cinit__(self, Doc tokens, int start, int end, int label=0): def __cinit__(self, Doc tokens, int start, int end, int label=0):
if start < 0:
start = tokens.length - start
if end < 0:
end = tokens.length - end
self._seq = tokens self._seq = tokens
self.start = start self.start = start
self.end = end self.end = end
@ -37,20 +41,48 @@ cdef class Span:
for i in range(self.start, self.end): for i in range(self.start, self.end):
yield self._seq[i] yield self._seq[i]
property head: property root:
"""The highest Token in the dependency tree in the Span, or None if """The first ancestor of the first word of the span that has its head
the Span is not internally connected (i.e. there are multiple heads). outside the span.
For example:
>>> toks = nlp(u'I like New York in Autumn.')
Let's name the indices --- easier than writing "toks[4]" etc.
>>> i, like, new, york, in_, autumn, dot = range(len(toks))
The head of 'new' is 'York', and the head of 'York' is 'like'
>>> toks[new].head.orth_
'York'
>>> toks[york].head.orth_
'like'
Create a span for "New York". Its root is "York".
>>> new_york = toks[new:york+1]
>>> new_york.root.orth_
'York'
When there are multiple words with external dependencies, we take the first:
>>> toks[autumn].head.orth_, toks[dot].head.orth_
('in', like')
>>> autumn_dot = toks[autumn:]
>>> autumn_dot.root.orth_
'Autumn'
""" """
def __get__(self): def __get__(self):
heads = [] # This should probably be called 'head', and the other one called
for token in self: # 'gov'. But we went with 'head' elsehwhere, and now we're stuck =/
head_i = token.head.i cdef const TokenC* start = &self._seq.data[self.start]
if token.head is token or head_i >= self.end or head_i < self.start: cdef const TokenC* end = &self._seq.data[self.end]
heads.append(token) head = start
if len(heads) != 1: while start <= (head + head.head) < end and head.head != 0:
return None head += head.head
else: return self[head - self._seq.data]
return heads[0]
property lefts: property lefts:
"""Tokens that are to the left of the Span, whose head is within the Span.""" """Tokens that are to the left of the Span, whose head is within the Span."""
@ -68,6 +100,14 @@ cdef class Span:
if right.i >= self.end: if right.i >= self.end:
yield right yield right
property subtree:
def __get__(self):
for word in self.lefts:
yield from word.subtree
yield from self
for word in self.rights:
yield from word.subtree
property orth_: property orth_:
def __get__(self): def __get__(self):
return ''.join([t.string for t in self]).strip() return ''.join([t.string for t in self]).strip()
@ -84,10 +124,3 @@ cdef class Span:
def __get__(self): def __get__(self):
return self._seq.vocab.strings[self.label] return self._seq.vocab.strings[self.label]
property subtree:
def __get__(self):
for word in self.lefts:
yield from word.subtree
yield from self
for word in self.rights:
yield from word.subtree

View File

@ -5,8 +5,8 @@ import pytest
@pytest.fixture @pytest.fixture
def doc(en_nlp): def doc(EN):
return en_nlp('This is a sentence. This is another sentence. And a third.') return EN('This is a sentence. This is another sentence. And a third.')
def test_sent_spans(doc): def test_sent_spans(doc):
@ -15,3 +15,11 @@ def test_sent_spans(doc):
assert sents[0].end == 5 assert sents[0].end == 5
assert len(sents) == 3 assert len(sents) == 3
assert sum(len(sent) for sent in sents) == len(doc) assert sum(len(sent) for sent in sents) == len(doc)
def test_root(doc):
np = doc[2:4]
assert len(np) == 2
assert np.orth_ == 'a sentence'
assert np.root.orth_ == 'sentence'
assert nlp.root.head.orth_ == 'is'