mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 01:46:28 +03:00
* Rename Span.head to Span.root.
This commit is contained in:
parent
c0255ed7d8
commit
53d1f5b2eb
|
@ -5,6 +5,10 @@ from collections import defaultdict
|
||||||
cdef class Span:
|
cdef class Span:
|
||||||
"""A slice from a Doc object."""
|
"""A slice from a Doc object."""
|
||||||
def __cinit__(self, Doc tokens, int start, int end, int label=0):
|
def __cinit__(self, Doc tokens, int start, int end, int label=0):
|
||||||
|
if start < 0:
|
||||||
|
start = tokens.length - start
|
||||||
|
if end < 0:
|
||||||
|
end = tokens.length - end
|
||||||
self._seq = tokens
|
self._seq = tokens
|
||||||
self.start = start
|
self.start = start
|
||||||
self.end = end
|
self.end = end
|
||||||
|
@ -37,20 +41,48 @@ cdef class Span:
|
||||||
for i in range(self.start, self.end):
|
for i in range(self.start, self.end):
|
||||||
yield self._seq[i]
|
yield self._seq[i]
|
||||||
|
|
||||||
property head:
|
property root:
|
||||||
"""The highest Token in the dependency tree in the Span, or None if
|
"""The first ancestor of the first word of the span that has its head
|
||||||
the Span is not internally connected (i.e. there are multiple heads).
|
outside the span.
|
||||||
|
|
||||||
|
For example:
|
||||||
|
|
||||||
|
>>> toks = nlp(u'I like New York in Autumn.')
|
||||||
|
|
||||||
|
Let's name the indices --- easier than writing "toks[4]" etc.
|
||||||
|
|
||||||
|
>>> i, like, new, york, in_, autumn, dot = range(len(toks))
|
||||||
|
|
||||||
|
The head of 'new' is 'York', and the head of 'York' is 'like'
|
||||||
|
|
||||||
|
>>> toks[new].head.orth_
|
||||||
|
'York'
|
||||||
|
>>> toks[york].head.orth_
|
||||||
|
'like'
|
||||||
|
|
||||||
|
Create a span for "New York". Its root is "York".
|
||||||
|
|
||||||
|
>>> new_york = toks[new:york+1]
|
||||||
|
>>> new_york.root.orth_
|
||||||
|
'York'
|
||||||
|
|
||||||
|
When there are multiple words with external dependencies, we take the first:
|
||||||
|
|
||||||
|
>>> toks[autumn].head.orth_, toks[dot].head.orth_
|
||||||
|
('in', like')
|
||||||
|
>>> autumn_dot = toks[autumn:]
|
||||||
|
>>> autumn_dot.root.orth_
|
||||||
|
'Autumn'
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
heads = []
|
# This should probably be called 'head', and the other one called
|
||||||
for token in self:
|
# 'gov'. But we went with 'head' elsehwhere, and now we're stuck =/
|
||||||
head_i = token.head.i
|
cdef const TokenC* start = &self._seq.data[self.start]
|
||||||
if token.head is token or head_i >= self.end or head_i < self.start:
|
cdef const TokenC* end = &self._seq.data[self.end]
|
||||||
heads.append(token)
|
head = start
|
||||||
if len(heads) != 1:
|
while start <= (head + head.head) < end and head.head != 0:
|
||||||
return None
|
head += head.head
|
||||||
else:
|
return self[head - self._seq.data]
|
||||||
return heads[0]
|
|
||||||
|
|
||||||
property lefts:
|
property lefts:
|
||||||
"""Tokens that are to the left of the Span, whose head is within the Span."""
|
"""Tokens that are to the left of the Span, whose head is within the Span."""
|
||||||
|
@ -68,6 +100,14 @@ cdef class Span:
|
||||||
if right.i >= self.end:
|
if right.i >= self.end:
|
||||||
yield right
|
yield right
|
||||||
|
|
||||||
|
property subtree:
|
||||||
|
def __get__(self):
|
||||||
|
for word in self.lefts:
|
||||||
|
yield from word.subtree
|
||||||
|
yield from self
|
||||||
|
for word in self.rights:
|
||||||
|
yield from word.subtree
|
||||||
|
|
||||||
property orth_:
|
property orth_:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return ''.join([t.string for t in self]).strip()
|
return ''.join([t.string for t in self]).strip()
|
||||||
|
@ -84,10 +124,3 @@ cdef class Span:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self._seq.vocab.strings[self.label]
|
return self._seq.vocab.strings[self.label]
|
||||||
|
|
||||||
property subtree:
|
|
||||||
def __get__(self):
|
|
||||||
for word in self.lefts:
|
|
||||||
yield from word.subtree
|
|
||||||
yield from self
|
|
||||||
for word in self.rights:
|
|
||||||
yield from word.subtree
|
|
||||||
|
|
|
@ -5,8 +5,8 @@ import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def doc(en_nlp):
|
def doc(EN):
|
||||||
return en_nlp('This is a sentence. This is another sentence. And a third.')
|
return EN('This is a sentence. This is another sentence. And a third.')
|
||||||
|
|
||||||
|
|
||||||
def test_sent_spans(doc):
|
def test_sent_spans(doc):
|
||||||
|
@ -15,3 +15,11 @@ def test_sent_spans(doc):
|
||||||
assert sents[0].end == 5
|
assert sents[0].end == 5
|
||||||
assert len(sents) == 3
|
assert len(sents) == 3
|
||||||
assert sum(len(sent) for sent in sents) == len(doc)
|
assert sum(len(sent) for sent in sents) == len(doc)
|
||||||
|
|
||||||
|
|
||||||
|
def test_root(doc):
|
||||||
|
np = doc[2:4]
|
||||||
|
assert len(np) == 2
|
||||||
|
assert np.orth_ == 'a sentence'
|
||||||
|
assert np.root.orth_ == 'sentence'
|
||||||
|
assert nlp.root.head.orth_ == 'is'
|
||||||
|
|
Loading…
Reference in New Issue
Block a user