mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-10 19:57:17 +03:00
add function for setting head and label to token
change PseudoProjectivity.deprojectivize to use these functions
This commit is contained in:
parent
1508528c8c
commit
46e3f979f1
|
@ -118,15 +118,18 @@ class PseudoProjectivity:
|
|||
# reattach arcs with decorated labels (following HEAD scheme)
|
||||
# for each decorated arc X||Y, search top-down, left-to-right,
|
||||
# breadth-first until hitting a Y then make this the new head
|
||||
parse = tokens.to_array([HEAD, DEP])
|
||||
labels = [ tokens.vocab.strings[int(p[1])] for p in parse ]
|
||||
#parse = tokens.to_array([HEAD, DEP])
|
||||
for token in tokens:
|
||||
if cls.is_decorated(token.dep_):
|
||||
newlabel,headlabel = cls.decompose(token.dep_)
|
||||
newhead = cls._find_new_head(token,headlabel)
|
||||
parse[token.i,1] = tokens.vocab.strings[newlabel]
|
||||
parse[token.i,0] = newhead.i - token.i
|
||||
tokens.from_array([HEAD, DEP],parse)
|
||||
token.head = newhead
|
||||
token.dep_ = newlabel
|
||||
|
||||
# tokens.attach(token,newhead,newlabel)
|
||||
#parse[token.i,1] = tokens.vocab.strings[newlabel]
|
||||
#parse[token.i,0] = newhead.i - token.i
|
||||
#tokens.from_array([HEAD, DEP],parse)
|
||||
|
||||
|
||||
@classmethod
|
||||
|
@ -168,7 +171,7 @@ class PseudoProjectivity:
|
|||
|
||||
@classmethod
|
||||
def _find_new_head(cls, token, headlabel):
|
||||
# search through the tree starting from root
|
||||
# search through the tree starting from the head of the given token
|
||||
# returns the id of the first descendant with the given label
|
||||
# if there is none, return the current head (no change)
|
||||
queue = [token.head]
|
||||
|
@ -176,8 +179,8 @@ class PseudoProjectivity:
|
|||
next_queue = []
|
||||
for qtoken in queue:
|
||||
for child in qtoken.children:
|
||||
if child == token:
|
||||
continue
|
||||
if child.is_space: continue
|
||||
if child == token: continue
|
||||
if child.dep_ == headlabel:
|
||||
return child
|
||||
next_queue.append(child)
|
||||
|
|
|
@ -62,3 +62,67 @@ def test_vectors(EN):
|
|||
assert sum(apples.vector) != sum(oranges.vector)
|
||||
assert apples.vector_norm != oranges.vector_norm
|
||||
|
||||
@pytest.mark.models
|
||||
def test_ancestors(EN):
|
||||
# the structure of this sentence depends on the English annotation scheme
|
||||
tokens = EN(u'Yesterday I saw a dog that barked loudly.')
|
||||
ancestors = [ t.orth_ for t in tokens[6].ancestors ]
|
||||
assert ancestors == ['dog','saw']
|
||||
ancestors = [ t.orth_ for t in tokens[1].ancestors ]
|
||||
assert ancestors == ['saw']
|
||||
ancestors = [ t.orth_ for t in tokens[2].ancestors ]
|
||||
assert ancestors == []
|
||||
|
||||
assert tokens[2].is_ancestor_of(tokens[7])
|
||||
assert not tokens[6].is_ancestor_of(tokens[2])
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_head_setter(EN):
|
||||
# the structure of this sentence depends on the English annotation scheme
|
||||
yesterday, i, saw, a, dog, that, barked, loudly, dot = EN(u'Yesterday I saw a dog that barked loudly.')
|
||||
assert barked.n_lefts == 1
|
||||
assert barked.n_rights == 1
|
||||
assert barked.left_edge == that
|
||||
assert barked.right_edge == loudly
|
||||
|
||||
assert dog.n_lefts == 1
|
||||
assert dog.n_rights == 1
|
||||
assert dog.left_edge == a
|
||||
assert dog.right_edge == loudly
|
||||
|
||||
assert a.n_lefts == 0
|
||||
assert a.n_rights == 0
|
||||
assert a.left_edge == a
|
||||
assert a.right_edge == a
|
||||
|
||||
assert saw.left_edge == yesterday
|
||||
assert saw.right_edge == dot
|
||||
|
||||
barked.head = a
|
||||
|
||||
assert barked.n_lefts == 1
|
||||
assert barked.n_rights == 1
|
||||
assert barked.left_edge == that
|
||||
assert barked.right_edge == loudly
|
||||
|
||||
assert a.n_lefts == 0
|
||||
assert a.n_rights == 1
|
||||
assert a.left_edge == a
|
||||
assert a.right_edge == loudly
|
||||
|
||||
assert dog.n_lefts == 1
|
||||
assert dog.n_rights == 0
|
||||
assert dog.left_edge == a
|
||||
assert dog.right_edge == loudly
|
||||
|
||||
assert saw.left_edge == yesterday
|
||||
assert saw.right_edge == dot
|
||||
|
||||
yesterday.head = that
|
||||
|
||||
assert that.left_edge == yesterday
|
||||
assert barked.left_edge == yesterday
|
||||
assert a.left_edge == yesterday
|
||||
assert dog.left_edge == yesterday
|
||||
assert saw.left_edge == yesterday
|
||||
|
|
|
@ -6,7 +6,7 @@ from .doc cimport Doc
|
|||
|
||||
cdef class Token:
|
||||
cdef Vocab vocab
|
||||
cdef const TokenC* c
|
||||
cdef TokenC* c
|
||||
cdef readonly int i
|
||||
cdef int array_len
|
||||
cdef readonly Doc doc
|
||||
|
|
|
@ -142,6 +142,8 @@ cdef class Token:
|
|||
property dep:
|
||||
def __get__(self):
|
||||
return self.c.dep
|
||||
def __set__(self, int label):
|
||||
self.c.dep = label
|
||||
|
||||
property has_vector:
|
||||
def __get__(self):
|
||||
|
@ -250,10 +252,122 @@ cdef class Token:
|
|||
def __get__(self):
|
||||
return self.doc[self.c.r_edge]
|
||||
|
||||
property ancestors:
|
||||
def __get__(self):
|
||||
cdef const TokenC* head_ptr = self.c
|
||||
# guard against infinite loop, no token can have
|
||||
# more ancestors than tokens in the tree
|
||||
cdef int i = 0
|
||||
while head_ptr.head != 0 and i < self.doc.length:
|
||||
head_ptr += head_ptr.head
|
||||
yield self.doc[head_ptr - (self.c - self.i)]
|
||||
i += 1
|
||||
|
||||
def is_ancestor_of(self, descendant):
|
||||
return any( ancestor.i == self.i for ancestor in descendant.ancestors )
|
||||
|
||||
property head:
|
||||
def __get__(self):
|
||||
"""The token predicted by the parser to be the head of the current token."""
|
||||
return self.doc[self.i + self.c.head]
|
||||
def __set__(self, Token new_head):
|
||||
# this function sets the head of self to new_head
|
||||
# and updates the counters for left/right dependents
|
||||
# and left/right corner for the new and the old head
|
||||
|
||||
# do nothing if old head is new head
|
||||
if self.i + self.c.head == new_head.i:
|
||||
return
|
||||
|
||||
cdef Token old_head = self.head
|
||||
cdef int rel_newhead_i = new_head.i - self.i
|
||||
|
||||
# is the new head a descendant of the old head
|
||||
cdef bint is_desc = old_head.is_ancestor_of(new_head)
|
||||
|
||||
cdef int token_i
|
||||
cdef int new_edge
|
||||
cdef Token anc
|
||||
|
||||
# update number of deps of old head
|
||||
if self.c.head > 0: # left dependent
|
||||
old_head.c.l_kids -= 1
|
||||
if self.c.l_edge == old_head.c.l_edge:
|
||||
# the token dominates the left edge so the left edge of the head
|
||||
# may change when the token is reattached
|
||||
# it may not change if the new head is a descendant of the current head
|
||||
|
||||
# find new l_edge if new head is not a descendant of old head
|
||||
# a new l_edge is any token between l_edge and old_head
|
||||
# that is a descendant of old_head but not of self
|
||||
new_edge = self.c.l_edge
|
||||
if not is_desc:
|
||||
for token_i in range(old_head.l_edge+1,old_head.i):
|
||||
if self.doc.c[token_i].l_kids == 0: # only a token without left deps can be a left edge
|
||||
if self.is_ancestor_of(self.doc[token_i]):
|
||||
continue
|
||||
if old_head.is_ancestor_of(self.doc[token_i]):
|
||||
new_edge = token_i
|
||||
break
|
||||
else: # set the new l_edge to old_head if no other was found
|
||||
new_edge = old_head.i
|
||||
|
||||
# assign new l_edge to old_head
|
||||
old_head.c.l_edge = new_edge
|
||||
# walk up the tree from old_head and assign new l_edge to ancestors
|
||||
# until an ancestor already has an l_edge that's further left
|
||||
for anc in old_head.ancestors:
|
||||
if anc.c.l_edge <= new_edge:
|
||||
break
|
||||
anc.c.l_edge = new_edge
|
||||
|
||||
elif self.c.head < 0: # right dependent
|
||||
old_head.c.r_kids -= 1
|
||||
# do the same thing as for l_edge
|
||||
if self.c.r_edge == old_head.c.r_edge:
|
||||
new_edge = self.c.r_edge
|
||||
if not is_desc:
|
||||
for token_i in range(old_head.r_edge-1,old_head.i,-1):
|
||||
if self.doc.c[token_i].r_kids == 0:
|
||||
if self.is_ancestor_of(self.doc[token_i]):
|
||||
continue
|
||||
if old_head.is_ancestor_of(self.doc[token_i]):
|
||||
new_edge = token_i
|
||||
break
|
||||
else:
|
||||
new_edge = old_head.i
|
||||
old_head.c.r_edge = new_edge
|
||||
for anc in old_head.ancestors:
|
||||
if anc.c.r_edge >= new_edge:
|
||||
break
|
||||
anc.c.r_edge = new_edge
|
||||
|
||||
# update number of deps of new head
|
||||
if rel_newhead_i > 0: # left dependent
|
||||
new_head.c.l_kids += 1
|
||||
# walk up the tree from new head and set l_edge to self.l_edge
|
||||
# until you hit a token with an l_edge further to the left
|
||||
if self.c.l_edge < new_head.c.l_edge:
|
||||
new_edge = self.c.l_edge
|
||||
new_head.c.l_edge = new_edge
|
||||
for anc in new_head.ancestors:
|
||||
if anc.c.l_edge <= new_edge:
|
||||
break
|
||||
anc.c.l_edge = new_edge
|
||||
|
||||
elif rel_newhead_i < 0: # right dependent
|
||||
new_head.c.r_kids += 1
|
||||
# do the same as for l_edge
|
||||
if self.c.r_edge > new_head.c.r_edge:
|
||||
new_edge = self.c.r_edge
|
||||
new_head.c.r_edge = new_edge
|
||||
for anc in new_head.ancestors:
|
||||
if anc.c.r_edge >= new_edge:
|
||||
break
|
||||
anc.c.r_edge = new_edge
|
||||
|
||||
# set new head
|
||||
self.c.head = rel_newhead_i
|
||||
|
||||
property conjuncts:
|
||||
def __get__(self):
|
||||
|
@ -325,6 +439,8 @@ cdef class Token:
|
|||
property dep_:
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.dep]
|
||||
def __set__(self, unicode label):
|
||||
self.c.dep = self.vocab.strings[label]
|
||||
|
||||
property is_oov:
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_OOV)
|
||||
|
|
Loading…
Reference in New Issue
Block a user