mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
add class PseudoProjective for pseudo-projective parsing
PseudoProjective() implements the algorithm from Nivre & Nilsson 2005 using their HEAD decoration scheme.
This commit is contained in:
parent
8d531c958b
commit
4b2297d5d4
|
@ -247,7 +247,7 @@ cdef class GoldParse:
|
|||
# projectivity here means non-proj arcs are being disconnected
|
||||
np_arcs = []
|
||||
for word in range(self.length):
|
||||
if nonproj.is_non_projective_arc(word,self.heads):
|
||||
if nonproj.is_nonproj_arc(word,self.heads):
|
||||
np_arcs.append(word)
|
||||
for np_arc in np_arcs:
|
||||
self.heads[np_arc] = None
|
||||
|
@ -266,7 +266,7 @@ cdef class GoldParse:
|
|||
|
||||
@property
|
||||
def is_projective(self):
|
||||
return not nonproj.is_non_projective_tree(self.heads)
|
||||
return not nonproj.is_nonproj_tree(self.heads)
|
||||
|
||||
|
||||
def is_punct_label(label):
|
||||
|
|
152
spacy/nonproj.py
152
spacy/nonproj.py
|
@ -1,11 +1,12 @@
|
|||
from copy import copy
|
||||
from collections import Counter
|
||||
|
||||
|
||||
def ancestors(word, heads):
|
||||
def ancestors(tokenid, heads):
|
||||
# returns all words going from the word up the path to the root
|
||||
# the path to root cannot be longer than the number of words in the sentence
|
||||
# this function ends after at most len(heads) steps
|
||||
# because it would otherwise loop indefinitely on cycles
|
||||
head = word
|
||||
head = tokenid
|
||||
cnt = 0
|
||||
while heads[head] != head and cnt < len(heads):
|
||||
head = heads[head]
|
||||
|
@ -18,26 +19,26 @@ def ancestors(word, heads):
|
|||
def contains_cycle(heads):
|
||||
# in an acyclic tree, the path from each word following
|
||||
# the head relation upwards always ends at the root node
|
||||
for word in range(len(heads)):
|
||||
seen = set([word])
|
||||
for ancestor in ancestors(word,heads):
|
||||
for tokenid in range(len(heads)):
|
||||
seen = set([tokenid])
|
||||
for ancestor in ancestors(tokenid,heads):
|
||||
if ancestor in seen:
|
||||
return seen
|
||||
seen.add(ancestor)
|
||||
return None
|
||||
|
||||
|
||||
def is_non_projective_arc(word, heads):
|
||||
def is_nonproj_arc(tokenid, heads):
|
||||
# definition (e.g. Havelka 2007): an arc h -> d, h < d is non-projective
|
||||
# if there is a word k, h < k < d such that h is not
|
||||
# if there is a token k, h < k < d such that h is not
|
||||
# an ancestor of k. Same for h -> d, h > d
|
||||
head = heads[word]
|
||||
if head == word: # root arcs cannot be non-projective
|
||||
head = heads[tokenid]
|
||||
if head == tokenid: # root arcs cannot be non-projective
|
||||
return False
|
||||
elif head == None: # unattached tokens cannot be non-projective
|
||||
return False
|
||||
|
||||
start, end = (head+1, word) if head < word else (word+1, head)
|
||||
start, end = (head+1, tokenid) if head < tokenid else (tokenid+1, head)
|
||||
for k in range(start,end):
|
||||
for ancestor in ancestors(k,heads):
|
||||
if ancestor == None: # for unattached tokens/subtrees
|
||||
|
@ -49,7 +50,132 @@ def is_non_projective_arc(word, heads):
|
|||
return False
|
||||
|
||||
|
||||
def is_non_projective_tree(heads):
|
||||
def is_nonproj_tree(heads):
|
||||
# a tree is non-projective if at least one arc is non-projective
|
||||
return any( is_non_projective_arc(word,heads) for word in range(len(heads)) )
|
||||
return any( is_nonproj_arc(word,heads) for word in range(len(heads)) )
|
||||
|
||||
|
||||
class PseudoProjective:
|
||||
# implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005
|
||||
# for doing pseudo-projective parsing
|
||||
# implementation uses the HEAD decoration scheme
|
||||
|
||||
def preprocess_training_data(self, labeled_trees, label_freq_cutoff=30):
|
||||
# expects a sequence of pairs of head arrays and labels
|
||||
preprocessed = []
|
||||
for heads,labels in labeled_trees:
|
||||
proj_heads,deco_labels = self.projectivize(heads,labels)
|
||||
# set the label to ROOT for each root dependent
|
||||
deco_labels = [ 'ROOT' if head == i else deco_labels[i] for i,head in enumerate(proj_heads) ]
|
||||
preprocessed.append((proj_heads,deco_labels))
|
||||
|
||||
if label_freq_cutoff > 0:
|
||||
return self._filter_labels(preprocessed,label_freq_cutoff)
|
||||
return preprocessed
|
||||
|
||||
|
||||
def projectivize(self, heads, labels):
|
||||
# use the algorithm by Nivre & Nilsson 2005
|
||||
# assumes heads to be a proper tree, i.e. connected and cycle-free
|
||||
# returns a new pair (heads,labels) which encode
|
||||
# a projective and decorated tree
|
||||
proj_heads = copy(heads)
|
||||
smallest_np_arc = self._get_smallest_nonproj_arc(proj_heads)
|
||||
if smallest_np_arc == None: # this sentence is already projective
|
||||
return proj_heads, copy(labels)
|
||||
while smallest_np_arc != None:
|
||||
self._lift(smallest_np_arc, proj_heads)
|
||||
smallest_np_arc = self._get_smallest_nonproj_arc(proj_heads)
|
||||
deco_labels = self._decorate(heads, proj_heads, labels)
|
||||
return proj_heads, deco_labels
|
||||
|
||||
|
||||
def deprojectivize(self, heads, labels):
|
||||
# reattach arcs with decorated labels (following HEAD scheme)
|
||||
# for each decorated arc X||Y, search top-down, left-to-right,
|
||||
# breadth-first until hitting a Y then make this the new head
|
||||
newheads, newlabels = copy(heads), copy(labels)
|
||||
spans = None
|
||||
for tokenid, head in enumerate(heads):
|
||||
if labels[tokenid].find('||') != -1:
|
||||
newlabel,_,headlabel = labels[tokenid].partition('||')
|
||||
newhead = self._find_new_head(head,tokenid,headlabel,heads,labels,spans=spans)
|
||||
newheads[tokenid] = newhead
|
||||
newlabels[tokenid] = newlabel
|
||||
return newheads, newlabels
|
||||
|
||||
|
||||
def _decorate(self, heads, proj_heads, labels):
|
||||
# uses decoration scheme HEAD from Nivre & Nilsson 2005
|
||||
assert(len(heads) == len(proj_heads) == len(labels))
|
||||
deco_labels = []
|
||||
for tokenid,head in enumerate(heads):
|
||||
if head != proj_heads[tokenid]:
|
||||
deco_labels.append('%s||%s' % (labels[tokenid],labels[head]))
|
||||
else:
|
||||
deco_labels.append(labels[tokenid])
|
||||
return deco_labels
|
||||
|
||||
|
||||
def _get_smallest_nonproj_arc(self, heads):
|
||||
# return the smallest non-proj arc or None
|
||||
# where size is defined as the distance between dep and head
|
||||
# and ties are broken left to right
|
||||
smallest_size = float('inf')
|
||||
smallest_np_arc = None
|
||||
for tokenid,head in enumerate(heads):
|
||||
size = abs(tokenid-head)
|
||||
if size < smallest_size and is_nonproj_arc(tokenid,heads):
|
||||
smallest_size = size
|
||||
smallest_np_arc = tokenid
|
||||
return smallest_np_arc
|
||||
|
||||
|
||||
def _lift(self, tokenid, heads):
|
||||
# reattaches a word to it's grandfather
|
||||
head = heads[tokenid]
|
||||
ghead = heads[head]
|
||||
# attach to ghead if head isn't attached to root else attach to root
|
||||
heads[tokenid] = ghead if head != ghead else tokenid
|
||||
|
||||
|
||||
def _find_new_head(self, rootid, tokenid, headlabel, heads, labels, spans=None):
|
||||
# search through the tree starting from root
|
||||
# returns the id of the first descendant with the given label
|
||||
# if there is none, return the current head (no change)
|
||||
if not spans:
|
||||
spans = self._make_span_index(heads)
|
||||
queue = spans.get(rootid,[])
|
||||
queue.remove(tokenid) # don't search in the subtree of the nonproj arc
|
||||
while queue:
|
||||
next_queue = []
|
||||
for idx in queue:
|
||||
if labels[idx] == headlabel:
|
||||
return idx
|
||||
next_queue.extend(spans.get(idx,[]))
|
||||
queue = next_queue
|
||||
return heads[tokenid]
|
||||
|
||||
|
||||
def _make_span_index(self, heads):
|
||||
# stores the direct dependents for each token
|
||||
# for searching top-down through a tree
|
||||
spans = {}
|
||||
for tokenid, head in enumerate(heads):
|
||||
if tokenid == head: # root
|
||||
continue
|
||||
if head not in spans:
|
||||
spans[head] = []
|
||||
spans[head].append(tokenid)
|
||||
return spans
|
||||
|
||||
|
||||
def _filter_labels(self, labeled_trees, cutoff):
|
||||
# throw away infrequent decorated labels
|
||||
# can't learn them reliably anyway and keeps label set smaller
|
||||
freqs = Counter([ label for _,labels in labeled_trees for label in labels if label.find('||') != -1 ])
|
||||
filtered = []
|
||||
for proj_heads,deco_labels in labeled_trees:
|
||||
filtered_labels = [ label.partition('||')[0] if freqs.get(label,cutoff) < cutoff else label for label in deco_labels ]
|
||||
filtered.append((proj_heads,filtered_labels))
|
||||
return filtered
|
||||
|
|
|
@ -211,11 +211,6 @@ cdef class Tagger:
|
|||
tokens.is_tagged = True
|
||||
tokens._py_tokens = [None] * tokens.length
|
||||
|
||||
def tags_from_list(self, Doc tokens, list strings):
|
||||
assert(tokens.length == len(strings))
|
||||
for i in range(tokens.length):
|
||||
self.vocab.morphology.assign_tag(&tokens.c[i], strings[i])
|
||||
|
||||
def pipe(self, stream, batch_size=1000, n_threads=2):
|
||||
for doc in stream:
|
||||
self(doc)
|
||||
|
|
|
@ -1,42 +1,113 @@
|
|||
from __future__ import unicode_literals
|
||||
import pytest
|
||||
|
||||
from spacy.nonproj import ancestors, contains_cycle, is_non_projective_arc, is_non_projective_tree
|
||||
from spacy.nonproj import ancestors, contains_cycle, is_nonproj_arc, is_nonproj_tree, PseudoProjective
|
||||
|
||||
def test_ancestors():
|
||||
tree = [1,2,2,4,5,2,2]
|
||||
cyclic_tree = [1,2,2,4,5,3,2]
|
||||
partial_tree = [1,2,2,4,5,None,2]
|
||||
multirooted_tree = [3,2,0,3,3,7,7,3,7,10,7,10,11,12,18,16,18,17,12,3]
|
||||
assert([ a for a in ancestors(3,tree) ] == [4,5,2])
|
||||
assert([ a for a in ancestors(3,cyclic_tree) ] == [4,5,3,4,5,3,4])
|
||||
assert([ a for a in ancestors(3,partial_tree) ] == [4,5,None])
|
||||
assert([ a for a in ancestors(17,multirooted_tree) ] == [])
|
||||
|
||||
def test_contains_cycle():
|
||||
tree = [1,2,2,4,5,2,2]
|
||||
cyclic_tree = [1,2,2,4,5,3,2]
|
||||
partial_tree = [1,2,2,4,5,None,2]
|
||||
multirooted_tree = [3,2,0,3,3,7,7,3,7,10,7,10,11,12,18,16,18,17,12,3]
|
||||
assert(contains_cycle(tree) == None)
|
||||
assert(contains_cycle(cyclic_tree) == set([3,4,5]))
|
||||
assert(contains_cycle(partial_tree) == None)
|
||||
assert(contains_cycle(multirooted_tree) == None)
|
||||
|
||||
def test_is_non_projective_arc():
|
||||
def test_is_nonproj_arc():
|
||||
nonproj_tree = [1,2,2,4,5,2,7,4,2]
|
||||
assert(is_non_projective_arc(0,nonproj_tree) == False)
|
||||
assert(is_non_projective_arc(1,nonproj_tree) == False)
|
||||
assert(is_non_projective_arc(2,nonproj_tree) == False)
|
||||
assert(is_non_projective_arc(3,nonproj_tree) == False)
|
||||
assert(is_non_projective_arc(4,nonproj_tree) == False)
|
||||
assert(is_non_projective_arc(5,nonproj_tree) == False)
|
||||
assert(is_non_projective_arc(6,nonproj_tree) == False)
|
||||
assert(is_non_projective_arc(7,nonproj_tree) == True)
|
||||
assert(is_non_projective_arc(8,nonproj_tree) == False)
|
||||
partial_tree = [1,2,2,4,5,None,7,4,2]
|
||||
assert(is_non_projective_arc(7,partial_tree) == False)
|
||||
multirooted_tree = [3,2,0,3,3,7,7,3,7,10,7,10,11,12,18,16,18,17,12,3]
|
||||
assert(is_nonproj_arc(0,nonproj_tree) == False)
|
||||
assert(is_nonproj_arc(1,nonproj_tree) == False)
|
||||
assert(is_nonproj_arc(2,nonproj_tree) == False)
|
||||
assert(is_nonproj_arc(3,nonproj_tree) == False)
|
||||
assert(is_nonproj_arc(4,nonproj_tree) == False)
|
||||
assert(is_nonproj_arc(5,nonproj_tree) == False)
|
||||
assert(is_nonproj_arc(6,nonproj_tree) == False)
|
||||
assert(is_nonproj_arc(7,nonproj_tree) == True)
|
||||
assert(is_nonproj_arc(8,nonproj_tree) == False)
|
||||
assert(is_nonproj_arc(7,partial_tree) == False)
|
||||
assert(is_nonproj_arc(17,multirooted_tree) == False)
|
||||
assert(is_nonproj_arc(16,multirooted_tree) == True)
|
||||
|
||||
def test_is_non_projective_tree():
|
||||
def test_is_nonproj_tree():
|
||||
proj_tree = [1,2,2,4,5,2,7,5,2]
|
||||
nonproj_tree = [1,2,2,4,5,2,7,4,2]
|
||||
partial_tree = [1,2,2,4,5,None,7,4,2]
|
||||
assert(is_non_projective_tree(proj_tree) == False)
|
||||
assert(is_non_projective_tree(nonproj_tree) == True)
|
||||
assert(is_non_projective_tree(partial_tree) == False)
|
||||
multirooted_tree = [3,2,0,3,3,7,7,3,7,10,7,10,11,12,18,16,18,17,12,3]
|
||||
assert(is_nonproj_tree(proj_tree) == False)
|
||||
assert(is_nonproj_tree(nonproj_tree) == True)
|
||||
assert(is_nonproj_tree(partial_tree) == False)
|
||||
assert(is_nonproj_tree(multirooted_tree) == True)
|
||||
|
||||
def test_pseudoprojective():
|
||||
tree = [1,2,2]
|
||||
nonproj_tree = [1,2,2,4,5,2,7,4,2]
|
||||
labels = ['NK','SB','ROOT','NK','OA','OC','SB','RC','--']
|
||||
nonproj_tree2 = [9,1,3,1,5,6,9,8,6,1,6,12,13,10,1]
|
||||
labels2 = ['MO','ROOT','NK','SB','MO','NK','OA','NK','AG','OC','MNR','MO','NK','NK','--']
|
||||
|
||||
pp = PseudoProjective()
|
||||
|
||||
assert(pp._make_span_index(tree) == { 1:[0], 2:[1] })
|
||||
assert(pp._make_span_index(nonproj_tree) == { 1:[0], 2:[1,5,8], 4:[3,7], 5:[4], 7:[6] })
|
||||
|
||||
pp._lift(0,tree)
|
||||
assert(tree == [2,2,2])
|
||||
|
||||
np_arc = pp._get_smallest_nonproj_arc(nonproj_tree)
|
||||
assert(np_arc == 7)
|
||||
|
||||
np_arc = pp._get_smallest_nonproj_arc(nonproj_tree2)
|
||||
assert(np_arc == 10)
|
||||
|
||||
proj_heads, deco_labels = pp.projectivize(nonproj_tree,labels)
|
||||
assert(proj_heads == [1,2,2,4,5,2,7,5,2])
|
||||
assert(deco_labels == ['NK','SB','ROOT','NK','OA','OC','SB','RC||OA','--'])
|
||||
deproj_heads, undeco_labels = pp.deprojectivize(proj_heads,deco_labels)
|
||||
assert(deproj_heads == nonproj_tree)
|
||||
assert(undeco_labels == labels)
|
||||
|
||||
proj_heads, deco_labels = pp.projectivize(nonproj_tree2,labels2)
|
||||
assert(proj_heads == [1,1,3,1,5,6,9,8,6,1,9,12,13,10,1])
|
||||
assert(deco_labels == ['MO||OC','ROOT','NK','SB','MO','NK','OA','NK','AG','OC','MNR||OA','MO','NK','NK','--'])
|
||||
deproj_heads, undeco_labels = pp.deprojectivize(proj_heads,deco_labels)
|
||||
assert(deproj_heads == nonproj_tree2)
|
||||
assert(undeco_labels == labels2)
|
||||
|
||||
# if decoration is wrong such that there is no head with the desired label
|
||||
# the structure is kept and the label is undecorated
|
||||
deproj_heads, undeco_labels = pp.deprojectivize([1,2,2,4,5,2,7,5,2],['NK','SB','ROOT','NK','OA','OC','SB','RC||DA','--'])
|
||||
assert(deproj_heads == [1,2,2,4,5,2,7,5,2])
|
||||
assert(undeco_labels == ['NK','SB','ROOT','NK','OA','OC','SB','RC','--'])
|
||||
|
||||
# if there are two potential new heads, the first one is chosen even if it's wrong
|
||||
deproj_heads, undeco_labels = pp.deprojectivize([1,1,3,1,5,6,9,8,6,1,9,12,13,10,1], \
|
||||
['MO||OC','ROOT','NK','OC','MO','NK','OA','NK','AG','OC','MNR||OA','MO','NK','NK','--'])
|
||||
assert(deproj_heads == [3,1,3,1,5,6,9,8,6,1,6,12,13,10,1])
|
||||
assert(undeco_labels == ['MO','ROOT','NK','OC','MO','NK','OA','NK','AG','OC','MNR','MO','NK','NK','--'])
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user