diff --git a/spacy/gold.pyx b/spacy/gold.pyx index dd29a42c7..7ab034195 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -247,7 +247,7 @@ cdef class GoldParse: # projectivity here means non-proj arcs are being disconnected np_arcs = [] for word in range(self.length): - if nonproj.is_non_projective_arc(word,self.heads): + if nonproj.is_nonproj_arc(word,self.heads): np_arcs.append(word) for np_arc in np_arcs: self.heads[np_arc] = None @@ -266,7 +266,7 @@ cdef class GoldParse: @property def is_projective(self): - return not nonproj.is_non_projective_tree(self.heads) + return not nonproj.is_nonproj_tree(self.heads) def is_punct_label(label): diff --git a/spacy/nonproj.py b/spacy/nonproj.py index 58f9f3e9b..facf9f299 100644 --- a/spacy/nonproj.py +++ b/spacy/nonproj.py @@ -1,11 +1,12 @@ +from copy import copy +from collections import Counter - -def ancestors(word, heads): +def ancestors(tokenid, heads): # returns all words going from the word up the path to the root # the path to root cannot be longer than the number of words in the sentence # this function ends after at most len(heads) steps # because it would otherwise loop indefinitely on cycles - head = word + head = tokenid cnt = 0 while heads[head] != head and cnt < len(heads): head = heads[head] @@ -18,26 +19,26 @@ def ancestors(word, heads): def contains_cycle(heads): # in an acyclic tree, the path from each word following # the head relation upwards always ends at the root node - for word in range(len(heads)): - seen = set([word]) - for ancestor in ancestors(word,heads): + for tokenid in range(len(heads)): + seen = set([tokenid]) + for ancestor in ancestors(tokenid,heads): if ancestor in seen: return seen seen.add(ancestor) return None -def is_non_projective_arc(word, heads): +def is_nonproj_arc(tokenid, heads): # definition (e.g. Havelka 2007): an arc h -> d, h < d is non-projective - # if there is a word k, h < k < d such that h is not + # if there is a token k, h < k < d such that h is not # an ancestor of k. Same for h -> d, h > d - head = heads[word] - if head == word: # root arcs cannot be non-projective + head = heads[tokenid] + if head == tokenid: # root arcs cannot be non-projective return False elif head == None: # unattached tokens cannot be non-projective return False - start, end = (head+1, word) if head < word else (word+1, head) + start, end = (head+1, tokenid) if head < tokenid else (tokenid+1, head) for k in range(start,end): for ancestor in ancestors(k,heads): if ancestor == None: # for unattached tokens/subtrees @@ -49,7 +50,132 @@ def is_non_projective_arc(word, heads): return False -def is_non_projective_tree(heads): +def is_nonproj_tree(heads): # a tree is non-projective if at least one arc is non-projective - return any( is_non_projective_arc(word,heads) for word in range(len(heads)) ) + return any( is_nonproj_arc(word,heads) for word in range(len(heads)) ) + +class PseudoProjective: + # implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005 + # for doing pseudo-projective parsing + # implementation uses the HEAD decoration scheme + + def preprocess_training_data(self, labeled_trees, label_freq_cutoff=30): + # expects a sequence of pairs of head arrays and labels + preprocessed = [] + for heads,labels in labeled_trees: + proj_heads,deco_labels = self.projectivize(heads,labels) + # set the label to ROOT for each root dependent + deco_labels = [ 'ROOT' if head == i else deco_labels[i] for i,head in enumerate(proj_heads) ] + preprocessed.append((proj_heads,deco_labels)) + + if label_freq_cutoff > 0: + return self._filter_labels(preprocessed,label_freq_cutoff) + return preprocessed + + + def projectivize(self, heads, labels): + # use the algorithm by Nivre & Nilsson 2005 + # assumes heads to be a proper tree, i.e. connected and cycle-free + # returns a new pair (heads,labels) which encode + # a projective and decorated tree + proj_heads = copy(heads) + smallest_np_arc = self._get_smallest_nonproj_arc(proj_heads) + if smallest_np_arc == None: # this sentence is already projective + return proj_heads, copy(labels) + while smallest_np_arc != None: + self._lift(smallest_np_arc, proj_heads) + smallest_np_arc = self._get_smallest_nonproj_arc(proj_heads) + deco_labels = self._decorate(heads, proj_heads, labels) + return proj_heads, deco_labels + + + def deprojectivize(self, heads, labels): + # reattach arcs with decorated labels (following HEAD scheme) + # for each decorated arc X||Y, search top-down, left-to-right, + # breadth-first until hitting a Y then make this the new head + newheads, newlabels = copy(heads), copy(labels) + spans = None + for tokenid, head in enumerate(heads): + if labels[tokenid].find('||') != -1: + newlabel,_,headlabel = labels[tokenid].partition('||') + newhead = self._find_new_head(head,tokenid,headlabel,heads,labels,spans=spans) + newheads[tokenid] = newhead + newlabels[tokenid] = newlabel + return newheads, newlabels + + + def _decorate(self, heads, proj_heads, labels): + # uses decoration scheme HEAD from Nivre & Nilsson 2005 + assert(len(heads) == len(proj_heads) == len(labels)) + deco_labels = [] + for tokenid,head in enumerate(heads): + if head != proj_heads[tokenid]: + deco_labels.append('%s||%s' % (labels[tokenid],labels[head])) + else: + deco_labels.append(labels[tokenid]) + return deco_labels + + + def _get_smallest_nonproj_arc(self, heads): + # return the smallest non-proj arc or None + # where size is defined as the distance between dep and head + # and ties are broken left to right + smallest_size = float('inf') + smallest_np_arc = None + for tokenid,head in enumerate(heads): + size = abs(tokenid-head) + if size < smallest_size and is_nonproj_arc(tokenid,heads): + smallest_size = size + smallest_np_arc = tokenid + return smallest_np_arc + + + def _lift(self, tokenid, heads): + # reattaches a word to it's grandfather + head = heads[tokenid] + ghead = heads[head] + # attach to ghead if head isn't attached to root else attach to root + heads[tokenid] = ghead if head != ghead else tokenid + + + def _find_new_head(self, rootid, tokenid, headlabel, heads, labels, spans=None): + # search through the tree starting from root + # returns the id of the first descendant with the given label + # if there is none, return the current head (no change) + if not spans: + spans = self._make_span_index(heads) + queue = spans.get(rootid,[]) + queue.remove(tokenid) # don't search in the subtree of the nonproj arc + while queue: + next_queue = [] + for idx in queue: + if labels[idx] == headlabel: + return idx + next_queue.extend(spans.get(idx,[])) + queue = next_queue + return heads[tokenid] + + + def _make_span_index(self, heads): + # stores the direct dependents for each token + # for searching top-down through a tree + spans = {} + for tokenid, head in enumerate(heads): + if tokenid == head: # root + continue + if head not in spans: + spans[head] = [] + spans[head].append(tokenid) + return spans + + + def _filter_labels(self, labeled_trees, cutoff): + # throw away infrequent decorated labels + # can't learn them reliably anyway and keeps label set smaller + freqs = Counter([ label for _,labels in labeled_trees for label in labels if label.find('||') != -1 ]) + filtered = [] + for proj_heads,deco_labels in labeled_trees: + filtered_labels = [ label.partition('||')[0] if freqs.get(label,cutoff) < cutoff else label for label in deco_labels ] + filtered.append((proj_heads,filtered_labels)) + return filtered diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index 1c5baced7..26f8fd3e5 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -211,11 +211,6 @@ cdef class Tagger: tokens.is_tagged = True tokens._py_tokens = [None] * tokens.length - def tags_from_list(self, Doc tokens, list strings): - assert(tokens.length == len(strings)) - for i in range(tokens.length): - self.vocab.morphology.assign_tag(&tokens.c[i], strings[i]) - def pipe(self, stream, batch_size=1000, n_threads=2): for doc in stream: self(doc) diff --git a/spacy/tests/test_nonproj.py b/spacy/tests/test_nonproj.py index bd7f12bff..d5290e342 100644 --- a/spacy/tests/test_nonproj.py +++ b/spacy/tests/test_nonproj.py @@ -1,42 +1,113 @@ from __future__ import unicode_literals import pytest -from spacy.nonproj import ancestors, contains_cycle, is_non_projective_arc, is_non_projective_tree +from spacy.nonproj import ancestors, contains_cycle, is_nonproj_arc, is_nonproj_tree, PseudoProjective def test_ancestors(): tree = [1,2,2,4,5,2,2] cyclic_tree = [1,2,2,4,5,3,2] partial_tree = [1,2,2,4,5,None,2] + multirooted_tree = [3,2,0,3,3,7,7,3,7,10,7,10,11,12,18,16,18,17,12,3] assert([ a for a in ancestors(3,tree) ] == [4,5,2]) assert([ a for a in ancestors(3,cyclic_tree) ] == [4,5,3,4,5,3,4]) assert([ a for a in ancestors(3,partial_tree) ] == [4,5,None]) + assert([ a for a in ancestors(17,multirooted_tree) ] == []) def test_contains_cycle(): tree = [1,2,2,4,5,2,2] cyclic_tree = [1,2,2,4,5,3,2] partial_tree = [1,2,2,4,5,None,2] + multirooted_tree = [3,2,0,3,3,7,7,3,7,10,7,10,11,12,18,16,18,17,12,3] assert(contains_cycle(tree) == None) assert(contains_cycle(cyclic_tree) == set([3,4,5])) assert(contains_cycle(partial_tree) == None) + assert(contains_cycle(multirooted_tree) == None) -def test_is_non_projective_arc(): +def test_is_nonproj_arc(): nonproj_tree = [1,2,2,4,5,2,7,4,2] - assert(is_non_projective_arc(0,nonproj_tree) == False) - assert(is_non_projective_arc(1,nonproj_tree) == False) - assert(is_non_projective_arc(2,nonproj_tree) == False) - assert(is_non_projective_arc(3,nonproj_tree) == False) - assert(is_non_projective_arc(4,nonproj_tree) == False) - assert(is_non_projective_arc(5,nonproj_tree) == False) - assert(is_non_projective_arc(6,nonproj_tree) == False) - assert(is_non_projective_arc(7,nonproj_tree) == True) - assert(is_non_projective_arc(8,nonproj_tree) == False) partial_tree = [1,2,2,4,5,None,7,4,2] - assert(is_non_projective_arc(7,partial_tree) == False) + multirooted_tree = [3,2,0,3,3,7,7,3,7,10,7,10,11,12,18,16,18,17,12,3] + assert(is_nonproj_arc(0,nonproj_tree) == False) + assert(is_nonproj_arc(1,nonproj_tree) == False) + assert(is_nonproj_arc(2,nonproj_tree) == False) + assert(is_nonproj_arc(3,nonproj_tree) == False) + assert(is_nonproj_arc(4,nonproj_tree) == False) + assert(is_nonproj_arc(5,nonproj_tree) == False) + assert(is_nonproj_arc(6,nonproj_tree) == False) + assert(is_nonproj_arc(7,nonproj_tree) == True) + assert(is_nonproj_arc(8,nonproj_tree) == False) + assert(is_nonproj_arc(7,partial_tree) == False) + assert(is_nonproj_arc(17,multirooted_tree) == False) + assert(is_nonproj_arc(16,multirooted_tree) == True) -def test_is_non_projective_tree(): +def test_is_nonproj_tree(): proj_tree = [1,2,2,4,5,2,7,5,2] nonproj_tree = [1,2,2,4,5,2,7,4,2] partial_tree = [1,2,2,4,5,None,7,4,2] - assert(is_non_projective_tree(proj_tree) == False) - assert(is_non_projective_tree(nonproj_tree) == True) - assert(is_non_projective_tree(partial_tree) == False) + multirooted_tree = [3,2,0,3,3,7,7,3,7,10,7,10,11,12,18,16,18,17,12,3] + assert(is_nonproj_tree(proj_tree) == False) + assert(is_nonproj_tree(nonproj_tree) == True) + assert(is_nonproj_tree(partial_tree) == False) + assert(is_nonproj_tree(multirooted_tree) == True) + +def test_pseudoprojective(): + tree = [1,2,2] + nonproj_tree = [1,2,2,4,5,2,7,4,2] + labels = ['NK','SB','ROOT','NK','OA','OC','SB','RC','--'] + nonproj_tree2 = [9,1,3,1,5,6,9,8,6,1,6,12,13,10,1] + labels2 = ['MO','ROOT','NK','SB','MO','NK','OA','NK','AG','OC','MNR','MO','NK','NK','--'] + + pp = PseudoProjective() + + assert(pp._make_span_index(tree) == { 1:[0], 2:[1] }) + assert(pp._make_span_index(nonproj_tree) == { 1:[0], 2:[1,5,8], 4:[3,7], 5:[4], 7:[6] }) + + pp._lift(0,tree) + assert(tree == [2,2,2]) + + np_arc = pp._get_smallest_nonproj_arc(nonproj_tree) + assert(np_arc == 7) + + np_arc = pp._get_smallest_nonproj_arc(nonproj_tree2) + assert(np_arc == 10) + + proj_heads, deco_labels = pp.projectivize(nonproj_tree,labels) + assert(proj_heads == [1,2,2,4,5,2,7,5,2]) + assert(deco_labels == ['NK','SB','ROOT','NK','OA','OC','SB','RC||OA','--']) + deproj_heads, undeco_labels = pp.deprojectivize(proj_heads,deco_labels) + assert(deproj_heads == nonproj_tree) + assert(undeco_labels == labels) + + proj_heads, deco_labels = pp.projectivize(nonproj_tree2,labels2) + assert(proj_heads == [1,1,3,1,5,6,9,8,6,1,9,12,13,10,1]) + assert(deco_labels == ['MO||OC','ROOT','NK','SB','MO','NK','OA','NK','AG','OC','MNR||OA','MO','NK','NK','--']) + deproj_heads, undeco_labels = pp.deprojectivize(proj_heads,deco_labels) + assert(deproj_heads == nonproj_tree2) + assert(undeco_labels == labels2) + + # if decoration is wrong such that there is no head with the desired label + # the structure is kept and the label is undecorated + deproj_heads, undeco_labels = pp.deprojectivize([1,2,2,4,5,2,7,5,2],['NK','SB','ROOT','NK','OA','OC','SB','RC||DA','--']) + assert(deproj_heads == [1,2,2,4,5,2,7,5,2]) + assert(undeco_labels == ['NK','SB','ROOT','NK','OA','OC','SB','RC','--']) + + # if there are two potential new heads, the first one is chosen even if it's wrong + deproj_heads, undeco_labels = pp.deprojectivize([1,1,3,1,5,6,9,8,6,1,9,12,13,10,1], \ + ['MO||OC','ROOT','NK','OC','MO','NK','OA','NK','AG','OC','MNR||OA','MO','NK','NK','--']) + assert(deproj_heads == [3,1,3,1,5,6,9,8,6,1,6,12,13,10,1]) + assert(undeco_labels == ['MO','ROOT','NK','OC','MO','NK','OA','NK','AG','OC','MNR','MO','NK','NK','--']) + + + + + + + + + + + + + + +