spaCy/spacy/nonproj.py

from copy import copy
from collections import Counter

def ancestors(tokenid, heads):
    # returns all words going from the word up the path to the root
    # the path to root cannot be longer than the number of words in the sentence
    # this function ends after at most len(heads) steps
    # because it would otherwise loop indefinitely on cycles
    head = tokenid
    cnt = 0
    while heads[head] != head and cnt < len(heads):
        head = heads[head]
        cnt += 1
        yield head
        if head == None:
            break


def contains_cycle(heads):
    # in an acyclic tree, the path from each word following
    # the head relation upwards always ends at the root node
    for tokenid in range(len(heads)):
        seen = set([tokenid])
        for ancestor in ancestors(tokenid,heads):
            if ancestor in seen:
                return seen
            seen.add(ancestor)
    return None


def is_nonproj_arc(tokenid, heads):
    # definition (e.g. Havelka 2007): an arc h -> d, h < d is non-projective
    # if there is a token k, h < k < d such that h is not
    # an ancestor of k. Same for h -> d, h > d
    head = heads[tokenid]
    if head == tokenid: # root arcs cannot be non-projective
        return False
    elif head == None: # unattached tokens cannot be non-projective
        return False

    start, end = (head+1, tokenid) if head < tokenid else (tokenid+1, head)
    for k in range(start,end):
        for ancestor in ancestors(k,heads):
            if ancestor == None: # for unattached tokens/subtrees
                break
            elif ancestor == head: # normal case: k dominated by h
                break
        else: # head not in ancestors: d -> h is non-projective
            return True
    return False


def is_nonproj_tree(heads):
    # a tree is non-projective if at least one arc is non-projective
    return any( is_nonproj_arc(word,heads) for word in range(len(heads)) )


class PseudoProjective:
    # implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005
    # for doing pseudo-projective parsing
    # implementation uses the HEAD decoration scheme

    def preprocess_training_data(self, labeled_trees, label_freq_cutoff=30):
        # expects a sequence of pairs of head arrays and labels
        preprocessed = []
        for heads,labels in labeled_trees:
            proj_heads,deco_labels = self.projectivize(heads,labels)
            # set the label to ROOT for each root dependent
            deco_labels = [ 'ROOT' if head == i else deco_labels[i] for i,head in enumerate(proj_heads) ]
            preprocessed.append((proj_heads,deco_labels))

        if label_freq_cutoff > 0:
            return self._filter_labels(preprocessed,label_freq_cutoff)
        return preprocessed


    def projectivize(self, heads, labels):
        # use the algorithm by Nivre & Nilsson 2005
        # assumes heads to be a proper tree, i.e. connected and cycle-free
        # returns a new pair (heads,labels) which encode
        # a projective and decorated tree
        proj_heads = copy(heads)
        smallest_np_arc = self._get_smallest_nonproj_arc(proj_heads)
        if smallest_np_arc == None: # this sentence is already projective
            return proj_heads, copy(labels)
        while smallest_np_arc != None:
            self._lift(smallest_np_arc, proj_heads)
            smallest_np_arc = self._get_smallest_nonproj_arc(proj_heads)
        deco_labels = self._decorate(heads, proj_heads, labels)
        return proj_heads, deco_labels


    def deprojectivize(self, heads, labels):
        # reattach arcs with decorated labels (following HEAD scheme)
        # for each decorated arc X||Y, search top-down, left-to-right,
        # breadth-first until hitting a Y then make this the new head
        newheads, newlabels = copy(heads), copy(labels)
        spans = None
        for tokenid, head in enumerate(heads):
            if labels[tokenid].find('||') != -1:
                newlabel,_,headlabel = labels[tokenid].partition('||')
                newhead = self._find_new_head(head,tokenid,headlabel,heads,labels,spans=spans)
                newheads[tokenid] = newhead
                newlabels[tokenid] = newlabel
        return newheads, newlabels


    def _decorate(self, heads, proj_heads, labels):
        # uses decoration scheme HEAD from Nivre & Nilsson 2005
        assert(len(heads) == len(proj_heads) == len(labels))
        deco_labels = []
        for tokenid,head in enumerate(heads):
            if head != proj_heads[tokenid]:
                deco_labels.append('%s||%s' % (labels[tokenid],labels[head]))
            else:
                deco_labels.append(labels[tokenid])
        return deco_labels


    def _get_smallest_nonproj_arc(self, heads):
        # return the smallest non-proj arc or None
        # where size is defined as the distance between dep and head
        # and ties are broken left to right
        smallest_size = float('inf')
        smallest_np_arc = None
        for tokenid,head in enumerate(heads):
            size = abs(tokenid-head)
            if size < smallest_size and is_nonproj_arc(tokenid,heads):
                smallest_size = size
                smallest_np_arc = tokenid
        return smallest_np_arc


    def _lift(self, tokenid, heads):
        # reattaches a word to it's grandfather
        head = heads[tokenid]
        ghead = heads[head]
        # attach to ghead if head isn't attached to root else attach to root
        heads[tokenid] = ghead if head != ghead else tokenid


    def _find_new_head(self, rootid, tokenid, headlabel, heads, labels, spans=None):
        # search through the tree starting from root
        # returns the id of the first descendant with the given label
        # if there is none, return the current head (no change)
        if not spans:
            spans = self._make_span_index(heads)
        queue = spans.get(rootid,[])
        queue.remove(tokenid) # don't search in the subtree of the nonproj arc
        while queue:
            next_queue = []
            for idx in queue:
                if labels[idx] == headlabel:
                    return idx
                next_queue.extend(spans.get(idx,[]))
            queue = next_queue
        return heads[tokenid]


    def _make_span_index(self, heads):
        # stores the direct dependents for each token
        # for searching top-down through a tree
        spans = {}
        for tokenid, head in enumerate(heads):
            if tokenid == head: # root
                continue
            if head not in spans:
                spans[head] = []
            spans[head].append(tokenid)
        return spans


    def _filter_labels(self, labeled_trees, cutoff):
        # throw away infrequent decorated labels
        # can't learn them reliably anyway and keeps label set smaller
        freqs = Counter([ label for _,labels in labeled_trees for label in labels if label.find('||') != -1 ])
        filtered = []
        for proj_heads,deco_labels in labeled_trees:
            filtered_labels = [ label.partition('||')[0] if freqs.get(label,cutoff) < cutoff else label for label in deco_labels ]
            filtered.append((proj_heads,filtered_labels))
        return filtered