# coding: utf-8 # cython: profile=True # cython: infer_types=True """Implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005 for doing pseudo-projective parsing implementation uses the HEAD decoration scheme. """ from __future__ import unicode_literals from copy import copy from spacy.gold import Example from ..tokens.doc cimport Doc, set_children_from_heads from ..errors import Errors DELIMITER = '||' def ancestors(tokenid, heads): # Returns all words going from the word up the path to the root. The path # to root cannot be longer than the number of words in the sentence. This # function ends after at most len(heads) steps, because it would otherwise # loop indefinitely on cycles. head = tokenid cnt = 0 while heads[head] != head and cnt < len(heads): head = heads[head] cnt += 1 yield head if head is None: break def contains_cycle(heads): # in an acyclic tree, the path from each word following the head relation # upwards always ends at the root node for tokenid in range(len(heads)): seen = set([tokenid]) for ancestor in ancestors(tokenid, heads): if ancestor in seen: return seen seen.add(ancestor) return None def is_nonproj_arc(tokenid, heads): # definition (e.g. Havelka 2007): an arc h -> d, h < d is non-projective # if there is a token k, h < k < d such that h is not # an ancestor of k. Same for h -> d, h > d head = heads[tokenid] if head == tokenid: # root arcs cannot be non-projective return False elif head is None: # unattached tokens cannot be non-projective return False start, end = (head+1, tokenid) if head < tokenid else (tokenid+1, head) for k in range(start, end): for ancestor in ancestors(k, heads): if ancestor is None: # for unattached tokens/subtrees break elif ancestor == head: # normal case: k dominated by h break else: # head not in ancestors: d -> h is non-projective return True return False def is_nonproj_tree(heads): # a tree is non-projective if at least one arc is non-projective return any(is_nonproj_arc(word, heads) for word in range(len(heads))) def decompose(label): return label.partition(DELIMITER)[::2] def is_decorated(label): return DELIMITER in label def count_decorated_labels(gold_data): freqs = {} for example in gold_data: for token_annotation in example.token_annotations: proj_heads, deco_deps = projectivize(token_annotation.heads, token_annotation.deps) # set the label to ROOT for each root dependent deco_deps = ['ROOT' if head == i else deco_deps[i] for i, head in enumerate(proj_heads)] # count label frequencies for label in deco_deps: if is_decorated(label): freqs[label] = freqs.get(label, 0) + 1 return freqs def preprocess_training_data(gold_data, label_freq_cutoff=30): preprocessed = [] freqs = {} for example in gold_data: new_example = Example(doc=example.doc) for token_annotation in example.token_annotations: proj_heads, deco_deps = projectivize(token_annotation.heads, token_annotation.deps) # set the label to ROOT for each root dependent deco_deps = ['ROOT' if head == i else deco_deps[i] for i, head in enumerate(proj_heads)] # count label frequencies if label_freq_cutoff > 0: for label in deco_deps: if is_decorated(label): freqs[label] = freqs.get(label, 0) + 1 # TODO: the code would be less ugly when changing heads and deps in-place, but is this OK upstream ? proj_token_dict = token_annotation.to_dict() proj_token_dict["heads"] = proj_heads proj_token_dict["deps"] = deco_deps new_example.add_token_annotation(**proj_token_dict) preprocessed.append(new_example) if label_freq_cutoff > 0: return _filter_labels(preprocessed, label_freq_cutoff, freqs) return preprocessed def projectivize(heads, labels): # Use the algorithm by Nivre & Nilsson 2005. Assumes heads to be a proper # tree, i.e. connected and cycle-free. Returns a new pair (heads, labels) # which encode a projective and decorated tree. proj_heads = copy(heads) smallest_np_arc = _get_smallest_nonproj_arc(proj_heads) if smallest_np_arc is None: # this sentence is already projective return proj_heads, copy(labels) while smallest_np_arc is not None: _lift(smallest_np_arc, proj_heads) smallest_np_arc = _get_smallest_nonproj_arc(proj_heads) deco_labels = _decorate(heads, proj_heads, labels) return proj_heads, deco_labels cpdef deprojectivize(Doc doc): # Reattach arcs with decorated labels (following HEAD scheme). For each # decorated arc X||Y, search top-down, left-to-right, breadth-first until # hitting a Y then make this the new head. for i in range(doc.length): label = doc.vocab.strings[doc.c[i].dep] if DELIMITER in label: new_label, head_label = label.split(DELIMITER) new_head = _find_new_head(doc[i], head_label) doc.c[i].head = new_head.i - i doc.c[i].dep = doc.vocab.strings.add(new_label) set_children_from_heads(doc.c, doc.length) return doc def _decorate(heads, proj_heads, labels): # uses decoration scheme HEAD from Nivre & Nilsson 2005 if (len(heads) != len(proj_heads)) or (len(proj_heads) != len(labels)): raise ValueError(Errors.E082.format(n_heads=len(heads), n_proj_heads=len(proj_heads), n_labels=len(labels))) deco_labels = [] for tokenid, head in enumerate(heads): if head != proj_heads[tokenid]: deco_labels.append( '%s%s%s' % (labels[tokenid], DELIMITER, labels[head])) else: deco_labels.append(labels[tokenid]) return deco_labels def _get_smallest_nonproj_arc(heads): # return the smallest non-proj arc or None # where size is defined as the distance between dep and head # and ties are broken left to right smallest_size = float('inf') smallest_np_arc = None for tokenid, head in enumerate(heads): size = abs(tokenid-head) if size < smallest_size and is_nonproj_arc(tokenid, heads): smallest_size = size smallest_np_arc = tokenid return smallest_np_arc def _lift(tokenid, heads): # reattaches a word to it's grandfather head = heads[tokenid] ghead = heads[head] # attach to ghead if head isn't attached to root else attach to root heads[tokenid] = ghead if head != ghead else tokenid def _find_new_head(token, headlabel): # search through the tree starting from the head of the given token # returns the id of the first descendant with the given label # if there is none, return the current head (no change) queue = [token.head] while queue: next_queue = [] for qtoken in queue: for child in qtoken.children: if child.is_space: continue if child == token: continue if child.dep_ == headlabel: return child next_queue.append(child) queue = next_queue return token.head def _filter_labels(examples, cutoff, freqs): # throw away infrequent decorated labels # can't learn them reliably anyway and keeps label set smaller filtered = [] for example in examples: new_example = Example(doc=example.doc) for token_annotation in example.token_annotations: filtered_labels = [] for label in token_annotation.deps: if is_decorated(label) and freqs.get(label, 0) < cutoff: filtered_labels.append(decompose(label)[0]) else: filtered_labels.append(label) filtered_token_dict = token_annotation.to_dict() filtered_token_dict["deps"] = filtered_labels new_example.add_token_annotation(**filtered_token_dict) filtered.append(new_example) return filtered