spaCy/spacy/syntax/nonproj.pyx

# coding: utf-8
# cython: profile=True
# cython: infer_types=True
"""Implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005
for doing pseudo-projective parsing implementation uses the HEAD decoration
scheme.
"""
from __future__ import unicode_literals

from copy import copy

from ..tokens.doc cimport Doc, set_children_from_heads
from ..errors import Errors


DELIMITER = '||'


def ancestors(tokenid, heads):
    # Returns all words going from the word up the path to the root. The path
    # to root cannot be longer than the number of words in the  sentence. This
    # function ends after at most len(heads) steps, because it would otherwise
    # loop indefinitely on cycles.
    head = tokenid
    cnt = 0
    while heads[head] != head and cnt < len(heads):
        head = heads[head]
        cnt += 1
        yield head
        if head is None:
            break


def contains_cycle(heads):
    # in an acyclic tree, the path from each word following the head relation
    # upwards always ends at the root node
    for tokenid in range(len(heads)):
        seen = set([tokenid])
        for ancestor in ancestors(tokenid, heads):
            if ancestor in seen:
                return seen
            seen.add(ancestor)
    return None


def is_nonproj_arc(tokenid, heads):
    # definition (e.g. Havelka 2007): an arc h -> d, h < d is non-projective
    # if there is a token k, h < k < d such that h is not
    # an ancestor of k. Same for h -> d, h > d
    head = heads[tokenid]
    if head == tokenid:  # root arcs cannot be non-projective
        return False
    elif head is None:  # unattached tokens cannot be non-projective
        return False

    start, end = (head+1, tokenid) if head < tokenid else (tokenid+1, head)
    for k in range(start, end):
        for ancestor in ancestors(k, heads):
            if ancestor is None:  # for unattached tokens/subtrees
                break
            elif ancestor == head:  # normal case: k dominated by h
                break
        else:  # head not in ancestors: d -> h is non-projective
            return True
    return False


def is_nonproj_tree(heads):
    # a tree is non-projective if at least one arc is non-projective
    return any(is_nonproj_arc(word, heads) for word in range(len(heads)))


def decompose(label):
    return label.partition(DELIMITER)[::2]


def is_decorated(label):
    return DELIMITER in label

def count_decorated_labels(gold_tuples):
    freqs = {}
    for raw_text, sents in gold_tuples:
        for (ids, words, tags, heads, labels, iob), ctnts in sents:
            proj_heads, deco_labels = projectivize(heads, labels)
            # set the label to ROOT for each root dependent
            deco_labels = ['ROOT' if head == i else deco_labels[i]
                           for i, head in enumerate(proj_heads)]
            # count label frequencies
            for label in deco_labels:
                if is_decorated(label):
                    freqs[label] = freqs.get(label, 0) + 1
    return freqs


def preprocess_training_data(gold_tuples, label_freq_cutoff=30):
    preprocessed = []
    freqs = {}
    for raw_text, sents in gold_tuples:
        prepro_sents = []
        for (ids, words, tags, heads, labels, iob), ctnts in sents:
            proj_heads, deco_labels = projectivize(heads, labels)
            # set the label to ROOT for each root dependent
            deco_labels = ['ROOT' if head == i else deco_labels[i]
                           for i, head in enumerate(proj_heads)]
            # count label frequencies
            if label_freq_cutoff > 0:
                for label in deco_labels:
                    if is_decorated(label):
                        freqs[label] = freqs.get(label, 0) + 1
            prepro_sents.append(
                ((ids, words, tags, proj_heads, deco_labels, iob), ctnts))
        preprocessed.append((raw_text, prepro_sents))
    if label_freq_cutoff > 0:
        return _filter_labels(preprocessed, label_freq_cutoff, freqs)
    return preprocessed


def projectivize(heads, labels):
    # Use the algorithm by Nivre & Nilsson 2005. Assumes heads to be a proper
    # tree, i.e. connected and cycle-free. Returns a new pair (heads, labels)
    # which encode a projective and decorated tree.
    proj_heads = copy(heads)
    smallest_np_arc = _get_smallest_nonproj_arc(proj_heads)
    if smallest_np_arc is None:  # this sentence is already projective
        return proj_heads, copy(labels)
    while smallest_np_arc is not None:
        _lift(smallest_np_arc, proj_heads)
        smallest_np_arc = _get_smallest_nonproj_arc(proj_heads)
    deco_labels = _decorate(heads, proj_heads, labels)
    return proj_heads, deco_labels


cpdef deprojectivize(Doc doc):
    # Reattach arcs with decorated labels (following HEAD scheme). For each
    # decorated arc X||Y, search top-down, left-to-right, breadth-first until
    # hitting a Y then make this the new head.
    for i in range(doc.length):
        label = doc.vocab.strings[doc.c[i].dep]
        if DELIMITER in label:
            new_label, head_label = label.split(DELIMITER)
            new_head = _find_new_head(doc[i], head_label)
            doc.c[i].head = new_head.i - i
            doc.c[i].dep = doc.vocab.strings.add(new_label)
    set_children_from_heads(doc.c, doc.length)
    return doc


def _decorate(heads, proj_heads, labels):
    # uses decoration scheme HEAD from Nivre & Nilsson 2005
    if (len(heads) != len(proj_heads)) or (len(proj_heads) != len(labels)):
        raise ValueError(Errors.E082.format(n_heads=len(heads),
                                            n_proj_heads=len(proj_heads),
                                            n_labels=len(labels)))
    deco_labels = []
    for tokenid, head in enumerate(heads):
        if head != proj_heads[tokenid]:
            deco_labels.append(
                '%s%s%s' % (labels[tokenid], DELIMITER, labels[head]))
        else:
            deco_labels.append(labels[tokenid])
    return deco_labels


def _get_smallest_nonproj_arc(heads):
    # return the smallest non-proj arc or None
    # where size is defined as the distance between dep and head
    # and ties are broken left to right
    smallest_size = float('inf')
    smallest_np_arc = None
    for tokenid, head in enumerate(heads):
        size = abs(tokenid-head)
        if size < smallest_size and is_nonproj_arc(tokenid, heads):
            smallest_size = size
            smallest_np_arc = tokenid
    return smallest_np_arc


def _lift(tokenid, heads):
    # reattaches a word to it's grandfather
    head = heads[tokenid]
    ghead = heads[head]
    # attach to ghead if head isn't attached to root else attach to root
    heads[tokenid] = ghead if head != ghead else tokenid


def _find_new_head(token, headlabel):
    # search through the tree starting from the head of the given token
    # returns the id of the first descendant with the given label
    # if there is none, return the current head (no change)
    queue = [token.head]
    while queue:
        next_queue = []
        for qtoken in queue:
            for child in qtoken.children:
                if child.is_space:
                    continue
                if child == token:
                    continue
                if child.dep_ == headlabel:
                    return child
                next_queue.append(child)
        queue = next_queue
    return token.head


def _filter_labels(gold_tuples, cutoff, freqs):
    # throw away infrequent decorated labels
    # can't learn them reliably anyway and keeps label set smaller
    filtered = []
    for raw_text, sents in gold_tuples:
        filtered_sents = []
        for (ids, words, tags, heads, labels, iob), ctnts in sents:
            filtered_labels = []
            for label in labels:
                if is_decorated(label) and freqs.get(label, 0) < cutoff:
                    filtered_labels.append(decompose(label)[0])
                else:
                    filtered_labels.append(label)
            filtered_sents.append(
                ((ids, words, tags, heads, filtered_labels, iob), ctnts))
        filtered.append((raw_text, filtered_sents))
    return filtered
Tidy up and fix formatting and imports 2017-04-15 14:05:15 +03:00			`# coding: utf-8`
Improve efficiency of deprojectivization 2017-11-17 20:55:13 +03:00			`# cython: profile=True`
			`# cython: infer_types=True`
Tidy up syntax 2017-10-27 20:45:57 +03:00			`"""Implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005`
Make nonproj methods top-level functions, instead of class methods 2017-05-22 12:48:02 +03:00			`for doing pseudo-projective parsing implementation uses the HEAD decoration`
			`scheme.`
			`"""`
Fix unicode problem in nonproj module 2016-11-26 02:29:17 +03:00			`from __future__ import unicode_literals`
Tidy up syntax 2017-10-27 20:45:57 +03:00
add class PseudoProjective for pseudo-projective parsing PseudoProjective() implements the algorithm from Nivre & Nilsson 2005 using their HEAD decoration scheme. 2016-02-24 13:26:25 +03:00			`from copy import copy`
replace tests for non-projectivity - add functions to find non-projective edges - add test file for non-projectivity functions 2016-02-22 16:40:40 +03:00
Revert "Merge branch 'develop' of https://github.com/explosion/spaCy into develop" This reverts commit c9ba3d3c2dc7067cf8bd55f878cec45a8c6d73d4, reversing changes made to 92c26a35d425d4e8ca1b805ea776ea10f5ded3df. 2018-03-27 20:23:02 +03:00			`from ..tokens.doc cimport Doc, set_children_from_heads`
💫 New system for error messages and warnings (#2163) * Add spacy.errors module * Update deprecation and user warnings * Replace errors and asserts with new error message system * Remove redundant asserts * Fix whitespace * Add messages for print/util.prints statements * Fix typo * Fix typos * Move CLI messages to spacy.cli._messages * Add decorator to display error code with message An implementation like this is nice because it only modifies the string when it's retrieved from the containing class – so we don't have to worry about manipulating tracebacks etc. * Remove unused link in spacy.about * Update errors for invalid pipeline components * Improve error for unknown factories * Add displaCy warnings * Update formatting consistency * Move error message to spacy.errors * Update errors and check if doc returned by component is None 2018-04-03 16:50:31 +03:00			`from ..errors import Errors`
Improve efficiency of deprojectivization 2017-11-17 20:55:13 +03:00
integrated pseudo-projective parsing into parser - nonproj.pyx holds a class PseudoProjectivity which currently holds all functionality to implement Nivre & Nilsson 2005's pseudo-projective parsing using the HEAD decoration scheme - changed lefts/rights in Token to account for possible non-projective structures 2016-03-01 12:09:08 +03:00
Make nonproj methods top-level functions, instead of class methods 2017-05-22 12:48:02 +03:00			`DELIMITER = '\|\|'`

integrated pseudo-projective parsing into parser - nonproj.pyx holds a class PseudoProjectivity which currently holds all functionality to implement Nivre & Nilsson 2005's pseudo-projective parsing using the HEAD decoration scheme - changed lefts/rights in Token to account for possible non-projective structures 2016-03-01 12:09:08 +03:00
add class PseudoProjective for pseudo-projective parsing PseudoProjective() implements the algorithm from Nivre & Nilsson 2005 using their HEAD decoration scheme. 2016-02-24 13:26:25 +03:00			`def ancestors(tokenid, heads):`
Tidy up syntax 2017-10-27 20:45:57 +03:00			`# Returns all words going from the word up the path to the root. The path`
			`# to root cannot be longer than the number of words in the sentence. This`
			`# function ends after at most len(heads) steps, because it would otherwise`
			`# loop indefinitely on cycles.`
add class PseudoProjective for pseudo-projective parsing PseudoProjective() implements the algorithm from Nivre & Nilsson 2005 using their HEAD decoration scheme. 2016-02-24 13:26:25 +03:00			`head = tokenid`
replace tests for non-projectivity - add functions to find non-projective edges - add test file for non-projectivity functions 2016-02-22 16:40:40 +03:00			`cnt = 0`
			`while heads[head] != head and cnt < len(heads):`
			`head = heads[head]`
			`cnt += 1`
			`yield head`
Tidy up syntax 2017-10-27 20:45:57 +03:00			`if head is None:`
replace tests for non-projectivity - add functions to find non-projective edges - add test file for non-projectivity functions 2016-02-22 16:40:40 +03:00			`break`


			`def contains_cycle(heads):`
Tidy up syntax 2017-10-27 20:45:57 +03:00			`# in an acyclic tree, the path from each word following the head relation`
			`# upwards always ends at the root node`
add class PseudoProjective for pseudo-projective parsing PseudoProjective() implements the algorithm from Nivre & Nilsson 2005 using their HEAD decoration scheme. 2016-02-24 13:26:25 +03:00			`for tokenid in range(len(heads)):`
			`seen = set([tokenid])`
Tidy up syntax 2017-10-27 20:45:57 +03:00			`for ancestor in ancestors(tokenid, heads):`
replace tests for non-projectivity - add functions to find non-projective edges - add test file for non-projectivity functions 2016-02-22 16:40:40 +03:00			`if ancestor in seen:`
			`return seen`
			`seen.add(ancestor)`
			`return None`


add class PseudoProjective for pseudo-projective parsing PseudoProjective() implements the algorithm from Nivre & Nilsson 2005 using their HEAD decoration scheme. 2016-02-24 13:26:25 +03:00			`def is_nonproj_arc(tokenid, heads):`
replace tests for non-projectivity - add functions to find non-projective edges - add test file for non-projectivity functions 2016-02-22 16:40:40 +03:00			`# definition (e.g. Havelka 2007): an arc h -> d, h < d is non-projective`
add class PseudoProjective for pseudo-projective parsing PseudoProjective() implements the algorithm from Nivre & Nilsson 2005 using their HEAD decoration scheme. 2016-02-24 13:26:25 +03:00			`# if there is a token k, h < k < d such that h is not`
replace tests for non-projectivity - add functions to find non-projective edges - add test file for non-projectivity functions 2016-02-22 16:40:40 +03:00			`# an ancestor of k. Same for h -> d, h > d`
add class PseudoProjective for pseudo-projective parsing PseudoProjective() implements the algorithm from Nivre & Nilsson 2005 using their HEAD decoration scheme. 2016-02-24 13:26:25 +03:00			`head = heads[tokenid]`
Tidy up syntax 2017-10-27 20:45:57 +03:00			`if head == tokenid: # root arcs cannot be non-projective`
replace tests for non-projectivity - add functions to find non-projective edges - add test file for non-projectivity functions 2016-02-22 16:40:40 +03:00			`return False`
Tidy up syntax 2017-10-27 20:45:57 +03:00			`elif head is None: # unattached tokens cannot be non-projective`
replace tests for non-projectivity - add functions to find non-projective edges - add test file for non-projectivity functions 2016-02-22 16:40:40 +03:00			`return False`

add class PseudoProjective for pseudo-projective parsing PseudoProjective() implements the algorithm from Nivre & Nilsson 2005 using their HEAD decoration scheme. 2016-02-24 13:26:25 +03:00			`start, end = (head+1, tokenid) if head < tokenid else (tokenid+1, head)`
Tidy up syntax 2017-10-27 20:45:57 +03:00			`for k in range(start, end):`
			`for ancestor in ancestors(k, heads):`
			`if ancestor is None: # for unattached tokens/subtrees`
replace tests for non-projectivity - add functions to find non-projective edges - add test file for non-projectivity functions 2016-02-22 16:40:40 +03:00			`break`
Tidy up syntax 2017-10-27 20:45:57 +03:00			`elif ancestor == head: # normal case: k dominated by h`
replace tests for non-projectivity - add functions to find non-projective edges - add test file for non-projectivity functions 2016-02-22 16:40:40 +03:00			`break`
Tidy up syntax 2017-10-27 20:45:57 +03:00			`else: # head not in ancestors: d -> h is non-projective`
replace tests for non-projectivity - add functions to find non-projective edges - add test file for non-projectivity functions 2016-02-22 16:40:40 +03:00			`return True`
			`return False`


add class PseudoProjective for pseudo-projective parsing PseudoProjective() implements the algorithm from Nivre & Nilsson 2005 using their HEAD decoration scheme. 2016-02-24 13:26:25 +03:00			`def is_nonproj_tree(heads):`
replace tests for non-projectivity - add functions to find non-projective edges - add test file for non-projectivity functions 2016-02-22 16:40:40 +03:00			`# a tree is non-projective if at least one arc is non-projective`
Tidy up syntax 2017-10-27 20:45:57 +03:00			`return any(is_nonproj_arc(word, heads) for word in range(len(heads)))`
add class PseudoProjective for pseudo-projective parsing PseudoProjective() implements the algorithm from Nivre & Nilsson 2005 using their HEAD decoration scheme. 2016-02-24 13:26:25 +03:00

Make nonproj methods top-level functions, instead of class methods 2017-05-22 12:48:02 +03:00			`def decompose(label):`
			`return label.partition(DELIMITER)[::2]`


			`def is_decorated(label):`
Revert "Merge branch 'develop' of https://github.com/explosion/spaCy into develop" This reverts commit c9ba3d3c2dc7067cf8bd55f878cec45a8c6d73d4, reversing changes made to 92c26a35d425d4e8ca1b805ea776ea10f5ded3df. 2018-03-27 20:23:02 +03:00			`return DELIMITER in label`

			`def count_decorated_labels(gold_tuples):`
			`freqs = {}`
			`for raw_text, sents in gold_tuples:`
			`for (ids, words, tags, heads, labels, iob), ctnts in sents:`
			`proj_heads, deco_labels = projectivize(heads, labels)`
			`# set the label to ROOT for each root dependent`
			`deco_labels = ['ROOT' if head == i else deco_labels[i]`
			`for i, head in enumerate(proj_heads)]`
			`# count label frequencies`
			`for label in deco_labels:`
			`if is_decorated(label):`
			`freqs[label] = freqs.get(label, 0) + 1`
			`return freqs`
Make nonproj methods top-level functions, instead of class methods 2017-05-22 12:48:02 +03:00

			`def preprocess_training_data(gold_tuples, label_freq_cutoff=30):`
			`preprocessed = []`
			`freqs = {}`
			`for raw_text, sents in gold_tuples:`
			`prepro_sents = []`
			`for (ids, words, tags, heads, labels, iob), ctnts in sents:`
Tidy up syntax 2017-10-27 20:45:57 +03:00			`proj_heads, deco_labels = projectivize(heads, labels)`
Make nonproj methods top-level functions, instead of class methods 2017-05-22 12:48:02 +03:00			`# set the label to ROOT for each root dependent`
Tidy up syntax 2017-10-27 20:45:57 +03:00			`deco_labels = ['ROOT' if head == i else deco_labels[i]`
			`for i, head in enumerate(proj_heads)]`
Make nonproj methods top-level functions, instead of class methods 2017-05-22 12:48:02 +03:00			`# count label frequencies`
			`if label_freq_cutoff > 0:`
			`for label in deco_labels:`
			`if is_decorated(label):`
Tidy up syntax 2017-10-27 20:45:57 +03:00			`freqs[label] = freqs.get(label, 0) + 1`
			`prepro_sents.append(`
			`((ids, words, tags, proj_heads, deco_labels, iob), ctnts))`
Make nonproj methods top-level functions, instead of class methods 2017-05-22 12:48:02 +03:00			`preprocessed.append((raw_text, prepro_sents))`
			`if label_freq_cutoff > 0:`
Tidy up syntax 2017-10-27 20:45:57 +03:00			`return _filter_labels(preprocessed, label_freq_cutoff, freqs)`
Make nonproj methods top-level functions, instead of class methods 2017-05-22 12:48:02 +03:00			`return preprocessed`


			`def projectivize(heads, labels):`
Tidy up syntax 2017-10-27 20:45:57 +03:00			`# Use the algorithm by Nivre & Nilsson 2005. Assumes heads to be a proper`
			`# tree, i.e. connected and cycle-free. Returns a new pair (heads, labels)`
			`# which encode a projective and decorated tree.`
Make nonproj methods top-level functions, instead of class methods 2017-05-22 12:48:02 +03:00			`proj_heads = copy(heads)`
			`smallest_np_arc = _get_smallest_nonproj_arc(proj_heads)`
Tidy up syntax 2017-10-27 20:45:57 +03:00			`if smallest_np_arc is None: # this sentence is already projective`
Make nonproj methods top-level functions, instead of class methods 2017-05-22 12:48:02 +03:00			`return proj_heads, copy(labels)`
Tidy up syntax 2017-10-27 20:45:57 +03:00			`while smallest_np_arc is not None:`
Make nonproj methods top-level functions, instead of class methods 2017-05-22 12:48:02 +03:00			`_lift(smallest_np_arc, proj_heads)`
			`smallest_np_arc = _get_smallest_nonproj_arc(proj_heads)`
			`deco_labels = _decorate(heads, proj_heads, labels)`
			`return proj_heads, deco_labels`


Improve efficiency of deprojectivization 2017-11-17 20:55:13 +03:00			`cpdef deprojectivize(Doc doc):`
Tidy up syntax 2017-10-27 20:45:57 +03:00			`# Reattach arcs with decorated labels (following HEAD scheme). For each`
			`# decorated arc X\|\|Y, search top-down, left-to-right, breadth-first until`
			`# hitting a Y then make this the new head.`
Improve efficiency of deprojectivization 2017-11-17 20:55:13 +03:00			`for i in range(doc.length):`
			`label = doc.vocab.strings[doc.c[i].dep]`
			`if DELIMITER in label:`
			`new_label, head_label = label.split(DELIMITER)`
			`new_head = _find_new_head(doc[i], head_label)`
Revert "Merge branch 'develop' of https://github.com/explosion/spaCy into develop" This reverts commit c9ba3d3c2dc7067cf8bd55f878cec45a8c6d73d4, reversing changes made to 92c26a35d425d4e8ca1b805ea776ea10f5ded3df. 2018-03-27 20:23:02 +03:00			`doc.c[i].head = new_head.i - i`
Fix more efficient nonproj 2017-11-23 15:48:00 +03:00			`doc.c[i].dep = doc.vocab.strings.add(new_label)`
Revert "Merge branch 'develop' of https://github.com/explosion/spaCy into develop" This reverts commit c9ba3d3c2dc7067cf8bd55f878cec45a8c6d73d4, reversing changes made to 92c26a35d425d4e8ca1b805ea776ea10f5ded3df. 2018-03-27 20:23:02 +03:00			`set_children_from_heads(doc.c, doc.length)`
Improve efficiency of deprojectivization 2017-11-17 20:55:13 +03:00			`return doc`
Make nonproj methods top-level functions, instead of class methods 2017-05-22 12:48:02 +03:00
Tidy up syntax 2017-10-27 20:45:57 +03:00
Make nonproj methods top-level functions, instead of class methods 2017-05-22 12:48:02 +03:00			`def _decorate(heads, proj_heads, labels):`
			`# uses decoration scheme HEAD from Nivre & Nilsson 2005`
💫 New system for error messages and warnings (#2163) * Add spacy.errors module * Update deprecation and user warnings * Replace errors and asserts with new error message system * Remove redundant asserts * Fix whitespace * Add messages for print/util.prints statements * Fix typo * Fix typos * Move CLI messages to spacy.cli._messages * Add decorator to display error code with message An implementation like this is nice because it only modifies the string when it's retrieved from the containing class – so we don't have to worry about manipulating tracebacks etc. * Remove unused link in spacy.about * Update errors for invalid pipeline components * Improve error for unknown factories * Add displaCy warnings * Update formatting consistency * Move error message to spacy.errors * Update errors and check if doc returned by component is None 2018-04-03 16:50:31 +03:00			`if (len(heads) != len(proj_heads)) or (len(proj_heads) != len(labels)):`
			`raise ValueError(Errors.E082.format(n_heads=len(heads),`
			`n_proj_heads=len(proj_heads),`
			`n_labels=len(labels)))`
Make nonproj methods top-level functions, instead of class methods 2017-05-22 12:48:02 +03:00			`deco_labels = []`
Tidy up syntax 2017-10-27 20:45:57 +03:00			`for tokenid, head in enumerate(heads):`
Make nonproj methods top-level functions, instead of class methods 2017-05-22 12:48:02 +03:00			`if head != proj_heads[tokenid]:`
Tidy up syntax 2017-10-27 20:45:57 +03:00			`deco_labels.append(`
			`'%s%s%s' % (labels[tokenid], DELIMITER, labels[head]))`
Make nonproj methods top-level functions, instead of class methods 2017-05-22 12:48:02 +03:00			`else:`
			`deco_labels.append(labels[tokenid])`
			`return deco_labels`


			`def _get_smallest_nonproj_arc(heads):`
			`# return the smallest non-proj arc or None`
			`# where size is defined as the distance between dep and head`
			`# and ties are broken left to right`
			`smallest_size = float('inf')`
			`smallest_np_arc = None`
Tidy up syntax 2017-10-27 20:45:57 +03:00			`for tokenid, head in enumerate(heads):`
Make nonproj methods top-level functions, instead of class methods 2017-05-22 12:48:02 +03:00			`size = abs(tokenid-head)`
Tidy up syntax 2017-10-27 20:45:57 +03:00			`if size < smallest_size and is_nonproj_arc(tokenid, heads):`
Make nonproj methods top-level functions, instead of class methods 2017-05-22 12:48:02 +03:00			`smallest_size = size`
			`smallest_np_arc = tokenid`
			`return smallest_np_arc`


			`def _lift(tokenid, heads):`
			`# reattaches a word to it's grandfather`
			`head = heads[tokenid]`
			`ghead = heads[head]`
			`# attach to ghead if head isn't attached to root else attach to root`
			`heads[tokenid] = ghead if head != ghead else tokenid`


			`def _find_new_head(token, headlabel):`
			`# search through the tree starting from the head of the given token`
			`# returns the id of the first descendant with the given label`
			`# if there is none, return the current head (no change)`
			`queue = [token.head]`
			`while queue:`
			`next_queue = []`
			`for qtoken in queue:`
			`for child in qtoken.children:`
Tidy up syntax 2017-10-27 20:45:57 +03:00			`if child.is_space:`
			`continue`
			`if child == token:`
			`continue`
Make nonproj methods top-level functions, instead of class methods 2017-05-22 12:48:02 +03:00			`if child.dep_ == headlabel:`
			`return child`
			`next_queue.append(child)`
			`queue = next_queue`
			`return token.head`


			`def _filter_labels(gold_tuples, cutoff, freqs):`
			`# throw away infrequent decorated labels`
			`# can't learn them reliably anyway and keeps label set smaller`
			`filtered = []`
			`for raw_text, sents in gold_tuples:`
			`filtered_sents = []`
			`for (ids, words, tags, heads, labels, iob), ctnts in sents:`
Revert "Merge branch 'develop' of https://github.com/explosion/spaCy into develop" This reverts commit c9ba3d3c2dc7067cf8bd55f878cec45a8c6d73d4, reversing changes made to 92c26a35d425d4e8ca1b805ea776ea10f5ded3df. 2018-03-27 20:23:02 +03:00			`filtered_labels = []`
			`for label in labels:`
			`if is_decorated(label) and freqs.get(label, 0) < cutoff:`
			`filtered_labels.append(decompose(label)[0])`
			`else:`
			`filtered_labels.append(label)`
Tidy up syntax 2017-10-27 20:45:57 +03:00			`filtered_sents.append(`
			`((ids, words, tags, heads, filtered_labels, iob), ctnts))`
Make nonproj methods top-level functions, instead of class methods 2017-05-22 12:48:02 +03:00			`filtered.append((raw_text, filtered_sents))`
			`return filtered`