mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 18:07:26 +03:00 
			
		
		
		
	* detect cycle during projectivize * not complete test to detect cycle in projectivize * boolean to int type to propagate error * use unordered_set instead of set * moved error message to errors * removed cycle from test case * use find instead of count * cycle check: only perform one lookup * Return bool again from _has_head_as_ancestor Communicate presence of cycles through an output argument. * Switch to returning std::pair to encode presence of a cycle The has_cycle pointer is too easy to misuse. Ideally, we would have a sum type like Rust's `Result` here, but C++ is not there yet. * _is_non_proj_arc: clarify what we are returning * _has_head_as_ancestor: remove count We are now explicitly checking for cycles, so the algorithm must always terminate. Either we encounter the head, we find a root, or a cycle. * _is_nonproj_arc: simplify condition * Another refactor using C++ exceptions * Remove unused error code * Print graph with cycle on exception * Include .hh files in source package * Add FIXME comment * cycle detection test * find cycle when starting from problematic vertex Co-authored-by: Daniël de Kok <me@danieldk.eu>
		
			
				
	
	
		
			254 lines
		
	
	
		
			8.4 KiB
		
	
	
	
		
			Cython
		
	
	
	
	
	
			
		
		
	
	
			254 lines
		
	
	
		
			8.4 KiB
		
	
	
	
		
			Cython
		
	
	
	
	
	
# cython: profile=True, infer_types=True
 | 
						|
"""Implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005
 | 
						|
for doing pseudo-projective parsing implementation uses the HEAD decoration
 | 
						|
scheme.
 | 
						|
"""
 | 
						|
from copy import copy
 | 
						|
from cython.operator cimport preincrement as incr, dereference as deref
 | 
						|
from libc.limits cimport INT_MAX
 | 
						|
from libc.stdlib cimport abs
 | 
						|
from libcpp cimport bool
 | 
						|
from libcpp.string cimport string, to_string
 | 
						|
from libcpp.vector cimport vector
 | 
						|
from libcpp.unordered_set cimport unordered_set
 | 
						|
 | 
						|
from ...tokens.doc cimport Doc, set_children_from_heads
 | 
						|
 | 
						|
from ...errors import Errors
 | 
						|
 | 
						|
 | 
						|
DELIMITER = '||'
 | 
						|
 | 
						|
 | 
						|
def ancestors(tokenid, heads):
 | 
						|
    # Returns all words going from the word up the path to the root. The path
 | 
						|
    # to root cannot be longer than the number of words in the  sentence. This
 | 
						|
    # function ends after at most len(heads) steps, because it would otherwise
 | 
						|
    # loop indefinitely on cycles.
 | 
						|
    head = tokenid
 | 
						|
    cnt = 0
 | 
						|
    while heads[head] != head and cnt < len(heads):
 | 
						|
        head = heads[head]
 | 
						|
        cnt += 1
 | 
						|
        yield head
 | 
						|
        if head is None:
 | 
						|
            break
 | 
						|
 | 
						|
 | 
						|
def contains_cycle(heads):
 | 
						|
    # in an acyclic tree, the path from each word following the head relation
 | 
						|
    # upwards always ends at the root node
 | 
						|
    for tokenid in range(len(heads)):
 | 
						|
        seen = set([tokenid])
 | 
						|
        for ancestor in ancestors(tokenid, heads):
 | 
						|
            if ancestor in seen:
 | 
						|
                return seen
 | 
						|
            seen.add(ancestor)
 | 
						|
    return None
 | 
						|
 | 
						|
 | 
						|
def is_nonproj_arc(tokenid, heads):
 | 
						|
    cdef vector[int] c_heads = _heads_to_c(heads)
 | 
						|
    return _is_nonproj_arc(tokenid, c_heads)
 | 
						|
 | 
						|
 | 
						|
cdef bool _is_nonproj_arc(int tokenid, const vector[int]& heads) nogil except *:
 | 
						|
    # definition (e.g. Havelka 2007): an arc h -> d, h < d is non-projective
 | 
						|
    # if there is a token k, h < k < d such that h is not
 | 
						|
    # an ancestor of k. Same for h -> d, h > d
 | 
						|
    head = heads[tokenid]
 | 
						|
    if head == tokenid:  # root arcs cannot be non-projective
 | 
						|
        return False
 | 
						|
    elif head < 0:  # unattached tokens cannot be non-projective
 | 
						|
        return False
 | 
						|
 | 
						|
    cdef int start, end
 | 
						|
    if head < tokenid:
 | 
						|
        start, end = (head+1, tokenid)
 | 
						|
    else:
 | 
						|
        start, end = (tokenid+1, head)
 | 
						|
    for k in range(start, end):
 | 
						|
        if not _has_head_as_ancestor(k, head, heads):
 | 
						|
            return True
 | 
						|
    return False
 | 
						|
 | 
						|
 | 
						|
cdef bool _has_head_as_ancestor(int tokenid, int head, const vector[int]& heads) nogil except *:
 | 
						|
    ancestor = tokenid
 | 
						|
    cdef unordered_set[int] seen_tokens
 | 
						|
    seen_tokens.insert(ancestor)
 | 
						|
    while True:
 | 
						|
        # Reached the head or a disconnected node
 | 
						|
        if heads[ancestor] == head or heads[ancestor] < 0:
 | 
						|
            return True
 | 
						|
        # Reached the root
 | 
						|
        if heads[ancestor] == ancestor:
 | 
						|
            return False
 | 
						|
        ancestor = heads[ancestor]
 | 
						|
        result = seen_tokens.insert(ancestor)
 | 
						|
        # Found cycle
 | 
						|
        if not result.second:
 | 
						|
            raise_domain_error(heads_to_string(heads))
 | 
						|
 | 
						|
    return False
 | 
						|
 | 
						|
 | 
						|
cdef string heads_to_string(const vector[int]& heads) nogil:
 | 
						|
    cdef vector[int].const_iterator citer
 | 
						|
    cdef string cycle_str
 | 
						|
 | 
						|
    cycle_str.append("Found cycle in dependency graph: [")
 | 
						|
 | 
						|
    # FIXME: Rewrite using ostringstream when available in Cython.
 | 
						|
    citer = heads.const_begin()
 | 
						|
    while citer != heads.const_end():
 | 
						|
        if citer != heads.const_begin():
 | 
						|
            cycle_str.append(", ")
 | 
						|
        cycle_str.append(to_string(deref(citer)))
 | 
						|
        incr(citer)
 | 
						|
    cycle_str.append("]")
 | 
						|
 | 
						|
    return cycle_str
 | 
						|
 | 
						|
 | 
						|
def is_nonproj_tree(heads):
 | 
						|
    cdef vector[int] c_heads = _heads_to_c(heads)
 | 
						|
    # a tree is non-projective if at least one arc is non-projective
 | 
						|
    return any(_is_nonproj_arc(word, c_heads) for word in range(len(heads)))
 | 
						|
 | 
						|
 | 
						|
def decompose(label):
 | 
						|
    return label.partition(DELIMITER)[::2]
 | 
						|
 | 
						|
 | 
						|
def is_decorated(label):
 | 
						|
    return DELIMITER in label
 | 
						|
 | 
						|
def count_decorated_labels(gold_data):
 | 
						|
    freqs = {}
 | 
						|
    for example in gold_data:
 | 
						|
        proj_heads, deco_deps = projectivize(example.get_aligned("HEAD"),
 | 
						|
                                             example.get_aligned("DEP"))
 | 
						|
        # set the label to ROOT for each root dependent
 | 
						|
        deco_deps = ['ROOT' if head == i else deco_deps[i]
 | 
						|
                       for i, head in enumerate(proj_heads)]
 | 
						|
        # count label frequencies
 | 
						|
        for label in deco_deps:
 | 
						|
            if is_decorated(label):
 | 
						|
                freqs[label] = freqs.get(label, 0) + 1
 | 
						|
    return freqs
 | 
						|
 | 
						|
 | 
						|
def projectivize(heads, labels):
 | 
						|
    # Use the algorithm by Nivre & Nilsson 2005. Assumes heads to be a proper
 | 
						|
    # tree, i.e. connected and cycle-free. Returns a new pair (heads, labels)
 | 
						|
    # which encode a projective and decorated tree.
 | 
						|
    proj_heads = copy(heads)
 | 
						|
 | 
						|
    cdef int new_head
 | 
						|
    cdef vector[int] c_proj_heads = _heads_to_c(proj_heads)
 | 
						|
    cdef int smallest_np_arc = _get_smallest_nonproj_arc(c_proj_heads)
 | 
						|
    if smallest_np_arc == -1:  # this sentence is already projective
 | 
						|
        return proj_heads, copy(labels)
 | 
						|
    while smallest_np_arc != -1:
 | 
						|
        new_head = _lift(smallest_np_arc, proj_heads)
 | 
						|
        c_proj_heads[smallest_np_arc] = new_head
 | 
						|
        smallest_np_arc = _get_smallest_nonproj_arc(c_proj_heads)
 | 
						|
    deco_labels = _decorate(heads, proj_heads, labels)
 | 
						|
    return proj_heads, deco_labels
 | 
						|
 | 
						|
 | 
						|
cdef vector[int] _heads_to_c(heads):
 | 
						|
    cdef vector[int] c_heads;
 | 
						|
    for head in heads:
 | 
						|
        if head == None:
 | 
						|
            c_heads.push_back(-1)
 | 
						|
        else:
 | 
						|
            assert head < len(heads)
 | 
						|
            c_heads.push_back(head)
 | 
						|
    return c_heads
 | 
						|
 | 
						|
 | 
						|
cpdef deprojectivize(Doc doc):
 | 
						|
    # Reattach arcs with decorated labels (following HEAD scheme). For each
 | 
						|
    # decorated arc X||Y, search top-down, left-to-right, breadth-first until
 | 
						|
    # hitting a Y then make this the new head.
 | 
						|
    for i in range(doc.length):
 | 
						|
        label = doc.vocab.strings[doc.c[i].dep]
 | 
						|
        if DELIMITER in label:
 | 
						|
            new_label, head_label = label.split(DELIMITER)
 | 
						|
            new_head = _find_new_head(doc[i], head_label)
 | 
						|
            doc.c[i].head = new_head.i - i
 | 
						|
            doc.c[i].dep = doc.vocab.strings.add(new_label)
 | 
						|
    set_children_from_heads(doc.c, 0, doc.length)
 | 
						|
    return doc
 | 
						|
 | 
						|
 | 
						|
def _decorate(heads, proj_heads, labels):
 | 
						|
    # uses decoration scheme HEAD from Nivre & Nilsson 2005
 | 
						|
    if (len(heads) != len(proj_heads)) or (len(proj_heads) != len(labels)):
 | 
						|
        raise ValueError(Errors.E082.format(n_heads=len(heads),
 | 
						|
                                            n_proj_heads=len(proj_heads),
 | 
						|
                                            n_labels=len(labels)))
 | 
						|
    deco_labels = []
 | 
						|
    for tokenid, head in enumerate(heads):
 | 
						|
        if head != proj_heads[tokenid]:
 | 
						|
            deco_labels.append(f"{labels[tokenid]}{DELIMITER}{labels[head]}")
 | 
						|
        else:
 | 
						|
            deco_labels.append(labels[tokenid])
 | 
						|
    return deco_labels
 | 
						|
 | 
						|
def get_smallest_nonproj_arc_slow(heads):
 | 
						|
    cdef vector[int] c_heads = _heads_to_c(heads)
 | 
						|
    return _get_smallest_nonproj_arc(c_heads)
 | 
						|
 | 
						|
 | 
						|
cdef int _get_smallest_nonproj_arc(const vector[int]& heads) nogil except -2:
 | 
						|
    # return the smallest non-proj arc or None
 | 
						|
    # where size is defined as the distance between dep and head
 | 
						|
    # and ties are broken left to right
 | 
						|
    cdef int smallest_size = INT_MAX
 | 
						|
    # -1 means its already projective.
 | 
						|
    cdef int smallest_np_arc = -1
 | 
						|
    cdef int size
 | 
						|
    cdef int tokenid
 | 
						|
    cdef int head
 | 
						|
 | 
						|
    for tokenid in range(heads.size()):
 | 
						|
        head = heads[tokenid]
 | 
						|
        size = abs(tokenid-head)
 | 
						|
        if size < smallest_size and _is_nonproj_arc(tokenid, heads):
 | 
						|
            smallest_size = size
 | 
						|
            smallest_np_arc = tokenid
 | 
						|
    return smallest_np_arc
 | 
						|
 | 
						|
 | 
						|
cpdef int _lift(tokenid, heads):
 | 
						|
    # reattaches a word to it's grandfather
 | 
						|
    head = heads[tokenid]
 | 
						|
    ghead = heads[head]
 | 
						|
    cdef int new_head = ghead if head != ghead else tokenid
 | 
						|
    # attach to ghead if head isn't attached to root else attach to root
 | 
						|
    heads[tokenid] = new_head
 | 
						|
    return new_head
 | 
						|
 | 
						|
 | 
						|
def _find_new_head(token, headlabel):
 | 
						|
    # search through the tree starting from the head of the given token
 | 
						|
    # returns the id of the first descendant with the given label
 | 
						|
    # if there is none, return the current head (no change)
 | 
						|
    queue = [token.head]
 | 
						|
    while queue:
 | 
						|
        next_queue = []
 | 
						|
        for qtoken in queue:
 | 
						|
            for child in qtoken.children:
 | 
						|
                if child.is_space:
 | 
						|
                    continue
 | 
						|
                if child == token:
 | 
						|
                    continue
 | 
						|
                if child.dep_ == headlabel:
 | 
						|
                    return child
 | 
						|
                next_queue.append(child)
 | 
						|
        queue = next_queue
 | 
						|
    return token.head
 |