spaCy/spacy/pipeline/_parser_internals/nonproj.pyx
kadarakos 1bb87f35bc
Detect cycle during projectivize (#10877)
* detect cycle during projectivize

* not complete test to detect cycle in projectivize

* boolean to int type to propagate error

* use unordered_set instead of set

* moved error message to errors

* removed cycle from test case

* use find instead of count

* cycle check: only perform one lookup

* Return bool again from _has_head_as_ancestor

Communicate presence of cycles through an output argument.

* Switch to returning std::pair to encode presence of a cycle

The has_cycle pointer is too easy to misuse. Ideally, we would have a
sum type like Rust's `Result` here, but C++ is not there yet.

* _is_non_proj_arc: clarify what we are returning

* _has_head_as_ancestor: remove count

We are now explicitly checking for cycles, so the algorithm must always
terminate. Either we encounter the head, we find a root, or a cycle.

* _is_nonproj_arc: simplify condition

* Another refactor using C++ exceptions

* Remove unused error code

* Print graph with cycle on exception

* Include .hh files in source package

* Add FIXME comment

* cycle detection test

* find cycle when starting from problematic vertex

Co-authored-by: Daniël de Kok <me@danieldk.eu>
2022-06-08 19:34:11 +02:00

254 lines
8.4 KiB
Cython

# cython: profile=True, infer_types=True
"""Implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005
for doing pseudo-projective parsing implementation uses the HEAD decoration
scheme.
"""
from copy import copy
from cython.operator cimport preincrement as incr, dereference as deref
from libc.limits cimport INT_MAX
from libc.stdlib cimport abs
from libcpp cimport bool
from libcpp.string cimport string, to_string
from libcpp.vector cimport vector
from libcpp.unordered_set cimport unordered_set
from ...tokens.doc cimport Doc, set_children_from_heads
from ...errors import Errors
DELIMITER = '||'
def ancestors(tokenid, heads):
# Returns all words going from the word up the path to the root. The path
# to root cannot be longer than the number of words in the sentence. This
# function ends after at most len(heads) steps, because it would otherwise
# loop indefinitely on cycles.
head = tokenid
cnt = 0
while heads[head] != head and cnt < len(heads):
head = heads[head]
cnt += 1
yield head
if head is None:
break
def contains_cycle(heads):
# in an acyclic tree, the path from each word following the head relation
# upwards always ends at the root node
for tokenid in range(len(heads)):
seen = set([tokenid])
for ancestor in ancestors(tokenid, heads):
if ancestor in seen:
return seen
seen.add(ancestor)
return None
def is_nonproj_arc(tokenid, heads):
cdef vector[int] c_heads = _heads_to_c(heads)
return _is_nonproj_arc(tokenid, c_heads)
cdef bool _is_nonproj_arc(int tokenid, const vector[int]& heads) nogil except *:
# definition (e.g. Havelka 2007): an arc h -> d, h < d is non-projective
# if there is a token k, h < k < d such that h is not
# an ancestor of k. Same for h -> d, h > d
head = heads[tokenid]
if head == tokenid: # root arcs cannot be non-projective
return False
elif head < 0: # unattached tokens cannot be non-projective
return False
cdef int start, end
if head < tokenid:
start, end = (head+1, tokenid)
else:
start, end = (tokenid+1, head)
for k in range(start, end):
if not _has_head_as_ancestor(k, head, heads):
return True
return False
cdef bool _has_head_as_ancestor(int tokenid, int head, const vector[int]& heads) nogil except *:
ancestor = tokenid
cdef unordered_set[int] seen_tokens
seen_tokens.insert(ancestor)
while True:
# Reached the head or a disconnected node
if heads[ancestor] == head or heads[ancestor] < 0:
return True
# Reached the root
if heads[ancestor] == ancestor:
return False
ancestor = heads[ancestor]
result = seen_tokens.insert(ancestor)
# Found cycle
if not result.second:
raise_domain_error(heads_to_string(heads))
return False
cdef string heads_to_string(const vector[int]& heads) nogil:
cdef vector[int].const_iterator citer
cdef string cycle_str
cycle_str.append("Found cycle in dependency graph: [")
# FIXME: Rewrite using ostringstream when available in Cython.
citer = heads.const_begin()
while citer != heads.const_end():
if citer != heads.const_begin():
cycle_str.append(", ")
cycle_str.append(to_string(deref(citer)))
incr(citer)
cycle_str.append("]")
return cycle_str
def is_nonproj_tree(heads):
cdef vector[int] c_heads = _heads_to_c(heads)
# a tree is non-projective if at least one arc is non-projective
return any(_is_nonproj_arc(word, c_heads) for word in range(len(heads)))
def decompose(label):
return label.partition(DELIMITER)[::2]
def is_decorated(label):
return DELIMITER in label
def count_decorated_labels(gold_data):
freqs = {}
for example in gold_data:
proj_heads, deco_deps = projectivize(example.get_aligned("HEAD"),
example.get_aligned("DEP"))
# set the label to ROOT for each root dependent
deco_deps = ['ROOT' if head == i else deco_deps[i]
for i, head in enumerate(proj_heads)]
# count label frequencies
for label in deco_deps:
if is_decorated(label):
freqs[label] = freqs.get(label, 0) + 1
return freqs
def projectivize(heads, labels):
# Use the algorithm by Nivre & Nilsson 2005. Assumes heads to be a proper
# tree, i.e. connected and cycle-free. Returns a new pair (heads, labels)
# which encode a projective and decorated tree.
proj_heads = copy(heads)
cdef int new_head
cdef vector[int] c_proj_heads = _heads_to_c(proj_heads)
cdef int smallest_np_arc = _get_smallest_nonproj_arc(c_proj_heads)
if smallest_np_arc == -1: # this sentence is already projective
return proj_heads, copy(labels)
while smallest_np_arc != -1:
new_head = _lift(smallest_np_arc, proj_heads)
c_proj_heads[smallest_np_arc] = new_head
smallest_np_arc = _get_smallest_nonproj_arc(c_proj_heads)
deco_labels = _decorate(heads, proj_heads, labels)
return proj_heads, deco_labels
cdef vector[int] _heads_to_c(heads):
cdef vector[int] c_heads;
for head in heads:
if head == None:
c_heads.push_back(-1)
else:
assert head < len(heads)
c_heads.push_back(head)
return c_heads
cpdef deprojectivize(Doc doc):
# Reattach arcs with decorated labels (following HEAD scheme). For each
# decorated arc X||Y, search top-down, left-to-right, breadth-first until
# hitting a Y then make this the new head.
for i in range(doc.length):
label = doc.vocab.strings[doc.c[i].dep]
if DELIMITER in label:
new_label, head_label = label.split(DELIMITER)
new_head = _find_new_head(doc[i], head_label)
doc.c[i].head = new_head.i - i
doc.c[i].dep = doc.vocab.strings.add(new_label)
set_children_from_heads(doc.c, 0, doc.length)
return doc
def _decorate(heads, proj_heads, labels):
# uses decoration scheme HEAD from Nivre & Nilsson 2005
if (len(heads) != len(proj_heads)) or (len(proj_heads) != len(labels)):
raise ValueError(Errors.E082.format(n_heads=len(heads),
n_proj_heads=len(proj_heads),
n_labels=len(labels)))
deco_labels = []
for tokenid, head in enumerate(heads):
if head != proj_heads[tokenid]:
deco_labels.append(f"{labels[tokenid]}{DELIMITER}{labels[head]}")
else:
deco_labels.append(labels[tokenid])
return deco_labels
def get_smallest_nonproj_arc_slow(heads):
cdef vector[int] c_heads = _heads_to_c(heads)
return _get_smallest_nonproj_arc(c_heads)
cdef int _get_smallest_nonproj_arc(const vector[int]& heads) nogil except -2:
# return the smallest non-proj arc or None
# where size is defined as the distance between dep and head
# and ties are broken left to right
cdef int smallest_size = INT_MAX
# -1 means its already projective.
cdef int smallest_np_arc = -1
cdef int size
cdef int tokenid
cdef int head
for tokenid in range(heads.size()):
head = heads[tokenid]
size = abs(tokenid-head)
if size < smallest_size and _is_nonproj_arc(tokenid, heads):
smallest_size = size
smallest_np_arc = tokenid
return smallest_np_arc
cpdef int _lift(tokenid, heads):
# reattaches a word to it's grandfather
head = heads[tokenid]
ghead = heads[head]
cdef int new_head = ghead if head != ghead else tokenid
# attach to ghead if head isn't attached to root else attach to root
heads[tokenid] = new_head
return new_head
def _find_new_head(token, headlabel):
# search through the tree starting from the head of the given token
# returns the id of the first descendant with the given label
# if there is none, return the current head (no change)
queue = [token.head]
while queue:
next_queue = []
for qtoken in queue:
for child in qtoken.children:
if child.is_space:
continue
if child == token:
continue
if child.dep_ == headlabel:
return child
next_queue.append(child)
queue = next_queue
return token.head