Make core projectivization functions cdef nogil (#10241)

* Make core projectivization methods cdef nogil

While profiling the parser, I noticed that relatively a lot of time is
spent in projectivization. This change rewrites the functions in the
core loops as cdef nogil for efficiency.

In C++-land, we use vector in place of Python lists and absent heads
are represented as -1 in place of None.

* _heads_to_c: add assertion

Validation should be performed by the caller, but this assertion ensures that
we are not reading/writing out of bounds with incorrect input.
This commit is contained in:
Daniël de Kok 2022-02-21 15:02:21 +01:00 committed by GitHub
parent 30030176ee
commit 78a8bec4d0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 66 additions and 21 deletions

View File

@ -4,6 +4,10 @@ for doing pseudo-projective parsing implementation uses the HEAD decoration
scheme. scheme.
""" """
from copy import copy from copy import copy
from libc.limits cimport INT_MAX
from libc.stdlib cimport abs
from libcpp cimport bool
from libcpp.vector cimport vector
from ...tokens.doc cimport Doc, set_children_from_heads from ...tokens.doc cimport Doc, set_children_from_heads
@ -41,13 +45,18 @@ def contains_cycle(heads):
def is_nonproj_arc(tokenid, heads): def is_nonproj_arc(tokenid, heads):
cdef vector[int] c_heads = _heads_to_c(heads)
return _is_nonproj_arc(tokenid, c_heads)
cdef bool _is_nonproj_arc(int tokenid, const vector[int]& heads) nogil:
# definition (e.g. Havelka 2007): an arc h -> d, h < d is non-projective # definition (e.g. Havelka 2007): an arc h -> d, h < d is non-projective
# if there is a token k, h < k < d such that h is not # if there is a token k, h < k < d such that h is not
# an ancestor of k. Same for h -> d, h > d # an ancestor of k. Same for h -> d, h > d
head = heads[tokenid] head = heads[tokenid]
if head == tokenid: # root arcs cannot be non-projective if head == tokenid: # root arcs cannot be non-projective
return False return False
elif head is None: # unattached tokens cannot be non-projective elif head < 0: # unattached tokens cannot be non-projective
return False return False
cdef int start, end cdef int start, end
@ -56,19 +65,29 @@ def is_nonproj_arc(tokenid, heads):
else: else:
start, end = (tokenid+1, head) start, end = (tokenid+1, head)
for k in range(start, end): for k in range(start, end):
for ancestor in ancestors(k, heads): if _has_head_as_ancestor(k, head, heads):
if ancestor is None: # for unattached tokens/subtrees continue
break
elif ancestor == head: # normal case: k dominated by h
break
else: # head not in ancestors: d -> h is non-projective else: # head not in ancestors: d -> h is non-projective
return True return True
return False return False
cdef bool _has_head_as_ancestor(int tokenid, int head, const vector[int]& heads) nogil:
ancestor = tokenid
cnt = 0
while cnt < heads.size():
if heads[ancestor] == head or heads[ancestor] < 0:
return True
ancestor = heads[ancestor]
cnt += 1
return False
def is_nonproj_tree(heads): def is_nonproj_tree(heads):
cdef vector[int] c_heads = _heads_to_c(heads)
# a tree is non-projective if at least one arc is non-projective # a tree is non-projective if at least one arc is non-projective
return any(is_nonproj_arc(word, heads) for word in range(len(heads))) return any(_is_nonproj_arc(word, c_heads) for word in range(len(heads)))
def decompose(label): def decompose(label):
@ -98,16 +117,31 @@ def projectivize(heads, labels):
# tree, i.e. connected and cycle-free. Returns a new pair (heads, labels) # tree, i.e. connected and cycle-free. Returns a new pair (heads, labels)
# which encode a projective and decorated tree. # which encode a projective and decorated tree.
proj_heads = copy(heads) proj_heads = copy(heads)
smallest_np_arc = _get_smallest_nonproj_arc(proj_heads)
if smallest_np_arc is None: # this sentence is already projective cdef int new_head
cdef vector[int] c_proj_heads = _heads_to_c(proj_heads)
cdef int smallest_np_arc = _get_smallest_nonproj_arc(c_proj_heads)
if smallest_np_arc == -1: # this sentence is already projective
return proj_heads, copy(labels) return proj_heads, copy(labels)
while smallest_np_arc is not None: while smallest_np_arc != -1:
_lift(smallest_np_arc, proj_heads) new_head = _lift(smallest_np_arc, proj_heads)
smallest_np_arc = _get_smallest_nonproj_arc(proj_heads) c_proj_heads[smallest_np_arc] = new_head
smallest_np_arc = _get_smallest_nonproj_arc(c_proj_heads)
deco_labels = _decorate(heads, proj_heads, labels) deco_labels = _decorate(heads, proj_heads, labels)
return proj_heads, deco_labels return proj_heads, deco_labels
cdef vector[int] _heads_to_c(heads):
cdef vector[int] c_heads;
for head in heads:
if head == None:
c_heads.push_back(-1)
else:
assert head < len(heads)
c_heads.push_back(head)
return c_heads
cpdef deprojectivize(Doc doc): cpdef deprojectivize(Doc doc):
# Reattach arcs with decorated labels (following HEAD scheme). For each # Reattach arcs with decorated labels (following HEAD scheme). For each
# decorated arc X||Y, search top-down, left-to-right, breadth-first until # decorated arc X||Y, search top-down, left-to-right, breadth-first until
@ -137,27 +171,38 @@ def _decorate(heads, proj_heads, labels):
deco_labels.append(labels[tokenid]) deco_labels.append(labels[tokenid])
return deco_labels return deco_labels
def get_smallest_nonproj_arc_slow(heads):
cdef vector[int] c_heads = _heads_to_c(heads)
return _get_smallest_nonproj_arc(c_heads)
def _get_smallest_nonproj_arc(heads):
cdef int _get_smallest_nonproj_arc(const vector[int]& heads) nogil:
# return the smallest non-proj arc or None # return the smallest non-proj arc or None
# where size is defined as the distance between dep and head # where size is defined as the distance between dep and head
# and ties are broken left to right # and ties are broken left to right
smallest_size = float('inf') cdef int smallest_size = INT_MAX
smallest_np_arc = None cdef int smallest_np_arc = -1
for tokenid, head in enumerate(heads): cdef int size
cdef int tokenid
cdef int head
for tokenid in range(heads.size()):
head = heads[tokenid]
size = abs(tokenid-head) size = abs(tokenid-head)
if size < smallest_size and is_nonproj_arc(tokenid, heads): if size < smallest_size and _is_nonproj_arc(tokenid, heads):
smallest_size = size smallest_size = size
smallest_np_arc = tokenid smallest_np_arc = tokenid
return smallest_np_arc return smallest_np_arc
def _lift(tokenid, heads): cpdef int _lift(tokenid, heads):
# reattaches a word to it's grandfather # reattaches a word to it's grandfather
head = heads[tokenid] head = heads[tokenid]
ghead = heads[head] ghead = heads[head]
cdef int new_head = ghead if head != ghead else tokenid
# attach to ghead if head isn't attached to root else attach to root # attach to ghead if head isn't attached to root else attach to root
heads[tokenid] = ghead if head != ghead else tokenid heads[tokenid] = new_head
return new_head
def _find_new_head(token, headlabel): def _find_new_head(token, headlabel):

View File

@ -93,8 +93,8 @@ def test_parser_pseudoprojectivity(en_vocab):
assert nonproj.is_decorated("X") is False assert nonproj.is_decorated("X") is False
nonproj._lift(0, tree) nonproj._lift(0, tree)
assert tree == [2, 2, 2] assert tree == [2, 2, 2]
assert nonproj._get_smallest_nonproj_arc(nonproj_tree) == 7 assert nonproj.get_smallest_nonproj_arc_slow(nonproj_tree) == 7
assert nonproj._get_smallest_nonproj_arc(nonproj_tree2) == 10 assert nonproj.get_smallest_nonproj_arc_slow(nonproj_tree2) == 10
# fmt: off # fmt: off
proj_heads, deco_labels = nonproj.projectivize(nonproj_tree, labels) proj_heads, deco_labels = nonproj.projectivize(nonproj_tree, labels)
assert proj_heads == [1, 2, 2, 4, 5, 2, 7, 5, 2] assert proj_heads == [1, 2, 2, 4, 5, 2, 7, 5, 2]