Make core projectivization functions cdef nogil (#10241)

* Make core projectivization methods cdef nogil While profiling the parser, I noticed that relatively a lot of time is spent in projectivization. This change rewrites the functions in the core loops as cdef nogil for efficiency. In C++-land, we use vector in place of Python lists and absent heads are represented as -1 in place of None. * _heads_to_c: add assertion Validation should be performed by the caller, but this assertion ensures that we are not reading/writing out of bounds with incorrect input.
2025-10-30 07:27:28 +03:00 · 2022-02-21 15:02:21 +01:00 · 2022-02-21 15:02:21 +01:00 · 78a8bec4d0
commit 78a8bec4d0
parent 30030176ee
2 changed files with 66 additions and 21 deletions
--- a/spacy/pipeline/_parser_internals/nonproj.pyx
+++ b/spacy/pipeline/_parser_internals/nonproj.pyx
@ -4,6 +4,10 @@ for doing pseudo-projective parsing implementation uses the HEAD decoration
 scheme.
 """
 from copy import copy
+from libc.limits cimport INT_MAX
+from libc.stdlib cimport abs
+from libcpp cimport bool
+from libcpp.vector cimport vector

 from ...tokens.doc cimport Doc, set_children_from_heads

@ -41,13 +45,18 @@ def contains_cycle(heads):


 def is_nonproj_arc(tokenid, heads):
+    cdef vector[int] c_heads = _heads_to_c(heads)
+    return _is_nonproj_arc(tokenid, c_heads)
+
+
+cdef bool _is_nonproj_arc(int tokenid, const vector[int]& heads) nogil:
    # definition (e.g. Havelka 2007): an arc h -> d, h < d is non-projective
    # if there is a token k, h < k < d such that h is not
    # an ancestor of k. Same for h -> d, h > d
    head = heads[tokenid]
    if head == tokenid:  # root arcs cannot be non-projective
        return False
-    elif head is None:  # unattached tokens cannot be non-projective
+    elif head < 0:  # unattached tokens cannot be non-projective
        return False
    
    cdef int start, end
@ -56,19 +65,29 @@ def is_nonproj_arc(tokenid, heads):
    else:
        start, end = (tokenid+1, head)
    for k in range(start, end):
-        for ancestor in ancestors(k, heads):
-            if ancestor is None:  # for unattached tokens/subtrees
-                break
-            elif ancestor == head:  # normal case: k dominated by h
-                break
+        if _has_head_as_ancestor(k, head, heads):
+            continue
        else:  # head not in ancestors: d -> h is non-projective
            return True
    return False


+cdef bool _has_head_as_ancestor(int tokenid, int head, const vector[int]& heads) nogil:
+    ancestor = tokenid
+    cnt = 0
+    while cnt < heads.size():
+        if heads[ancestor] == head or heads[ancestor] < 0:
+            return True
+        ancestor = heads[ancestor]
+        cnt += 1
+
+    return False
+
+
 def is_nonproj_tree(heads):
+    cdef vector[int] c_heads = _heads_to_c(heads)
    # a tree is non-projective if at least one arc is non-projective
-    return any(is_nonproj_arc(word, heads) for word in range(len(heads)))
+    return any(_is_nonproj_arc(word, c_heads) for word in range(len(heads)))


 def decompose(label):
@ -98,16 +117,31 @@ def projectivize(heads, labels):
    # tree, i.e. connected and cycle-free. Returns a new pair (heads, labels)
    # which encode a projective and decorated tree.
    proj_heads = copy(heads)
-    smallest_np_arc = _get_smallest_nonproj_arc(proj_heads)
-    if smallest_np_arc is None:  # this sentence is already projective
+
+    cdef int new_head
+    cdef vector[int] c_proj_heads = _heads_to_c(proj_heads)
+    cdef int smallest_np_arc = _get_smallest_nonproj_arc(c_proj_heads)
+    if smallest_np_arc == -1:  # this sentence is already projective
        return proj_heads, copy(labels)
-    while smallest_np_arc is not None:
-        _lift(smallest_np_arc, proj_heads)
-        smallest_np_arc = _get_smallest_nonproj_arc(proj_heads)
+    while smallest_np_arc != -1:
+        new_head = _lift(smallest_np_arc, proj_heads)
+        c_proj_heads[smallest_np_arc] = new_head
+        smallest_np_arc = _get_smallest_nonproj_arc(c_proj_heads)
    deco_labels = _decorate(heads, proj_heads, labels)
    return proj_heads, deco_labels


+cdef vector[int] _heads_to_c(heads):
+    cdef vector[int] c_heads;
+    for head in heads:
+        if head == None:
+            c_heads.push_back(-1)
+        else:
+            assert head < len(heads)
+            c_heads.push_back(head)
+    return c_heads
+
+
 cpdef deprojectivize(Doc doc):
    # Reattach arcs with decorated labels (following HEAD scheme). For each
    # decorated arc X||Y, search top-down, left-to-right, breadth-first until
@ -137,27 +171,38 @@ def _decorate(heads, proj_heads, labels):
            deco_labels.append(labels[tokenid])
    return deco_labels

+def get_smallest_nonproj_arc_slow(heads):
+    cdef vector[int] c_heads = _heads_to_c(heads)
+    return _get_smallest_nonproj_arc(c_heads)

-def _get_smallest_nonproj_arc(heads):
+
+cdef int _get_smallest_nonproj_arc(const vector[int]& heads) nogil:
    # return the smallest non-proj arc or None
    # where size is defined as the distance between dep and head
    # and ties are broken left to right
-    smallest_size = float('inf')
-    smallest_np_arc = None
-    for tokenid, head in enumerate(heads):
+    cdef int smallest_size = INT_MAX
+    cdef int smallest_np_arc = -1
+    cdef int size
+    cdef int tokenid
+    cdef int head
+
+    for tokenid in range(heads.size()):
+        head = heads[tokenid]
        size = abs(tokenid-head)
-        if size < smallest_size and is_nonproj_arc(tokenid, heads):
+        if size < smallest_size and _is_nonproj_arc(tokenid, heads):
            smallest_size = size
            smallest_np_arc = tokenid
    return smallest_np_arc


-def _lift(tokenid, heads):
+cpdef int _lift(tokenid, heads):
    # reattaches a word to it's grandfather
    head = heads[tokenid]
    ghead = heads[head]
+    cdef int new_head = ghead if head != ghead else tokenid
    # attach to ghead if head isn't attached to root else attach to root
-    heads[tokenid] = ghead if head != ghead else tokenid
+    heads[tokenid] = new_head
+    return new_head


 def _find_new_head(token, headlabel):
--- a/spacy/tests/parser/test_nonproj.py
+++ b/spacy/tests/parser/test_nonproj.py
@ -93,8 +93,8 @@ def test_parser_pseudoprojectivity(en_vocab):
    assert nonproj.is_decorated("X") is False
    nonproj._lift(0, tree)
    assert tree == [2, 2, 2]
-    assert nonproj._get_smallest_nonproj_arc(nonproj_tree) == 7
-    assert nonproj._get_smallest_nonproj_arc(nonproj_tree2) == 10
+    assert nonproj.get_smallest_nonproj_arc_slow(nonproj_tree) == 7
+    assert nonproj.get_smallest_nonproj_arc_slow(nonproj_tree2) == 10
    # fmt: off
    proj_heads, deco_labels = nonproj.projectivize(nonproj_tree, labels)
    assert proj_heads == [1, 2, 2, 4, 5, 2, 7, 5, 2]