mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 18:56:36 +03:00
Make core projectivization functions cdef nogil (#10241)
* Make core projectivization methods cdef nogil While profiling the parser, I noticed that relatively a lot of time is spent in projectivization. This change rewrites the functions in the core loops as cdef nogil for efficiency. In C++-land, we use vector in place of Python lists and absent heads are represented as -1 in place of None. * _heads_to_c: add assertion Validation should be performed by the caller, but this assertion ensures that we are not reading/writing out of bounds with incorrect input.
This commit is contained in:
parent
30030176ee
commit
78a8bec4d0
|
@ -4,6 +4,10 @@ for doing pseudo-projective parsing implementation uses the HEAD decoration
|
||||||
scheme.
|
scheme.
|
||||||
"""
|
"""
|
||||||
from copy import copy
|
from copy import copy
|
||||||
|
from libc.limits cimport INT_MAX
|
||||||
|
from libc.stdlib cimport abs
|
||||||
|
from libcpp cimport bool
|
||||||
|
from libcpp.vector cimport vector
|
||||||
|
|
||||||
from ...tokens.doc cimport Doc, set_children_from_heads
|
from ...tokens.doc cimport Doc, set_children_from_heads
|
||||||
|
|
||||||
|
@ -41,13 +45,18 @@ def contains_cycle(heads):
|
||||||
|
|
||||||
|
|
||||||
def is_nonproj_arc(tokenid, heads):
|
def is_nonproj_arc(tokenid, heads):
|
||||||
|
cdef vector[int] c_heads = _heads_to_c(heads)
|
||||||
|
return _is_nonproj_arc(tokenid, c_heads)
|
||||||
|
|
||||||
|
|
||||||
|
cdef bool _is_nonproj_arc(int tokenid, const vector[int]& heads) nogil:
|
||||||
# definition (e.g. Havelka 2007): an arc h -> d, h < d is non-projective
|
# definition (e.g. Havelka 2007): an arc h -> d, h < d is non-projective
|
||||||
# if there is a token k, h < k < d such that h is not
|
# if there is a token k, h < k < d such that h is not
|
||||||
# an ancestor of k. Same for h -> d, h > d
|
# an ancestor of k. Same for h -> d, h > d
|
||||||
head = heads[tokenid]
|
head = heads[tokenid]
|
||||||
if head == tokenid: # root arcs cannot be non-projective
|
if head == tokenid: # root arcs cannot be non-projective
|
||||||
return False
|
return False
|
||||||
elif head is None: # unattached tokens cannot be non-projective
|
elif head < 0: # unattached tokens cannot be non-projective
|
||||||
return False
|
return False
|
||||||
|
|
||||||
cdef int start, end
|
cdef int start, end
|
||||||
|
@ -56,19 +65,29 @@ def is_nonproj_arc(tokenid, heads):
|
||||||
else:
|
else:
|
||||||
start, end = (tokenid+1, head)
|
start, end = (tokenid+1, head)
|
||||||
for k in range(start, end):
|
for k in range(start, end):
|
||||||
for ancestor in ancestors(k, heads):
|
if _has_head_as_ancestor(k, head, heads):
|
||||||
if ancestor is None: # for unattached tokens/subtrees
|
continue
|
||||||
break
|
|
||||||
elif ancestor == head: # normal case: k dominated by h
|
|
||||||
break
|
|
||||||
else: # head not in ancestors: d -> h is non-projective
|
else: # head not in ancestors: d -> h is non-projective
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
cdef bool _has_head_as_ancestor(int tokenid, int head, const vector[int]& heads) nogil:
|
||||||
|
ancestor = tokenid
|
||||||
|
cnt = 0
|
||||||
|
while cnt < heads.size():
|
||||||
|
if heads[ancestor] == head or heads[ancestor] < 0:
|
||||||
|
return True
|
||||||
|
ancestor = heads[ancestor]
|
||||||
|
cnt += 1
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def is_nonproj_tree(heads):
|
def is_nonproj_tree(heads):
|
||||||
|
cdef vector[int] c_heads = _heads_to_c(heads)
|
||||||
# a tree is non-projective if at least one arc is non-projective
|
# a tree is non-projective if at least one arc is non-projective
|
||||||
return any(is_nonproj_arc(word, heads) for word in range(len(heads)))
|
return any(_is_nonproj_arc(word, c_heads) for word in range(len(heads)))
|
||||||
|
|
||||||
|
|
||||||
def decompose(label):
|
def decompose(label):
|
||||||
|
@ -98,16 +117,31 @@ def projectivize(heads, labels):
|
||||||
# tree, i.e. connected and cycle-free. Returns a new pair (heads, labels)
|
# tree, i.e. connected and cycle-free. Returns a new pair (heads, labels)
|
||||||
# which encode a projective and decorated tree.
|
# which encode a projective and decorated tree.
|
||||||
proj_heads = copy(heads)
|
proj_heads = copy(heads)
|
||||||
smallest_np_arc = _get_smallest_nonproj_arc(proj_heads)
|
|
||||||
if smallest_np_arc is None: # this sentence is already projective
|
cdef int new_head
|
||||||
|
cdef vector[int] c_proj_heads = _heads_to_c(proj_heads)
|
||||||
|
cdef int smallest_np_arc = _get_smallest_nonproj_arc(c_proj_heads)
|
||||||
|
if smallest_np_arc == -1: # this sentence is already projective
|
||||||
return proj_heads, copy(labels)
|
return proj_heads, copy(labels)
|
||||||
while smallest_np_arc is not None:
|
while smallest_np_arc != -1:
|
||||||
_lift(smallest_np_arc, proj_heads)
|
new_head = _lift(smallest_np_arc, proj_heads)
|
||||||
smallest_np_arc = _get_smallest_nonproj_arc(proj_heads)
|
c_proj_heads[smallest_np_arc] = new_head
|
||||||
|
smallest_np_arc = _get_smallest_nonproj_arc(c_proj_heads)
|
||||||
deco_labels = _decorate(heads, proj_heads, labels)
|
deco_labels = _decorate(heads, proj_heads, labels)
|
||||||
return proj_heads, deco_labels
|
return proj_heads, deco_labels
|
||||||
|
|
||||||
|
|
||||||
|
cdef vector[int] _heads_to_c(heads):
|
||||||
|
cdef vector[int] c_heads;
|
||||||
|
for head in heads:
|
||||||
|
if head == None:
|
||||||
|
c_heads.push_back(-1)
|
||||||
|
else:
|
||||||
|
assert head < len(heads)
|
||||||
|
c_heads.push_back(head)
|
||||||
|
return c_heads
|
||||||
|
|
||||||
|
|
||||||
cpdef deprojectivize(Doc doc):
|
cpdef deprojectivize(Doc doc):
|
||||||
# Reattach arcs with decorated labels (following HEAD scheme). For each
|
# Reattach arcs with decorated labels (following HEAD scheme). For each
|
||||||
# decorated arc X||Y, search top-down, left-to-right, breadth-first until
|
# decorated arc X||Y, search top-down, left-to-right, breadth-first until
|
||||||
|
@ -137,27 +171,38 @@ def _decorate(heads, proj_heads, labels):
|
||||||
deco_labels.append(labels[tokenid])
|
deco_labels.append(labels[tokenid])
|
||||||
return deco_labels
|
return deco_labels
|
||||||
|
|
||||||
|
def get_smallest_nonproj_arc_slow(heads):
|
||||||
|
cdef vector[int] c_heads = _heads_to_c(heads)
|
||||||
|
return _get_smallest_nonproj_arc(c_heads)
|
||||||
|
|
||||||
def _get_smallest_nonproj_arc(heads):
|
|
||||||
|
cdef int _get_smallest_nonproj_arc(const vector[int]& heads) nogil:
|
||||||
# return the smallest non-proj arc or None
|
# return the smallest non-proj arc or None
|
||||||
# where size is defined as the distance between dep and head
|
# where size is defined as the distance between dep and head
|
||||||
# and ties are broken left to right
|
# and ties are broken left to right
|
||||||
smallest_size = float('inf')
|
cdef int smallest_size = INT_MAX
|
||||||
smallest_np_arc = None
|
cdef int smallest_np_arc = -1
|
||||||
for tokenid, head in enumerate(heads):
|
cdef int size
|
||||||
|
cdef int tokenid
|
||||||
|
cdef int head
|
||||||
|
|
||||||
|
for tokenid in range(heads.size()):
|
||||||
|
head = heads[tokenid]
|
||||||
size = abs(tokenid-head)
|
size = abs(tokenid-head)
|
||||||
if size < smallest_size and is_nonproj_arc(tokenid, heads):
|
if size < smallest_size and _is_nonproj_arc(tokenid, heads):
|
||||||
smallest_size = size
|
smallest_size = size
|
||||||
smallest_np_arc = tokenid
|
smallest_np_arc = tokenid
|
||||||
return smallest_np_arc
|
return smallest_np_arc
|
||||||
|
|
||||||
|
|
||||||
def _lift(tokenid, heads):
|
cpdef int _lift(tokenid, heads):
|
||||||
# reattaches a word to it's grandfather
|
# reattaches a word to it's grandfather
|
||||||
head = heads[tokenid]
|
head = heads[tokenid]
|
||||||
ghead = heads[head]
|
ghead = heads[head]
|
||||||
|
cdef int new_head = ghead if head != ghead else tokenid
|
||||||
# attach to ghead if head isn't attached to root else attach to root
|
# attach to ghead if head isn't attached to root else attach to root
|
||||||
heads[tokenid] = ghead if head != ghead else tokenid
|
heads[tokenid] = new_head
|
||||||
|
return new_head
|
||||||
|
|
||||||
|
|
||||||
def _find_new_head(token, headlabel):
|
def _find_new_head(token, headlabel):
|
||||||
|
|
|
@ -93,8 +93,8 @@ def test_parser_pseudoprojectivity(en_vocab):
|
||||||
assert nonproj.is_decorated("X") is False
|
assert nonproj.is_decorated("X") is False
|
||||||
nonproj._lift(0, tree)
|
nonproj._lift(0, tree)
|
||||||
assert tree == [2, 2, 2]
|
assert tree == [2, 2, 2]
|
||||||
assert nonproj._get_smallest_nonproj_arc(nonproj_tree) == 7
|
assert nonproj.get_smallest_nonproj_arc_slow(nonproj_tree) == 7
|
||||||
assert nonproj._get_smallest_nonproj_arc(nonproj_tree2) == 10
|
assert nonproj.get_smallest_nonproj_arc_slow(nonproj_tree2) == 10
|
||||||
# fmt: off
|
# fmt: off
|
||||||
proj_heads, deco_labels = nonproj.projectivize(nonproj_tree, labels)
|
proj_heads, deco_labels = nonproj.projectivize(nonproj_tree, labels)
|
||||||
assert proj_heads == [1, 2, 2, 4, 5, 2, 7, 5, 2]
|
assert proj_heads == [1, 2, 2, 4, 5, 2, 7, 5, 2]
|
||||||
|
|
Loading…
Reference in New Issue
Block a user