spaCy/spacy/pipeline/_parser_internals/nonproj.pyx
Matthew Honnibal 5bebbf7550
Python 3.13 support (#13823)
In order to support Python 3.13, we had to migrate to Cython 3.0. This caused some tricky interaction with our Pydantic usage, because Cython 3 uses the from __future__ import annotations semantics, which causes type annotations to be saved as strings.

The end result is that we can't have Language.factory decorated functions in Cython modules anymore, as the Language.factory decorator expects to inspect the signature of the functions and build a Pydantic model. If the function is implemented in Cython, an error is raised because the type is not resolved.

To address this I've moved the factory functions into a new module, spacy.pipeline.factories. I've added __getattr__ importlib hooks to the previous locations, in case anyone was importing these functions directly. The change should have no backwards compatibility implications.

Along the way I've also refactored the registration of functions for the config. Previously these ran as import-time side-effects, using the registry decorator. I've created instead a new module spacy.registrations. When the registry is accessed it calls a function ensure_populated(), which cases the registrations to occur.

I've made a similar change to the Language.factory registrations in the new spacy.pipeline.factories module.

I want to remove these import-time side-effects so that we can speed up the loading time of the library, which can be especially painful on the CLI. I also find that I'm often working to track down the implementations of functions referenced by strings in the config. Having the registrations all happen in one place will make this easier.

With these changes I've fortunately avoided the need to migrate to Pydantic v2 properly --- we're still using the v1 compatibility shim. We might not be able to hold out forever though: Pydantic (reasonably) aren't actively supporting the v1 shims. I put a lot of work into v2 migration when investigating the 3.13 support, and it's definitely challenging. In any case, it's a relief that we don't have to do the v2 migration at the same time as the Cython 3.0/Python 3.13 support.
2025-05-22 13:47:21 +02:00

259 lines
8.5 KiB
Cython

# cython: infer_types=True
"""Implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005
for doing pseudo-projective parsing implementation uses the HEAD decoration
scheme.
"""
from copy import copy
from cython.operator cimport dereference as deref
from cython.operator cimport preincrement as incr
from libc.limits cimport INT_MAX
from libc.stdlib cimport abs
from libcpp cimport bool
from libcpp.string cimport string, to_string
from libcpp.unordered_set cimport unordered_set
from libcpp.vector cimport vector
from ...tokens.doc cimport Doc, set_children_from_heads
from ...errors import Errors
DELIMITER = '||'
def ancestors(tokenid, heads):
# Returns all words going from the word up the path to the root. The path
# to root cannot be longer than the number of words in the sentence. This
# function ends after at most len(heads) steps, because it would otherwise
# loop indefinitely on cycles.
head = tokenid
cnt = 0
while heads[head] != head and cnt < len(heads):
head = heads[head]
cnt += 1
yield head
if head is None:
break
def contains_cycle(heads):
# in an acyclic tree, the path from each word following the head relation
# upwards always ends at the root node
for tokenid in range(len(heads)):
seen = set([tokenid])
for ancestor in ancestors(tokenid, heads):
if ancestor in seen:
return seen
seen.add(ancestor)
return None
def is_nonproj_arc(tokenid, heads):
cdef vector[int] c_heads = _heads_to_c(heads)
return _is_nonproj_arc(tokenid, c_heads)
cdef bool _is_nonproj_arc(int tokenid, const vector[int]& heads) nogil except *:
# definition (e.g. Havelka 2007): an arc h -> d, h < d is non-projective
# if there is a token k, h < k < d such that h is not
# an ancestor of k. Same for h -> d, h > d
head = heads[tokenid]
if head == tokenid: # root arcs cannot be non-projective
return False
elif head < 0: # unattached tokens cannot be non-projective
return False
cdef int start, end
if head < tokenid:
start, end = (head+1, tokenid)
else:
start, end = (tokenid+1, head)
for k in range(start, end):
if not _has_head_as_ancestor(k, head, heads):
return True
return False
cdef bool _has_head_as_ancestor(int tokenid, int head, const vector[int]& heads) nogil except *:
ancestor = tokenid
cdef unordered_set[int] seen_tokens
seen_tokens.insert(ancestor)
while True:
# Reached the head or a disconnected node
if heads[ancestor] == head or heads[ancestor] < 0:
return True
# Reached the root
if heads[ancestor] == ancestor:
return False
ancestor = heads[ancestor]
result = seen_tokens.insert(ancestor)
# Found cycle
if not result.second:
raise_domain_error(heads_to_string(heads))
return False
cdef string heads_to_string(const vector[int]& heads) noexcept nogil:
cdef vector[int].const_iterator citer
cdef string cycle_str
cycle_str.append("Found cycle in dependency graph: [")
# FIXME: Rewrite using ostringstream when available in Cython.
citer = heads.const_begin()
while citer != heads.const_end():
if citer != heads.const_begin():
cycle_str.append(", ")
cycle_str.append(to_string(deref(citer)))
incr(citer)
cycle_str.append("]")
return cycle_str
def is_nonproj_tree(heads):
cdef vector[int] c_heads = _heads_to_c(heads)
# a tree is non-projective if at least one arc is non-projective
return any(_is_nonproj_arc(word, c_heads) for word in range(len(heads)))
def decompose(label):
return label.partition(DELIMITER)[::2]
def is_decorated(label):
return DELIMITER in label
def count_decorated_labels(gold_data):
freqs = {}
for example in gold_data:
proj_heads, deco_deps = projectivize(example.get_aligned("HEAD"),
example.get_aligned("DEP"))
# set the label to ROOT for each root dependent
deco_deps = [
'ROOT' if head == i else deco_deps[i]
for i, head in enumerate(proj_heads)
]
# count label frequencies
for label in deco_deps:
if is_decorated(label):
freqs[label] = freqs.get(label, 0) + 1
return freqs
def projectivize(heads, labels):
# Use the algorithm by Nivre & Nilsson 2005. Assumes heads to be a proper
# tree, i.e. connected and cycle-free. Returns a new pair (heads, labels)
# which encode a projective and decorated tree.
proj_heads = copy(heads)
cdef int new_head
cdef vector[int] c_proj_heads = _heads_to_c(proj_heads)
cdef int smallest_np_arc = _get_smallest_nonproj_arc(c_proj_heads)
if smallest_np_arc == -1: # this sentence is already projective
return proj_heads, copy(labels)
while smallest_np_arc != -1:
new_head = _lift(smallest_np_arc, proj_heads)
c_proj_heads[smallest_np_arc] = new_head
smallest_np_arc = _get_smallest_nonproj_arc(c_proj_heads)
deco_labels = _decorate(heads, proj_heads, labels)
return proj_heads, deco_labels
cdef vector[int] _heads_to_c(heads):
cdef vector[int] c_heads
for head in heads:
if head is None:
c_heads.push_back(-1)
else:
assert head < len(heads)
c_heads.push_back(head)
return c_heads
cpdef deprojectivize(Doc doc):
# Reattach arcs with decorated labels (following HEAD scheme). For each
# decorated arc X||Y, search top-down, left-to-right, breadth-first until
# hitting a Y then make this the new head.
for i in range(doc.length):
label = doc.vocab.strings[doc.c[i].dep]
if DELIMITER in label:
new_label, head_label = label.split(DELIMITER)
new_head = _find_new_head(doc[i], head_label)
doc.c[i].head = new_head.i - i
doc.c[i].dep = doc.vocab.strings.add(new_label, allow_transient=False)
set_children_from_heads(doc.c, 0, doc.length)
return doc
def _decorate(heads, proj_heads, labels):
# uses decoration scheme HEAD from Nivre & Nilsson 2005
if (len(heads) != len(proj_heads)) or (len(proj_heads) != len(labels)):
raise ValueError(Errors.E082.format(n_heads=len(heads),
n_proj_heads=len(proj_heads),
n_labels=len(labels)))
deco_labels = []
for tokenid, head in enumerate(heads):
if head != proj_heads[tokenid]:
deco_labels.append(f"{labels[tokenid]}{DELIMITER}{labels[head]}")
else:
deco_labels.append(labels[tokenid])
return deco_labels
def get_smallest_nonproj_arc_slow(heads):
cdef vector[int] c_heads = _heads_to_c(heads)
return _get_smallest_nonproj_arc(c_heads)
cdef int _get_smallest_nonproj_arc(const vector[int]& heads) nogil except -2:
# return the smallest non-proj arc or None
# where size is defined as the distance between dep and head
# and ties are broken left to right
cdef int smallest_size = INT_MAX
# -1 means its already projective.
cdef int smallest_np_arc = -1
cdef int size
cdef int tokenid
cdef int head
for tokenid in range(heads.size()):
head = heads[tokenid]
size = abs(tokenid-head)
if size < smallest_size and _is_nonproj_arc(tokenid, heads):
smallest_size = size
smallest_np_arc = tokenid
return smallest_np_arc
cpdef int _lift(tokenid, heads):
# reattaches a word to it's grandfather
head = heads[tokenid]
ghead = heads[head]
cdef int new_head = ghead if head != ghead else tokenid
# attach to ghead if head isn't attached to root else attach to root
heads[tokenid] = new_head
return new_head
def _find_new_head(token, headlabel):
# search through the tree starting from the head of the given token
# returns the id of the first descendant with the given label
# if there is none, return the current head (no change)
queue = [token.head]
while queue:
next_queue = []
for qtoken in queue:
for child in qtoken.children:
if child.is_space:
continue
if child == token:
continue
if child.dep_ == headlabel:
return child
next_queue.append(child)
queue = next_queue
return token.head