mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-01 04:46:38 +03:00
b0228d8ea6
* chore: add cython-linter dev dependency * fix: lexeme.pyx * fix: morphology.pxd * fix: tokenizer.pxd * fix: vocab.pxd * fix: morphology.pxd (line length) * ci: add cython-lint * ci: fix cython-lint call * Fix kb/candidate.pyx. * Fix kb/kb.pyx. * Fix kb/kb_in_memory.pyx. * Fix kb. * Fix training/ partially. * Fix training/. Ignore trailing whitespaces and too long lines. * Fix ml/. * Fix matcher/. * Fix pipeline/. * Fix tokens/. * Fix build errors. Fix vocab.pyx. * Fix cython-lint install and run. * Fix lexeme.pyx, parts_of_speech.pxd, vectors.pyx. Temporarily disable cython-lint execution. * Fix attrs.pyx, lexeme.pyx, symbols.pxd, isort issues. * Make cython-lint install conditional. Fix tokenizer.pyx. * Fix remaining files. Reenable cython-lint check. * Readded parentheses. * Fix test_build_dependencies(). * Add explanatory comment to cython-lint execution. --------- Co-authored-by: Raphael Mitsch <r.mitsch@outlook.com>
259 lines
8.4 KiB
Cython
259 lines
8.4 KiB
Cython
# cython: profile=True, infer_types=True
|
|
"""Implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005
|
|
for doing pseudo-projective parsing implementation uses the HEAD decoration
|
|
scheme.
|
|
"""
|
|
from copy import copy
|
|
|
|
from cython.operator cimport dereference as deref
|
|
from cython.operator cimport preincrement as incr
|
|
from libc.limits cimport INT_MAX
|
|
from libc.stdlib cimport abs
|
|
from libcpp cimport bool
|
|
from libcpp.string cimport string, to_string
|
|
from libcpp.unordered_set cimport unordered_set
|
|
from libcpp.vector cimport vector
|
|
|
|
from ...tokens.doc cimport Doc, set_children_from_heads
|
|
|
|
from ...errors import Errors
|
|
|
|
DELIMITER = '||'
|
|
|
|
|
|
def ancestors(tokenid, heads):
|
|
# Returns all words going from the word up the path to the root. The path
|
|
# to root cannot be longer than the number of words in the sentence. This
|
|
# function ends after at most len(heads) steps, because it would otherwise
|
|
# loop indefinitely on cycles.
|
|
head = tokenid
|
|
cnt = 0
|
|
while heads[head] != head and cnt < len(heads):
|
|
head = heads[head]
|
|
cnt += 1
|
|
yield head
|
|
if head is None:
|
|
break
|
|
|
|
|
|
def contains_cycle(heads):
|
|
# in an acyclic tree, the path from each word following the head relation
|
|
# upwards always ends at the root node
|
|
for tokenid in range(len(heads)):
|
|
seen = set([tokenid])
|
|
for ancestor in ancestors(tokenid, heads):
|
|
if ancestor in seen:
|
|
return seen
|
|
seen.add(ancestor)
|
|
return None
|
|
|
|
|
|
def is_nonproj_arc(tokenid, heads):
|
|
cdef vector[int] c_heads = _heads_to_c(heads)
|
|
return _is_nonproj_arc(tokenid, c_heads)
|
|
|
|
|
|
cdef bool _is_nonproj_arc(int tokenid, const vector[int]& heads) nogil except *:
|
|
# definition (e.g. Havelka 2007): an arc h -> d, h < d is non-projective
|
|
# if there is a token k, h < k < d such that h is not
|
|
# an ancestor of k. Same for h -> d, h > d
|
|
head = heads[tokenid]
|
|
if head == tokenid: # root arcs cannot be non-projective
|
|
return False
|
|
elif head < 0: # unattached tokens cannot be non-projective
|
|
return False
|
|
|
|
cdef int start, end
|
|
if head < tokenid:
|
|
start, end = (head+1, tokenid)
|
|
else:
|
|
start, end = (tokenid+1, head)
|
|
for k in range(start, end):
|
|
if not _has_head_as_ancestor(k, head, heads):
|
|
return True
|
|
return False
|
|
|
|
|
|
cdef bool _has_head_as_ancestor(int tokenid, int head, const vector[int]& heads) nogil except *:
|
|
ancestor = tokenid
|
|
cdef unordered_set[int] seen_tokens
|
|
seen_tokens.insert(ancestor)
|
|
while True:
|
|
# Reached the head or a disconnected node
|
|
if heads[ancestor] == head or heads[ancestor] < 0:
|
|
return True
|
|
# Reached the root
|
|
if heads[ancestor] == ancestor:
|
|
return False
|
|
ancestor = heads[ancestor]
|
|
result = seen_tokens.insert(ancestor)
|
|
# Found cycle
|
|
if not result.second:
|
|
raise_domain_error(heads_to_string(heads))
|
|
|
|
return False
|
|
|
|
|
|
cdef string heads_to_string(const vector[int]& heads) nogil:
|
|
cdef vector[int].const_iterator citer
|
|
cdef string cycle_str
|
|
|
|
cycle_str.append("Found cycle in dependency graph: [")
|
|
|
|
# FIXME: Rewrite using ostringstream when available in Cython.
|
|
citer = heads.const_begin()
|
|
while citer != heads.const_end():
|
|
if citer != heads.const_begin():
|
|
cycle_str.append(", ")
|
|
cycle_str.append(to_string(deref(citer)))
|
|
incr(citer)
|
|
cycle_str.append("]")
|
|
|
|
return cycle_str
|
|
|
|
|
|
def is_nonproj_tree(heads):
|
|
cdef vector[int] c_heads = _heads_to_c(heads)
|
|
# a tree is non-projective if at least one arc is non-projective
|
|
return any(_is_nonproj_arc(word, c_heads) for word in range(len(heads)))
|
|
|
|
|
|
def decompose(label):
|
|
return label.partition(DELIMITER)[::2]
|
|
|
|
|
|
def is_decorated(label):
|
|
return DELIMITER in label
|
|
|
|
|
|
def count_decorated_labels(gold_data):
|
|
freqs = {}
|
|
for example in gold_data:
|
|
proj_heads, deco_deps = projectivize(example.get_aligned("HEAD"),
|
|
example.get_aligned("DEP"))
|
|
# set the label to ROOT for each root dependent
|
|
deco_deps = [
|
|
'ROOT' if head == i else deco_deps[i]
|
|
for i, head in enumerate(proj_heads)
|
|
]
|
|
# count label frequencies
|
|
for label in deco_deps:
|
|
if is_decorated(label):
|
|
freqs[label] = freqs.get(label, 0) + 1
|
|
return freqs
|
|
|
|
|
|
def projectivize(heads, labels):
|
|
# Use the algorithm by Nivre & Nilsson 2005. Assumes heads to be a proper
|
|
# tree, i.e. connected and cycle-free. Returns a new pair (heads, labels)
|
|
# which encode a projective and decorated tree.
|
|
proj_heads = copy(heads)
|
|
|
|
cdef int new_head
|
|
cdef vector[int] c_proj_heads = _heads_to_c(proj_heads)
|
|
cdef int smallest_np_arc = _get_smallest_nonproj_arc(c_proj_heads)
|
|
if smallest_np_arc == -1: # this sentence is already projective
|
|
return proj_heads, copy(labels)
|
|
while smallest_np_arc != -1:
|
|
new_head = _lift(smallest_np_arc, proj_heads)
|
|
c_proj_heads[smallest_np_arc] = new_head
|
|
smallest_np_arc = _get_smallest_nonproj_arc(c_proj_heads)
|
|
deco_labels = _decorate(heads, proj_heads, labels)
|
|
return proj_heads, deco_labels
|
|
|
|
|
|
cdef vector[int] _heads_to_c(heads):
|
|
cdef vector[int] c_heads
|
|
for head in heads:
|
|
if head is None:
|
|
c_heads.push_back(-1)
|
|
else:
|
|
assert head < len(heads)
|
|
c_heads.push_back(head)
|
|
return c_heads
|
|
|
|
|
|
cpdef deprojectivize(Doc doc):
|
|
# Reattach arcs with decorated labels (following HEAD scheme). For each
|
|
# decorated arc X||Y, search top-down, left-to-right, breadth-first until
|
|
# hitting a Y then make this the new head.
|
|
for i in range(doc.length):
|
|
label = doc.vocab.strings[doc.c[i].dep]
|
|
if DELIMITER in label:
|
|
new_label, head_label = label.split(DELIMITER)
|
|
new_head = _find_new_head(doc[i], head_label)
|
|
doc.c[i].head = new_head.i - i
|
|
doc.c[i].dep = doc.vocab.strings.add(new_label)
|
|
set_children_from_heads(doc.c, 0, doc.length)
|
|
return doc
|
|
|
|
|
|
def _decorate(heads, proj_heads, labels):
|
|
# uses decoration scheme HEAD from Nivre & Nilsson 2005
|
|
if (len(heads) != len(proj_heads)) or (len(proj_heads) != len(labels)):
|
|
raise ValueError(Errors.E082.format(n_heads=len(heads),
|
|
n_proj_heads=len(proj_heads),
|
|
n_labels=len(labels)))
|
|
deco_labels = []
|
|
for tokenid, head in enumerate(heads):
|
|
if head != proj_heads[tokenid]:
|
|
deco_labels.append(f"{labels[tokenid]}{DELIMITER}{labels[head]}")
|
|
else:
|
|
deco_labels.append(labels[tokenid])
|
|
return deco_labels
|
|
|
|
|
|
def get_smallest_nonproj_arc_slow(heads):
|
|
cdef vector[int] c_heads = _heads_to_c(heads)
|
|
return _get_smallest_nonproj_arc(c_heads)
|
|
|
|
|
|
cdef int _get_smallest_nonproj_arc(const vector[int]& heads) nogil except -2:
|
|
# return the smallest non-proj arc or None
|
|
# where size is defined as the distance between dep and head
|
|
# and ties are broken left to right
|
|
cdef int smallest_size = INT_MAX
|
|
# -1 means its already projective.
|
|
cdef int smallest_np_arc = -1
|
|
cdef int size
|
|
cdef int tokenid
|
|
cdef int head
|
|
|
|
for tokenid in range(heads.size()):
|
|
head = heads[tokenid]
|
|
size = abs(tokenid-head)
|
|
if size < smallest_size and _is_nonproj_arc(tokenid, heads):
|
|
smallest_size = size
|
|
smallest_np_arc = tokenid
|
|
return smallest_np_arc
|
|
|
|
|
|
cpdef int _lift(tokenid, heads):
|
|
# reattaches a word to it's grandfather
|
|
head = heads[tokenid]
|
|
ghead = heads[head]
|
|
cdef int new_head = ghead if head != ghead else tokenid
|
|
# attach to ghead if head isn't attached to root else attach to root
|
|
heads[tokenid] = new_head
|
|
return new_head
|
|
|
|
|
|
def _find_new_head(token, headlabel):
|
|
# search through the tree starting from the head of the given token
|
|
# returns the id of the first descendant with the given label
|
|
# if there is none, return the current head (no change)
|
|
queue = [token.head]
|
|
while queue:
|
|
next_queue = []
|
|
for qtoken in queue:
|
|
for child in qtoken.children:
|
|
if child.is_space:
|
|
continue
|
|
if child == token:
|
|
continue
|
|
if child.dep_ == headlabel:
|
|
return child
|
|
next_queue.append(child)
|
|
queue = next_queue
|
|
return token.head
|