mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
Backport parser/alignment optimizations from feature/refactor-parser
(#10952)
This commit is contained in:
parent
9738b69c0e
commit
8f1ba4de58
|
@ -1,33 +1,39 @@
|
||||||
from typing import List
|
from typing import List
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
import numpy
|
import numpy
|
||||||
|
from libc.stdint cimport int32_t
|
||||||
|
|
||||||
|
|
||||||
cdef class AlignmentArray:
|
cdef class AlignmentArray:
|
||||||
"""AlignmentArray is similar to Thinc's Ragged with two simplfications:
|
"""AlignmentArray is similar to Thinc's Ragged with two simplfications:
|
||||||
indexing returns numpy arrays and this type can only be used for CPU arrays.
|
indexing returns numpy arrays and this type can only be used for CPU arrays.
|
||||||
However, these changes make AlginmentArray more efficient for indexing in a
|
However, these changes make AlignmentArray more efficient for indexing in a
|
||||||
tight loop."""
|
tight loop."""
|
||||||
|
|
||||||
__slots__ = []
|
__slots__ = []
|
||||||
|
|
||||||
def __init__(self, alignment: List[List[int]]):
|
def __init__(self, alignment: List[List[int]]):
|
||||||
self._lengths = None
|
|
||||||
self._starts_ends = numpy.zeros(len(alignment) + 1, dtype="i")
|
|
||||||
|
|
||||||
cdef int data_len = 0
|
cdef int data_len = 0
|
||||||
cdef int outer_len
|
cdef int outer_len
|
||||||
cdef int idx
|
cdef int idx
|
||||||
|
|
||||||
|
self._starts_ends = numpy.zeros(len(alignment) + 1, dtype='int32')
|
||||||
|
cdef int32_t* starts_ends_ptr = <int32_t*>self._starts_ends.data
|
||||||
|
|
||||||
for idx, outer in enumerate(alignment):
|
for idx, outer in enumerate(alignment):
|
||||||
outer_len = len(outer)
|
outer_len = len(outer)
|
||||||
self._starts_ends[idx + 1] = self._starts_ends[idx] + outer_len
|
starts_ends_ptr[idx + 1] = starts_ends_ptr[idx] + outer_len
|
||||||
data_len += outer_len
|
data_len += outer_len
|
||||||
|
|
||||||
self._data = numpy.empty(data_len, dtype="i")
|
self._lengths = None
|
||||||
|
self._data = numpy.empty(data_len, dtype="int32")
|
||||||
|
|
||||||
idx = 0
|
idx = 0
|
||||||
|
cdef int32_t* data_ptr = <int32_t*>self._data.data
|
||||||
|
|
||||||
for outer in alignment:
|
for outer in alignment:
|
||||||
for inner in outer:
|
for inner in outer:
|
||||||
self._data[idx] = inner
|
data_ptr[idx] = inner
|
||||||
idx += 1
|
idx += 1
|
||||||
|
|
||||||
def __getitem__(self, idx):
|
def __getitem__(self, idx):
|
||||||
|
|
|
@ -13,7 +13,7 @@ from .iob_utils import biluo_tags_to_spans, remove_bilu_prefix
|
||||||
from ..errors import Errors, Warnings
|
from ..errors import Errors, Warnings
|
||||||
from ..pipeline._parser_internals import nonproj
|
from ..pipeline._parser_internals import nonproj
|
||||||
from ..tokens.token cimport MISSING_DEP
|
from ..tokens.token cimport MISSING_DEP
|
||||||
from ..util import logger, to_ternary_int
|
from ..util import logger, to_ternary_int, all_equal
|
||||||
|
|
||||||
|
|
||||||
cpdef Doc annotations_to_doc(vocab, tok_annot, doc_annot):
|
cpdef Doc annotations_to_doc(vocab, tok_annot, doc_annot):
|
||||||
|
@ -151,50 +151,127 @@ cdef class Example:
|
||||||
self._y_sig = y_sig
|
self._y_sig = y_sig
|
||||||
return self._cached_alignment
|
return self._cached_alignment
|
||||||
|
|
||||||
|
|
||||||
|
def _get_aligned_vectorized(self, align, gold_values):
|
||||||
|
# Fast path for Doc attributes/fields that are predominantly a single value,
|
||||||
|
# i.e., TAG, POS, MORPH.
|
||||||
|
x2y_single_toks = []
|
||||||
|
x2y_single_toks_i = []
|
||||||
|
|
||||||
|
x2y_multiple_toks = []
|
||||||
|
x2y_multiple_toks_i = []
|
||||||
|
|
||||||
|
# Gather indices of gold tokens aligned to the candidate tokens into two buckets.
|
||||||
|
# Bucket 1: All tokens that have a one-to-one alignment.
|
||||||
|
# Bucket 2: All tokens that have a one-to-many alignment.
|
||||||
|
for idx, token in enumerate(self.predicted):
|
||||||
|
aligned_gold_i = align[token.i]
|
||||||
|
aligned_gold_len = len(aligned_gold_i)
|
||||||
|
|
||||||
|
if aligned_gold_len == 1:
|
||||||
|
x2y_single_toks.append(aligned_gold_i.item())
|
||||||
|
x2y_single_toks_i.append(idx)
|
||||||
|
elif aligned_gold_len > 1:
|
||||||
|
x2y_multiple_toks.append(aligned_gold_i)
|
||||||
|
x2y_multiple_toks_i.append(idx)
|
||||||
|
|
||||||
|
# Map elements of the first bucket directly to the output array.
|
||||||
|
output = numpy.full(len(self.predicted), None)
|
||||||
|
output[x2y_single_toks_i] = gold_values[x2y_single_toks].squeeze()
|
||||||
|
|
||||||
|
# Collapse many-to-one alignments into one-to-one alignments if they
|
||||||
|
# share the same value. Map to None in all other cases.
|
||||||
|
for i in range(len(x2y_multiple_toks)):
|
||||||
|
aligned_gold_values = gold_values[x2y_multiple_toks[i]]
|
||||||
|
|
||||||
|
# If all aligned tokens have the same value, use it.
|
||||||
|
if all_equal(aligned_gold_values):
|
||||||
|
x2y_multiple_toks[i] = aligned_gold_values[0].item()
|
||||||
|
else:
|
||||||
|
x2y_multiple_toks[i] = None
|
||||||
|
|
||||||
|
output[x2y_multiple_toks_i] = x2y_multiple_toks
|
||||||
|
|
||||||
|
return output.tolist()
|
||||||
|
|
||||||
|
|
||||||
|
def _get_aligned_non_vectorized(self, align, gold_values):
|
||||||
|
# Slower path for fields that return multiple values (resulting
|
||||||
|
# in ragged arrays that cannot be vectorized trivially).
|
||||||
|
output = [None] * len(self.predicted)
|
||||||
|
|
||||||
|
for token in self.predicted:
|
||||||
|
aligned_gold_i = align[token.i]
|
||||||
|
values = gold_values[aligned_gold_i].ravel()
|
||||||
|
if len(values) == 1:
|
||||||
|
output[token.i] = values.item()
|
||||||
|
elif all_equal(values):
|
||||||
|
# If all aligned tokens have the same value, use it.
|
||||||
|
output[token.i] = values[0].item()
|
||||||
|
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
def get_aligned(self, field, as_string=False):
|
def get_aligned(self, field, as_string=False):
|
||||||
"""Return an aligned array for a token attribute."""
|
"""Return an aligned array for a token attribute."""
|
||||||
align = self.alignment.x2y
|
align = self.alignment.x2y
|
||||||
|
gold_values = self.reference.to_array([field])
|
||||||
|
|
||||||
|
if len(gold_values.shape) == 1:
|
||||||
|
output = self._get_aligned_vectorized(align, gold_values)
|
||||||
|
else:
|
||||||
|
output = self._get_aligned_non_vectorized(align, gold_values)
|
||||||
|
|
||||||
vocab = self.reference.vocab
|
vocab = self.reference.vocab
|
||||||
gold_values = self.reference.to_array([field])
|
|
||||||
output = [None] * len(self.predicted)
|
|
||||||
for token in self.predicted:
|
|
||||||
values = gold_values[align[token.i]]
|
|
||||||
values = values.ravel()
|
|
||||||
if len(values) == 0:
|
|
||||||
output[token.i] = None
|
|
||||||
elif len(values) == 1:
|
|
||||||
output[token.i] = values[0]
|
|
||||||
elif len(set(list(values))) == 1:
|
|
||||||
# If all aligned tokens have the same value, use it.
|
|
||||||
output[token.i] = values[0]
|
|
||||||
else:
|
|
||||||
output[token.i] = None
|
|
||||||
if as_string and field not in ["ENT_IOB", "SENT_START"]:
|
if as_string and field not in ["ENT_IOB", "SENT_START"]:
|
||||||
output = [vocab.strings[o] if o is not None else o for o in output]
|
output = [vocab.strings[o] if o is not None else o for o in output]
|
||||||
|
|
||||||
return output
|
return output
|
||||||
|
|
||||||
def get_aligned_parse(self, projectivize=True):
|
def get_aligned_parse(self, projectivize=True):
|
||||||
cand_to_gold = self.alignment.x2y
|
cand_to_gold = self.alignment.x2y
|
||||||
gold_to_cand = self.alignment.y2x
|
gold_to_cand = self.alignment.y2x
|
||||||
aligned_heads = [None] * self.x.length
|
|
||||||
aligned_deps = [None] * self.x.length
|
|
||||||
has_deps = [token.has_dep() for token in self.y]
|
|
||||||
has_heads = [token.has_head() for token in self.y]
|
|
||||||
heads = [token.head.i for token in self.y]
|
heads = [token.head.i for token in self.y]
|
||||||
deps = [token.dep_ for token in self.y]
|
deps = [token.dep_ for token in self.y]
|
||||||
|
|
||||||
if projectivize:
|
if projectivize:
|
||||||
proj_heads, proj_deps = nonproj.projectivize(heads, deps)
|
proj_heads, proj_deps = nonproj.projectivize(heads, deps)
|
||||||
|
has_deps = [token.has_dep() for token in self.y]
|
||||||
|
has_heads = [token.has_head() for token in self.y]
|
||||||
|
|
||||||
# ensure that missing data remains missing
|
# ensure that missing data remains missing
|
||||||
heads = [h if has_heads[i] else heads[i] for i, h in enumerate(proj_heads)]
|
heads = [h if has_heads[i] else heads[i] for i, h in enumerate(proj_heads)]
|
||||||
deps = [d if has_deps[i] else deps[i] for i, d in enumerate(proj_deps)]
|
deps = [d if has_deps[i] else deps[i] for i, d in enumerate(proj_deps)]
|
||||||
for cand_i in range(self.x.length):
|
|
||||||
if cand_to_gold.lengths[cand_i] == 1:
|
# Select all candidate tokens that are aligned to a single gold token.
|
||||||
gold_i = cand_to_gold[cand_i][0]
|
c2g_single_toks = numpy.where(cand_to_gold.lengths == 1)[0]
|
||||||
if gold_to_cand.lengths[heads[gold_i]] == 1:
|
|
||||||
aligned_heads[cand_i] = int(gold_to_cand[heads[gold_i]][0])
|
# Fetch all aligned gold token incides.
|
||||||
aligned_deps[cand_i] = deps[gold_i]
|
if c2g_single_toks.shape == cand_to_gold.lengths.shape:
|
||||||
return aligned_heads, aligned_deps
|
# This the most likely case.
|
||||||
|
gold_i = cand_to_gold[:].squeeze()
|
||||||
|
else:
|
||||||
|
gold_i = numpy.vectorize(lambda x: cand_to_gold[int(x)][0])(c2g_single_toks).squeeze()
|
||||||
|
|
||||||
|
# Fetch indices of all gold heads for the aligned gold tokens.
|
||||||
|
heads = numpy.asarray(heads, dtype='i')
|
||||||
|
gold_head_i = heads[gold_i]
|
||||||
|
|
||||||
|
# Select all gold tokens that are heads of the previously selected
|
||||||
|
# gold tokens (and are aligned to a single candidate token).
|
||||||
|
g2c_len_heads = gold_to_cand.lengths[gold_head_i]
|
||||||
|
g2c_len_heads = numpy.where(g2c_len_heads == 1)[0]
|
||||||
|
g2c_i = numpy.vectorize(lambda x: gold_to_cand[int(x)][0])(gold_head_i[g2c_len_heads]).squeeze()
|
||||||
|
|
||||||
|
# Update head/dep alignments with the above.
|
||||||
|
aligned_heads = numpy.full((self.x.length), None)
|
||||||
|
aligned_heads[c2g_single_toks[g2c_len_heads]] = g2c_i
|
||||||
|
|
||||||
|
deps = numpy.asarray(deps)
|
||||||
|
aligned_deps = numpy.full((self.x.length), None)
|
||||||
|
aligned_deps[c2g_single_toks] = deps[gold_i]
|
||||||
|
|
||||||
|
return aligned_heads.tolist(), aligned_deps.tolist()
|
||||||
|
|
||||||
def get_aligned_sent_starts(self):
|
def get_aligned_sent_starts(self):
|
||||||
"""Get list of SENT_START attributes aligned to the predicted tokenization.
|
"""Get list of SENT_START attributes aligned to the predicted tokenization.
|
||||||
|
|
|
@ -1716,3 +1716,10 @@ def packages_distributions() -> Dict[str, List[str]]:
|
||||||
for pkg in (dist.read_text("top_level.txt") or "").split():
|
for pkg in (dist.read_text("top_level.txt") or "").split():
|
||||||
pkg_to_dist[pkg].append(dist.metadata["Name"])
|
pkg_to_dist[pkg].append(dist.metadata["Name"])
|
||||||
return dict(pkg_to_dist)
|
return dict(pkg_to_dist)
|
||||||
|
|
||||||
|
|
||||||
|
def all_equal(iterable):
|
||||||
|
"""Return True if all the elements are equal to each other
|
||||||
|
(or if the input is an empty sequence), False otherwise."""
|
||||||
|
g = itertools.groupby(iterable)
|
||||||
|
return next(g, True) and not next(g, False)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user