mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-27 10:26:35 +03:00
91acc3ea75
* Partial fix of entity linker batching * Add import * Better name * Add `use_gold_ents` option, docs * Change to v2, create stub v1, update docs etc. * Fix error type Honestly no idea what the right type to use here is. ConfigValidationError seems wrong. Maybe a NotImplementedError? * Make mypy happy * Add hacky fix for init issue * Add legacy pipeline entity linker * Fix references to class name * Add __init__.py for legacy * Attempted fix for loss issue * Remove placeholder V1 * formatting * slightly more interesting train data * Handle batches with no usable examples This adds a test for batches that have docs but not entities, and a check in the component that detects such cases and skips the update step as thought the batch were empty. * Remove todo about data verification Check for empty data was moved further up so this should be OK now - the case in question shouldn't be possible. * Fix gradient calculation The model doesn't know which entities are not in the kb, so it generates embeddings for the context of all of them. However, the loss does know which entities aren't in the kb, and it ignores them, as there's no sensible gradient. This has the issue that the gradient will not be calculated for some of the input embeddings, which causes a dimension mismatch in backprop. That should have caused a clear error, but with numpyops it was causing nans to happen, which is another problem that should be addressed separately. This commit changes the loss to give a zero gradient for entities not in the kb. * add failing test for v1 EL legacy architecture * Add nasty but simple working check for legacy arch * Clarify why init hack works the way it does * Clarify use_gold_ents use case * Fix use gold ents related handling * Add tests for no gold ents and fix other tests * Use aligned ents function (not working) This doesn't actually work because the "aligned" ents are gold-only. But if I have a different function that returns the intersection, *then* this will work as desired. * Use proper matching ent check This changes the process when gold ents are not used so that the intersection of ents in the pred and gold is used. * Move get_matching_ents to Example * Use model attribute to check for legacy arch * Rename flag * bump spacy-legacy to lower 3.0.9 Co-authored-by: svlandeg <svlandeg@github.com>
67 lines
2.3 KiB
Python
67 lines
2.3 KiB
Python
from typing import Tuple, Callable
|
|
from thinc.api import Model, to_numpy
|
|
from thinc.types import Ragged, Ints1d
|
|
|
|
from ..util import registry
|
|
|
|
|
|
@registry.layers("spacy.extract_spans.v1")
|
|
def extract_spans() -> Model[Tuple[Ragged, Ragged], Ragged]:
|
|
"""Extract spans from a sequence of source arrays, as specified by an array
|
|
of (start, end) indices. The output is a ragged array of the
|
|
extracted spans.
|
|
"""
|
|
return Model(
|
|
"extract_spans", forward, layers=[], refs={}, attrs={}, dims={}, init=init
|
|
)
|
|
|
|
|
|
def init(model, X=None, Y=None):
|
|
pass
|
|
|
|
|
|
def forward(
|
|
model: Model, source_spans: Tuple[Ragged, Ragged], is_train: bool
|
|
) -> Tuple[Ragged, Callable]:
|
|
"""Get subsequences from source vectors."""
|
|
ops = model.ops
|
|
X, spans = source_spans
|
|
assert spans.dataXd.ndim == 2
|
|
indices = _get_span_indices(ops, spans, X.lengths)
|
|
if len(indices) > 0:
|
|
Y = Ragged(X.dataXd[indices], spans.dataXd[:, 1] - spans.dataXd[:, 0]) # type: ignore[arg-type, index]
|
|
else:
|
|
Y = Ragged(
|
|
ops.xp.zeros(X.dataXd.shape, dtype=X.dataXd.dtype),
|
|
ops.xp.zeros((len(X.lengths),), dtype="i"),
|
|
)
|
|
x_shape = X.dataXd.shape
|
|
x_lengths = X.lengths
|
|
|
|
def backprop_windows(dY: Ragged) -> Tuple[Ragged, Ragged]:
|
|
dX = Ragged(ops.alloc2f(*x_shape), x_lengths)
|
|
ops.scatter_add(dX.dataXd, indices, dY.dataXd) # type: ignore[arg-type]
|
|
return (dX, spans)
|
|
|
|
return Y, backprop_windows
|
|
|
|
|
|
def _get_span_indices(ops, spans: Ragged, lengths: Ints1d) -> Ints1d:
|
|
"""Construct a flat array that has the indices we want to extract from the
|
|
source data. For instance, if we want the spans (5, 9), (8, 10) the
|
|
indices will be [5, 6, 7, 8, 8, 9].
|
|
"""
|
|
spans, lengths = _ensure_cpu(spans, lengths)
|
|
indices = []
|
|
offset = 0
|
|
for i, length in enumerate(lengths):
|
|
spans_i = spans[i].dataXd + offset
|
|
for j in range(spans_i.shape[0]):
|
|
indices.append(ops.xp.arange(spans_i[j, 0], spans_i[j, 1])) # type: ignore[call-overload, index]
|
|
offset += length
|
|
return ops.flatten(indices, dtype="i", ndim_if_empty=1)
|
|
|
|
|
|
def _ensure_cpu(spans: Ragged, lengths: Ints1d) -> Tuple[Ragged, Ints1d]:
|
|
return Ragged(to_numpy(spans.dataXd), to_numpy(spans.lengths)), to_numpy(lengths)
|