spaCy/spacy/ml/extract_spans.py

from typing import Tuple, Callable
from thinc.api import Model, to_numpy
from thinc.types import Ragged, Ints1d

from ..util import registry


@registry.layers("spacy.extract_spans.v1")
def extract_spans() -> Model[Tuple[Ragged, Ragged], Ragged]:
    """Extract spans from a sequence of source arrays, as specified by an array
    of (start, end) indices. The output is a ragged array of the
    extracted spans.
    """
    return Model(
        "extract_spans", forward, layers=[], refs={}, attrs={}, dims={}, init=init
    )


def init(model, X=None, Y=None):
    pass


def forward(
    model: Model, source_spans: Tuple[Ragged, Ragged], is_train: bool
) -> Tuple[Ragged, Callable]:
    """Get subsequences from source vectors."""
    ops = model.ops
    X, spans = source_spans
    assert spans.dataXd.ndim == 2
    indices = _get_span_indices(ops, spans, X.lengths)
    Y = Ragged(X.dataXd[indices], spans.dataXd[:, 1] - spans.dataXd[:, 0])
    x_shape = X.dataXd.shape
    x_lengths = X.lengths

    def backprop_windows(dY: Ragged) -> Tuple[Ragged, Ragged]:
        dX = Ragged(ops.alloc2f(*x_shape), x_lengths)
        ops.scatter_add(dX.dataXd, indices, dY.dataXd)
        return (dX, spans)

    return Y, backprop_windows


def _get_span_indices(ops, spans: Ragged, lengths: Ints1d) -> Ints1d:
    """Construct a flat array that has the indices we want to extract from the
    source data. For instance, if we want the spans (5, 9), (8, 10) the
    indices will be [5, 6, 7, 8, 8, 9].
    """
    spans, lengths = _ensure_cpu(spans, lengths)
    indices = []
    offset = 0
    for i, length in enumerate(lengths):
        spans_i = spans[i].dataXd + offset
        for j in range(spans_i.shape[0]):
            indices.append(ops.xp.arange(spans_i[j, 0], spans_i[j, 1]))
        offset += length
    return ops.flatten(indices)


def _ensure_cpu(spans: Ragged, lengths: Ints1d) -> Tuple[Ragged, Ints1d]:
    return (Ragged(to_numpy(spans.dataXd), to_numpy(spans.lengths)), to_numpy(lengths))
Add SpanCategorizer component (#6747) * Draft spancat model * Add spancat model * Add test for extract_spans * Add extract_spans layer * Upd extract_spans * Add spancat model * Add test for spancat model * Upd spancat model * Update spancat component * Upd spancat * Update spancat model * Add quick spancat test * Import SpanCategorizer * Fix SpanCategorizer component * Import SpanGroup * Fix span extraction * Fix import * Fix import * Upd model * Update spancat models * Add scoring, update defaults * Update and add docs * Fix type * Update spacy/ml/extract_spans.py * Auto-format and fix import * Fix comment * Fix type * Fix type * Update website/docs/api/spancategorizer.md * Fix comment Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Better defense Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Fix labels list Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/ml/extract_spans.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/pipeline/spancat.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Set annotations during update * Set annotations in spancat * fix imports in test * Update spacy/pipeline/spancat.py * replace MaxoutLogistic with LinearLogistic * fix config * various small fixes * remove set_annotations parameter in update * use our beloved tupley format with recent support for doc.spans * bugfix to allow renaming the default span_key (scores weren't showing up) * use different key in docs example * change defaults to better-working parameters from project (WIP) * register spacy.extract_spans.v1 for legacy purposes * Upd dev version so can build wheel * layers instead of architectures for smaller building blocks * Update website/docs/api/spancategorizer.md Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> * Update website/docs/api/spancategorizer.md Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> * Include additional scores from overrides in combined score weights * Parameterize spans key in scoring Parameterize the `SpanCategorizer` `spans_key` for scoring purposes so that it's possible to evaluate multiple `spancat` components in the same pipeline. * Use the (intentionally very short) default spans key `sc` in the `SpanCategorizer` * Adjust the default score weights to include the default key * Adjust the scorer to use `spans_{spans_key}` as the prefix for the returned score * Revert addition of `attr_name` argument to `score_spans` and adjust the key in the `getter` instead. Note that for `spancat` components with a custom `span_key`, the score weights currently need to be modified manually in `[training.score_weights]` for them to be available during training. To suppress the default score weights `spans_sc_p/r/f` during training, set them to `null` in `[training.score_weights]`. * Update website/docs/api/scorer.md * Fix scorer for spans key containing underscore * Increment version * Add Spans to Evaluate CLI (#8439) * Add Spans to Evaluate CLI * Change to spans_key * Add spans per_type output Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> * Fix spancat GPU issues (#8455) * Fix GPU issues * Require thinc >=8.0.6 * Switch to glorot_uniform_init * Fix and test ngram suggester * Include final ngram in doc for all sizes * Fix ngrams for docs of the same length as ngram size * Handle batches of docs that result in no ngrams * Add tests Co-authored-by: Ines Montani <ines@ines.io> Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com> Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> Co-authored-by: Nirant <NirantK@users.noreply.github.com> 2021-06-24 13:35:27 +03:00			`from typing import Tuple, Callable`
			`from thinc.api import Model, to_numpy`
			`from thinc.types import Ragged, Ints1d`

			`from ..util import registry`


			`@registry.layers("spacy.extract_spans.v1")`
			`def extract_spans() -> Model[Tuple[Ragged, Ragged], Ragged]:`
			`"""Extract spans from a sequence of source arrays, as specified by an array`
			`of (start, end) indices. The output is a ragged array of the`
			`extracted spans.`
			`"""`
			`return Model(`
			`"extract_spans", forward, layers=[], refs={}, attrs={}, dims={}, init=init`
			`)`


			`def init(model, X=None, Y=None):`
			`pass`


			`def forward(`
			`model: Model, source_spans: Tuple[Ragged, Ragged], is_train: bool`
			`) -> Tuple[Ragged, Callable]:`
			`"""Get subsequences from source vectors."""`
			`ops = model.ops`
			`X, spans = source_spans`
			`assert spans.dataXd.ndim == 2`
			`indices = _get_span_indices(ops, spans, X.lengths)`
			`Y = Ragged(X.dataXd[indices], spans.dataXd[:, 1] - spans.dataXd[:, 0])`
			`x_shape = X.dataXd.shape`
			`x_lengths = X.lengths`

			`def backprop_windows(dY: Ragged) -> Tuple[Ragged, Ragged]:`
			`dX = Ragged(ops.alloc2f(*x_shape), x_lengths)`
			`ops.scatter_add(dX.dataXd, indices, dY.dataXd)`
			`return (dX, spans)`

			`return Y, backprop_windows`


			`def _get_span_indices(ops, spans: Ragged, lengths: Ints1d) -> Ints1d:`
			`"""Construct a flat array that has the indices we want to extract from the`
			`source data. For instance, if we want the spans (5, 9), (8, 10) the`
			`indices will be [5, 6, 7, 8, 8, 9].`
			`"""`
			`spans, lengths = _ensure_cpu(spans, lengths)`
			`indices = []`
			`offset = 0`
			`for i, length in enumerate(lengths):`
			`spans_i = spans[i].dataXd + offset`
			`for j in range(spans_i.shape[0]):`
			`indices.append(ops.xp.arange(spans_i[j, 0], spans_i[j, 1]))`
			`offset += length`
			`return ops.flatten(indices)`


			`def _ensure_cpu(spans: Ragged, lengths: Ints1d) -> Tuple[Ragged, Ints1d]:`
			`return (Ragged(to_numpy(spans.dataXd), to_numpy(spans.lengths)), to_numpy(lengths))`