mirror of
https://github.com/explosion/spaCy.git
synced 2025-10-25 05:01:02 +03:00
* Draft spancat model
* Add spancat model
* Add test for extract_spans
* Add extract_spans layer
* Upd extract_spans
* Add spancat model
* Add test for spancat model
* Upd spancat model
* Update spancat component
* Upd spancat
* Update spancat model
* Add quick spancat test
* Import SpanCategorizer
* Fix SpanCategorizer component
* Import SpanGroup
* Fix span extraction
* Fix import
* Fix import
* Upd model
* Update spancat models
* Add scoring, update defaults
* Update and add docs
* Fix type
* Update spacy/ml/extract_spans.py
* Auto-format and fix import
* Fix comment
* Fix type
* Fix type
* Update website/docs/api/spancategorizer.md
* Fix comment
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
* Better defense
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
* Fix labels list
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
* Update spacy/ml/extract_spans.py
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
* Update spacy/pipeline/spancat.py
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
* Set annotations during update
* Set annotations in spancat
* fix imports in test
* Update spacy/pipeline/spancat.py
* replace MaxoutLogistic with LinearLogistic
* fix config
* various small fixes
* remove set_annotations parameter in update
* use our beloved tupley format with recent support for doc.spans
* bugfix to allow renaming the default span_key (scores weren't showing up)
* use different key in docs example
* change defaults to better-working parameters from project (WIP)
* register spacy.extract_spans.v1 for legacy purposes
* Upd dev version so can build wheel
* layers instead of architectures for smaller building blocks
* Update website/docs/api/spancategorizer.md
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
* Update website/docs/api/spancategorizer.md
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
* Include additional scores from overrides in combined score weights
* Parameterize spans key in scoring
Parameterize the `SpanCategorizer` `spans_key` for scoring purposes so
that it's possible to evaluate multiple `spancat` components in the same
pipeline.
* Use the (intentionally very short) default spans key `sc` in the
`SpanCategorizer`
* Adjust the default score weights to include the default key
* Adjust the scorer to use `spans_{spans_key}` as the prefix for the
returned score
* Revert addition of `attr_name` argument to `score_spans` and adjust
the key in the `getter` instead.
Note that for `spancat` components with a custom `span_key`, the score
weights currently need to be modified manually in
`[training.score_weights]` for them to be available during training. To
suppress the default score weights `spans_sc_p/r/f` during training, set
them to `null` in `[training.score_weights]`.
* Update website/docs/api/scorer.md
* Fix scorer for spans key containing underscore
* Increment version
* Add Spans to Evaluate CLI (#8439)
* Add Spans to Evaluate CLI
* Change to spans_key
* Add spans per_type output
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
* Fix spancat GPU issues (#8455)
* Fix GPU issues
* Require thinc >=8.0.6
* Switch to glorot_uniform_init
* Fix and test ngram suggester
* Include final ngram in doc for all sizes
* Fix ngrams for docs of the same length as ngram size
* Handle batches of docs that result in no ngrams
* Add tests
Co-authored-by: Ines Montani <ines@ines.io>
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: Nirant <NirantK@users.noreply.github.com>
61 lines
2.0 KiB
Python
61 lines
2.0 KiB
Python
from typing import Tuple, Callable
|
|
from thinc.api import Model, to_numpy
|
|
from thinc.types import Ragged, Ints1d
|
|
|
|
from ..util import registry
|
|
|
|
|
|
@registry.layers("spacy.extract_spans.v1")
|
|
def extract_spans() -> Model[Tuple[Ragged, Ragged], Ragged]:
|
|
"""Extract spans from a sequence of source arrays, as specified by an array
|
|
of (start, end) indices. The output is a ragged array of the
|
|
extracted spans.
|
|
"""
|
|
return Model(
|
|
"extract_spans", forward, layers=[], refs={}, attrs={}, dims={}, init=init
|
|
)
|
|
|
|
|
|
def init(model, X=None, Y=None):
|
|
pass
|
|
|
|
|
|
def forward(
|
|
model: Model, source_spans: Tuple[Ragged, Ragged], is_train: bool
|
|
) -> Tuple[Ragged, Callable]:
|
|
"""Get subsequences from source vectors."""
|
|
ops = model.ops
|
|
X, spans = source_spans
|
|
assert spans.dataXd.ndim == 2
|
|
indices = _get_span_indices(ops, spans, X.lengths)
|
|
Y = Ragged(X.dataXd[indices], spans.dataXd[:, 1] - spans.dataXd[:, 0])
|
|
x_shape = X.dataXd.shape
|
|
x_lengths = X.lengths
|
|
|
|
def backprop_windows(dY: Ragged) -> Tuple[Ragged, Ragged]:
|
|
dX = Ragged(ops.alloc2f(*x_shape), x_lengths)
|
|
ops.scatter_add(dX.dataXd, indices, dY.dataXd)
|
|
return (dX, spans)
|
|
|
|
return Y, backprop_windows
|
|
|
|
|
|
def _get_span_indices(ops, spans: Ragged, lengths: Ints1d) -> Ints1d:
|
|
"""Construct a flat array that has the indices we want to extract from the
|
|
source data. For instance, if we want the spans (5, 9), (8, 10) the
|
|
indices will be [5, 6, 7, 8, 8, 9].
|
|
"""
|
|
spans, lengths = _ensure_cpu(spans, lengths)
|
|
indices = []
|
|
offset = 0
|
|
for i, length in enumerate(lengths):
|
|
spans_i = spans[i].dataXd + offset
|
|
for j in range(spans_i.shape[0]):
|
|
indices.append(ops.xp.arange(spans_i[j, 0], spans_i[j, 1]))
|
|
offset += length
|
|
return ops.flatten(indices)
|
|
|
|
|
|
def _ensure_cpu(spans: Ragged, lengths: Ints1d) -> Tuple[Ragged, Ints1d]:
|
|
return (Ragged(to_numpy(spans.dataXd), to_numpy(spans.lengths)), to_numpy(lengths))
|