mirror of
https://github.com/explosion/spaCy.git
synced 2025-10-27 22:21:08 +03:00
* Draft spancat model
* Add spancat model
* Add test for extract_spans
* Add extract_spans layer
* Upd extract_spans
* Add spancat model
* Add test for spancat model
* Upd spancat model
* Update spancat component
* Upd spancat
* Update spancat model
* Add quick spancat test
* Import SpanCategorizer
* Fix SpanCategorizer component
* Import SpanGroup
* Fix span extraction
* Fix import
* Fix import
* Upd model
* Update spancat models
* Add scoring, update defaults
* Update and add docs
* Fix type
* Update spacy/ml/extract_spans.py
* Auto-format and fix import
* Fix comment
* Fix type
* Fix type
* Update website/docs/api/spancategorizer.md
* Fix comment
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
* Better defense
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
* Fix labels list
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
* Update spacy/ml/extract_spans.py
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
* Update spacy/pipeline/spancat.py
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
* Set annotations during update
* Set annotations in spancat
* fix imports in test
* Update spacy/pipeline/spancat.py
* replace MaxoutLogistic with LinearLogistic
* fix config
* various small fixes
* remove set_annotations parameter in update
* use our beloved tupley format with recent support for doc.spans
* bugfix to allow renaming the default span_key (scores weren't showing up)
* use different key in docs example
* change defaults to better-working parameters from project (WIP)
* register spacy.extract_spans.v1 for legacy purposes
* Upd dev version so can build wheel
* layers instead of architectures for smaller building blocks
* Update website/docs/api/spancategorizer.md
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
* Update website/docs/api/spancategorizer.md
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
* Include additional scores from overrides in combined score weights
* Parameterize spans key in scoring
Parameterize the `SpanCategorizer` `spans_key` for scoring purposes so
that it's possible to evaluate multiple `spancat` components in the same
pipeline.
* Use the (intentionally very short) default spans key `sc` in the
`SpanCategorizer`
* Adjust the default score weights to include the default key
* Adjust the scorer to use `spans_{spans_key}` as the prefix for the
returned score
* Revert addition of `attr_name` argument to `score_spans` and adjust
the key in the `getter` instead.
Note that for `spancat` components with a custom `span_key`, the score
weights currently need to be modified manually in
`[training.score_weights]` for them to be available during training. To
suppress the default score weights `spans_sc_p/r/f` during training, set
them to `null` in `[training.score_weights]`.
* Update website/docs/api/scorer.md
* Fix scorer for spans key containing underscore
* Increment version
* Add Spans to Evaluate CLI (#8439)
* Add Spans to Evaluate CLI
* Change to spans_key
* Add spans per_type output
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
* Fix spancat GPU issues (#8455)
* Fix GPU issues
* Require thinc >=8.0.6
* Switch to glorot_uniform_init
* Fix and test ngram suggester
* Include final ngram in doc for all sizes
* Fix ngrams for docs of the same length as ngram size
* Handle batches of docs that result in no ngrams
* Add tests
Co-authored-by: Ines Montani <ines@ines.io>
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: Nirant <NirantK@users.noreply.github.com>
55 lines
2.1 KiB
Python
55 lines
2.1 KiB
Python
from typing import List, Tuple
|
|
from thinc.api import Model, with_getitem, chain, list2ragged, Logistic
|
|
from thinc.api import Maxout, Linear, concatenate, glorot_uniform_init
|
|
from thinc.api import reduce_mean, reduce_max, reduce_first, reduce_last
|
|
from thinc.types import Ragged, Floats2d
|
|
|
|
from ...util import registry
|
|
from ...tokens import Doc
|
|
from ..extract_spans import extract_spans
|
|
|
|
|
|
@registry.layers.register("spacy.LinearLogistic.v1")
|
|
def build_linear_logistic(nO=None, nI=None) -> Model[Floats2d, Floats2d]:
|
|
"""An output layer for multi-label classification. It uses a linear layer
|
|
followed by a logistic activation.
|
|
"""
|
|
return chain(Linear(nO=nO, nI=nI, init_W=glorot_uniform_init), Logistic())
|
|
|
|
|
|
@registry.layers.register("spacy.mean_max_reducer.v1")
|
|
def build_mean_max_reducer(hidden_size: int) -> Model[Ragged, Floats2d]:
|
|
"""Reduce sequences by concatenating their mean and max pooled vectors,
|
|
and then combine the concatenated vectors with a hidden layer.
|
|
"""
|
|
return chain(
|
|
concatenate(reduce_last(), reduce_first(), reduce_mean(), reduce_max()),
|
|
Maxout(nO=hidden_size, normalize=True, dropout=0.0),
|
|
)
|
|
|
|
|
|
@registry.architectures.register("spacy.SpanCategorizer.v1")
|
|
def build_spancat_model(
|
|
tok2vec: Model[List[Doc], List[Floats2d]],
|
|
reducer: Model[Ragged, Floats2d],
|
|
scorer: Model[Floats2d, Floats2d],
|
|
) -> Model[Tuple[List[Doc], Ragged], Floats2d]:
|
|
"""Build a span categorizer model, given a token-to-vector model, a
|
|
reducer model to map the sequence of vectors for each span down to a single
|
|
vector, and a scorer model to map the vectors to probabilities.
|
|
|
|
tok2vec (Model[List[Doc], List[Floats2d]]): The tok2vec model.
|
|
reducer (Model[Ragged, Floats2d]): The reducer model.
|
|
scorer (Model[Floats2d, Floats2d]): The scorer model.
|
|
"""
|
|
model = chain(
|
|
with_getitem(0, chain(tok2vec, list2ragged())),
|
|
extract_spans(),
|
|
reducer,
|
|
scorer,
|
|
)
|
|
model.set_ref("tok2vec", tok2vec)
|
|
model.set_ref("reducer", reducer)
|
|
model.set_ref("scorer", scorer)
|
|
return model
|