Add spancat_exclusive to pipeline

This commit is contained in:
Lj Miranda 2022-08-25 12:40:48 +08:00
parent 527a1818e5
commit 3d07c05cba
2 changed files with 13 additions and 8 deletions

View File

@ -13,6 +13,7 @@ from .sentencizer import Sentencizer
from .tagger import Tagger from .tagger import Tagger
from .textcat import TextCategorizer from .textcat import TextCategorizer
from .spancat import SpanCategorizer from .spancat import SpanCategorizer
from .spancat_exclusive import SpanCategorizerExclusive
from .span_ruler import SpanRuler from .span_ruler import SpanRuler
from .textcat_multilabel import MultiLabel_TextCategorizer from .textcat_multilabel import MultiLabel_TextCategorizer
from .tok2vec import Tok2Vec from .tok2vec import Tok2Vec
@ -31,6 +32,7 @@ __all__ = [
"SentenceRecognizer", "SentenceRecognizer",
"Sentencizer", "Sentencizer",
"SpanCategorizer", "SpanCategorizer",
"SpanCategorizerExclusive",
"SpanRuler", "SpanRuler",
"Tagger", "Tagger",
"TextCategorizer", "TextCategorizer",

View File

@ -78,7 +78,7 @@ def make_spancat(
model: Model[Tuple[List[Doc], Ragged], Floats2d], model: Model[Tuple[List[Doc], Ragged], Floats2d],
spans_key: str, spans_key: str,
scorer: Optional[Callable], scorer: Optional[Callable],
negative_weight: Optional[float] = 1.0, negative_weight: float = 1.0,
allow_overlap: Optional[bool] = True, allow_overlap: Optional[bool] = True,
) -> "SpanCategorizerExclusive": ) -> "SpanCategorizerExclusive":
"""Create a SpanCategorizer component. The span categorizer consists of two """Create a SpanCategorizer component. The span categorizer consists of two
@ -95,7 +95,7 @@ def make_spancat(
spans_key (str): Key of the doc.spans dict to save the spans under. During spans_key (str): Key of the doc.spans dict to save the spans under. During
initialization and training, the component will look for spans on the initialization and training, the component will look for spans on the
reference document under the same key. reference document under the same key.
negative_weight (Optional[float]): Multiplier for the loss terms. negative_weight (float): Multiplier for the loss terms.
Can be used to down weigh the negative samples if there are too many. Can be used to down weigh the negative samples if there are too many.
allow_overlap (Optional[bool]): If True the data is assumed to allow_overlap (Optional[bool]): If True the data is assumed to
contain overlapping spans. contain overlapping spans.
@ -133,7 +133,6 @@ class Ranges:
return False return False
# TODO: Documentation
class SpanCategorizerExclusive(TrainablePipe): class SpanCategorizerExclusive(TrainablePipe):
"""Pipeline component to label spans of text. """Pipeline component to label spans of text.
@ -148,7 +147,7 @@ class SpanCategorizerExclusive(TrainablePipe):
name: str = "spancat_exclusive", name: str = "spancat_exclusive",
*, *,
spans_key: str = "spans", spans_key: str = "spans",
negative_weight: Optional[float], negative_weight: float = 1.0,
scorer: Optional[Callable] = spancat_score, scorer: Optional[Callable] = spancat_score,
allow_overlap: Optional[bool] = True, allow_overlap: Optional[bool] = True,
) -> None: ) -> None:
@ -161,7 +160,7 @@ class SpanCategorizerExclusive(TrainablePipe):
During initialization and training, the component will look for During initialization and training, the component will look for
spans on the reference document under the same key. Defaults to spans on the reference document under the same key. Defaults to
`"spans"`. `"spans"`.
negative_weight (Optional[float]): Multiplier for the loss terms. negative_weight (float): Multiplier for the loss terms.
Can be used to down weigh the negative samples if there are too many. Can be used to down weigh the negative samples if there are too many.
scorer (Optional[Callable]): The scoring method. Defaults to scorer (Optional[Callable]): The scoring method. Defaults to
Scorer.score_spans for the Doc.spans[spans_key] with overlapping Scorer.score_spans for the Doc.spans[spans_key] with overlapping
@ -394,13 +393,15 @@ class SpanCategorizerExclusive(TrainablePipe):
) -> None: ) -> None:
"""Initialize the pipe for training, using a representative set """Initialize the pipe for training, using a representative set
of data examples. of data examples.
get_examples (Callable[[], Iterable[Example]]): Function that get_examples (Callable[[], Iterable[Example]]): Function that
returns a representative sample of gold-standard Example objects. returns a representative sample of gold-standard Example objects.
nlp (Optional[Language]): The current nlp object the component is part of. nlp (Optional[Language]): The current nlp object the component is part of.
labels (Optional[List[str]]): The labels to add to the component, typically generated by the labels (Optional[List[str]]): The labels to add to the component, typically generated by the
`init labels` command. If no labels are provided, the get_examples `init labels` command. If no labels are provided, the get_examples
callback is used to extract the labels from the data. callback is used to extract the labels from the data.
DOCS: https://spacy.io/api/spancategorizer#initialize
DOCS: https://spacy.io/api/spancategorizerexclusive#initialize
""" """
subbatch: List[Example] = [] subbatch: List[Example] = []
if labels is not None: if labels is not None:
@ -419,9 +420,11 @@ class SpanCategorizerExclusive(TrainablePipe):
# + 1 for the "no-label" category # + 1 for the "no-label" category
Y = self.model.ops.alloc2f(spans.dataXd.shape[0], self._n_labels) Y = self.model.ops.alloc2f(spans.dataXd.shape[0], self._n_labels)
self.model.initialize(X=(docs, spans), Y=Y) self.model.initialize(X=(docs, spans), Y=Y)
# FIXME I think this branch is broken
else: else:
raise ValueError("Cannot initialize without examples.") # FIXME: Ideally we want to raise an error to avoid implicitly
# raising it when initializing without examples. For now, we'll just
# copy over what `spancat` did.
self.model.initialize()
def _validate_categories(self, examples: Iterable[Example]): def _validate_categories(self, examples: Iterable[Example]):
# TODO # TODO