Add spancat_exclusive to pipeline

2025-07-31 10:29:46 +03:00 · 2022-08-25 12:40:48 +08:00 · 2022-08-25 12:40:48 +08:00 · 3d07c05cba
commit 3d07c05cba
parent 527a1818e5
2 changed files with 13 additions and 8 deletions
--- a/spacy/pipeline/init.py
+++ b/spacy/pipeline/init.py
@ -13,6 +13,7 @@ from .sentencizer import Sentencizer
 from .tagger import Tagger
 from .textcat import TextCategorizer
 from .spancat import SpanCategorizer
+from .spancat_exclusive import SpanCategorizerExclusive
 from .span_ruler import SpanRuler
 from .textcat_multilabel import MultiLabel_TextCategorizer
 from .tok2vec import Tok2Vec
@ -31,6 +32,7 @@ __all__ = [
    "SentenceRecognizer",
    "Sentencizer",
    "SpanCategorizer",
+    "SpanCategorizerExclusive",
    "SpanRuler",
    "Tagger",
    "TextCategorizer",
--- a/spacy/pipeline/spancat_exclusive.py
+++ b/spacy/pipeline/spancat_exclusive.py
@ -78,7 +78,7 @@ def make_spancat(
    model: Model[Tuple[List[Doc], Ragged], Floats2d],
    spans_key: str,
    scorer: Optional[Callable],
-    negative_weight: Optional[float] = 1.0,
+    negative_weight: float = 1.0,
    allow_overlap: Optional[bool] = True,
 ) -> "SpanCategorizerExclusive":
    """Create a SpanCategorizer component. The span categorizer consists of two
@ -95,7 +95,7 @@ def make_spancat(
    spans_key (str): Key of the doc.spans dict to save the spans under. During
        initialization and training, the component will look for spans on the
        reference document under the same key.
-    negative_weight (Optional[float]): Multiplier for the loss terms.
+    negative_weight (float): Multiplier for the loss terms.
        Can be used to down weigh the negative samples if there are too many.
    allow_overlap (Optional[bool]): If True the data is assumed to
        contain overlapping spans.
@ -133,7 +133,6 @@ class Ranges:
        return False


-# TODO: Documentation
 class SpanCategorizerExclusive(TrainablePipe):
    """Pipeline component to label spans of text.

@ -148,7 +147,7 @@ class SpanCategorizerExclusive(TrainablePipe):
        name: str = "spancat_exclusive",
        *,
        spans_key: str = "spans",
-        negative_weight: Optional[float],
+        negative_weight: float = 1.0,
        scorer: Optional[Callable] = spancat_score,
        allow_overlap: Optional[bool] = True,
    ) -> None:
@ -161,7 +160,7 @@ class SpanCategorizerExclusive(TrainablePipe):
            During initialization and training, the component will look for
            spans on the reference document under the same key. Defaults to
            `"spans"`.
-        negative_weight (Optional[float]): Multiplier for the loss terms.
+        negative_weight (float): Multiplier for the loss terms.
            Can be used to down weigh the negative samples if there are too many.
        scorer (Optional[Callable]): The scoring method. Defaults to
            Scorer.score_spans for the Doc.spans[spans_key] with overlapping
@ -394,13 +393,15 @@ class SpanCategorizerExclusive(TrainablePipe):
    ) -> None:
        """Initialize the pipe for training, using a representative set
        of data examples.
+
        get_examples (Callable[[], Iterable[Example]]): Function that
            returns a representative sample of gold-standard Example objects.
        nlp (Optional[Language]): The current nlp object the component is part of.
        labels (Optional[List[str]]): The labels to add to the component, typically generated by the
            `init labels` command. If no labels are provided, the get_examples
            callback is used to extract the labels from the data.
-        DOCS: https://spacy.io/api/spancategorizer#initialize
+
+        DOCS: https://spacy.io/api/spancategorizerexclusive#initialize
        """
        subbatch: List[Example] = []
        if labels is not None:
@ -419,9 +420,11 @@ class SpanCategorizerExclusive(TrainablePipe):
            # + 1 for the "no-label" category
            Y = self.model.ops.alloc2f(spans.dataXd.shape[0], self._n_labels)
            self.model.initialize(X=(docs, spans), Y=Y)
-        # FIXME I think this branch is broken
        else:
-            raise ValueError("Cannot initialize without examples.")
+            # FIXME: Ideally we want to raise an error to avoid implicitly
+            # raising it when initializing without examples. For now, we'll just
+            # copy over what `spancat` did.
+            self.model.initialize()

    def _validate_categories(self, examples: Iterable[Example]):
        # TODO