From c864f12e288021283407c85e4ac24049aaed2599 Mon Sep 17 00:00:00 2001
From: kadarakos <kadar.akos@gmail.com>
Date: Mon, 6 Feb 2023 10:15:53 +0000
Subject: [PATCH] remove spancat exclusive

---
 spacy/pipeline/__init__.py          |   2 -
 spacy/pipeline/spancat_exclusive.py | 312 ----------------------------
 2 files changed, 314 deletions(-)
 delete mode 100644 spacy/pipeline/spancat_exclusive.py

diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py
index dbdefb843..26931606b 100644
--- a/spacy/pipeline/__init__.py
+++ b/spacy/pipeline/__init__.py
@@ -13,7 +13,6 @@ from .sentencizer import Sentencizer
 from .tagger import Tagger
 from .textcat import TextCategorizer
 from .spancat import SpanCategorizer
-from .spancat_exclusive import Exclusive_SpanCategorizer
 from .span_ruler import SpanRuler
 from .textcat_multilabel import MultiLabel_TextCategorizer
 from .tok2vec import Tok2Vec
@@ -32,7 +31,6 @@ __all__ = [
     "SentenceRecognizer",
     "Sentencizer",
     "SpanCategorizer",
-    "Exclusive_SpanCategorizer",
     "SpanRuler",
     "Tagger",
     "TextCategorizer",
diff --git a/spacy/pipeline/spancat_exclusive.py b/spacy/pipeline/spancat_exclusive.py
deleted file mode 100644
index 86b9875af..000000000
--- a/spacy/pipeline/spancat_exclusive.py
+++ /dev/null
@@ -1,312 +0,0 @@
-from typing import Callable, Dict, Iterable, List, Optional, Tuple, cast
-from dataclasses import dataclass
-
-import numpy
-from thinc.api import Config, Model
-from thinc.types import Floats2d, Ints2d, Ragged
-
-from ..language import Language
-from ..tokens import Doc, Span, SpanGroup
-from ..training import Example
-from ..vocab import Vocab
-from .spancat import spancat_score, Suggester
-from .spancat import SpanCategorizer
-
-
-spancat_excl_default_config = """
-[model]
-@architectures = "spacy.SpanCategorizer.v1"
-scorer = {"@layers": "Softmax.v2"}
-
-[model.reducer]
-@layers = spacy.mean_max_reducer.v1
-hidden_size = 128
-
-[model.tok2vec]
-@architectures = "spacy.Tok2Vec.v2"
-[model.tok2vec.embed]
-@architectures = "spacy.MultiHashEmbed.v1"
-width = 96
-rows = [5000, 1000, 2500, 1000]
-attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
-include_static_vectors = false
-
-[model.tok2vec.encode]
-@architectures = "spacy.MaxoutWindowEncoder.v2"
-width = ${model.tok2vec.embed.width}
-window_size = 1
-maxout_pieces = 3
-depth = 4
-"""
-
-DEFAULT_EXCL_SPANCAT_MODEL = Config().from_str(spancat_excl_default_config)["model"]
-
-
-@Language.factory(
-    "spancat_exclusive",
-    assigns=["doc.spans"],
-    default_config={
-        "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
-        "model": DEFAULT_EXCL_SPANCAT_MODEL,
-        "spans_key": "sc",
-        "scorer": {"@scorers": "spacy.spancat_scorer.v1"},
-        "negative_weight": 1.0,
-        "allow_overlap": True,
-    },
-    default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
-)
-def make_exclusive_spancat(
-    nlp: Language,
-    name: str,
-    suggester: Suggester,
-    model: Model[Tuple[List[Doc], Ragged], Floats2d],
-    spans_key: str,
-    scorer: Optional[Callable],
-    negative_weight: float,
-    allow_overlap: bool,
-) -> "Exclusive_SpanCategorizer":
-    """Create an Exclusive_SpanCategorizer component. The span categorizer consists of two
-    parts: a suggester function that proposes candidate spans, and a labeler
-    model that predicts a maximum of one label for each span.
-
-    suggester (Callable[[Iterable[Doc], Optional[Ops]], Ragged]): A function that suggests spans.
-        Spans are returned as a ragged array with two integer columns, for the
-        start and end positions.
-    model (Model[Tuple[List[Doc], Ragged], Floats2d]): A model instance that
-        is given a list of documents and (start, end) indices representing
-        candidate span offsets. The model predicts a probability for each category
-        for each span.
-    spans_key (str): Key of the doc.spans dict to save the spans under. During
-        initialization and training, the component will look for spans on the
-        reference document under the same key.
-    scorer (Optional[Callable]): The scoring method. Defaults to
-        Scorer.score_spans for the Doc.spans[spans_key] with overlapping
-        spans allowed.
-    negative_weight (float): Multiplier for the loss terms.
-        Can be used to downweight the negative samples if there are too many.
-    allow_overlap (bool): If True the data is assumed to
-        contain overlapping spans.
-    """
-    return Exclusive_SpanCategorizer(
-        nlp.vocab,
-        suggester=suggester,
-        model=model,
-        spans_key=spans_key,
-        negative_weight=negative_weight,
-        name=name,
-        scorer=scorer,
-        allow_overlap=allow_overlap,
-    )
-
-
-@dataclass
-class Ranges:
-    """
-    Helper class to avoid storing overlapping spans.
-    """
-
-    def __init__(self):
-        self.ranges = set()
-
-    def add(self, i, j):
-        for e in range(i, j):
-            self.ranges.add(e)
-
-    def __contains__(self, rang):
-        i, j = rang
-        for e in range(i, j):
-            if e in self.ranges:
-                return True
-        return False
-
-
-class Exclusive_SpanCategorizer(SpanCategorizer):
-    """Pipeline component to label spans with exclusive classes.
-
-    DOCS: https://spacy.io/api/spancatcategorizer
-    """
-
-    def __init__(
-        self,
-        vocab: Vocab,
-        model: Model[Tuple[List[Doc], Ragged], Floats2d],
-        suggester: Suggester,
-        name: str = "spancat_exclusive",
-        *,
-        spans_key: str = "spans",
-        negative_weight: float = 1.0,
-        allow_overlap: bool = True,
-        scorer: Optional[Callable] = spancat_score,
-    ) -> None:
-        """Initialize the exclusive span categorizer.
-        vocab (Vocab): The shared vocabulary.
-        model (thinc.api.Model): The Thinc Model powering the pipeline component.
-        suggester (Callable[[Iterable[Doc], Optional[Ops]], Ragged]): A function that suggests spans.
-            Spans are returned as a ragged array with two integer columns, for the
-            start and end positions.
-        name (str): The component instance name, used to add entries to the
-            losses during training.
-        spans_key (str): Key of the Doc.spans dict to save the spans under.
-            During initialization and training, the component will look for
-            spans on the reference document under the same key. Defaults to
-            `"spans"`.
-        negative_weight (float): Multiplier for the loss terms.
-            Can be used to down weigh the negative samples if there are too many.
-            Defaults to 1.0.
-        allow_overlap (bool): If True the data is assumed to
-            contain overlapping spans. Defaults to `True`.
-        scorer (Optional[Callable]): The scoring method. Defaults to
-            Scorer.score_spans for the Doc.spans[spans_key] with overlapping
-            spans allowed.
-
-        DOCS: https://spacy.io/api/spancatcategorizer#init
-        """
-        self.cfg = {
-            "labels": [],
-            "spans_key": spans_key,
-            "negative_weight": negative_weight,
-            "allow_overlap": allow_overlap,
-        }
-        self.vocab = vocab
-        self.suggester = suggester
-        self.model = model
-        self.name = name
-        self.scorer = scorer
-
-    @property
-    def label_map(self) -> Dict[str, int]:
-        """RETURNS (Dict[str, int]): The label map."""
-        return {label: i for i, label in enumerate(self.labels)}
-
-    @property
-    def _negative_label(self) -> int:
-        """RETURNS (int): Index of the negative label."""
-        return len(self.label_data)
-
-    @property
-    def _n_labels(self) -> int:
-        """RETURNS (int): Number of labels including the negative label."""
-        return len(self.label_data) + 1
-
-    def set_annotations(self, docs: Iterable[Doc], indices_scores) -> None:
-        """Modify a batch of Doc objects, using pre-computed scores.
-
-        docs (Iterable[Doc]): The documents to modify.
-        indices_scores: The scores to set, produced by Exclusive_SpanCategorizer.predict.
-
-        DOCS: https://spacy.io/api/spancatcategorizer#set_annotations
-        """
-        allow_overlap = cast(bool, self.cfg["allow_overlap"])
-        labels = list(self.labels)
-        indices, scores = indices_scores
-        offset = 0
-        for i, doc in enumerate(docs):
-            indices_i = indices[i].dataXd
-            doc.spans[self.key] = self._make_span_group(
-                doc,
-                indices_i,
-                scores[offset : offset + indices.lengths[i]],
-                labels,
-                allow_overlap,
-            )
-            offset += indices.lengths[i]
-
-    def get_loss(
-        self, examples: Iterable[Example], spans_scores: Tuple[Ragged, Floats2d]
-    ) -> Tuple[float, float]:
-        """Find the loss and gradient of loss for the batch of documents and
-        their predicted scores.
-
-        examples (Iterable[Examples]): The batch of examples.
-        spans_scores: Scores representing the model's predictions.
-        RETURNS (Tuple[float, float]): The loss and the gradient.
-
-        DOCS: https://spacy.io/api/spancatcategorizer#get_loss
-        """
-        spans, scores = spans_scores
-        spans = Ragged(
-            self.model.ops.to_numpy(spans.data), self.model.ops.to_numpy(spans.lengths)
-        )
-        target = numpy.zeros(scores.shape, dtype=scores.dtype)
-        # Set negative class as target initially for all samples.
-        negative_spans = numpy.ones((scores.shape[0]))
-        offset = 0
-        for i, eg in enumerate(examples):
-            # Map (start, end) offset of spans to the row in the
-            # d_scores array, so that we can adjust the gradient
-            # for predictions that were in the gold standard.
-            spans_index = {}
-            spans_i = spans[i].dataXd
-            for j in range(spans.lengths[i]):
-                start = int(spans_i[j, 0])  # type: ignore
-                end = int(spans_i[j, 1])  # type: ignore
-                spans_index[(start, end)] = offset + j
-            for gold_span in self._get_aligned_spans(eg):
-                key = (gold_span.start, gold_span.end)
-                if key in spans_index:
-                    row = spans_index[key]
-                    k = self.label_map[gold_span.label_]
-                    target[row, k] = 1.0
-                    # delete negative label target.
-                    negative_spans[row] = 0.0
-            # The target is a flat array for all docs. Track the position
-            # we're at within the flat array.
-            offset += spans.lengths[i]
-        target = self.model.ops.asarray(target, dtype="f")  # type: ignore
-        negative_samples = numpy.nonzero(negative_spans)[0]
-        target[negative_samples, self._negative_label] = 1.0  # type: ignore
-        d_scores = scores - target
-        neg_weight = cast(float, self.cfg["negative_weight"])
-        if neg_weight != 1.0:
-            d_scores[negative_samples] *= neg_weight
-        loss = float((d_scores**2).sum())
-        return loss, d_scores
-
-    def _make_span_group(
-        self,
-        doc: Doc,
-        indices: Ints2d,
-        scores: Floats2d,
-        labels: List[str],
-        allow_overlap: bool = True,
-    ) -> SpanGroup:
-        scores = self.model.ops.to_numpy(scores)
-        indices = self.model.ops.to_numpy(indices)
-
-        # Handle cases when there are zero suggestions
-        if scores.size == 0:
-            return SpanGroup(doc, name=self.key)
-
-        predicted = scores.argmax(axis=1)
-        # Remove samples where the negative label is the argmax
-        positive = numpy.where(predicted != self._negative_label)[0]
-        predicted = predicted[positive]
-        indices = indices[positive]
-
-        # Sort spans according to argmax probability
-        if not allow_overlap and predicted.size != 0:
-            # Get the probabilities
-            argmax_probs = numpy.take_along_axis(
-                scores[positive], numpy.expand_dims(predicted, 1), axis=1
-            )
-            argmax_probs = argmax_probs.squeeze()
-            sort_idx = (argmax_probs * -1).argsort()
-            predicted = predicted[sort_idx]
-            indices = indices[sort_idx]
-
-        seen = Ranges()
-        spans = SpanGroup(doc, name=self.key)
-        for i in range(len(predicted)):
-            label = predicted[i]
-            start = indices[i, 0]
-            end = indices[i, 1]
-
-            if not allow_overlap:
-                if (start, end) in seen:
-                    continue
-                else:
-                    seen.add(start, end)
-
-            spans.append(Span(doc, start, end, label=labels[label]))
-
-        return spans