Save span candidates produced by spancat suggesters (#10413)

* Add save_candidates attribute * Change spancat api * Add unit test * reimplement method to produce a list of doc * Add method to docs * Add new version tag * Add intended use to docstring * prettier formatting
2025-11-04 01:48:04 +03:00 · 2022-03-14 16:46:58 +01:00 · 2022-03-14 16:46:58 +01:00 · 2eef47dd26
commit 2eef47dd26
parent b68bf43f5b
3 changed files with 58 additions and 0 deletions
--- a/spacy/pipeline/spancat.py
+++ b/spacy/pipeline/spancat.py
@ -272,6 +272,24 @@ class SpanCategorizer(TrainablePipe):
        scores = self.model.predict((docs, indices))  # type: ignore
        return indices, scores
    def set_candidates(
        self, docs: Iterable[Doc], *, candidates_key: str = "candidates"
    ) -> None:
        """Use the spancat suggester to add a list of span candidates to a list of docs.
        This method is intended to be used for debugging purposes.
        docs (Iterable[Doc]): The documents to modify.
        candidates_key (str): Key of the Doc.spans dict to save the candidate spans under.
        DOCS: https://spacy.io/api/spancategorizer#set_candidates
        """
        suggester_output = self.suggester(docs, ops=self.model.ops)
        for candidates, doc in zip(suggester_output, docs):  # type: ignore
            doc.spans[candidates_key] = []
            for index in candidates.dataXd:
                doc.spans[candidates_key].append(doc[index[0] : index[1]])
    def set_annotations(self, docs: Iterable[Doc], indices_scores) -> None:
        """Modify a batch of Doc objects, using pre-computed scores.
--- a/spacy/tests/pipeline/test_spancat.py
+++ b/spacy/tests/pipeline/test_spancat.py
@ -397,3 +397,25 @@ def test_zero_suggestions():
    assert set(spancat.labels) == {"LOC", "PERSON"}
    nlp.update(train_examples, sgd=optimizer)
 def test_set_candidates():
    nlp = Language()
    spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
    train_examples = make_examples(nlp)
    nlp.initialize(get_examples=lambda: train_examples)
    texts = [
        "Just a sentence.",
        "I like London and Berlin",
        "I like Berlin",
        "I eat ham.",
    ]
    docs = [nlp(text) for text in texts]
    spancat.set_candidates(docs)
    assert len(docs) == len(texts)
    assert type(docs[0].spans["candidates"]) == SpanGroup
    assert len(docs[0].spans["candidates"]) == 9
    assert docs[0].spans["candidates"][0].text == "Just"
    assert docs[0].spans["candidates"][4].text == "Just a"
--- a/website/docs/api/spancategorizer.md
+++ b/website/docs/api/spancategorizer.md
@ -239,6 +239,24 @@ Delegates to [`predict`](/api/spancategorizer#predict) and
 | `losses`       | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
 | **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                    |
 ## SpanCategorizer.set_candidates {#set_candidates tag="method", new="3.3"}
 Use the suggester to add a list of [`Span`](/api/span) candidates to a list of
 [`Doc`](/api/doc) objects. This method is intended to be used for debugging
 purposes.
 > #### Example
 >
 > ```python
 > spancat = nlp.add_pipe("spancat")
 > spancat.set_candidates(docs, "candidates")
 > ```
 | Name             | Description                                                          |
 | ---------------- | -------------------------------------------------------------------- |
 | `docs`           | The documents to modify. ~~Iterable[Doc]~~                           |
 | `candidates_key` | Key of the Doc.spans dict to save the candidate spans under. ~~str~~ |
 ## SpanCategorizer.get_loss {#get_loss tag="method"}
 Find the loss and gradient of loss for the batch of documents and their