spaCy/spacy/tests/pipeline/test_span_finder.py

import pytest
from thinc.api import Config
from thinc.types import Ragged

from spacy.language import Language
from spacy.pipeline.span_finder import DEFAULT_PREDICTED_KEY, span_finder_default_config
from spacy.tokens import Doc
from spacy.training import Example
from spacy.util import registry

TRAINING_KEY = "pytest"


@pytest.mark.parametrize(
    "tokens_predicted, tokens_reference, reference_truths",
    [
        (
            ["Mon", ".", "-", "June", "16"],
            ["Mon.", "-", "June", "16"],
            [(0, 0), (0, 0), (0, 0), (1, 1), (0, 0)],
        ),
        (
            ["Mon.", "-", "J", "une", "16"],
            ["Mon.", "-", "June", "16"],
            [(0, 0), (0, 0), (1, 0), (0, 1), (0, 0)],
        ),
        (
            ["Mon", ".", "-", "June", "16"],
            ["Mon.", "-", "June", "1", "6"],
            [(0, 0), (0, 0), (0, 0), (1, 1), (0, 0)],
        ),
        (
            ["Mon.", "-J", "un", "e 16"],
            ["Mon.", "-", "June", "16"],
            [(0, 0), (0, 0), (0, 0), (0, 0)],
        ),
        pytest.param(
            ["Mon.-June", "16"],
            ["Mon.", "-", "June", "16"],
            [(0, 1), (0, 0)],
        ),
        pytest.param(
            ["Mon.-", "June", "16"],
            ["Mon.", "-", "J", "une", "16"],
            [(0, 0), (1, 1), (0, 0)],
        ),
        pytest.param(
            ["Mon.-", "June 16"],
            ["Mon.", "-", "June", "16"],
            [(0, 0), (1, 0)],
        ),
    ],
)
def test_loss_alignment_example(tokens_predicted, tokens_reference, reference_truths):
    nlp = Language()
    predicted = Doc(
        nlp.vocab, words=tokens_predicted, spaces=[False] * len(tokens_predicted)
    )
    reference = Doc(
        nlp.vocab, words=tokens_reference, spaces=[False] * len(tokens_reference)
    )
    example = Example(predicted, reference)
    example.reference.spans[TRAINING_KEY] = [example.reference.char_span(5, 9)]
    span_finder = nlp.add_pipe("span_finder", config={"training_key": TRAINING_KEY})
    nlp.initialize()

    truth_scores = span_finder._get_aligned_truth_scores([example])
    assert len(truth_scores) == len(tokens_predicted)
    assert truth_scores == reference_truths


def test_span_finder_model():
    nlp = Language()

    docs = [nlp("This is an example."), nlp("This is the second example.")]
    docs[0].spans[TRAINING_KEY] = [docs[0][3:4]]
    docs[1].spans[TRAINING_KEY] = [docs[1][3:5]]

    total_tokens = 0
    for doc in docs:
        total_tokens += len(doc)

    config = Config().from_str(span_finder_default_config).interpolate()
    model = registry.resolve(config)["model"]

    model.initialize(X=docs)
    predictions = model.predict(docs)

    assert len(predictions) == total_tokens
    assert len(predictions[0]) == 2


def test_span_finder_component():
    nlp = Language()

    docs = [nlp("This is an example."), nlp("This is the second example.")]
    docs[0].spans[TRAINING_KEY] = [docs[0][3:4]]
    docs[1].spans[TRAINING_KEY] = [docs[1][3:5]]

    span_finder = nlp.add_pipe("span_finder", config={"training_key": TRAINING_KEY})
    nlp.initialize()
    docs = list(span_finder.pipe(docs))

    # TODO: update hard-coded name
    assert "span_candidates" in docs[0].spans


@pytest.mark.parametrize(
    "min_length, max_length, span_count",
    [(0, 0, 0), (None, None, 8), (2, None, 6), (None, 1, 2), (2, 3, 2)],
)
def test_set_annotations_span_lengths(min_length, max_length, span_count):
    nlp = Language()
    doc = nlp("Me and Jenny goes together like peas and carrots.")
    if min_length == 0 and max_length == 0:
        with pytest.raises(ValueError, match="Both 'min_length' and 'max_length'"):
            span_finder = nlp.add_pipe(
                "span_finder",
                config={
                    "max_length": max_length,
                    "min_length": min_length,
                    "training_key": TRAINING_KEY,
                },
            )
        return
    span_finder = nlp.add_pipe(
        "span_finder",
        config={
            "max_length": max_length,
            "min_length": min_length,
            "training_key": TRAINING_KEY,
        },
    )
    nlp.initialize()
    # Starts    [Me, Jenny, peas]
    # Ends      [Jenny, peas, carrots]
    scores = [
        (1, 0),
        (0, 0),
        (1, 1),
        (0, 0),
        (0, 0),
        (0, 0),
        (1, 1),
        (0, 0),
        (0, 1),
        (0, 0),
    ]
    span_finder.set_annotations([doc], scores)

    assert doc.spans[DEFAULT_PREDICTED_KEY]
    assert len(doc.spans[DEFAULT_PREDICTED_KEY]) == span_count

    # Assert below will fail when max_length is set to 0
    if max_length is None:
        max_length = float("inf")
    if min_length is None:
        min_length = 1

    assert all(
        min_length <= len(span) <= max_length
        for span in doc.spans[DEFAULT_PREDICTED_KEY]
    )


def test_span_finder_suggester():
    nlp = Language()
    docs = [nlp("This is an example."), nlp("This is the second example.")]
    docs[0].spans[TRAINING_KEY] = [docs[0][3:4]]
    docs[1].spans[TRAINING_KEY] = [docs[1][3:5]]
    span_finder = nlp.add_pipe("span_finder", config={"training_key": TRAINING_KEY})
    nlp.initialize()
    span_finder.set_annotations(docs, span_finder.predict(docs))

    suggester = registry.misc.get("spacy.span_finder_suggester.v1")(
        candidates_key="span_candidates"
    )

    candidates = suggester(docs)

    span_length = 0
    for doc in docs:
        span_length += len(doc.spans["span_candidates"])

    assert span_length == len(candidates.dataXd)
    assert type(candidates) == Ragged
    assert len(candidates.dataXd[0]) == 2