spaCy/spacy/tests/pipeline/test_spancat.py

from numpy.testing import assert_equal
from spacy.language import Language
from spacy.training import Example
from spacy.util import fix_random_seed, registry


SPAN_KEY = "labeled_spans"

TRAIN_DATA = [
    ("Who is Shaka Khan?", {"spans": {SPAN_KEY: [(7, 17, "PERSON")]}}),
    (
        "I like London and Berlin.",
        {"spans": {SPAN_KEY: [(7, 13, "LOC"), (18, 24, "LOC")]}},
    ),
]


def make_get_examples(nlp):
    train_examples = []
    for t in TRAIN_DATA:
        eg = Example.from_dict(nlp.make_doc(t[0]), t[1])
        train_examples.append(eg)

    def get_examples():
        return train_examples

    return get_examples


def test_simple_train():
    fix_random_seed(0)
    nlp = Language()
    spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
    get_examples = make_get_examples(nlp)
    nlp.initialize(get_examples)
    sgd = nlp.create_optimizer()
    assert len(spancat.labels) != 0
    for i in range(40):
        losses = {}
        nlp.update(list(get_examples()), losses=losses, drop=0.1, sgd=sgd)
    doc = nlp("I like London and Berlin.")
    assert doc.spans[spancat.key] == doc.spans[SPAN_KEY]
    assert len(doc.spans[spancat.key]) == 2
    assert doc.spans[spancat.key][0].text == "London"
    scores = nlp.evaluate(get_examples())
    assert f"spans_{SPAN_KEY}_f" in scores
    assert scores[f"spans_{SPAN_KEY}_f"] == 1.0


def test_ngram_suggester(en_tokenizer):
    # test different n-gram lengths
    for size in [1, 2, 3]:
        ngram_suggester = registry.misc.get("ngram_suggester.v1")(sizes=[size])
        docs = [
            en_tokenizer(text)
            for text in [
                "a",
                "a b",
                "a b c",
                "a b c d",
                "a b c d e",
                "a " * 100,
            ]
        ]
        ngrams = ngram_suggester(docs)
        # span sizes are correct
        for s in ngrams.data:
            assert s[1] - s[0] == size
        # spans are within docs
        offset = 0
        for i, doc in enumerate(docs):
            spans = ngrams.dataXd[offset : offset + ngrams.lengths[i]]
            spans_set = set()
            for span in spans:
                assert 0 <= span[0] < len(doc)
                assert 0 < span[1] <= len(doc)
                spans_set.add((span[0], span[1]))
            # spans are unique
            assert spans.shape[0] == len(spans_set)
            offset += ngrams.lengths[i]
        # the number of spans is correct
        assert_equal(ngrams.lengths, [max(0, len(doc) - (size - 1)) for doc in docs])

    # test 1-3-gram suggestions
    ngram_suggester = registry.misc.get("ngram_suggester.v1")(sizes=[1, 2, 3])
    docs = [
        en_tokenizer(text) for text in ["a", "a b", "a b c", "a b c d", "a b c d e"]
    ]
    ngrams = ngram_suggester(docs)
    assert_equal(ngrams.lengths, [1, 3, 6, 9, 12])
    assert_equal(
        ngrams.data,
        [
            # doc 0
            [0, 1],
            # doc 1
            [0, 1],
            [1, 2],
            [0, 2],
            # doc 2
            [0, 1],
            [1, 2],
            [2, 3],
            [0, 2],
            [1, 3],
            [0, 3],
            # doc 3
            [0, 1],
            [1, 2],
            [2, 3],
            [3, 4],
            [0, 2],
            [1, 3],
            [2, 4],
            [0, 3],
            [1, 4],
            # doc 4
            [0, 1],
            [1, 2],
            [2, 3],
            [3, 4],
            [4, 5],
            [0, 2],
            [1, 3],
            [2, 4],
            [3, 5],
            [0, 3],
            [1, 4],
            [2, 5],
        ],
    )

    # test some empty docs
    ngram_suggester = registry.misc.get("ngram_suggester.v1")(sizes=[1])
    docs = [en_tokenizer(text) for text in ["", "a", ""]]
    ngrams = ngram_suggester(docs)
    assert_equal(ngrams.lengths, [len(doc) for doc in docs])

    # test all empty docs
    ngram_suggester = registry.misc.get("ngram_suggester.v1")(sizes=[1])
    docs = [en_tokenizer(text) for text in ["", "", ""]]
    ngrams = ngram_suggester(docs)
    assert_equal(ngrams.lengths, [len(doc) for doc in docs])
Add SpanCategorizer component (#6747) * Draft spancat model * Add spancat model * Add test for extract_spans * Add extract_spans layer * Upd extract_spans * Add spancat model * Add test for spancat model * Upd spancat model * Update spancat component * Upd spancat * Update spancat model * Add quick spancat test * Import SpanCategorizer * Fix SpanCategorizer component * Import SpanGroup * Fix span extraction * Fix import * Fix import * Upd model * Update spancat models * Add scoring, update defaults * Update and add docs * Fix type * Update spacy/ml/extract_spans.py * Auto-format and fix import * Fix comment * Fix type * Fix type * Update website/docs/api/spancategorizer.md * Fix comment Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Better defense Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Fix labels list Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/ml/extract_spans.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/pipeline/spancat.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Set annotations during update * Set annotations in spancat * fix imports in test * Update spacy/pipeline/spancat.py * replace MaxoutLogistic with LinearLogistic * fix config * various small fixes * remove set_annotations parameter in update * use our beloved tupley format with recent support for doc.spans * bugfix to allow renaming the default span_key (scores weren't showing up) * use different key in docs example * change defaults to better-working parameters from project (WIP) * register spacy.extract_spans.v1 for legacy purposes * Upd dev version so can build wheel * layers instead of architectures for smaller building blocks * Update website/docs/api/spancategorizer.md Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> * Update website/docs/api/spancategorizer.md Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> * Include additional scores from overrides in combined score weights * Parameterize spans key in scoring Parameterize the `SpanCategorizer` `spans_key` for scoring purposes so that it's possible to evaluate multiple `spancat` components in the same pipeline. * Use the (intentionally very short) default spans key `sc` in the `SpanCategorizer` * Adjust the default score weights to include the default key * Adjust the scorer to use `spans_{spans_key}` as the prefix for the returned score * Revert addition of `attr_name` argument to `score_spans` and adjust the key in the `getter` instead. Note that for `spancat` components with a custom `span_key`, the score weights currently need to be modified manually in `[training.score_weights]` for them to be available during training. To suppress the default score weights `spans_sc_p/r/f` during training, set them to `null` in `[training.score_weights]`. * Update website/docs/api/scorer.md * Fix scorer for spans key containing underscore * Increment version * Add Spans to Evaluate CLI (#8439) * Add Spans to Evaluate CLI * Change to spans_key * Add spans per_type output Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> * Fix spancat GPU issues (#8455) * Fix GPU issues * Require thinc >=8.0.6 * Switch to glorot_uniform_init * Fix and test ngram suggester * Include final ngram in doc for all sizes * Fix ngrams for docs of the same length as ngram size * Handle batches of docs that result in no ngrams * Add tests Co-authored-by: Ines Montani <ines@ines.io> Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com> Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> Co-authored-by: Nirant <NirantK@users.noreply.github.com> 2021-06-24 13:35:27 +03:00			`from numpy.testing import assert_equal`
			`from spacy.language import Language`
			`from spacy.training import Example`
			`from spacy.util import fix_random_seed, registry`


			`SPAN_KEY = "labeled_spans"`

			`TRAIN_DATA = [`
			`("Who is Shaka Khan?", {"spans": {SPAN_KEY: [(7, 17, "PERSON")]}}),`
			`(`
			`"I like London and Berlin.",`
			`{"spans": {SPAN_KEY: [(7, 13, "LOC"), (18, 24, "LOC")]}},`
			`),`
			`]`


			`def make_get_examples(nlp):`
			`train_examples = []`
			`for t in TRAIN_DATA:`
			`eg = Example.from_dict(nlp.make_doc(t[0]), t[1])`
			`train_examples.append(eg)`

			`def get_examples():`
			`return train_examples`

			`return get_examples`


			`def test_simple_train():`
			`fix_random_seed(0)`
			`nlp = Language()`
			`spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})`
			`get_examples = make_get_examples(nlp)`
			`nlp.initialize(get_examples)`
			`sgd = nlp.create_optimizer()`
			`assert len(spancat.labels) != 0`
			`for i in range(40):`
			`losses = {}`
			`nlp.update(list(get_examples()), losses=losses, drop=0.1, sgd=sgd)`
			`doc = nlp("I like London and Berlin.")`
			`assert doc.spans[spancat.key] == doc.spans[SPAN_KEY]`
			`assert len(doc.spans[spancat.key]) == 2`
			`assert doc.spans[spancat.key][0].text == "London"`
			`scores = nlp.evaluate(get_examples())`
			`assert f"spans_{SPAN_KEY}_f" in scores`
			`assert scores[f"spans_{SPAN_KEY}_f"] == 1.0`


			`def test_ngram_suggester(en_tokenizer):`
			`# test different n-gram lengths`
			`for size in [1, 2, 3]:`
			`ngram_suggester = registry.misc.get("ngram_suggester.v1")(sizes=[size])`
			`docs = [`
			`en_tokenizer(text)`
			`for text in [`
			`"a",`
			`"a b",`
			`"a b c",`
			`"a b c d",`
			`"a b c d e",`
			`"a " * 100,`
			`]`
			`]`
			`ngrams = ngram_suggester(docs)`
			`# span sizes are correct`
			`for s in ngrams.data:`
			`assert s[1] - s[0] == size`
			`# spans are within docs`
			`offset = 0`
			`for i, doc in enumerate(docs):`
			`spans = ngrams.dataXd[offset : offset + ngrams.lengths[i]]`
			`spans_set = set()`
			`for span in spans:`
			`assert 0 <= span[0] < len(doc)`
			`assert 0 < span[1] <= len(doc)`
			`spans_set.add((span[0], span[1]))`
			`# spans are unique`
			`assert spans.shape[0] == len(spans_set)`
			`offset += ngrams.lengths[i]`
			`# the number of spans is correct`
Tidy up code 2021-06-28 12:48:00 +03:00			`assert_equal(ngrams.lengths, [max(0, len(doc) - (size - 1)) for doc in docs])`
Add SpanCategorizer component (#6747) * Draft spancat model * Add spancat model * Add test for extract_spans * Add extract_spans layer * Upd extract_spans * Add spancat model * Add test for spancat model * Upd spancat model * Update spancat component * Upd spancat * Update spancat model * Add quick spancat test * Import SpanCategorizer * Fix SpanCategorizer component * Import SpanGroup * Fix span extraction * Fix import * Fix import * Upd model * Update spancat models * Add scoring, update defaults * Update and add docs * Fix type * Update spacy/ml/extract_spans.py * Auto-format and fix import * Fix comment * Fix type * Fix type * Update website/docs/api/spancategorizer.md * Fix comment Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Better defense Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Fix labels list Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/ml/extract_spans.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/pipeline/spancat.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Set annotations during update * Set annotations in spancat * fix imports in test * Update spacy/pipeline/spancat.py * replace MaxoutLogistic with LinearLogistic * fix config * various small fixes * remove set_annotations parameter in update * use our beloved tupley format with recent support for doc.spans * bugfix to allow renaming the default span_key (scores weren't showing up) * use different key in docs example * change defaults to better-working parameters from project (WIP) * register spacy.extract_spans.v1 for legacy purposes * Upd dev version so can build wheel * layers instead of architectures for smaller building blocks * Update website/docs/api/spancategorizer.md Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> * Update website/docs/api/spancategorizer.md Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> * Include additional scores from overrides in combined score weights * Parameterize spans key in scoring Parameterize the `SpanCategorizer` `spans_key` for scoring purposes so that it's possible to evaluate multiple `spancat` components in the same pipeline. * Use the (intentionally very short) default spans key `sc` in the `SpanCategorizer` * Adjust the default score weights to include the default key * Adjust the scorer to use `spans_{spans_key}` as the prefix for the returned score * Revert addition of `attr_name` argument to `score_spans` and adjust the key in the `getter` instead. Note that for `spancat` components with a custom `span_key`, the score weights currently need to be modified manually in `[training.score_weights]` for them to be available during training. To suppress the default score weights `spans_sc_p/r/f` during training, set them to `null` in `[training.score_weights]`. * Update website/docs/api/scorer.md * Fix scorer for spans key containing underscore * Increment version * Add Spans to Evaluate CLI (#8439) * Add Spans to Evaluate CLI * Change to spans_key * Add spans per_type output Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> * Fix spancat GPU issues (#8455) * Fix GPU issues * Require thinc >=8.0.6 * Switch to glorot_uniform_init * Fix and test ngram suggester * Include final ngram in doc for all sizes * Fix ngrams for docs of the same length as ngram size * Handle batches of docs that result in no ngrams * Add tests Co-authored-by: Ines Montani <ines@ines.io> Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com> Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> Co-authored-by: Nirant <NirantK@users.noreply.github.com> 2021-06-24 13:35:27 +03:00
			`# test 1-3-gram suggestions`
			`ngram_suggester = registry.misc.get("ngram_suggester.v1")(sizes=[1, 2, 3])`
			`docs = [`
			`en_tokenizer(text) for text in ["a", "a b", "a b c", "a b c d", "a b c d e"]`
			`]`
			`ngrams = ngram_suggester(docs)`
			`assert_equal(ngrams.lengths, [1, 3, 6, 9, 12])`
			`assert_equal(`
			`ngrams.data,`
			`[`
			`# doc 0`
			`[0, 1],`
			`# doc 1`
			`[0, 1],`
			`[1, 2],`
			`[0, 2],`
			`# doc 2`
			`[0, 1],`
			`[1, 2],`
			`[2, 3],`
			`[0, 2],`
			`[1, 3],`
			`[0, 3],`
			`# doc 3`
			`[0, 1],`
			`[1, 2],`
			`[2, 3],`
			`[3, 4],`
			`[0, 2],`
			`[1, 3],`
			`[2, 4],`
			`[0, 3],`
			`[1, 4],`
			`# doc 4`
			`[0, 1],`
			`[1, 2],`
			`[2, 3],`
			`[3, 4],`
			`[4, 5],`
			`[0, 2],`
			`[1, 3],`
			`[2, 4],`
			`[3, 5],`
			`[0, 3],`
			`[1, 4],`
			`[2, 5],`
			`],`
			`)`

			`# test some empty docs`
			`ngram_suggester = registry.misc.get("ngram_suggester.v1")(sizes=[1])`
			`docs = [en_tokenizer(text) for text in ["", "a", ""]]`
			`ngrams = ngram_suggester(docs)`
			`assert_equal(ngrams.lengths, [len(doc) for doc in docs])`

			`# test all empty docs`
			`ngram_suggester = registry.misc.get("ngram_suggester.v1")(sizes=[1])`
			`docs = [en_tokenizer(text) for text in ["", "", ""]]`
			`ngrams = ngram_suggester(docs)`
			`assert_equal(ngrams.lengths, [len(doc) for doc in docs])`