Getting scores out of beam_parser (#6684)

* clean up of ner tests * beam_parser tests * implement get_beam_parses and scored_parses for the dep parser * we don't have to add the parse if there are no arcs
2025-10-25 21:21:10 +03:00 · 2021-01-07 06:28:27 +01:00 · 2021-01-07 06:28:27 +01:00 · 8c1a23209f
commit 8c1a23209f
parent 3983bc6b1e
6 changed files with 201 additions and 15 deletions
--- a/spacy/pipeline/_parser_internals/arc_eager.pxd
+++ b/spacy/pipeline/_parser_internals/arc_eager.pxd
@ -4,4 +4,4 @@ from .transition_system cimport Transition, TransitionSystem
 cdef class ArcEager(TransitionSystem):
-    pass
+    cdef get_arcs(self, StateC* state)
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@ -1,6 +1,7 @@
 # cython: profile=True, cdivision=True, infer_types=True
 from cymem.cymem cimport Pool, Address
 from libc.stdint cimport int32_t
 from libcpp.vector cimport vector
 from collections import defaultdict, Counter
@ -10,9 +11,9 @@ from ...structs cimport TokenC
 from ...tokens.doc cimport Doc, set_children_from_heads
 from ...training.example cimport Example
 from .stateclass cimport StateClass
-from ._state cimport StateC
+from ._state cimport StateC, ArcC
 from ...errors import Errors
 from thinc.extra.search cimport Beam
 cdef weight_t MIN_SCORE = -90000
 cdef attr_t SUBTOK_LABEL = hash_string(u'subtok')
@ -707,6 +708,28 @@ cdef class ArcEager(TransitionSystem):
                doc.c[i].dep = self.root_label
        set_children_from_heads(doc.c, 0, doc.length)
    def get_beam_parses(self, Beam beam):
        parses = []
        probs = beam.probs
        for i in range(beam.size):
            state = <StateC*>beam.at(i)
            if state.is_final():
                prob = probs[i]
                parse = []
                arcs = self.get_arcs(state)
                if arcs:
                    for arc in arcs:
                        dep = arc["label"]
                        label = self.strings[dep]
                        parse.append((arc["head"], arc["child"], label))
                    parses.append((prob, parse))
        return parses
    cdef get_arcs(self, StateC* state):
        cdef vector[ArcC] arcs
        state.get_arcs(&arcs)
        return list(arcs)
    def has_gold(self, Example eg, start=0, end=None):
        for word in eg.y[start:end]:
            if word.dep != 0:
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@ -257,7 +257,8 @@ cdef class BiluoPushDown(TransitionSystem):
                parse = []
                for j in range(state._ents.size()):
                    ent = state._ents.at(j)
-                    parse.append((ent.start, ent.end, self.strings[ent.label]))
+                    if ent.start != -1 and ent.end != -1:
                        parse.append((ent.start, ent.end, self.strings[ent.label]))
                parses.append((prob, parse))
        return parses
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@ -1,4 +1,5 @@
 # cython: infer_types=True, profile=True, binding=True
 from collections import defaultdict
 from typing import Optional, Iterable
 from thinc.api import Model, Config
@ -258,3 +259,20 @@ cdef class DependencyParser(Parser):
        results.update(Scorer.score_deps(examples, "dep", **kwargs))
        del results["sents_per_type"]
        return results
    def scored_parses(self, beams):
        """Return two dictionaries with scores for each beam/doc that was processed:
        one containing (i, head) keys, and another containing (i, label) keys.
        """
        head_scores = []
        label_scores = []
        for beam in beams:
            score_head_dict = defaultdict(float)
            score_label_dict = defaultdict(float)
            for score, parses in self.moves.get_beam_parses(beam):
                for head, i, label in parses:
                    score_head_dict[(i, head)] += score
                    score_label_dict[(i, label)] += score
            head_scores.append(score_head_dict)
            label_scores.append(score_label_dict)
        return head_scores, label_scores
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@ -1,5 +1,3 @@
 from collections import defaultdict
 import pytest
 from numpy.testing import assert_equal
 from spacy.attrs import ENT_IOB
@ -305,7 +303,7 @@ def test_block_ner():
@pytest.mark.parametrize("use_upper", [True, False])
 def test_overfitting_IO(use_upper):
-    # Simple test to try and quickly overfit the NER component - ensuring the ML models work correctly
+    # Simple test to try and quickly overfit the NER component
    nlp = English()
    ner = nlp.add_pipe("ner", config={"model": {"use_upper": use_upper}})
    train_examples = []
@ -386,7 +384,6 @@ def test_beam_ner_scores():
    test_text = "I like London."
    doc = nlp.make_doc(test_text)
    docs = [doc]
    ner = nlp.get_pipe("beam_ner")
    beams = ner.predict(docs)
    entity_scores = ner.scored_ents(beams)[0]
@ -423,7 +420,6 @@ def test_beam_overfitting_IO():
    # test the scores from the beam
    test_text = "I like London."
    docs = [nlp.make_doc(test_text)]
    ner = nlp.get_pipe("beam_ner")
    beams = ner.predict(docs)
    entity_scores = ner.scored_ents(beams)[0]
    assert entity_scores[(2, 3, "LOC")] == 1.0
@ -433,7 +429,7 @@ def test_beam_overfitting_IO():
    with make_tempdir() as tmp_dir:
        nlp.to_disk(tmp_dir)
        nlp2 = util.load_model_from_path(tmp_dir)
-        docs2 = [nlp2(test_text)]
+        docs2 = [nlp2.make_doc(test_text)]
        ner2 = nlp2.get_pipe("beam_ner")
        beams2 = ner2.predict(docs2)
        entity_scores2 = ner2.scored_ents(beams2)[0]
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@ -28,6 +28,26 @@ TRAIN_DATA = [
 ]
 CONFLICTING_DATA = [
    (
        "I like London and Berlin.",
        {
            "heads": [1, 1, 1, 2, 2, 1],
            "deps": ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"],
        },
    ),
    (
        "I like London and Berlin.",
        {
            "heads": [0, 0, 0, 0, 0, 0],
            "deps": ["ROOT", "nsubj", "nsubj", "cc", "conj", "punct"],
        },
    ),
 ]
 eps = 0.01
 def test_parser_root(en_vocab):
    words = ["i", "do", "n't", "have", "other", "assistance"]
    heads = [3, 3, 3, 3, 5, 3]
@ -185,26 +205,31 @@ def test_parser_set_sent_starts(en_vocab):
            assert token.head in sent
-def test_overfitting_IO():
+@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
-    # Simple test to try and quickly overfit the dependency parser - ensuring the ML models work correctly
+def test_overfitting_IO(pipe_name):
    # Simple test to try and quickly overfit the dependency parser (normal or beam)
    nlp = English()
-    parser = nlp.add_pipe("parser")
+    parser = nlp.add_pipe(pipe_name)
    train_examples = []
    for text, annotations in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
        for dep in annotations.get("deps", []):
            parser.add_label(dep)
    optimizer = nlp.initialize()
-    for i in range(100):
+    # run overfitting
    for i in range(150):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)
-    assert losses["parser"] < 0.0001
+    assert losses[pipe_name] < 0.0001
    # test the trained model
    test_text = "I like securities."
    doc = nlp(test_text)
    assert doc[0].dep_ == "nsubj"
    assert doc[2].dep_ == "dobj"
    assert doc[3].dep_ == "punct"
    assert doc[0].head.i == 1
    assert doc[2].head.i == 1
    assert doc[3].head.i == 1
    # Also test the results are still the same after IO
    with make_tempdir() as tmp_dir:
        nlp.to_disk(tmp_dir)
@ -213,6 +238,9 @@ def test_overfitting_IO():
        assert doc2[0].dep_ == "nsubj"
        assert doc2[2].dep_ == "dobj"
        assert doc2[3].dep_ == "punct"
        assert doc2[0].head.i == 1
        assert doc2[2].head.i == 1
        assert doc2[3].head.i == 1
    # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions
    texts = [
@ -226,3 +254,123 @@ def test_overfitting_IO():
    no_batch_deps = [doc.to_array([DEP]) for doc in [nlp(text) for text in texts]]
    assert_equal(batch_deps_1, batch_deps_2)
    assert_equal(batch_deps_1, no_batch_deps)
 def test_beam_parser_scores():
    # Test that we can get confidence values out of the beam_parser pipe
    beam_width = 16
    beam_density = 0.0001
    nlp = English()
    config = {
        "beam_width": beam_width,
        "beam_density": beam_density,
    }
    parser = nlp.add_pipe("beam_parser", config=config)
    train_examples = []
    for text, annotations in CONFLICTING_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
        for dep in annotations.get("deps", []):
            parser.add_label(dep)
    optimizer = nlp.initialize()
    # update a bit with conflicting data
    for i in range(10):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)
    # test the scores from the beam
    test_text = "I like securities."
    doc = nlp.make_doc(test_text)
    docs = [doc]
    beams = parser.predict(docs)
    head_scores, label_scores = parser.scored_parses(beams)
    for j in range(len(doc)):
        for label in parser.labels:
            label_score = label_scores[0][(j, label)]
            assert 0 - eps <= label_score <= 1 + eps
        for i in range(len(doc)):
            head_score = head_scores[0][(j, i)]
            assert 0 - eps <= head_score <= 1 + eps
 def test_beam_overfitting_IO():
    # Simple test to try and quickly overfit the Beam dependency parser
    nlp = English()
    beam_width = 16
    beam_density = 0.0001
    config = {
        "beam_width": beam_width,
        "beam_density": beam_density,
    }
    parser = nlp.add_pipe("beam_parser", config=config)
    train_examples = []
    for text, annotations in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
        for dep in annotations.get("deps", []):
            parser.add_label(dep)
    optimizer = nlp.initialize()
    # run overfitting
    for i in range(150):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)
    assert losses["beam_parser"] < 0.0001
    # test the scores from the beam
    test_text = "I like securities."
    docs = [nlp.make_doc(test_text)]
    beams = parser.predict(docs)
    head_scores, label_scores = parser.scored_parses(beams)
    # we only processed one document
    head_scores = head_scores[0]
    label_scores = label_scores[0]
    # test label annotations: 0=nsubj, 2=dobj, 3=punct
    assert label_scores[(0, "nsubj")] == pytest.approx(1.0, eps)
    assert label_scores[(0, "dobj")] == pytest.approx(0.0, eps)
    assert label_scores[(0, "punct")] == pytest.approx(0.0, eps)
    assert label_scores[(2, "nsubj")] == pytest.approx(0.0, eps)
    assert label_scores[(2, "dobj")] == pytest.approx(1.0, eps)
    assert label_scores[(2, "punct")] == pytest.approx(0.0, eps)
    assert label_scores[(3, "nsubj")] == pytest.approx(0.0, eps)
    assert label_scores[(3, "dobj")] == pytest.approx(0.0, eps)
    assert label_scores[(3, "punct")] == pytest.approx(1.0, eps)
    # test head annotations: the root is token at index 1
    assert head_scores[(0, 0)] == pytest.approx(0.0, eps)
    assert head_scores[(0, 1)] == pytest.approx(1.0, eps)
    assert head_scores[(0, 2)] == pytest.approx(0.0, eps)
    assert head_scores[(2, 0)] == pytest.approx(0.0, eps)
    assert head_scores[(2, 1)] == pytest.approx(1.0, eps)
    assert head_scores[(2, 2)] == pytest.approx(0.0, eps)
    assert head_scores[(3, 0)] == pytest.approx(0.0, eps)
    assert head_scores[(3, 1)] == pytest.approx(1.0, eps)
    assert head_scores[(3, 2)] == pytest.approx(0.0, eps)
    # Also test the results are still the same after IO
    with make_tempdir() as tmp_dir:
        nlp.to_disk(tmp_dir)
        nlp2 = util.load_model_from_path(tmp_dir)
        docs2 = [nlp2.make_doc(test_text)]
        parser2 = nlp2.get_pipe("beam_parser")
        beams2 = parser2.predict(docs2)
        head_scores2, label_scores2 = parser2.scored_parses(beams2)
        # we only processed one document
        head_scores2 = head_scores2[0]
        label_scores2 = label_scores2[0]
        # check the results again
        assert label_scores2[(0, "nsubj")] == pytest.approx(1.0, eps)
        assert label_scores2[(0, "dobj")] == pytest.approx(0.0, eps)
        assert label_scores2[(0, "punct")] == pytest.approx(0.0, eps)
        assert label_scores2[(2, "nsubj")] == pytest.approx(0.0, eps)
        assert label_scores2[(2, "dobj")] == pytest.approx(1.0, eps)
        assert label_scores2[(2, "punct")] == pytest.approx(0.0, eps)
        assert label_scores2[(3, "nsubj")] == pytest.approx(0.0, eps)
        assert label_scores2[(3, "dobj")] == pytest.approx(0.0, eps)
        assert label_scores2[(3, "punct")] == pytest.approx(1.0, eps)
        assert head_scores2[(0, 0)] == pytest.approx(0.0, eps)
        assert head_scores2[(0, 1)] == pytest.approx(1.0, eps)
        assert head_scores2[(0, 2)] == pytest.approx(0.0, eps)
        assert head_scores2[(2, 0)] == pytest.approx(0.0, eps)
        assert head_scores2[(2, 1)] == pytest.approx(1.0, eps)
        assert head_scores2[(2, 2)] == pytest.approx(0.0, eps)
        assert head_scores2[(3, 0)] == pytest.approx(0.0, eps)
        assert head_scores2[(3, 1)] == pytest.approx(1.0, eps)
        assert head_scores2[(3, 2)] == pytest.approx(0.0, eps)
`@ -4,4 +4,4 @@ from .transition_system cimport Transition, TransitionSystem`


	`cdef class ArcEager(TransitionSystem):`	`cdef class ArcEager(TransitionSystem):`
	`pass`	`cdef get_arcs(self, StateC* state)`