Getting scores out of beam_parser (#6684)

* clean up of ner tests * beam_parser tests * implement get_beam_parses and scored_parses for the dep parser * we don't have to add the parse if there are no arcs
2025-11-01 08:27:44 +03:00 · 2021-01-07 06:28:27 +01:00 · 2021-01-07 06:28:27 +01:00 · 8c1a23209f
commit 8c1a23209f
parent 3983bc6b1e
6 changed files with 201 additions and 15 deletions
--- a/spacy/pipeline/_parser_internals/arc_eager.pxd
+++ b/spacy/pipeline/_parser_internals/arc_eager.pxd
@ -4,4 +4,4 @@ from .transition_system cimport Transition, TransitionSystem


 cdef class ArcEager(TransitionSystem):
-    pass
+    cdef get_arcs(self, StateC* state)
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@ -1,6 +1,7 @@
 # cython: profile=True, cdivision=True, infer_types=True
 from cymem.cymem cimport Pool, Address
 from libc.stdint cimport int32_t
+from libcpp.vector cimport vector

 from collections import defaultdict, Counter

@ -10,9 +11,9 @@ from ...structs cimport TokenC
 from ...tokens.doc cimport Doc, set_children_from_heads
 from ...training.example cimport Example
 from .stateclass cimport StateClass
-from ._state cimport StateC
-
+from ._state cimport StateC, ArcC
 from ...errors import Errors
+from thinc.extra.search cimport Beam

 cdef weight_t MIN_SCORE = -90000
 cdef attr_t SUBTOK_LABEL = hash_string(u'subtok')
@ -707,6 +708,28 @@ cdef class ArcEager(TransitionSystem):
                doc.c[i].dep = self.root_label
        set_children_from_heads(doc.c, 0, doc.length)

+    def get_beam_parses(self, Beam beam):
+        parses = []
+        probs = beam.probs
+        for i in range(beam.size):
+            state = <StateC*>beam.at(i)
+            if state.is_final():
+                prob = probs[i]
+                parse = []
+                arcs = self.get_arcs(state)
+                if arcs:
+                    for arc in arcs:
+                        dep = arc["label"]
+                        label = self.strings[dep]
+                        parse.append((arc["head"], arc["child"], label))
+                    parses.append((prob, parse))
+        return parses
+
+    cdef get_arcs(self, StateC* state):
+        cdef vector[ArcC] arcs
+        state.get_arcs(&arcs)
+        return list(arcs)
+
    def has_gold(self, Example eg, start=0, end=None):
        for word in eg.y[start:end]:
            if word.dep != 0:
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@ -257,6 +257,7 @@ cdef class BiluoPushDown(TransitionSystem):
                parse = []
                for j in range(state._ents.size()):
                    ent = state._ents.at(j)
+                    if ent.start != -1 and ent.end != -1:
                        parse.append((ent.start, ent.end, self.strings[ent.label]))
                parses.append((prob, parse))
        return parses
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@ -1,4 +1,5 @@
 # cython: infer_types=True, profile=True, binding=True
+from collections import defaultdict
 from typing import Optional, Iterable
 from thinc.api import Model, Config

@ -258,3 +259,20 @@ cdef class DependencyParser(Parser):
        results.update(Scorer.score_deps(examples, "dep", **kwargs))
        del results["sents_per_type"]
        return results
+
+    def scored_parses(self, beams):
+        """Return two dictionaries with scores for each beam/doc that was processed:
+        one containing (i, head) keys, and another containing (i, label) keys.
+        """
+        head_scores = []
+        label_scores = []
+        for beam in beams:
+            score_head_dict = defaultdict(float)
+            score_label_dict = defaultdict(float)
+            for score, parses in self.moves.get_beam_parses(beam):
+                for head, i, label in parses:
+                    score_head_dict[(i, head)] += score
+                    score_label_dict[(i, label)] += score
+            head_scores.append(score_head_dict)
+            label_scores.append(score_label_dict)
+        return head_scores, label_scores
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@ -1,5 +1,3 @@
-from collections import defaultdict
-
 import pytest
 from numpy.testing import assert_equal
 from spacy.attrs import ENT_IOB
@ -305,7 +303,7 @@ def test_block_ner():

@pytest.mark.parametrize("use_upper", [True, False])
 def test_overfitting_IO(use_upper):
-    # Simple test to try and quickly overfit the NER component - ensuring the ML models work correctly
+    # Simple test to try and quickly overfit the NER component
    nlp = English()
    ner = nlp.add_pipe("ner", config={"model": {"use_upper": use_upper}})
    train_examples = []
@ -386,7 +384,6 @@ def test_beam_ner_scores():
    test_text = "I like London."
    doc = nlp.make_doc(test_text)
    docs = [doc]
-    ner = nlp.get_pipe("beam_ner")
    beams = ner.predict(docs)
    entity_scores = ner.scored_ents(beams)[0]

@ -423,7 +420,6 @@ def test_beam_overfitting_IO():
    # test the scores from the beam
    test_text = "I like London."
    docs = [nlp.make_doc(test_text)]
-    ner = nlp.get_pipe("beam_ner")
    beams = ner.predict(docs)
    entity_scores = ner.scored_ents(beams)[0]
    assert entity_scores[(2, 3, "LOC")] == 1.0
@ -433,7 +429,7 @@ def test_beam_overfitting_IO():
    with make_tempdir() as tmp_dir:
        nlp.to_disk(tmp_dir)
        nlp2 = util.load_model_from_path(tmp_dir)
-        docs2 = [nlp2(test_text)]
+        docs2 = [nlp2.make_doc(test_text)]
        ner2 = nlp2.get_pipe("beam_ner")
        beams2 = ner2.predict(docs2)
        entity_scores2 = ner2.scored_ents(beams2)[0]
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@ -28,6 +28,26 @@ TRAIN_DATA = [
 ]


+CONFLICTING_DATA = [
+    (
+        "I like London and Berlin.",
+        {
+            "heads": [1, 1, 1, 2, 2, 1],
+            "deps": ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"],
+        },
+    ),
+    (
+        "I like London and Berlin.",
+        {
+            "heads": [0, 0, 0, 0, 0, 0],
+            "deps": ["ROOT", "nsubj", "nsubj", "cc", "conj", "punct"],
+        },
+    ),
+]
+
+eps = 0.01
+
+
 def test_parser_root(en_vocab):
    words = ["i", "do", "n't", "have", "other", "assistance"]
    heads = [3, 3, 3, 3, 5, 3]
@ -185,26 +205,31 @@ def test_parser_set_sent_starts(en_vocab):
            assert token.head in sent


-def test_overfitting_IO():
-    # Simple test to try and quickly overfit the dependency parser - ensuring the ML models work correctly
+@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
+def test_overfitting_IO(pipe_name):
+    # Simple test to try and quickly overfit the dependency parser (normal or beam)
    nlp = English()
-    parser = nlp.add_pipe("parser")
+    parser = nlp.add_pipe(pipe_name)
    train_examples = []
    for text, annotations in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
        for dep in annotations.get("deps", []):
            parser.add_label(dep)
    optimizer = nlp.initialize()
-    for i in range(100):
+    # run overfitting
+    for i in range(150):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)
-    assert losses["parser"] < 0.0001
+    assert losses[pipe_name] < 0.0001
    # test the trained model
    test_text = "I like securities."
    doc = nlp(test_text)
    assert doc[0].dep_ == "nsubj"
    assert doc[2].dep_ == "dobj"
    assert doc[3].dep_ == "punct"
+    assert doc[0].head.i == 1
+    assert doc[2].head.i == 1
+    assert doc[3].head.i == 1
    # Also test the results are still the same after IO
    with make_tempdir() as tmp_dir:
        nlp.to_disk(tmp_dir)
@ -213,6 +238,9 @@ def test_overfitting_IO():
        assert doc2[0].dep_ == "nsubj"
        assert doc2[2].dep_ == "dobj"
        assert doc2[3].dep_ == "punct"
+        assert doc2[0].head.i == 1
+        assert doc2[2].head.i == 1
+        assert doc2[3].head.i == 1

    # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions
    texts = [
@ -226,3 +254,123 @@ def test_overfitting_IO():
    no_batch_deps = [doc.to_array([DEP]) for doc in [nlp(text) for text in texts]]
    assert_equal(batch_deps_1, batch_deps_2)
    assert_equal(batch_deps_1, no_batch_deps)
+
+
+def test_beam_parser_scores():
+    # Test that we can get confidence values out of the beam_parser pipe
+    beam_width = 16
+    beam_density = 0.0001
+    nlp = English()
+    config = {
+        "beam_width": beam_width,
+        "beam_density": beam_density,
+    }
+    parser = nlp.add_pipe("beam_parser", config=config)
+    train_examples = []
+    for text, annotations in CONFLICTING_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
+        for dep in annotations.get("deps", []):
+            parser.add_label(dep)
+    optimizer = nlp.initialize()
+
+    # update a bit with conflicting data
+    for i in range(10):
+        losses = {}
+        nlp.update(train_examples, sgd=optimizer, losses=losses)
+
+    # test the scores from the beam
+    test_text = "I like securities."
+    doc = nlp.make_doc(test_text)
+    docs = [doc]
+    beams = parser.predict(docs)
+    head_scores, label_scores = parser.scored_parses(beams)
+
+    for j in range(len(doc)):
+        for label in parser.labels:
+            label_score = label_scores[0][(j, label)]
+            assert 0 - eps <= label_score <= 1 + eps
+        for i in range(len(doc)):
+            head_score = head_scores[0][(j, i)]
+            assert 0 - eps <= head_score <= 1 + eps
+
+
+def test_beam_overfitting_IO():
+    # Simple test to try and quickly overfit the Beam dependency parser
+    nlp = English()
+    beam_width = 16
+    beam_density = 0.0001
+    config = {
+        "beam_width": beam_width,
+        "beam_density": beam_density,
+    }
+    parser = nlp.add_pipe("beam_parser", config=config)
+    train_examples = []
+    for text, annotations in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
+        for dep in annotations.get("deps", []):
+            parser.add_label(dep)
+    optimizer = nlp.initialize()
+    # run overfitting
+    for i in range(150):
+        losses = {}
+        nlp.update(train_examples, sgd=optimizer, losses=losses)
+    assert losses["beam_parser"] < 0.0001
+    # test the scores from the beam
+    test_text = "I like securities."
+    docs = [nlp.make_doc(test_text)]
+    beams = parser.predict(docs)
+    head_scores, label_scores = parser.scored_parses(beams)
+    # we only processed one document
+    head_scores = head_scores[0]
+    label_scores = label_scores[0]
+    # test label annotations: 0=nsubj, 2=dobj, 3=punct
+    assert label_scores[(0, "nsubj")] == pytest.approx(1.0, eps)
+    assert label_scores[(0, "dobj")] == pytest.approx(0.0, eps)
+    assert label_scores[(0, "punct")] == pytest.approx(0.0, eps)
+    assert label_scores[(2, "nsubj")] == pytest.approx(0.0, eps)
+    assert label_scores[(2, "dobj")] == pytest.approx(1.0, eps)
+    assert label_scores[(2, "punct")] == pytest.approx(0.0, eps)
+    assert label_scores[(3, "nsubj")] == pytest.approx(0.0, eps)
+    assert label_scores[(3, "dobj")] == pytest.approx(0.0, eps)
+    assert label_scores[(3, "punct")] == pytest.approx(1.0, eps)
+    # test head annotations: the root is token at index 1
+    assert head_scores[(0, 0)] == pytest.approx(0.0, eps)
+    assert head_scores[(0, 1)] == pytest.approx(1.0, eps)
+    assert head_scores[(0, 2)] == pytest.approx(0.0, eps)
+    assert head_scores[(2, 0)] == pytest.approx(0.0, eps)
+    assert head_scores[(2, 1)] == pytest.approx(1.0, eps)
+    assert head_scores[(2, 2)] == pytest.approx(0.0, eps)
+    assert head_scores[(3, 0)] == pytest.approx(0.0, eps)
+    assert head_scores[(3, 1)] == pytest.approx(1.0, eps)
+    assert head_scores[(3, 2)] == pytest.approx(0.0, eps)
+
+    # Also test the results are still the same after IO
+    with make_tempdir() as tmp_dir:
+        nlp.to_disk(tmp_dir)
+        nlp2 = util.load_model_from_path(tmp_dir)
+        docs2 = [nlp2.make_doc(test_text)]
+        parser2 = nlp2.get_pipe("beam_parser")
+        beams2 = parser2.predict(docs2)
+        head_scores2, label_scores2 = parser2.scored_parses(beams2)
+        # we only processed one document
+        head_scores2 = head_scores2[0]
+        label_scores2 = label_scores2[0]
+        # check the results again
+        assert label_scores2[(0, "nsubj")] == pytest.approx(1.0, eps)
+        assert label_scores2[(0, "dobj")] == pytest.approx(0.0, eps)
+        assert label_scores2[(0, "punct")] == pytest.approx(0.0, eps)
+        assert label_scores2[(2, "nsubj")] == pytest.approx(0.0, eps)
+        assert label_scores2[(2, "dobj")] == pytest.approx(1.0, eps)
+        assert label_scores2[(2, "punct")] == pytest.approx(0.0, eps)
+        assert label_scores2[(3, "nsubj")] == pytest.approx(0.0, eps)
+        assert label_scores2[(3, "dobj")] == pytest.approx(0.0, eps)
+        assert label_scores2[(3, "punct")] == pytest.approx(1.0, eps)
+        assert head_scores2[(0, 0)] == pytest.approx(0.0, eps)
+        assert head_scores2[(0, 1)] == pytest.approx(1.0, eps)
+        assert head_scores2[(0, 2)] == pytest.approx(0.0, eps)
+        assert head_scores2[(2, 0)] == pytest.approx(0.0, eps)
+        assert head_scores2[(2, 1)] == pytest.approx(1.0, eps)
+        assert head_scores2[(2, 2)] == pytest.approx(0.0, eps)
+        assert head_scores2[(3, 0)] == pytest.approx(0.0, eps)
+        assert head_scores2[(3, 1)] == pytest.approx(1.0, eps)
+        assert head_scores2[(3, 2)] == pytest.approx(0.0, eps)