diff --git a/spacy/pipeline/_parser_internals/arc_eager.pxd b/spacy/pipeline/_parser_internals/arc_eager.pxd
index 3732dd1b7..b618bc587 100644
--- a/spacy/pipeline/_parser_internals/arc_eager.pxd
+++ b/spacy/pipeline/_parser_internals/arc_eager.pxd
@@ -4,4 +4,4 @@ from .transition_system cimport Transition, TransitionSystem
 
 
 cdef class ArcEager(TransitionSystem):
-    pass
+    cdef get_arcs(self, StateC* state)
diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx
index 1d92efd7b..90a70b17b 100644
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@@ -1,6 +1,7 @@
 # cython: profile=True, cdivision=True, infer_types=True
 from cymem.cymem cimport Pool, Address
 from libc.stdint cimport int32_t
+from libcpp.vector cimport vector
 
 from collections import defaultdict, Counter
 
@@ -10,9 +11,9 @@ from ...structs cimport TokenC
 from ...tokens.doc cimport Doc, set_children_from_heads
 from ...training.example cimport Example
 from .stateclass cimport StateClass
-from ._state cimport StateC
-
+from ._state cimport StateC, ArcC
 from ...errors import Errors
+from thinc.extra.search cimport Beam
 
 cdef weight_t MIN_SCORE = -90000
 cdef attr_t SUBTOK_LABEL = hash_string(u'subtok')
@@ -707,6 +708,28 @@ cdef class ArcEager(TransitionSystem):
                 doc.c[i].dep = self.root_label
         set_children_from_heads(doc.c, 0, doc.length)
 
+    def get_beam_parses(self, Beam beam):
+        parses = []
+        probs = beam.probs
+        for i in range(beam.size):
+            state = <StateC*>beam.at(i)
+            if state.is_final():
+                prob = probs[i]
+                parse = []
+                arcs = self.get_arcs(state)
+                if arcs:
+                    for arc in arcs:
+                        dep = arc["label"]
+                        label = self.strings[dep]
+                        parse.append((arc["head"], arc["child"], label))
+                    parses.append((prob, parse))
+        return parses
+
+    cdef get_arcs(self, StateC* state):
+        cdef vector[ArcC] arcs
+        state.get_arcs(&arcs)
+        return list(arcs)
+
     def has_gold(self, Example eg, start=0, end=None):
         for word in eg.y[start:end]:
             if word.dep != 0:
diff --git a/spacy/pipeline/_parser_internals/ner.pyx b/spacy/pipeline/_parser_internals/ner.pyx
index d3e58e245..d0da6ff70 100644
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@@ -257,7 +257,8 @@ cdef class BiluoPushDown(TransitionSystem):
                 parse = []
                 for j in range(state._ents.size()):
                     ent = state._ents.at(j)
-                    parse.append((ent.start, ent.end, self.strings[ent.label]))
+                    if ent.start != -1 and ent.end != -1:
+                        parse.append((ent.start, ent.end, self.strings[ent.label]))
                 parses.append((prob, parse))
         return parses
 
diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx
index 3399ef677..1fe29eb9b 100644
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@@ -1,4 +1,5 @@
 # cython: infer_types=True, profile=True, binding=True
+from collections import defaultdict
 from typing import Optional, Iterable
 from thinc.api import Model, Config
 
@@ -258,3 +259,20 @@ cdef class DependencyParser(Parser):
         results.update(Scorer.score_deps(examples, "dep", **kwargs))
         del results["sents_per_type"]
         return results
+
+    def scored_parses(self, beams):
+        """Return two dictionaries with scores for each beam/doc that was processed:
+        one containing (i, head) keys, and another containing (i, label) keys.
+        """
+        head_scores = []
+        label_scores = []
+        for beam in beams:
+            score_head_dict = defaultdict(float)
+            score_label_dict = defaultdict(float)
+            for score, parses in self.moves.get_beam_parses(beam):
+                for head, i, label in parses:
+                    score_head_dict[(i, head)] += score
+                    score_label_dict[(i, label)] += score
+            head_scores.append(score_head_dict)
+            label_scores.append(score_label_dict)
+        return head_scores, label_scores
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index fec69bf66..d110eb11c 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -1,5 +1,3 @@
-from collections import defaultdict
-
 import pytest
 from numpy.testing import assert_equal
 from spacy.attrs import ENT_IOB
@@ -305,7 +303,7 @@ def test_block_ner():
 
 @pytest.mark.parametrize("use_upper", [True, False])
 def test_overfitting_IO(use_upper):
-    # Simple test to try and quickly overfit the NER component - ensuring the ML models work correctly
+    # Simple test to try and quickly overfit the NER component
     nlp = English()
     ner = nlp.add_pipe("ner", config={"model": {"use_upper": use_upper}})
     train_examples = []
@@ -386,7 +384,6 @@ def test_beam_ner_scores():
     test_text = "I like London."
     doc = nlp.make_doc(test_text)
     docs = [doc]
-    ner = nlp.get_pipe("beam_ner")
     beams = ner.predict(docs)
     entity_scores = ner.scored_ents(beams)[0]
 
@@ -423,7 +420,6 @@ def test_beam_overfitting_IO():
     # test the scores from the beam
     test_text = "I like London."
     docs = [nlp.make_doc(test_text)]
-    ner = nlp.get_pipe("beam_ner")
     beams = ner.predict(docs)
     entity_scores = ner.scored_ents(beams)[0]
     assert entity_scores[(2, 3, "LOC")] == 1.0
@@ -433,7 +429,7 @@ def test_beam_overfitting_IO():
     with make_tempdir() as tmp_dir:
         nlp.to_disk(tmp_dir)
         nlp2 = util.load_model_from_path(tmp_dir)
-        docs2 = [nlp2(test_text)]
+        docs2 = [nlp2.make_doc(test_text)]
         ner2 = nlp2.get_pipe("beam_ner")
         beams2 = ner2.predict(docs2)
         entity_scores2 = ner2.scored_ents(beams2)[0]
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index a914eb17a..e7728baaf 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -28,6 +28,26 @@ TRAIN_DATA = [
 ]
 
 
+CONFLICTING_DATA = [
+    (
+        "I like London and Berlin.",
+        {
+            "heads": [1, 1, 1, 2, 2, 1],
+            "deps": ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"],
+        },
+    ),
+    (
+        "I like London and Berlin.",
+        {
+            "heads": [0, 0, 0, 0, 0, 0],
+            "deps": ["ROOT", "nsubj", "nsubj", "cc", "conj", "punct"],
+        },
+    ),
+]
+
+eps = 0.01
+
+
 def test_parser_root(en_vocab):
     words = ["i", "do", "n't", "have", "other", "assistance"]
     heads = [3, 3, 3, 3, 5, 3]
@@ -185,26 +205,31 @@ def test_parser_set_sent_starts(en_vocab):
             assert token.head in sent
 
 
-def test_overfitting_IO():
-    # Simple test to try and quickly overfit the dependency parser - ensuring the ML models work correctly
+@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
+def test_overfitting_IO(pipe_name):
+    # Simple test to try and quickly overfit the dependency parser (normal or beam)
     nlp = English()
-    parser = nlp.add_pipe("parser")
+    parser = nlp.add_pipe(pipe_name)
     train_examples = []
     for text, annotations in TRAIN_DATA:
         train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
         for dep in annotations.get("deps", []):
             parser.add_label(dep)
     optimizer = nlp.initialize()
-    for i in range(100):
+    # run overfitting
+    for i in range(150):
         losses = {}
         nlp.update(train_examples, sgd=optimizer, losses=losses)
-    assert losses["parser"] < 0.0001
+    assert losses[pipe_name] < 0.0001
     # test the trained model
     test_text = "I like securities."
     doc = nlp(test_text)
     assert doc[0].dep_ == "nsubj"
     assert doc[2].dep_ == "dobj"
     assert doc[3].dep_ == "punct"
+    assert doc[0].head.i == 1
+    assert doc[2].head.i == 1
+    assert doc[3].head.i == 1
     # Also test the results are still the same after IO
     with make_tempdir() as tmp_dir:
         nlp.to_disk(tmp_dir)
@@ -213,6 +238,9 @@ def test_overfitting_IO():
         assert doc2[0].dep_ == "nsubj"
         assert doc2[2].dep_ == "dobj"
         assert doc2[3].dep_ == "punct"
+        assert doc2[0].head.i == 1
+        assert doc2[2].head.i == 1
+        assert doc2[3].head.i == 1
 
     # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions
     texts = [
@@ -226,3 +254,123 @@ def test_overfitting_IO():
     no_batch_deps = [doc.to_array([DEP]) for doc in [nlp(text) for text in texts]]
     assert_equal(batch_deps_1, batch_deps_2)
     assert_equal(batch_deps_1, no_batch_deps)
+
+
+def test_beam_parser_scores():
+    # Test that we can get confidence values out of the beam_parser pipe
+    beam_width = 16
+    beam_density = 0.0001
+    nlp = English()
+    config = {
+        "beam_width": beam_width,
+        "beam_density": beam_density,
+    }
+    parser = nlp.add_pipe("beam_parser", config=config)
+    train_examples = []
+    for text, annotations in CONFLICTING_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
+        for dep in annotations.get("deps", []):
+            parser.add_label(dep)
+    optimizer = nlp.initialize()
+
+    # update a bit with conflicting data
+    for i in range(10):
+        losses = {}
+        nlp.update(train_examples, sgd=optimizer, losses=losses)
+
+    # test the scores from the beam
+    test_text = "I like securities."
+    doc = nlp.make_doc(test_text)
+    docs = [doc]
+    beams = parser.predict(docs)
+    head_scores, label_scores = parser.scored_parses(beams)
+
+    for j in range(len(doc)):
+        for label in parser.labels:
+            label_score = label_scores[0][(j, label)]
+            assert 0 - eps <= label_score <= 1 + eps
+        for i in range(len(doc)):
+            head_score = head_scores[0][(j, i)]
+            assert 0 - eps <= head_score <= 1 + eps
+
+
+def test_beam_overfitting_IO():
+    # Simple test to try and quickly overfit the Beam dependency parser
+    nlp = English()
+    beam_width = 16
+    beam_density = 0.0001
+    config = {
+        "beam_width": beam_width,
+        "beam_density": beam_density,
+    }
+    parser = nlp.add_pipe("beam_parser", config=config)
+    train_examples = []
+    for text, annotations in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
+        for dep in annotations.get("deps", []):
+            parser.add_label(dep)
+    optimizer = nlp.initialize()
+    # run overfitting
+    for i in range(150):
+        losses = {}
+        nlp.update(train_examples, sgd=optimizer, losses=losses)
+    assert losses["beam_parser"] < 0.0001
+    # test the scores from the beam
+    test_text = "I like securities."
+    docs = [nlp.make_doc(test_text)]
+    beams = parser.predict(docs)
+    head_scores, label_scores = parser.scored_parses(beams)
+    # we only processed one document
+    head_scores = head_scores[0]
+    label_scores = label_scores[0]
+    # test label annotations: 0=nsubj, 2=dobj, 3=punct
+    assert label_scores[(0, "nsubj")] == pytest.approx(1.0, eps)
+    assert label_scores[(0, "dobj")] == pytest.approx(0.0, eps)
+    assert label_scores[(0, "punct")] == pytest.approx(0.0, eps)
+    assert label_scores[(2, "nsubj")] == pytest.approx(0.0, eps)
+    assert label_scores[(2, "dobj")] == pytest.approx(1.0, eps)
+    assert label_scores[(2, "punct")] == pytest.approx(0.0, eps)
+    assert label_scores[(3, "nsubj")] == pytest.approx(0.0, eps)
+    assert label_scores[(3, "dobj")] == pytest.approx(0.0, eps)
+    assert label_scores[(3, "punct")] == pytest.approx(1.0, eps)
+    # test head annotations: the root is token at index 1
+    assert head_scores[(0, 0)] == pytest.approx(0.0, eps)
+    assert head_scores[(0, 1)] == pytest.approx(1.0, eps)
+    assert head_scores[(0, 2)] == pytest.approx(0.0, eps)
+    assert head_scores[(2, 0)] == pytest.approx(0.0, eps)
+    assert head_scores[(2, 1)] == pytest.approx(1.0, eps)
+    assert head_scores[(2, 2)] == pytest.approx(0.0, eps)
+    assert head_scores[(3, 0)] == pytest.approx(0.0, eps)
+    assert head_scores[(3, 1)] == pytest.approx(1.0, eps)
+    assert head_scores[(3, 2)] == pytest.approx(0.0, eps)
+
+    # Also test the results are still the same after IO
+    with make_tempdir() as tmp_dir:
+        nlp.to_disk(tmp_dir)
+        nlp2 = util.load_model_from_path(tmp_dir)
+        docs2 = [nlp2.make_doc(test_text)]
+        parser2 = nlp2.get_pipe("beam_parser")
+        beams2 = parser2.predict(docs2)
+        head_scores2, label_scores2 = parser2.scored_parses(beams2)
+        # we only processed one document
+        head_scores2 = head_scores2[0]
+        label_scores2 = label_scores2[0]
+        # check the results again
+        assert label_scores2[(0, "nsubj")] == pytest.approx(1.0, eps)
+        assert label_scores2[(0, "dobj")] == pytest.approx(0.0, eps)
+        assert label_scores2[(0, "punct")] == pytest.approx(0.0, eps)
+        assert label_scores2[(2, "nsubj")] == pytest.approx(0.0, eps)
+        assert label_scores2[(2, "dobj")] == pytest.approx(1.0, eps)
+        assert label_scores2[(2, "punct")] == pytest.approx(0.0, eps)
+        assert label_scores2[(3, "nsubj")] == pytest.approx(0.0, eps)
+        assert label_scores2[(3, "dobj")] == pytest.approx(0.0, eps)
+        assert label_scores2[(3, "punct")] == pytest.approx(1.0, eps)
+        assert head_scores2[(0, 0)] == pytest.approx(0.0, eps)
+        assert head_scores2[(0, 1)] == pytest.approx(1.0, eps)
+        assert head_scores2[(0, 2)] == pytest.approx(0.0, eps)
+        assert head_scores2[(2, 0)] == pytest.approx(0.0, eps)
+        assert head_scores2[(2, 1)] == pytest.approx(1.0, eps)
+        assert head_scores2[(2, 2)] == pytest.approx(0.0, eps)
+        assert head_scores2[(3, 0)] == pytest.approx(0.0, eps)
+        assert head_scores2[(3, 1)] == pytest.approx(1.0, eps)
+        assert head_scores2[(3, 2)] == pytest.approx(0.0, eps)