Support negative examples in partial NER annotations (#8106)

* Support a cfg field in transition system * Make NER 'has gold' check use right alignment for span * Pass 'negative_samples_key' property into NER transition system * Add field for negative samples to NER transition system * Check neg_key in NER has_gold * Support negative examples in NER oracle * Test for negative examples in NER * Fix name of config variable in NER * Remove vestiges of old-style partial annotation * Remove obsolete tests * Add comment noting lack of support for negative samples in parser * Additions to "neg examples" PR (#8201) * add custom error and test for deprecated format * add test for unlearning an entity * add break also for Begin's cost * add negative_samples_key property on Parser * rename * extend docs & fix some older docs issues * add subclass constructors, clean up tests, fix docs * add flaky test with ValueError if gold parse was not found * remove ValueError if n_gold == 0 * fix docstring * Hack in environment variables to try out training * Remove hack * Remove NER hack, and support 'negative O' samples * Fix O oracle * Fix transition parser * Remove 'not O' from oracle * Fix NER oracle * check for spans in both gold.ents and gold.spans and raise if so, to prevent memory access violation * use set instead of list in consistency check Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com> Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
2025-08-03 03:40:24 +03:00 · 2021-06-17 17:33:00 +10:00 · 2021-06-17 17:33:00 +10:00 · 6f5e308d17
commit 6f5e308d17
parent 02bac8f269
19 changed files with 558 additions and 236 deletions
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -521,6 +521,13 @@ class Errors:
    E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.")

    # New errors added in v3.x
+    E868 = ("Found a conflicting gold annotation in a reference document, "
+            "with the following char-based span occurring both in the gold ents "
+            "as well as in the negative spans: {span}.")
+    E869 = ("The notation '{label}' is not supported anymore. To annotate "
+            "negative NER samples, use `doc.spans[key]` instead, and "
+            "specify the key as 'incorrect_spans_key' when constructing "
+            "the NER component.")
    E870 = ("Could not serialize the DocBin because it is too large. Consider "
            "splitting up your documents into several doc bins and serializing "
            "each separately. spacy.Corpus.v1 will search recursively for all "
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@ -1,3 +1,5 @@
+import os
+import random
 from libc.stdint cimport int32_t
 from cymem.cymem cimport Pool

@ -6,10 +8,11 @@ from thinc.extra.search cimport Beam

 from ...tokens.doc cimport Doc
 from ...tokens.span import Span
+from ...tokens.span cimport Span
 from ...typedefs cimport weight_t, attr_t
 from ...lexeme cimport Lexeme
 from ...attrs cimport IS_SPACE
-from ...structs cimport TokenC
+from ...structs cimport TokenC, SpanC
 from ...training.example cimport Example
 from .stateclass cimport StateClass
 from ._state cimport StateC
@ -25,7 +28,6 @@ cdef enum:
    LAST
    UNIT
    OUT
-    ISNT
    N_MOVES


@ -36,39 +38,62 @@ MOVE_NAMES[IN] = 'I'
 MOVE_NAMES[LAST] = 'L'
 MOVE_NAMES[UNIT] = 'U'
 MOVE_NAMES[OUT] = 'O'
-MOVE_NAMES[ISNT] = 'x'


 cdef struct GoldNERStateC:
    Transition* ner
+    SpanC* negs
    int32_t length
+    int32_t nr_neg


 cdef class BiluoGold:
    cdef Pool mem
    cdef GoldNERStateC c

-    def __init__(self, BiluoPushDown moves, StateClass stcls, Example example):
+    def __init__(self, BiluoPushDown moves, StateClass stcls, Example example, neg_key):
        self.mem = Pool()
-        self.c = create_gold_state(self.mem, moves, stcls.c, example)
+        self.c = create_gold_state(self.mem, moves, stcls.c, example, neg_key)

    def update(self, StateClass stcls):
        update_gold_state(&self.c, stcls.c)


-
 cdef GoldNERStateC create_gold_state(
    Pool mem,
    BiluoPushDown moves,
    const StateC* stcls,
-    Example example
+    Example example,
+    neg_key
 ) except *:
    cdef GoldNERStateC gs
+    cdef Span neg
+    if neg_key is not None:
+        negs = example.get_aligned_spans_y2x(
+            example.y.spans.get(neg_key, []),
+            allow_overlap=True
+        )
+    else:
+        negs = []
    assert example.x.length > 0
    gs.ner = <Transition*>mem.alloc(example.x.length, sizeof(Transition))
-    ner_tags = example.get_aligned_ner()
+    gs.negs = <SpanC*>mem.alloc(len(negs), sizeof(SpanC))
+    gs.nr_neg = len(negs)
+    ner_ents, ner_tags = example.get_aligned_ents_and_ner()
    for i, ner_tag in enumerate(ner_tags):
        gs.ner[i] = moves.lookup_transition(ner_tag)
+
+    # Prevent conflicting spans in the data. For NER, spans are equal if they have the same offsets and label.
+    neg_span_triples = {(neg_ent.start_char, neg_ent.end_char, neg_ent.label) for neg_ent in negs}
+    for pos_span in ner_ents:
+        if (pos_span.start_char, pos_span.end_char, pos_span.label) in neg_span_triples:
+            raise ValueError(Errors.E868.format(span=(pos_span.start_char, pos_span.end_char, pos_span.label_)))
+
+    # In order to handle negative samples, we need to maintain the full
+    # (start, end, label) triple. If we break it down to the 'isnt B-LOC'
+    # thing, we'll get blocked if there's an incorrect prefix.
+    for i, neg in enumerate(negs):
+        gs.negs[i] = neg.c
    return gs


@ -156,21 +181,16 @@ cdef class BiluoPushDown(TransitionSystem):
        cdef attr_t label
        if name == '-' or name == '' or name is None:
            return Transition(clas=0, move=MISSING, label=0, score=0)
-        elif name == '!O':
-            return Transition(clas=0, move=ISNT, label=0, score=0)
        elif '-' in name:
            move_str, label_str = name.split('-', 1)
-            # Hacky way to denote 'not this entity'
+            # Deprecated, hacky way to denote 'not this entity'
            if label_str.startswith('!'):
-                label_str = label_str[1:]
-                move_str = 'x'
+                raise ValueError(Errors.E869.format(label=name))
            label = self.strings.add(label_str)
        else:
            move_str = name
            label = 0
        move = MOVE_NAMES.index(move_str)
-        if move == ISNT:
-            return Transition(clas=0, move=ISNT, label=label, score=0)
        for i in range(self.n_moves):
            if self.c[i].move == move and self.c[i].label == label:
                return self.c[i]
@ -220,7 +240,7 @@ cdef class BiluoPushDown(TransitionSystem):
            label_id = label_name
        if action == OUT and label_id != 0:
            return None
-        if action == MISSING or action == ISNT:
+        if action == MISSING:
            return None
        # Check we're not creating a move we already have, so that this is
        # idempotent
@ -270,9 +290,23 @@ cdef class BiluoPushDown(TransitionSystem):
        return parses

    def init_gold(self, StateClass state, Example example):
-        return BiluoGold(self, state, example)
+        return BiluoGold(self, state, example, self.neg_key)

    def has_gold(self, Example eg, start=0, end=None):
+        # We get x and y referring to X, we want to check relative to Y,
+        # the reference
+        y_spans = eg.get_aligned_spans_x2y([eg.x[start:end]])
+        if not y_spans:
+            y_spans = [eg.y[:]]
+        y_span = y_spans[0]
+        start = y_span.start
+        end = y_span.end
+        neg_key = self.neg_key
+        if neg_key is not None:
+            # If we have any negative samples, count that as having annotation.
+            for span in eg.y.spans.get(neg_key, []):
+                if span.start >= start and span.end <= end:
+                    return True
        for word in eg.y[start:end]:
            if word.ent_iob != 0:
                return True
@ -306,8 +340,6 @@ cdef class BiluoPushDown(TransitionSystem):
                n_gold += costs[i] <= 0
            else:
                costs[i] = 9000
-        if n_gold < 1:
-            raise ValueError


 cdef class Missing:
@ -373,23 +405,33 @@ cdef class Begin:
    @staticmethod
    cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
        gold = <GoldNERStateC*>_gold
-        cdef int g_act = gold.ner[s.B(0)].move
-        cdef attr_t g_tag = gold.ner[s.B(0)].label
+        b0 = s.B(0)
+        cdef int cost = 0
+        cdef int g_act = gold.ner[b0].move
+        cdef attr_t g_tag = gold.ner[b0].label

        if g_act == MISSING:
-            return 0
+            pass
        elif g_act == BEGIN:
            # B, Gold B --> Label match
-            return label != g_tag
-        # Support partial supervision in the form of "not this label"
-        elif g_act == ISNT:
-            return label == g_tag
+            cost += label != g_tag
        else:
            # B, Gold I --> False (P)
            # B, Gold L --> False (P)
            # B, Gold O --> False (P)
            # B, Gold U --> False (P)
-            return 1
+            cost += 1
+        if s.buffer_length() < 3:
+            # Handle negatives. In general we can't really do much to block
+            # B, because we don't know whether the whole entity is going to
+            # be correct or not. However, we can at least tell whether we're
+            # going to be opening an entity where there's only one possible
+            # L.
+            for span in gold.negs[:gold.nr_neg]:
+                if span.label == label and span.start == b0:
+                    cost += 1
+                    break
+        return cost


 cdef class In:
@ -462,9 +504,6 @@ cdef class In:
        elif g_act == UNIT:
            # I, Gold U --> True iff next tag == O
            return next_act != OUT
-        # Support partial supervision in the form of "not this label"
-        elif g_act == ISNT:
-            return 0
        else:
            return 1

@ -504,32 +543,41 @@ cdef class Last:
    cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
        gold = <GoldNERStateC*>_gold
        move = LAST
+        b0 = s.B(0)
+        ent_start = s.E(0)

-        cdef int g_act = gold.ner[s.B(0)].move
-        cdef attr_t g_tag = gold.ner[s.B(0)].label
+        cdef int g_act = gold.ner[b0].move
+        cdef attr_t g_tag = gold.ner[b0].label
+
+        cdef int cost = 0

        if g_act == MISSING:
-            return 0
+            pass
        elif g_act == BEGIN:
            # L, Gold B --> True
-            return 0
+            pass
        elif g_act == IN:
            # L, Gold I --> True iff this entity sunk
-            return not _entity_is_sunk(s, gold.ner)
+            cost += not _entity_is_sunk(s, gold.ner)
        elif g_act == LAST:
            # L, Gold L --> True
-            return 0
+            pass
        elif g_act == OUT:
            # L, Gold O --> True
-            return 0
+            pass
        elif g_act == UNIT:
            # L, Gold U --> True
-            return 0
-        # Support partial supervision in the form of "not this label"
-        elif g_act == ISNT:
-            return 0
+            pass
        else:
-            return 1
+            cost += 1
+        # If we have negative-example entities, integrate them into the objective,
+        # by marking actions that close an entity that we know is incorrect
+        # as costly.
+        for span in gold.negs[:gold.nr_neg]:
+            if span.label == label and (span.end-1) == b0 and span.start == ent_start:
+                cost += 1
+                break
+        return cost


 cdef class Unit:
@ -573,21 +621,29 @@ cdef class Unit:
        gold = <GoldNERStateC*>_gold
        cdef int g_act = gold.ner[s.B(0)].move
        cdef attr_t g_tag = gold.ner[s.B(0)].label
+        cdef int cost = 0

        if g_act == MISSING:
-            return 0
+            pass
        elif g_act == UNIT:
            # U, Gold U --> True iff tag match
-            return label != g_tag
-        # Support partial supervision in the form of "not this label"
-        elif g_act == ISNT:
-            return label == g_tag
+            cost += label != g_tag
        else:
            # U, Gold B --> False
            # U, Gold I --> False
            # U, Gold L --> False
            # U, Gold O --> False
-            return 1
+            cost += 1
+        # If we have negative-example entities, integrate them into the objective.
+        # This is fairly straight-forward for U- entities, as we have a single
+        # action
+        cdef int b0 = s.B(0)
+        for span in gold.negs[:gold.nr_neg]:
+            if span.label == label and span.start == b0 and span.end == (b0+1):
+                cost += 1
+                break
+        return cost
+ 


 cdef class Out:
@ -613,25 +669,24 @@ cdef class Out:
        gold = <GoldNERStateC*>_gold
        cdef int g_act = gold.ner[s.B(0)].move
        cdef attr_t g_tag = gold.ner[s.B(0)].label
-
-        if g_act == ISNT and g_tag == 0:
-            return 1
-        elif g_act == MISSING or g_act == ISNT:
-            return 0
+        cdef weight_t cost = 0
+        if g_act == MISSING:
+            pass
        elif g_act == BEGIN:
            # O, Gold B --> False
-            return 1
+            cost += 1
        elif g_act == IN:
            # O, Gold I --> True
-            return 0
+            pass
        elif g_act == LAST:
            # O, Gold L --> True
-            return 0
+            pass
        elif g_act == OUT:
            # O, Gold O --> True
-            return 0
+            pass
        elif g_act == UNIT:
            # O, Gold U --> False
-            return 1
+            cost += 1
        else:
-            return 1
+            cost += 1
+        return cost
--- a/spacy/pipeline/_parser_internals/transition_system.pxd
+++ b/spacy/pipeline/_parser_internals/transition_system.pxd
@ -41,6 +41,7 @@ cdef class TransitionSystem:
    cdef public attr_t root_label
    cdef public freqs
    cdef public object labels
+    cdef public object cfg
    cdef init_state_t init_beam_state
    cdef del_state_t del_beam_state

--- a/spacy/pipeline/_parser_internals/transition_system.pyx
+++ b/spacy/pipeline/_parser_internals/transition_system.pyx
@ -33,7 +33,14 @@ cdef int _del_state(Pool mem, void* state, void* x) except -1:


 cdef class TransitionSystem:
-    def __init__(self, StringStore string_table, labels_by_action=None, min_freq=None):
+    def __init__(
+        self,
+        StringStore string_table,
+        labels_by_action=None,
+        min_freq=None,
+        incorrect_spans_key=None
+    ):
+        self.cfg = {"neg_key": incorrect_spans_key}
        self.mem = Pool()
        self.strings = string_table
        self.n_moves = 0
@ -49,8 +56,13 @@ cdef class TransitionSystem:
        self.del_beam_state = _del_state

    def __reduce__(self):
+        # TODO: This loses the 'cfg'
        return (self.__class__, (self.strings, self.labels), None, None)

+    @property
+    def neg_key(self):
+        return self.cfg.get("neg_key")
+
    def init_batch(self, docs):
        cdef StateClass state
        states = []
@ -220,16 +232,21 @@ cdef class TransitionSystem:
        transitions = []
        serializers = {
            'moves': lambda: srsly.json_dumps(self.labels),
-            'strings': lambda: self.strings.to_bytes()
+            'strings': lambda: self.strings.to_bytes(),
+            'cfg': lambda: self.cfg
        }
        return util.to_bytes(serializers, exclude)

    def from_bytes(self, bytes_data, exclude=tuple()):
+        # We're adding a new field, 'cfg', here and we don't want to break
+        # previous models that don't have it.
+        msg = srsly.msgpack_loads(bytes_data)
        labels = {}
-        deserializers = {
-            'moves': lambda b: labels.update(srsly.json_loads(b)),
-            'strings': lambda b: self.strings.from_bytes(b)
-        }
-        msg = util.from_bytes(bytes_data, deserializers, exclude)
+        if 'moves' not in exclude:
+            labels.update(srsly.json_loads(msg['moves']))
+        if 'strings' not in exclude:
+            self.strings.from_bytes(msg['strings'])
+        if 'cfg' not in exclude and 'cfg' in msg:
+            self.cfg.update(msg['cfg'])
        self.initialize_actions(labels)
        return self
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@ -3,6 +3,7 @@ from collections import defaultdict
 from typing import Optional, Iterable
 from thinc.api import Model, Config

+from ._parser_internals.transition_system import TransitionSystem
 from .transition_parser cimport Parser
 from ._parser_internals.arc_eager cimport ArcEager

@ -59,7 +60,7 @@ def make_parser(
    nlp: Language,
    name: str,
    model: Model,
-    moves: Optional[list],
+    moves: Optional[TransitionSystem],
    update_with_oracle_cut_size: int,
    learn_tokens: bool,
    min_action_freq: int
@ -85,13 +86,13 @@ def make_parser(
    model (Model): The model for the transition-based parser. The model needs
        to have a specific substructure of named components --- see the
        spacy.ml.tb_framework.TransitionModel for details.
-    moves (List[str]): A list of transition names. Inferred from the data if not
-        provided.
-    update_with_oracle_cut_size (int):
-        During training, cut long sequences into shorter segments by creating
-        intermediate states based on the gold-standard history. The model is
-        not very sensitive to this parameter, so you usually won't need to change
-        it. 100 is a good default.
+    moves (Optional[TransitionSystem]): This defines how the parse-state is created,
+        updated and evaluated. If 'moves' is None, a new instance is
+        created with `self.TransitionSystem()`. Defaults to `None`.
+    update_with_oracle_cut_size (int): During training, cut long sequences into
+        shorter segments by creating intermediate states based on the gold-standard
+        history. The model is not very sensitive to this parameter, so you usually
+        won't need to change it. 100 is a good default.
    learn_tokens (bool): Whether to learn to merge subtokens that are split
        relative to the gold standard. Experimental.
    min_action_freq (int): The minimum frequency of labelled actions to retain.
@ -112,6 +113,9 @@ def make_parser(
        beam_width=1,
        beam_density=0.0,
        beam_update_prob=0.0,
+        # At some point in the future we can try to implement support for
+        # partial annotations, perhaps only in the beam objective.
+        incorrect_spans_key=None
    )

@Language.factory(
@ -140,7 +144,7 @@ def make_beam_parser(
    nlp: Language,
    name: str,
    model: Model,
-    moves: Optional[list],
+    moves: Optional[TransitionSystem],
    update_with_oracle_cut_size: int,
    learn_tokens: bool,
    min_action_freq: int,
@ -165,8 +169,13 @@ def make_beam_parser(
    model (Model): The model for the transition-based parser. The model needs
        to have a specific substructure of named components --- see the
        spacy.ml.tb_framework.TransitionModel for details.
-    moves (List[str]): A list of transition names. Inferred from the data if not
-        provided.
+    moves (Optional[TransitionSystem]): This defines how the parse-state is created,
+        updated and evaluated. If 'moves' is None, a new instance is
+        created with `self.TransitionSystem()`. Defaults to `None`.
+    update_with_oracle_cut_size (int): During training, cut long sequences into
+        shorter segments by creating intermediate states based on the gold-standard
+        history. The model is not very sensitive to this parameter, so you usually
+        won't need to change it. 100 is a good default.
    beam_width (int): The number of candidate analyses to maintain.
    beam_density (float): The minimum ratio between the scores of the first and
        last candidates in the beam. This allows the parser to avoid exploring
@ -195,7 +204,10 @@ def make_beam_parser(
        beam_update_prob=beam_update_prob,
        multitasks=[],
        learn_tokens=learn_tokens,
-        min_action_freq=min_action_freq
+        min_action_freq=min_action_freq,
+        # At some point in the future we can try to implement support for
+        # partial annotations, perhaps only in the beam objective.
+        incorrect_spans_key=None
    )


@ -206,6 +218,39 @@ cdef class DependencyParser(Parser):
    """
    TransitionSystem = ArcEager

+    def __init__(
+        self,
+        vocab,
+        model,
+        name="parser",
+        moves=None,
+        *,
+        update_with_oracle_cut_size=100,
+        min_action_freq=30,
+        learn_tokens=False,
+        beam_width=1,
+        beam_density=0.0,
+        beam_update_prob=0.0,
+        multitasks=tuple(),
+        incorrect_spans_key=None,
+    ):
+        """Create a DependencyParser.
+        """
+        super().__init__(
+            vocab,
+            model,
+            name,
+            moves,
+            update_with_oracle_cut_size=update_with_oracle_cut_size,
+            min_action_freq=min_action_freq,
+            learn_tokens=learn_tokens,
+            beam_width=beam_width,
+            beam_density=beam_density,
+            beam_update_prob=beam_update_prob,
+            multitasks=multitasks,
+            incorrect_spans_key=incorrect_spans_key,
+        )
+
    @property
    def postprocesses(self):
        output = [nonproj.deprojectivize]
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@ -3,6 +3,7 @@ from collections import defaultdict
 from typing import Optional, Iterable
 from thinc.api import Model, Config

+from ._parser_internals.transition_system import TransitionSystem
 from .transition_parser cimport Parser
 from ._parser_internals.ner cimport BiluoPushDown

@ -40,6 +41,7 @@ DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"]
        "moves": None,
        "update_with_oracle_cut_size": 100,
        "model": DEFAULT_NER_MODEL,
+        "incorrect_spans_key": None
    },
    default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},

@ -48,8 +50,9 @@ def make_ner(
    nlp: Language,
    name: str,
    model: Model,
-    moves: Optional[list],
+    moves: Optional[TransitionSystem],
    update_with_oracle_cut_size: int,
+    incorrect_spans_key: Optional[str]=None
 ):
    """Create a transition-based EntityRecognizer component. The entity recognizer
    identifies non-overlapping labelled spans of tokens.
@ -67,13 +70,16 @@ def make_ner(
    model (Model): The model for the transition-based parser. The model needs
        to have a specific substructure of named components --- see the
        spacy.ml.tb_framework.TransitionModel for details.
-    moves (list[str]): A list of transition names. Inferred from the data if not
-        provided.
-    update_with_oracle_cut_size (int):
-        During training, cut long sequences into shorter segments by creating
-        intermediate states based on the gold-standard history. The model is
-        not very sensitive to this parameter, so you usually won't need to change
-        it. 100 is a good default.
+    moves (Optional[TransitionSystem]): This defines how the parse-state is created,
+        updated and evaluated. If 'moves' is None, a new instance is
+        created with `self.TransitionSystem()`. Defaults to `None`.
+    update_with_oracle_cut_size (int): During training, cut long sequences into
+        shorter segments by creating intermediate states based on the gold-standard
+        history. The model is not very sensitive to this parameter, so you usually
+        won't need to change it. 100 is a good default.
+    incorrect_spans_key (Optional[str]): Identifies spans that are known
+        to be incorrect entity annotations. The incorrect entity annotations
+        can be stored in the span group, under this key.
    """
    return EntityRecognizer(
        nlp.vocab,
@ -81,9 +87,8 @@ def make_ner(
        name,
        moves=moves,
        update_with_oracle_cut_size=update_with_oracle_cut_size,
+        incorrect_spans_key=incorrect_spans_key,
        multitasks=[],
-        min_action_freq=1,
-        learn_tokens=False,
        beam_width=1,
        beam_density=0.0,
        beam_update_prob=0.0,
@ -98,7 +103,8 @@ def make_ner(
        "model": DEFAULT_NER_MODEL,
        "beam_density": 0.01,
        "beam_update_prob": 0.5,
-        "beam_width": 32
+        "beam_width": 32,
+        "incorrect_spans_key": None
    },
    default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
 )
@ -106,11 +112,12 @@ def make_beam_ner(
    nlp: Language,
    name: str,
    model: Model,
-    moves: Optional[list],
+    moves: Optional[TransitionSystem],
    update_with_oracle_cut_size: int,
    beam_width: int,
    beam_density: float,
    beam_update_prob: float,
+    incorrect_spans_key: Optional[str]=None
 ):
    """Create a transition-based EntityRecognizer component that uses beam-search.
    The entity recognizer identifies non-overlapping labelled spans of tokens.
@ -128,13 +135,13 @@ def make_beam_ner(
    model (Model): The model for the transition-based parser. The model needs
        to have a specific substructure of named components --- see the
        spacy.ml.tb_framework.TransitionModel for details.
-    moves (list[str]): A list of transition names. Inferred from the data if not
-        provided.
-    update_with_oracle_cut_size (int):
-        During training, cut long sequences into shorter segments by creating
-        intermediate states based on the gold-standard history. The model is
-        not very sensitive to this parameter, so you usually won't need to change
-        it. 100 is a good default.
+    moves (Optional[TransitionSystem]): This defines how the parse-state is created,
+        updated and evaluated. If 'moves' is None, a new instance is
+        created with `self.TransitionSystem()`. Defaults to `None`.
+    update_with_oracle_cut_size (int): During training, cut long sequences into
+        shorter segments by creating intermediate states based on the gold-standard
+        history. The model is not very sensitive to this parameter, so you usually
+        won't need to change it. 100 is a good default.
    beam_width (int): The number of candidate analyses to maintain.
    beam_density (float): The minimum ratio between the scores of the first and
        last candidates in the beam. This allows the parser to avoid exploring
@ -144,6 +151,8 @@ def make_beam_ner(
    beam_update_prob (float): The chance of making a beam update, instead of a
        greedy update. Greedy updates are an approximation for the beam updates,
        and are faster to compute.
+    incorrect_spans_key (Optional[str]): Optional key into span groups of
+        entities known to be non-entities.
    """
    return EntityRecognizer(
        nlp.vocab,
@ -152,11 +161,10 @@ def make_beam_ner(
        moves=moves,
        update_with_oracle_cut_size=update_with_oracle_cut_size,
        multitasks=[],
-        min_action_freq=1,
-        learn_tokens=False,
        beam_width=beam_width,
        beam_density=beam_density,
        beam_update_prob=beam_update_prob,
+        incorrect_spans_key=incorrect_spans_key
    )


@ -167,6 +175,37 @@ cdef class EntityRecognizer(Parser):
    """
    TransitionSystem = BiluoPushDown

+    def __init__(
+        self,
+        vocab,
+        model,
+        name="ner",
+        moves=None,
+        *,
+        update_with_oracle_cut_size=100,
+        beam_width=1,
+        beam_density=0.0,
+        beam_update_prob=0.0,
+        multitasks=tuple(),
+        incorrect_spans_key=None,
+    ):
+        """Create an EntityRecognizer.
+        """
+        super().__init__(
+            vocab,
+            model,
+            name,
+            moves,
+            update_with_oracle_cut_size=update_with_oracle_cut_size,
+            min_action_freq=1,   # not relevant for NER
+            learn_tokens=False,  # not relevant for NER
+            beam_width=beam_width,
+            beam_density=beam_density,
+            beam_update_prob=beam_update_prob,
+            multitasks=multitasks,
+            incorrect_spans_key=incorrect_spans_key,
+        )
+
    def add_multitask_objective(self, mt_component):
        """Register another component as a multi-task objective. Experimental."""
        self._multitasks.append(mt_component)
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@ -29,6 +29,7 @@ from ..training import validate_examples, validate_get_examples
 from ..errors import Errors, Warnings
 from .. import util

+
 cdef class Parser(TrainablePipe):
    """
    Base class of the DependencyParser and EntityRecognizer.
@ -48,15 +49,43 @@ cdef class Parser(TrainablePipe):
        beam_density=0.0,
        beam_update_prob=0.0,
        multitasks=tuple(),
+        incorrect_spans_key=None
    ):
        """Create a Parser.

        vocab (Vocab): The vocabulary object. Must be shared with documents
            to be processed. The value is set to the `.vocab` attribute.
-        **cfg: Configuration parameters. Set to the `.cfg` attribute.
-             If it doesn't include a value for 'moves',  a new instance is
-             created with `self.TransitionSystem()`. This defines how the
-             parse-state is created, updated and evaluated.
+        model (Model): The model for the transition-based parser. The model needs
+            to have a specific substructure of named components --- see the
+            spacy.ml.tb_framework.TransitionModel for details.
+        name (str): The name of the pipeline component
+        moves (Optional[TransitionSystem]): This defines how the parse-state is created,
+            updated and evaluated. If 'moves' is None, a new instance is
+            created with `self.TransitionSystem()`. Defaults to `None`.
+        update_with_oracle_cut_size (int): During training, cut long sequences into
+            shorter segments by creating intermediate states based on the gold-standard
+            history. The model is not very sensitive to this parameter, so you usually
+            won't need to change it. 100 is a good default.
+        min_action_freq (int): The minimum frequency of labelled actions to retain.
+            Rarer labelled actions have their label backed-off to "dep". While this
+            primarily affects the label accuracy, it can also affect the attachment
+            structure, as the labels are used to represent the pseudo-projectivity
+            transformation.
+        learn_tokens (bool): Whether to learn to merge subtokens that are split
+            relative to the gold standard. Experimental.
+        beam_width (int): The number of candidate analyses to maintain.
+        beam_density (float): The minimum ratio between the scores of the first and
+            last candidates in the beam. This allows the parser to avoid exploring
+            candidates that are too far behind. This is mostly intended to improve
+            efficiency, but it can also improve accuracy as deeper search is not
+            always better.
+        beam_update_prob (float): The chance of making a beam update, instead of a
+            greedy update. Greedy updates are an approximation for the beam updates,
+            and are faster to compute.
+        multitasks: additional multi-tasking components. Experimental.
+        incorrect_spans_key (Optional[str]): Identifies spans that are known
+            to be incorrect entity annotations. The incorrect entity annotations
+            can be stored in the span group, under this key.
        """
        self.vocab = vocab
        self.name = name
@ -68,11 +97,16 @@ cdef class Parser(TrainablePipe):
            "learn_tokens": learn_tokens,
            "beam_width": beam_width,
            "beam_density": beam_density,
-            "beam_update_prob": beam_update_prob
+            "beam_update_prob": beam_update_prob,
+            "incorrect_spans_key": incorrect_spans_key
        }
        if moves is None:
-            # defined by EntityRecognizer as a BiluoPushDown
-            moves = self.TransitionSystem(self.vocab.strings)
+            # EntityRecognizer -> BiluoPushDown
+            # DependencyParser -> ArcEager
+            moves = self.TransitionSystem(
+                self.vocab.strings,
+                incorrect_spans_key=incorrect_spans_key
+            )
        self.moves = moves
        self.model = model
        if self.moves.n_moves != 0:
@ -118,6 +152,10 @@ cdef class Parser(TrainablePipe):
        # Available for subclasses, e.g. to deprojectivize
        return []

+    @property
+    def incorrect_spans_key(self):
+        return self.cfg["incorrect_spans_key"]
+
    def add_label(self, label):
        resized = False
        for action in self.moves.action_types:
@ -326,7 +364,6 @@ cdef class Parser(TrainablePipe):
        )
        for multitask in self._multitasks:
            multitask.update(examples, drop=drop, sgd=sgd)
-    
        n_examples = len([eg for eg in examples if self.moves.has_gold(eg)])
        if n_examples == 0:
            return losses
@ -554,7 +591,7 @@ cdef class Parser(TrainablePipe):
                self._resize()
                self.model.from_bytes(bytes_data)
            except AttributeError:
-                raise ValueError(Errors.E149) from None
+                raise ValueError(Errors.E149)
        return self

    def to_bytes(self, exclude=tuple()):
--- a/spacy/tests/doc/test_add_entities.py
+++ b/spacy/tests/doc/test_add_entities.py
@ -18,14 +18,9 @@ def _ner_example(ner):
 def test_doc_add_entities_set_ents_iob(en_vocab):
    text = ["This", "is", "a", "lion"]
    doc = Doc(en_vocab, words=text)
-    config = {
-        "learn_tokens": False,
-        "min_action_freq": 30,
-        "update_with_oracle_cut_size": 100,
-    }
    cfg = {"model": DEFAULT_NER_MODEL}
    model = registry.resolve(cfg, validate=True)["model"]
-    ner = EntityRecognizer(en_vocab, model, **config)
+    ner = EntityRecognizer(en_vocab, model)
    ner.initialize(lambda: [_ner_example(ner)])
    ner(doc)

@ -40,14 +35,9 @@ def test_ents_reset(en_vocab):
    """Ensure that resetting doc.ents does not change anything"""
    text = ["This", "is", "a", "lion"]
    doc = Doc(en_vocab, words=text)
-    config = {
-        "learn_tokens": False,
-        "min_action_freq": 30,
-        "update_with_oracle_cut_size": 100,
-    }
    cfg = {"model": DEFAULT_NER_MODEL}
    model = registry.resolve(cfg, validate=True)["model"]
-    ner = EntityRecognizer(en_vocab, model, **config)
+    ner = EntityRecognizer(en_vocab, model)
    ner.initialize(lambda: [_ner_example(ner)])
    ner(doc)
    orig_iobs = [t.ent_iob_ for t in doc]
--- a/spacy/tests/parser/test_add_label.py
+++ b/spacy/tests/parser/test_add_label.py
@ -18,14 +18,9 @@ def vocab():

@pytest.fixture
 def parser(vocab):
-    config = {
-        "learn_tokens": False,
-        "min_action_freq": 30,
-        "update_with_oracle_cut_size": 100,
-    }
    cfg = {"model": DEFAULT_PARSER_MODEL}
    model = registry.resolve(cfg, validate=True)["model"]
-    parser = DependencyParser(vocab, model, **config)
+    parser = DependencyParser(vocab, model)
    return parser


@ -77,19 +72,14 @@ def test_add_label(parser):


 def test_add_label_deserializes_correctly():
-    config = {
-        "learn_tokens": False,
-        "min_action_freq": 30,
-        "update_with_oracle_cut_size": 100,
-    }
    cfg = {"model": DEFAULT_NER_MODEL}
    model = registry.resolve(cfg, validate=True)["model"]
-    ner1 = EntityRecognizer(Vocab(), model, **config)
+    ner1 = EntityRecognizer(Vocab(), model)
    ner1.add_label("C")
    ner1.add_label("B")
    ner1.add_label("A")
    ner1.initialize(lambda: [_ner_example(ner1)])
-    ner2 = EntityRecognizer(Vocab(), model, **config)
+    ner2 = EntityRecognizer(Vocab(), model)

    # the second model needs to be resized before we can call from_bytes
    ner2.model.attrs["resize_output"](ner2.model, ner1.moves.n_moves)
@ -113,12 +103,7 @@ def test_add_label_get_label(pipe_cls, n_moves, model_config):
    """
    labels = ["A", "B", "C"]
    model = registry.resolve({"model": model_config}, validate=True)["model"]
-    config = {
-        "learn_tokens": False,
-        "min_action_freq": 30,
-        "update_with_oracle_cut_size": 100,
-    }
-    pipe = pipe_cls(Vocab(), model, **config)
+    pipe = pipe_cls(Vocab(), model)
    for label in labels:
        pipe.add_label(label)
    assert len(pipe.move_names) == len(labels) * n_moves
--- a/spacy/tests/parser/test_arc_eager_oracle.py
+++ b/spacy/tests/parser/test_arc_eager_oracle.py
@ -130,14 +130,9 @@ def test_get_oracle_actions():
        deps.append(dep)
        ents.append(ent)
    doc = Doc(Vocab(), words=[t[1] for t in annot_tuples])
-    config = {
-        "learn_tokens": False,
-        "min_action_freq": 0,
-        "update_with_oracle_cut_size": 100,
-    }
    cfg = {"model": DEFAULT_PARSER_MODEL}
    model = registry.resolve(cfg, validate=True)["model"]
-    parser = DependencyParser(doc.vocab, model, **config)
+    parser = DependencyParser(doc.vocab, model)
    parser.moves.add_action(0, "")
    parser.moves.add_action(1, "")
    parser.moves.add_action(1, "")
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@ -9,11 +9,12 @@ from spacy.lookups import Lookups
 from spacy.pipeline._parser_internals.ner import BiluoPushDown
 from spacy.training import Example
 from spacy.tokens import Doc, Span
-from spacy.vocab import Vocab
+from spacy.vocab import Vocab, registry
 import logging

 from ..util import make_tempdir
-
+from ...pipeline import EntityRecognizer
+from ...pipeline.ner import DEFAULT_NER_MODEL

 TRAIN_DATA = [
    ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
@ -21,6 +22,11 @@ TRAIN_DATA = [
 ]


+@pytest.fixture
+def neg_key():
+    return "non_entities"
+
+
@pytest.fixture
 def vocab():
    return Vocab()
@ -59,39 +65,70 @@ def test_get_oracle_moves(tsys, doc, entity_annots):
    assert names == ["U-PERSON", "O", "O", "B-GPE", "L-GPE", "O"]


-@pytest.mark.filterwarnings("ignore::UserWarning")
-def test_get_oracle_moves_negative_entities(tsys, doc, entity_annots):
-    entity_annots = [(s, e, "!" + label) for s, e, label in entity_annots]
+def test_negative_samples_two_word_input(tsys, vocab, neg_key):
+    """Test that we don't get stuck in a two word input when we have a negative
+    span. This could happen if we don't have the right check on the B action.
+    """
+    tsys.cfg["neg_key"] = neg_key
+    doc = Doc(vocab, words=["A", "B"])
+    entity_annots = [None, None]
    example = Example.from_dict(doc, {"entities": entity_annots})
-    ex_dict = example.to_dict()
-
-    for i, tag in enumerate(ex_dict["doc_annotation"]["entities"]):
-        if tag == "L-!GPE":
-            ex_dict["doc_annotation"]["entities"][i] = "-"
-    example = Example.from_dict(doc, ex_dict)
-
+    # These mean that the oracle sequence shouldn't have O for the first
+    # word, and it shouldn't analyse it as B-PERSON, L-PERSON
+    example.y.spans[neg_key] = [
+        Span(example.y, 0, 1, label="O"),
+        Span(example.y, 0, 2, label="PERSON"),
+    ]
    act_classes = tsys.get_oracle_sequence(example)
    names = [tsys.get_class_name(act) for act in act_classes]
    assert names
+    assert names[0] != "O"
+    assert names[0] != "B-PERSON"
+    assert names[1] != "L-PERSON"


-def test_get_oracle_moves_negative_entities2(tsys, vocab):
-    doc = Doc(vocab, words=["A", "B", "C", "D"])
-    entity_annots = ["B-!PERSON", "L-!PERSON", "B-!PERSON", "L-!PERSON"]
+def test_negative_samples_three_word_input(tsys, vocab, neg_key):
+    """Test that we exclude a 2-word entity correctly using a negative example."""
+    tsys.cfg["neg_key"] = neg_key
+    doc = Doc(vocab, words=["A", "B", "C"])
+    entity_annots = [None, None, None]
    example = Example.from_dict(doc, {"entities": entity_annots})
+    # These mean that the oracle sequence shouldn't have O for the first
+    # word, and it shouldn't analyse it as B-PERSON, L-PERSON
+    example.y.spans[neg_key] = [
+        Span(example.y, 0, 1, label="O"),
+        Span(example.y, 0, 2, label="PERSON"),
+    ]
    act_classes = tsys.get_oracle_sequence(example)
    names = [tsys.get_class_name(act) for act in act_classes]
    assert names
+    assert names[0] != "O"
+    assert names[1] != "B-PERSON"


-@pytest.mark.skip(reason="Maybe outdated? Unsure")
-def test_get_oracle_moves_negative_O(tsys, vocab):
-    doc = Doc(vocab, words=["A", "B", "C", "D"])
-    entity_annots = ["O", "!O", "O", "!O"]
+def test_negative_samples_U_entity(tsys, vocab, neg_key):
+    """Test that we exclude a 2-word entity correctly using a negative example."""
+    tsys.cfg["neg_key"] = neg_key
+    doc = Doc(vocab, words=["A"])
+    entity_annots = [None]
    example = Example.from_dict(doc, {"entities": entity_annots})
+    # These mean that the oracle sequence shouldn't have O for the first
+    # word, and it shouldn't analyse it as B-PERSON, L-PERSON
+    example.y.spans[neg_key] = [
+        Span(example.y, 0, 1, label="O"),
+        Span(example.y, 0, 1, label="PERSON"),
+    ]
    act_classes = tsys.get_oracle_sequence(example)
    names = [tsys.get_class_name(act) for act in act_classes]
    assert names
+    assert names[0] != "O"
+    assert names[0] != "U-PERSON"
+
+
+def test_negative_sample_key_is_in_config(vocab, entity_types):
+    actions = BiluoPushDown.get_actions(entity_types=entity_types)
+    tsys = BiluoPushDown(vocab.strings, actions, incorrect_spans_key="non_entities")
+    assert tsys.cfg["neg_key"] == "non_entities"


 # We can't easily represent this on a Doc object. Not sure what the best solution
@ -213,6 +250,27 @@ def test_train_empty():
            nlp.update(batch, losses=losses)


+def test_train_negative_deprecated():
+    """Test that the deprecated negative entity format raises a custom error."""
+    train_data = [
+        ("Who is Shaka Khan?", {"entities": [(7, 17, "!PERSON")]}),
+    ]
+
+    nlp = English()
+    train_examples = []
+    for t in train_data:
+        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+    ner = nlp.add_pipe("ner", last=True)
+    ner.add_label("PERSON")
+    nlp.initialize()
+    for itn in range(2):
+        losses = {}
+        batches = util.minibatch(train_examples, size=8)
+        for batch in batches:
+            with pytest.raises(ValueError):
+                nlp.update(batch, losses=losses)
+
+
 def test_overwrite_token():
    nlp = English()
    nlp.add_pipe("ner")
@ -265,6 +323,16 @@ def test_ruler_before_ner():
    assert [token.ent_type_ for token in doc] == expected_types


+def test_ner_constructor(en_vocab):
+    config = {
+        "update_with_oracle_cut_size": 100,
+    }
+    cfg = {"model": DEFAULT_NER_MODEL}
+    model = registry.resolve(cfg, validate=True)["model"]
+    ner_1 = EntityRecognizer(en_vocab, model, **config)
+    ner_2 = EntityRecognizer(en_vocab, model)
+
+
 def test_ner_before_ruler():
    """ Test that an entity_ruler works after an NER: the second can overwrite O annotations """
    nlp = English()
@ -414,7 +482,7 @@ def test_beam_ner_scores():
            assert 0 - eps <= score <= 1 + eps


-def test_beam_overfitting_IO():
+def test_beam_overfitting_IO(neg_key):
    # Simple test to try and quickly overfit the Beam NER component
    nlp = English()
    beam_width = 16
@ -422,6 +490,7 @@ def test_beam_overfitting_IO():
    config = {
        "beam_width": beam_width,
        "beam_density": beam_density,
+        "incorrect_spans_key": neg_key,
    }
    ner = nlp.add_pipe("beam_ner", config=config)
    train_examples = []
@ -438,12 +507,13 @@ def test_beam_overfitting_IO():
    assert losses["beam_ner"] < 0.0001

    # test the scores from the beam
-    test_text = "I like London."
+    test_text = "I like London"
    docs = [nlp.make_doc(test_text)]
    beams = ner.predict(docs)
    entity_scores = ner.scored_ents(beams)[0]
    assert entity_scores[(2, 3, "LOC")] == 1.0
    assert entity_scores[(2, 3, "PERSON")] == 0.0
+    assert len(nlp(test_text).ents) == 1

    # Also test the results are still the same after IO
    with make_tempdir() as tmp_dir:
@ -456,6 +526,104 @@ def test_beam_overfitting_IO():
        assert entity_scores2[(2, 3, "LOC")] == 1.0
        assert entity_scores2[(2, 3, "PERSON")] == 0.0

+    # Try to unlearn the entity by using negative annotations
+    neg_doc = nlp.make_doc(test_text)
+    neg_ex = Example(neg_doc, neg_doc)
+    neg_ex.reference.spans[neg_key] = [Span(neg_doc, 2, 3, "LOC")]
+    neg_train_examples = [neg_ex]
+
+    for i in range(20):
+        losses = {}
+        nlp.update(neg_train_examples, sgd=optimizer, losses=losses)
+
+    # test the "untrained" model
+    assert len(nlp(test_text).ents) == 0
+
+
+def test_neg_annotation(neg_key):
+    """Check that the NER update works with a negative annotation that is a different label of the correct one,
+    or partly overlapping, etc"""
+    nlp = English()
+    beam_width = 16
+    beam_density = 0.0001
+    config = {
+        "beam_width": beam_width,
+        "beam_density": beam_density,
+        "incorrect_spans_key": neg_key,
+    }
+    ner = nlp.add_pipe("beam_ner", config=config)
+    train_text = "Who is Shaka Khan?"
+    neg_doc = nlp.make_doc(train_text)
+    ner.add_label("PERSON")
+    ner.add_label("ORG")
+    example = Example.from_dict(neg_doc, {"entities": [(7, 17, "PERSON")]})
+    example.reference.spans[neg_key] = [Span(neg_doc, 2, 4, "ORG"), Span(neg_doc, 2, 3, "PERSON"), Span(neg_doc, 1, 4, "PERSON")]
+
+    optimizer = nlp.initialize()
+    for i in range(2):
+        losses = {}
+        nlp.update([example], sgd=optimizer, losses=losses)
+
+
+def test_neg_annotation_conflict(neg_key):
+    # Check that NER raises for a negative annotation that is THE SAME as a correct one
+    nlp = English()
+    beam_width = 16
+    beam_density = 0.0001
+    config = {
+        "beam_width": beam_width,
+        "beam_density": beam_density,
+        "incorrect_spans_key": neg_key,
+    }
+    ner = nlp.add_pipe("beam_ner", config=config)
+    train_text = "Who is Shaka Khan?"
+    neg_doc = nlp.make_doc(train_text)
+    ner.add_label("PERSON")
+    ner.add_label("LOC")
+    example = Example.from_dict(neg_doc, {"entities": [(7, 17, "PERSON")]})
+    example.reference.spans[neg_key] = [Span(neg_doc, 2, 4, "PERSON")]
+    assert len(example.reference.ents) == 1
+    assert example.reference.ents[0].text == "Shaka Khan"
+    assert example.reference.ents[0].label_ == "PERSON"
+    assert len(example.reference.spans[neg_key]) == 1
+    assert example.reference.spans[neg_key][0].text == "Shaka Khan"
+    assert example.reference.spans[neg_key][0].label_ == "PERSON"
+
+    optimizer = nlp.initialize()
+    for i in range(2):
+        losses = {}
+        with pytest.raises(ValueError):
+            nlp.update([example], sgd=optimizer, losses=losses)
+
+
+def test_beam_valid_parse(neg_key):
+    """Regression test for previously flakey behaviour"""
+    nlp = English()
+    beam_width = 16
+    beam_density = 0.0001
+    config = {
+        "beam_width": beam_width,
+        "beam_density": beam_density,
+        "incorrect_spans_key": neg_key,
+    }
+    nlp.add_pipe("beam_ner", config=config)
+    # fmt: off
+    tokens = ['FEDERAL', 'NATIONAL', 'MORTGAGE', 'ASSOCIATION', '(', 'Fannie', 'Mae', '):', 'Posted', 'yields', 'on', '30', 'year', 'mortgage', 'commitments', 'for', 'delivery', 'within', '30', 'days', '(', 'priced', 'at', 'par', ')', '9.75', '%', ',', 'standard', 'conventional', 'fixed', '-', 'rate', 'mortgages', ';', '8.70', '%', ',', '6/2', 'rate', 'capped', 'one', '-', 'year', 'adjustable', 'rate', 'mortgages', '.', 'Source', ':', 'Telerate', 'Systems', 'Inc.']
+    iob = ['B-ORG', 'I-ORG', 'I-ORG', 'L-ORG', 'O', 'B-ORG', 'L-ORG', 'O', 'O', 'O', 'O', 'B-DATE', 'L-DATE', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'L-DATE', 'O', 'O', 'O', 'O', 'O', 'B-PERCENT', 'L-PERCENT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PERCENT', 'L-PERCENT', 'O', 'U-CARDINAL', 'O', 'O', 'B-DATE', 'I-DATE', 'L-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
+    # fmt: on
+
+    doc = Doc(nlp.vocab, words=tokens)
+    example = Example.from_dict(doc, {"ner": iob})
+    neg_span = Span(doc, 50, 53, "ORG")
+    example.reference.spans[neg_key] = [neg_span]
+
+    optimizer = nlp.initialize()
+
+    for i in range(5):
+        losses = {}
+        nlp.update([example], sgd=optimizer, losses=losses)
+    assert "beam_ner" in losses
+

 def test_ner_warns_no_lookups(caplog):
    nlp = English()
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@ -5,10 +5,11 @@ from spacy.attrs import DEP
 from spacy.lang.en import English
 from spacy.training import Example
 from spacy.tokens import Doc
-from spacy import util
+from spacy import util, registry

 from ..util import apply_transition_sequence, make_tempdir
-
+from ...pipeline import DependencyParser
+from ...pipeline.dep_parser import DEFAULT_PARSER_MODEL

 TRAIN_DATA = [
    (
@ -215,6 +216,18 @@ def test_parser_set_sent_starts(en_vocab):
            assert token.head in sent


+def test_parser_constructor(en_vocab):
+    config = {
+        "learn_tokens": False,
+        "min_action_freq": 30,
+        "update_with_oracle_cut_size": 100,
+    }
+    cfg = {"model": DEFAULT_PARSER_MODEL}
+    model = registry.resolve(cfg, validate=True)["model"]
+    parser_1 = DependencyParser(en_vocab, model, **config)
+    parser_2 = DependencyParser(en_vocab, model)
+
+
@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
 def test_incomplete_data(pipe_name):
    # Test that the parser works with incomplete information
--- a/spacy/tests/parser/test_preset_sbd.py
+++ b/spacy/tests/parser/test_preset_sbd.py
@ -23,14 +23,9 @@ def _parser_example(parser):
@pytest.fixture
 def parser(vocab):
    vocab.strings.add("ROOT")
-    config = {
-        "learn_tokens": False,
-        "min_action_freq": 30,
-        "update_with_oracle_cut_size": 100,
-    }
    cfg = {"model": DEFAULT_PARSER_MODEL}
    model = registry.resolve(cfg, validate=True)["model"]
-    parser = DependencyParser(vocab, model, **config)
+    parser = DependencyParser(vocab, model)
    parser.cfg["token_vector_width"] = 4
    parser.cfg["hidden_width"] = 32
    # parser.add_label('right')
--- a/spacy/tests/regression/test_issue3001-3500.py
+++ b/spacy/tests/regression/test_issue3001-3500.py
@ -190,14 +190,9 @@ def test_issue3345():
    doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"])
    doc[4].is_sent_start = True
    ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}])
-    config = {
-        "learn_tokens": False,
-        "min_action_freq": 30,
-        "update_with_oracle_cut_size": 100,
-    }
    cfg = {"model": DEFAULT_NER_MODEL}
    model = registry.resolve(cfg, validate=True)["model"]
-    ner = EntityRecognizer(doc.vocab, model, **config)
+    ner = EntityRecognizer(doc.vocab, model)
    # Add the OUT action. I wouldn't have thought this would be necessary...
    ner.moves.add_action(5, "")
    ner.add_label("GPE")
--- a/spacy/tests/regression/test_issue3501-4000.py
+++ b/spacy/tests/regression/test_issue3501-4000.py
@ -259,8 +259,6 @@ def test_issue3830_no_subtok():
    """Test that the parser doesn't have subtok label if not learn_tokens"""
    config = {
        "learn_tokens": False,
-        "min_action_freq": 30,
-        "update_with_oracle_cut_size": 100,
    }
    model = registry.resolve({"model": DEFAULT_PARSER_MODEL}, validate=True)["model"]
    parser = DependencyParser(Vocab(), model, **config)
@ -274,8 +272,6 @@ def test_issue3830_with_subtok():
    """Test that the parser does have subtok label if learn_tokens=True."""
    config = {
        "learn_tokens": True,
-        "min_action_freq": 30,
-        "update_with_oracle_cut_size": 100,
    }
    model = registry.resolve({"model": DEFAULT_PARSER_MODEL}, validate=True)["model"]
    parser = DependencyParser(Vocab(), model, **config)
--- a/spacy/tests/serialize/test_serialize_pipeline.py
+++ b/spacy/tests/serialize/test_serialize_pipeline.py
@ -61,8 +61,6 @@ def taggers(en_vocab):
@pytest.mark.parametrize("Parser", test_parsers)
 def test_serialize_parser_roundtrip_bytes(en_vocab, Parser):
    config = {
-        "learn_tokens": False,
-        "min_action_freq": 0,
        "update_with_oracle_cut_size": 100,
        "beam_width": 1,
        "beam_update_prob": 1.0,
@ -70,8 +68,8 @@ def test_serialize_parser_roundtrip_bytes(en_vocab, Parser):
    }
    cfg = {"model": DEFAULT_PARSER_MODEL}
    model = registry.resolve(cfg, validate=True)["model"]
-    parser = Parser(en_vocab, model, **config)
-    new_parser = Parser(en_vocab, model, **config)
+    parser = Parser(en_vocab, model)
+    new_parser = Parser(en_vocab, model)
    new_parser = new_parser.from_bytes(parser.to_bytes(exclude=["vocab"]))
    bytes_2 = new_parser.to_bytes(exclude=["vocab"])
    bytes_3 = parser.to_bytes(exclude=["vocab"])
@ -84,43 +82,27 @@ def test_serialize_parser_strings(Parser):
    vocab1 = Vocab()
    label = "FunnyLabel"
    assert label not in vocab1.strings
-    config = {
-        "learn_tokens": False,
-        "min_action_freq": 0,
-        "update_with_oracle_cut_size": 100,
-        "beam_width": 1,
-        "beam_update_prob": 1.0,
-        "beam_density": 0.0,
-    }
    cfg = {"model": DEFAULT_PARSER_MODEL}
    model = registry.resolve(cfg, validate=True)["model"]
-    parser1 = Parser(vocab1, model, **config)
+    parser1 = Parser(vocab1, model)
    parser1.add_label(label)
    assert label in parser1.vocab.strings
    vocab2 = Vocab()
    assert label not in vocab2.strings
-    parser2 = Parser(vocab2, model, **config)
+    parser2 = Parser(vocab2, model)
    parser2 = parser2.from_bytes(parser1.to_bytes(exclude=["vocab"]))
    assert label in parser2.vocab.strings


@pytest.mark.parametrize("Parser", test_parsers)
 def test_serialize_parser_roundtrip_disk(en_vocab, Parser):
-    config = {
-        "learn_tokens": False,
-        "min_action_freq": 0,
-        "update_with_oracle_cut_size": 100,
-        "beam_width": 1,
-        "beam_update_prob": 1.0,
-        "beam_density": 0.0,
-    }
    cfg = {"model": DEFAULT_PARSER_MODEL}
    model = registry.resolve(cfg, validate=True)["model"]
-    parser = Parser(en_vocab, model, **config)
+    parser = Parser(en_vocab, model)
    with make_tempdir() as d:
        file_path = d / "parser"
        parser.to_disk(file_path)
-        parser_d = Parser(en_vocab, model, **config)
+        parser_d = Parser(en_vocab, model)
        parser_d = parser_d.from_disk(file_path)
        parser_bytes = parser.to_bytes(exclude=["model", "vocab"])
        parser_d_bytes = parser_d.to_bytes(exclude=["model", "vocab"])
@ -198,17 +180,12 @@ def test_serialize_textcat_empty(en_vocab):
 def test_serialize_pipe_exclude(en_vocab, Parser):
    cfg = {"model": DEFAULT_PARSER_MODEL}
    model = registry.resolve(cfg, validate=True)["model"]
-    config = {
-        "learn_tokens": False,
-        "min_action_freq": 0,
-        "update_with_oracle_cut_size": 100,
-    }

    def get_new_parser():
-        new_parser = Parser(en_vocab, model, **config)
+        new_parser = Parser(en_vocab, model)
        return new_parser

-    parser = Parser(en_vocab, model, **config)
+    parser = Parser(en_vocab, model)
    parser.cfg["foo"] = "bar"
    new_parser = get_new_parser().from_bytes(parser.to_bytes(exclude=["vocab"]))
    assert "foo" in new_parser.cfg
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@ -235,9 +235,9 @@ cdef class Example:
                    seen.update(indices)
        return output

-    def get_aligned_ner(self):
+    def get_aligned_ents_and_ner(self):
        if not self.y.has_annotation("ENT_IOB"):
-            return [None] * len(self.x)  # should this be 'missing' instead of 'None' ?
+            return [], [None] * len(self.x)
        x_ents = self.get_aligned_spans_y2x(self.y.ents, allow_overlap=False)
        # Default to 'None' for missing values
        x_tags = offsets_to_biluo_tags(
@ -253,6 +253,10 @@ cdef class Example:
                    x_tags[i] = "O"
                elif self.x[i].is_space:
                    x_tags[i] = "O"
+        return x_ents, x_tags
+
+    def get_aligned_ner(self):
+        x_ents, x_tags = self.get_aligned_ents_and_ner()
        return x_tags

    def to_dict(self):
--- a/website/docs/api/dependencyparser.md
+++ b/website/docs/api/dependencyparser.md
@ -50,7 +50,7 @@ architectures and their arguments and hyperparameters.

 | Setting                       | Description                                                                                                                                                                                                                                                                                                           |
 | ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `moves`                       | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[List[str]]~~                                                                                                                                                                                                       |
+| `moves`                       | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[TransitionSystem]~~                                                                                                                                                                                                |
 | `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~                                                                   |
 | `learn_tokens`                | Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. Defaults to `False`. ~~bool~~                                                                                                                                                                                         |
 | `min_action_freq`             | The minimum frequency of labelled actions to retain. Rarer labelled actions have their label backed-off to "dep". While this primarily affects the label accuracy, it can also affect the attachment structure, as the labels are used to represent the pseudo-projectivity transformation. Defaults to `30`. ~~int~~ |
@ -88,8 +88,8 @@ shortcut for this and instantiate the component using its string name and
 | `name`                        | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                                                                                                                                                                                                 |
 | `moves`                       | A list of transition names. Inferred from the data if not provided. ~~Optional[List[str]]~~                                                                                                                                                                                                         |
 | _keyword-only_                |                                                                                                                                                                                                                                                                                                     |
-| `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. `100` is a good default. ~~int~~                                           |
-| `learn_tokens`                | Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. ~~bool~~                                                                                                                                                                                            |
+| `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~                                                 |
+| `learn_tokens`                | Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. Defaults to `False`. ~~bool~~                                                                                                                                                                       |
 | `min_action_freq`             | The minimum frequency of labelled actions to retain. Rarer labelled actions have their label backed-off to "dep". While this primarily affects the label accuracy, it can also affect the attachment structure, as the labels are used to represent the pseudo-projectivity transformation. ~~int~~ |

 ## DependencyParser.\_\_call\_\_ {#call tag="method"}
--- a/website/docs/api/entityrecognizer.md
+++ b/website/docs/api/entityrecognizer.md
@ -37,6 +37,7 @@ architectures and their arguments and hyperparameters.
 >    "moves": None,
 >    "update_with_oracle_cut_size": 100,
 >    "model": DEFAULT_NER_MODEL,
+>    "incorrect_spans_key": "incorrect_spans",
 > }
 > nlp.add_pipe("ner", config=config)
 > ```
@ -46,6 +47,7 @@ architectures and their arguments and hyperparameters.
 | `moves`                       | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[List[str]]~~                                                                                                                                     |
 | `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ |
 | `model`                       | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [TransitionBasedParser](/api/architectures#TransitionBasedParser). ~~Model[List[Doc], List[Floats2d]]~~                                                 |
+| `incorrect_spans_key`         | This key refers to a `SpanGroup` in `doc.spans` that specifies incorrect spans. The NER wiill learn not to predict (exactly) those spans. Defaults to `None`. ~~Optional[str]~~                                                                     |

 ```python
 %%GITHUB_SPACY/spacy/pipeline/ner.pyx
@ -72,14 +74,15 @@ Create a new pipeline instance. In your application, you would normally use a
 shortcut for this and instantiate the component using its string name and
 [`nlp.add_pipe`](/api/language#add_pipe).

-| Name                          | Description                                                                                                                                                                                                                                               |
-| ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `vocab`                       | The shared vocabulary. ~~Vocab~~                                                                                                                                                                                                                          |
-| `model`                       | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                      |
-| `name`                        | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                                                                                                                                                       |
-| `moves`                       | A list of transition names. Inferred from the data if not provided. ~~Optional[List[str]]~~                                                                                                                                                               |
-| _keyword-only_                |                                                                                                                                                                                                                                                           |
-| `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. `100` is a good default. ~~int~~ |
+| Name                          | Description                                                                                                                                                                                                                                         |
+| ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `vocab`                       | The shared vocabulary. ~~Vocab~~                                                                                                                                                                                                                    |
+| `model`                       | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                |
+| `name`                        | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                                                                                                                                                 |
+| `moves`                       | A list of transition names. Inferred from the data if set to `None`, which is the default. ~~Optional[List[str]]~~                                                                                                                                  |
+| _keyword-only_                |                                                                                                                                                                                                                                                     |
+| `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ |
+| `incorrect_spans_key`         | Identifies spans that are known to be incorrect entity annotations. The incorrect entity annotations can be stored in the span group, under this key. Defaults to `None`. ~~Optional[str]~~                                                         |

 ## EntityRecognizer.\_\_call\_\_ {#call tag="method"}

@ -220,14 +223,14 @@ model. Delegates to [`predict`](/api/entityrecognizer#predict) and
 > losses = ner.update(examples, sgd=optimizer)
 > ```

-| Name              | Description                                                                                                                        |
-| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
-| `examples`        | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~                                                  |
-| _keyword-only_    |                                                                                                                                    |
-| `drop`            | The dropout rate. ~~float~~                                                                                                        |
-| `sgd`             | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                      |
-| `losses`          | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~           |
-| **RETURNS**       | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                              |
+| Name           | Description                                                                                                              |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------ |
+| `examples`     | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~                                        |
+| _keyword-only_ |                                                                                                                          |
+| `drop`         | The dropout rate. ~~float~~                                                                                              |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~            |
+| `losses`       | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                    |

 ## EntityRecognizer.get_loss {#get_loss tag="method"}