Support negative examples in partial NER annotations (#8106)

* Support a cfg field in transition system * Make NER 'has gold' check use right alignment for span * Pass 'negative_samples_key' property into NER transition system * Add field for negative samples to NER transition system * Check neg_key in NER has_gold * Support negative examples in NER oracle * Test for negative examples in NER * Fix name of config variable in NER * Remove vestiges of old-style partial annotation * Remove obsolete tests * Add comment noting lack of support for negative samples in parser * Additions to "neg examples" PR (#8201) * add custom error and test for deprecated format * add test for unlearning an entity * add break also for Begin's cost * add negative_samples_key property on Parser * rename * extend docs & fix some older docs issues * add subclass constructors, clean up tests, fix docs * add flaky test with ValueError if gold parse was not found * remove ValueError if n_gold == 0 * fix docstring * Hack in environment variables to try out training * Remove hack * Remove NER hack, and support 'negative O' samples * Fix O oracle * Fix transition parser * Remove 'not O' from oracle * Fix NER oracle * check for spans in both gold.ents and gold.spans and raise if so, to prevent memory access violation * use set instead of list in consistency check Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com> Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
2025-10-15 16:26:41 +03:00 · 2021-06-17 17:33:00 +10:00 · 2021-06-17 17:33:00 +10:00 · 6f5e308d17
commit 6f5e308d17
parent 02bac8f269
19 changed files with 558 additions and 236 deletions
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -521,6 +521,13 @@ class Errors:
    E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.")
    # New errors added in v3.x
    E868 = ("Found a conflicting gold annotation in a reference document, "
            "with the following char-based span occurring both in the gold ents "
            "as well as in the negative spans: {span}.")
    E869 = ("The notation '{label}' is not supported anymore. To annotate "
            "negative NER samples, use `doc.spans[key]` instead, and "
            "specify the key as 'incorrect_spans_key' when constructing "
            "the NER component.")
    E870 = ("Could not serialize the DocBin because it is too large. Consider "
            "splitting up your documents into several doc bins and serializing "
            "each separately. spacy.Corpus.v1 will search recursively for all "
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@ -1,3 +1,5 @@
 import os
 import random
 from libc.stdint cimport int32_t
 from cymem.cymem cimport Pool
@ -6,10 +8,11 @@ from thinc.extra.search cimport Beam
 from ...tokens.doc cimport Doc
 from ...tokens.span import Span
 from ...tokens.span cimport Span
 from ...typedefs cimport weight_t, attr_t
 from ...lexeme cimport Lexeme
 from ...attrs cimport IS_SPACE
-from ...structs cimport TokenC
+from ...structs cimport TokenC, SpanC
 from ...training.example cimport Example
 from .stateclass cimport StateClass
 from ._state cimport StateC
@ -25,7 +28,6 @@ cdef enum:
    LAST
    UNIT
    OUT
    ISNT
    N_MOVES
@ -36,39 +38,62 @@ MOVE_NAMES[IN] = 'I'
 MOVE_NAMES[LAST] = 'L'
 MOVE_NAMES[UNIT] = 'U'
 MOVE_NAMES[OUT] = 'O'
 MOVE_NAMES[ISNT] = 'x'
 cdef struct GoldNERStateC:
    Transition* ner
    SpanC* negs
    int32_t length
    int32_t nr_neg
 cdef class BiluoGold:
    cdef Pool mem
    cdef GoldNERStateC c
-    def __init__(self, BiluoPushDown moves, StateClass stcls, Example example):
+    def __init__(self, BiluoPushDown moves, StateClass stcls, Example example, neg_key):
        self.mem = Pool()
-        self.c = create_gold_state(self.mem, moves, stcls.c, example)
+        self.c = create_gold_state(self.mem, moves, stcls.c, example, neg_key)
    def update(self, StateClass stcls):
        update_gold_state(&self.c, stcls.c)
 cdef GoldNERStateC create_gold_state(
    Pool mem,
    BiluoPushDown moves,
    const StateC* stcls,
-    Example example
+    Example example,
    neg_key
 ) except *:
    cdef GoldNERStateC gs
    cdef Span neg
    if neg_key is not None:
        negs = example.get_aligned_spans_y2x(
            example.y.spans.get(neg_key, []),
            allow_overlap=True
        )
    else:
        negs = []
    assert example.x.length > 0
    gs.ner = <Transition*>mem.alloc(example.x.length, sizeof(Transition))
-    ner_tags = example.get_aligned_ner()
+    gs.negs = <SpanC*>mem.alloc(len(negs), sizeof(SpanC))
    gs.nr_neg = len(negs)
    ner_ents, ner_tags = example.get_aligned_ents_and_ner()
    for i, ner_tag in enumerate(ner_tags):
        gs.ner[i] = moves.lookup_transition(ner_tag)
    # Prevent conflicting spans in the data. For NER, spans are equal if they have the same offsets and label.
    neg_span_triples = {(neg_ent.start_char, neg_ent.end_char, neg_ent.label) for neg_ent in negs}
    for pos_span in ner_ents:
        if (pos_span.start_char, pos_span.end_char, pos_span.label) in neg_span_triples:
            raise ValueError(Errors.E868.format(span=(pos_span.start_char, pos_span.end_char, pos_span.label_)))
    # In order to handle negative samples, we need to maintain the full
    # (start, end, label) triple. If we break it down to the 'isnt B-LOC'
    # thing, we'll get blocked if there's an incorrect prefix.
    for i, neg in enumerate(negs):
        gs.negs[i] = neg.c
    return gs
@ -156,21 +181,16 @@ cdef class BiluoPushDown(TransitionSystem):
        cdef attr_t label
        if name == '-' or name == '' or name is None:
            return Transition(clas=0, move=MISSING, label=0, score=0)
        elif name == '!O':
            return Transition(clas=0, move=ISNT, label=0, score=0)
        elif '-' in name:
            move_str, label_str = name.split('-', 1)
-            # Hacky way to denote 'not this entity'
+            # Deprecated, hacky way to denote 'not this entity'
            if label_str.startswith('!'):
-                label_str = label_str[1:]
+                raise ValueError(Errors.E869.format(label=name))
                move_str = 'x'
            label = self.strings.add(label_str)
        else:
            move_str = name
            label = 0
        move = MOVE_NAMES.index(move_str)
        if move == ISNT:
            return Transition(clas=0, move=ISNT, label=label, score=0)
        for i in range(self.n_moves):
            if self.c[i].move == move and self.c[i].label == label:
                return self.c[i]
@ -220,7 +240,7 @@ cdef class BiluoPushDown(TransitionSystem):
            label_id = label_name
        if action == OUT and label_id != 0:
            return None
-        if action == MISSING or action == ISNT:
+        if action == MISSING:
            return None
        # Check we're not creating a move we already have, so that this is
        # idempotent
@ -270,9 +290,23 @@ cdef class BiluoPushDown(TransitionSystem):
        return parses
    def init_gold(self, StateClass state, Example example):
-        return BiluoGold(self, state, example)
+        return BiluoGold(self, state, example, self.neg_key)
    def has_gold(self, Example eg, start=0, end=None):
        # We get x and y referring to X, we want to check relative to Y,
        # the reference
        y_spans = eg.get_aligned_spans_x2y([eg.x[start:end]])
        if not y_spans:
            y_spans = [eg.y[:]]
        y_span = y_spans[0]
        start = y_span.start
        end = y_span.end
        neg_key = self.neg_key
        if neg_key is not None:
            # If we have any negative samples, count that as having annotation.
            for span in eg.y.spans.get(neg_key, []):
                if span.start >= start and span.end <= end:
                    return True
        for word in eg.y[start:end]:
            if word.ent_iob != 0:
                return True
@ -306,8 +340,6 @@ cdef class BiluoPushDown(TransitionSystem):
                n_gold += costs[i] <= 0
            else:
                costs[i] = 9000
        if n_gold < 1:
            raise ValueError
 cdef class Missing:
@ -373,23 +405,33 @@ cdef class Begin:
    @staticmethod
    cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
        gold = <GoldNERStateC*>_gold
-        cdef int g_act = gold.ner[s.B(0)].move
+        b0 = s.B(0)
-        cdef attr_t g_tag = gold.ner[s.B(0)].label
+        cdef int cost = 0
        cdef int g_act = gold.ner[b0].move
        cdef attr_t g_tag = gold.ner[b0].label
        if g_act == MISSING:
-            return 0
+            pass
        elif g_act == BEGIN:
            # B, Gold B --> Label match
-            return label != g_tag
+            cost += label != g_tag
        # Support partial supervision in the form of "not this label"
        elif g_act == ISNT:
            return label == g_tag
        else:
            # B, Gold I --> False (P)
            # B, Gold L --> False (P)
            # B, Gold O --> False (P)
            # B, Gold U --> False (P)
-            return 1
+            cost += 1
        if s.buffer_length() < 3:
            # Handle negatives. In general we can't really do much to block
            # B, because we don't know whether the whole entity is going to
            # be correct or not. However, we can at least tell whether we're
            # going to be opening an entity where there's only one possible
            # L.
            for span in gold.negs[:gold.nr_neg]:
                if span.label == label and span.start == b0:
                    cost += 1
                    break
        return cost
 cdef class In:
@ -462,9 +504,6 @@ cdef class In:
        elif g_act == UNIT:
            # I, Gold U --> True iff next tag == O
            return next_act != OUT
        # Support partial supervision in the form of "not this label"
        elif g_act == ISNT:
            return 0
        else:
            return 1
@ -504,32 +543,41 @@ cdef class Last:
    cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
        gold = <GoldNERStateC*>_gold
        move = LAST
        b0 = s.B(0)
        ent_start = s.E(0)
-        cdef int g_act = gold.ner[s.B(0)].move
+        cdef int g_act = gold.ner[b0].move
-        cdef attr_t g_tag = gold.ner[s.B(0)].label
+        cdef attr_t g_tag = gold.ner[b0].label
        cdef int cost = 0
        if g_act == MISSING:
-            return 0
+            pass
        elif g_act == BEGIN:
            # L, Gold B --> True
-            return 0
+            pass
        elif g_act == IN:
            # L, Gold I --> True iff this entity sunk
-            return not _entity_is_sunk(s, gold.ner)
+            cost += not _entity_is_sunk(s, gold.ner)
        elif g_act == LAST:
            # L, Gold L --> True
-            return 0
+            pass
        elif g_act == OUT:
            # L, Gold O --> True
-            return 0
+            pass
        elif g_act == UNIT:
            # L, Gold U --> True
-            return 0
+            pass
        # Support partial supervision in the form of "not this label"
        elif g_act == ISNT:
            return 0
        else:
-            return 1
+            cost += 1
        # If we have negative-example entities, integrate them into the objective,
        # by marking actions that close an entity that we know is incorrect
        # as costly.
        for span in gold.negs[:gold.nr_neg]:
            if span.label == label and (span.end-1) == b0 and span.start == ent_start:
                cost += 1
                break
        return cost
 cdef class Unit:
@ -573,21 +621,29 @@ cdef class Unit:
        gold = <GoldNERStateC*>_gold
        cdef int g_act = gold.ner[s.B(0)].move
        cdef attr_t g_tag = gold.ner[s.B(0)].label
        cdef int cost = 0
        if g_act == MISSING:
-            return 0
+            pass
        elif g_act == UNIT:
            # U, Gold U --> True iff tag match
-            return label != g_tag
+            cost += label != g_tag
        # Support partial supervision in the form of "not this label"
        elif g_act == ISNT:
            return label == g_tag
        else:
            # U, Gold B --> False
            # U, Gold I --> False
            # U, Gold L --> False
            # U, Gold O --> False
-            return 1
+            cost += 1
        # If we have negative-example entities, integrate them into the objective.
        # This is fairly straight-forward for U- entities, as we have a single
        # action
        cdef int b0 = s.B(0)
        for span in gold.negs[:gold.nr_neg]:
            if span.label == label and span.start == b0 and span.end == (b0+1):
                cost += 1
                break
        return cost
 cdef class Out:
@ -613,25 +669,24 @@ cdef class Out:
        gold = <GoldNERStateC*>_gold
        cdef int g_act = gold.ner[s.B(0)].move
        cdef attr_t g_tag = gold.ner[s.B(0)].label
-
+        cdef weight_t cost = 0
-        if g_act == ISNT and g_tag == 0:
+        if g_act == MISSING:
-            return 1
+            pass
        elif g_act == MISSING or g_act == ISNT:
            return 0
        elif g_act == BEGIN:
            # O, Gold B --> False
-            return 1
+            cost += 1
        elif g_act == IN:
            # O, Gold I --> True
-            return 0
+            pass
        elif g_act == LAST:
            # O, Gold L --> True
-            return 0
+            pass
        elif g_act == OUT:
            # O, Gold O --> True
-            return 0
+            pass
        elif g_act == UNIT:
            # O, Gold U --> False
-            return 1
+            cost += 1
        else:
-            return 1
+            cost += 1
        return cost
--- a/spacy/pipeline/_parser_internals/transition_system.pxd
+++ b/spacy/pipeline/_parser_internals/transition_system.pxd
@ -41,6 +41,7 @@ cdef class TransitionSystem:
    cdef public attr_t root_label
    cdef public freqs
    cdef public object labels
    cdef public object cfg
    cdef init_state_t init_beam_state
    cdef del_state_t del_beam_state
--- a/spacy/pipeline/_parser_internals/transition_system.pyx
+++ b/spacy/pipeline/_parser_internals/transition_system.pyx
@ -33,7 +33,14 @@ cdef int _del_state(Pool mem, void* state, void* x) except -1:
 cdef class TransitionSystem:
-    def __init__(self, StringStore string_table, labels_by_action=None, min_freq=None):
+    def __init__(
        self,
        StringStore string_table,
        labels_by_action=None,
        min_freq=None,
        incorrect_spans_key=None
    ):
        self.cfg = {"neg_key": incorrect_spans_key}
        self.mem = Pool()
        self.strings = string_table
        self.n_moves = 0
@ -49,8 +56,13 @@ cdef class TransitionSystem:
        self.del_beam_state = _del_state
    def __reduce__(self):
        # TODO: This loses the 'cfg'
        return (self.__class__, (self.strings, self.labels), None, None)
    @property
    def neg_key(self):
        return self.cfg.get("neg_key")
    def init_batch(self, docs):
        cdef StateClass state
        states = []
@ -220,16 +232,21 @@ cdef class TransitionSystem:
        transitions = []
        serializers = {
            'moves': lambda: srsly.json_dumps(self.labels),
-            'strings': lambda: self.strings.to_bytes()
+            'strings': lambda: self.strings.to_bytes(),
            'cfg': lambda: self.cfg
        }
        return util.to_bytes(serializers, exclude)
    def from_bytes(self, bytes_data, exclude=tuple()):
        # We're adding a new field, 'cfg', here and we don't want to break
        # previous models that don't have it.
        msg = srsly.msgpack_loads(bytes_data)
        labels = {}
-        deserializers = {
+        if 'moves' not in exclude:
-            'moves': lambda b: labels.update(srsly.json_loads(b)),
+            labels.update(srsly.json_loads(msg['moves']))
-            'strings': lambda b: self.strings.from_bytes(b)
+        if 'strings' not in exclude:
-        }
+            self.strings.from_bytes(msg['strings'])
-        msg = util.from_bytes(bytes_data, deserializers, exclude)
+        if 'cfg' not in exclude and 'cfg' in msg:
            self.cfg.update(msg['cfg'])
        self.initialize_actions(labels)
        return self
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@ -3,6 +3,7 @@ from collections import defaultdict
 from typing import Optional, Iterable
 from thinc.api import Model, Config
 from ._parser_internals.transition_system import TransitionSystem
 from .transition_parser cimport Parser
 from ._parser_internals.arc_eager cimport ArcEager
@ -59,7 +60,7 @@ def make_parser(
    nlp: Language,
    name: str,
    model: Model,
-    moves: Optional[list],
+    moves: Optional[TransitionSystem],
    update_with_oracle_cut_size: int,
    learn_tokens: bool,
    min_action_freq: int
@ -85,13 +86,13 @@ def make_parser(
    model (Model): The model for the transition-based parser. The model needs
        to have a specific substructure of named components --- see the
        spacy.ml.tb_framework.TransitionModel for details.
-    moves (List[str]): A list of transition names. Inferred from the data if not
+    moves (Optional[TransitionSystem]): This defines how the parse-state is created,
-        provided.
+        updated and evaluated. If 'moves' is None, a new instance is
-    update_with_oracle_cut_size (int):
+        created with `self.TransitionSystem()`. Defaults to `None`.
-        During training, cut long sequences into shorter segments by creating
+    update_with_oracle_cut_size (int): During training, cut long sequences into
-        intermediate states based on the gold-standard history. The model is
+        shorter segments by creating intermediate states based on the gold-standard
-        not very sensitive to this parameter, so you usually won't need to change
+        history. The model is not very sensitive to this parameter, so you usually
-        it. 100 is a good default.
+        won't need to change it. 100 is a good default.
    learn_tokens (bool): Whether to learn to merge subtokens that are split
        relative to the gold standard. Experimental.
    min_action_freq (int): The minimum frequency of labelled actions to retain.
@ -112,6 +113,9 @@ def make_parser(
        beam_width=1,
        beam_density=0.0,
        beam_update_prob=0.0,
        # At some point in the future we can try to implement support for
        # partial annotations, perhaps only in the beam objective.
        incorrect_spans_key=None
    )
@Language.factory(
@ -140,7 +144,7 @@ def make_beam_parser(
    nlp: Language,
    name: str,
    model: Model,
-    moves: Optional[list],
+    moves: Optional[TransitionSystem],
    update_with_oracle_cut_size: int,
    learn_tokens: bool,
    min_action_freq: int,
@ -165,8 +169,13 @@ def make_beam_parser(
    model (Model): The model for the transition-based parser. The model needs
        to have a specific substructure of named components --- see the
        spacy.ml.tb_framework.TransitionModel for details.
-    moves (List[str]): A list of transition names. Inferred from the data if not
+    moves (Optional[TransitionSystem]): This defines how the parse-state is created,
-        provided.
+        updated and evaluated. If 'moves' is None, a new instance is
        created with `self.TransitionSystem()`. Defaults to `None`.
    update_with_oracle_cut_size (int): During training, cut long sequences into
        shorter segments by creating intermediate states based on the gold-standard
        history. The model is not very sensitive to this parameter, so you usually
        won't need to change it. 100 is a good default.
    beam_width (int): The number of candidate analyses to maintain.
    beam_density (float): The minimum ratio between the scores of the first and
        last candidates in the beam. This allows the parser to avoid exploring
@ -195,7 +204,10 @@ def make_beam_parser(
        beam_update_prob=beam_update_prob,
        multitasks=[],
        learn_tokens=learn_tokens,
-        min_action_freq=min_action_freq
+        min_action_freq=min_action_freq,
        # At some point in the future we can try to implement support for
        # partial annotations, perhaps only in the beam objective.
        incorrect_spans_key=None
    )
@ -206,6 +218,39 @@ cdef class DependencyParser(Parser):
    """
    TransitionSystem = ArcEager
    def __init__(
        self,
        vocab,
        model,
        name="parser",
        moves=None,
        *,
        update_with_oracle_cut_size=100,
        min_action_freq=30,
        learn_tokens=False,
        beam_width=1,
        beam_density=0.0,
        beam_update_prob=0.0,
        multitasks=tuple(),
        incorrect_spans_key=None,
    ):
        """Create a DependencyParser.
        """
        super().__init__(
            vocab,
            model,
            name,
            moves,
            update_with_oracle_cut_size=update_with_oracle_cut_size,
            min_action_freq=min_action_freq,
            learn_tokens=learn_tokens,
            beam_width=beam_width,
            beam_density=beam_density,
            beam_update_prob=beam_update_prob,
            multitasks=multitasks,
            incorrect_spans_key=incorrect_spans_key,
        )
    @property
    def postprocesses(self):
        output = [nonproj.deprojectivize]
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@ -3,6 +3,7 @@ from collections import defaultdict
 from typing import Optional, Iterable
 from thinc.api import Model, Config
 from ._parser_internals.transition_system import TransitionSystem
 from .transition_parser cimport Parser
 from ._parser_internals.ner cimport BiluoPushDown
@ -40,6 +41,7 @@ DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"]
        "moves": None,
        "update_with_oracle_cut_size": 100,
        "model": DEFAULT_NER_MODEL,
        "incorrect_spans_key": None
    },
    default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
@ -48,8 +50,9 @@ def make_ner(
    nlp: Language,
    name: str,
    model: Model,
-    moves: Optional[list],
+    moves: Optional[TransitionSystem],
    update_with_oracle_cut_size: int,
    incorrect_spans_key: Optional[str]=None
 ):
    """Create a transition-based EntityRecognizer component. The entity recognizer
    identifies non-overlapping labelled spans of tokens.
@ -67,13 +70,16 @@ def make_ner(
    model (Model): The model for the transition-based parser. The model needs
        to have a specific substructure of named components --- see the
        spacy.ml.tb_framework.TransitionModel for details.
-    moves (list[str]): A list of transition names. Inferred from the data if not
+    moves (Optional[TransitionSystem]): This defines how the parse-state is created,
-        provided.
+        updated and evaluated. If 'moves' is None, a new instance is
-    update_with_oracle_cut_size (int):
+        created with `self.TransitionSystem()`. Defaults to `None`.
-        During training, cut long sequences into shorter segments by creating
+    update_with_oracle_cut_size (int): During training, cut long sequences into
-        intermediate states based on the gold-standard history. The model is
+        shorter segments by creating intermediate states based on the gold-standard
-        not very sensitive to this parameter, so you usually won't need to change
+        history. The model is not very sensitive to this parameter, so you usually
-        it. 100 is a good default.
+        won't need to change it. 100 is a good default.
    incorrect_spans_key (Optional[str]): Identifies spans that are known
        to be incorrect entity annotations. The incorrect entity annotations
        can be stored in the span group, under this key.
    """
    return EntityRecognizer(
        nlp.vocab,
@ -81,9 +87,8 @@ def make_ner(
        name,
        moves=moves,
        update_with_oracle_cut_size=update_with_oracle_cut_size,
        incorrect_spans_key=incorrect_spans_key,
        multitasks=[],
        min_action_freq=1,
        learn_tokens=False,
        beam_width=1,
        beam_density=0.0,
        beam_update_prob=0.0,
@ -98,7 +103,8 @@ def make_ner(
        "model": DEFAULT_NER_MODEL,
        "beam_density": 0.01,
        "beam_update_prob": 0.5,
-        "beam_width": 32
+        "beam_width": 32,
        "incorrect_spans_key": None
    },
    default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
 )
@ -106,11 +112,12 @@ def make_beam_ner(
    nlp: Language,
    name: str,
    model: Model,
-    moves: Optional[list],
+    moves: Optional[TransitionSystem],
    update_with_oracle_cut_size: int,
    beam_width: int,
    beam_density: float,
    beam_update_prob: float,
    incorrect_spans_key: Optional[str]=None
 ):
    """Create a transition-based EntityRecognizer component that uses beam-search.
    The entity recognizer identifies non-overlapping labelled spans of tokens.
@ -128,13 +135,13 @@ def make_beam_ner(
    model (Model): The model for the transition-based parser. The model needs
        to have a specific substructure of named components --- see the
        spacy.ml.tb_framework.TransitionModel for details.
-    moves (list[str]): A list of transition names. Inferred from the data if not
+    moves (Optional[TransitionSystem]): This defines how the parse-state is created,
-        provided.
+        updated and evaluated. If 'moves' is None, a new instance is
-    update_with_oracle_cut_size (int):
+        created with `self.TransitionSystem()`. Defaults to `None`.
-        During training, cut long sequences into shorter segments by creating
+    update_with_oracle_cut_size (int): During training, cut long sequences into
-        intermediate states based on the gold-standard history. The model is
+        shorter segments by creating intermediate states based on the gold-standard
-        not very sensitive to this parameter, so you usually won't need to change
+        history. The model is not very sensitive to this parameter, so you usually
-        it. 100 is a good default.
+        won't need to change it. 100 is a good default.
    beam_width (int): The number of candidate analyses to maintain.
    beam_density (float): The minimum ratio between the scores of the first and
        last candidates in the beam. This allows the parser to avoid exploring
@ -144,6 +151,8 @@ def make_beam_ner(
    beam_update_prob (float): The chance of making a beam update, instead of a
        greedy update. Greedy updates are an approximation for the beam updates,
        and are faster to compute.
    incorrect_spans_key (Optional[str]): Optional key into span groups of
        entities known to be non-entities.
    """
    return EntityRecognizer(
        nlp.vocab,
@ -152,11 +161,10 @@ def make_beam_ner(
        moves=moves,
        update_with_oracle_cut_size=update_with_oracle_cut_size,
        multitasks=[],
        min_action_freq=1,
        learn_tokens=False,
        beam_width=beam_width,
        beam_density=beam_density,
        beam_update_prob=beam_update_prob,
        incorrect_spans_key=incorrect_spans_key
    )
@ -167,6 +175,37 @@ cdef class EntityRecognizer(Parser):
    """
    TransitionSystem = BiluoPushDown
    def __init__(
        self,
        vocab,
        model,
        name="ner",
        moves=None,
        *,
        update_with_oracle_cut_size=100,
        beam_width=1,
        beam_density=0.0,
        beam_update_prob=0.0,
        multitasks=tuple(),
        incorrect_spans_key=None,
    ):
        """Create an EntityRecognizer.
        """
        super().__init__(
            vocab,
            model,
            name,
            moves,
            update_with_oracle_cut_size=update_with_oracle_cut_size,
            min_action_freq=1,   # not relevant for NER
            learn_tokens=False,  # not relevant for NER
            beam_width=beam_width,
            beam_density=beam_density,
            beam_update_prob=beam_update_prob,
            multitasks=multitasks,
            incorrect_spans_key=incorrect_spans_key,
        )
    def add_multitask_objective(self, mt_component):
        """Register another component as a multi-task objective. Experimental."""
        self._multitasks.append(mt_component)
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@ -29,6 +29,7 @@ from ..training import validate_examples, validate_get_examples
 from ..errors import Errors, Warnings
 from .. import util
 cdef class Parser(TrainablePipe):
    """
    Base class of the DependencyParser and EntityRecognizer.
@ -48,15 +49,43 @@ cdef class Parser(TrainablePipe):
        beam_density=0.0,
        beam_update_prob=0.0,
        multitasks=tuple(),
        incorrect_spans_key=None
    ):
        """Create a Parser.
        vocab (Vocab): The vocabulary object. Must be shared with documents
            to be processed. The value is set to the `.vocab` attribute.
-        **cfg: Configuration parameters. Set to the `.cfg` attribute.
+        model (Model): The model for the transition-based parser. The model needs
-             If it doesn't include a value for 'moves',  a new instance is
+            to have a specific substructure of named components --- see the
-             created with `self.TransitionSystem()`. This defines how the
+            spacy.ml.tb_framework.TransitionModel for details.
-             parse-state is created, updated and evaluated.
+        name (str): The name of the pipeline component
        moves (Optional[TransitionSystem]): This defines how the parse-state is created,
            updated and evaluated. If 'moves' is None, a new instance is
            created with `self.TransitionSystem()`. Defaults to `None`.
        update_with_oracle_cut_size (int): During training, cut long sequences into
            shorter segments by creating intermediate states based on the gold-standard
            history. The model is not very sensitive to this parameter, so you usually
            won't need to change it. 100 is a good default.
        min_action_freq (int): The minimum frequency of labelled actions to retain.
            Rarer labelled actions have their label backed-off to "dep". While this
            primarily affects the label accuracy, it can also affect the attachment
            structure, as the labels are used to represent the pseudo-projectivity
            transformation.
        learn_tokens (bool): Whether to learn to merge subtokens that are split
            relative to the gold standard. Experimental.
        beam_width (int): The number of candidate analyses to maintain.
        beam_density (float): The minimum ratio between the scores of the first and
            last candidates in the beam. This allows the parser to avoid exploring
            candidates that are too far behind. This is mostly intended to improve
            efficiency, but it can also improve accuracy as deeper search is not
            always better.
        beam_update_prob (float): The chance of making a beam update, instead of a
            greedy update. Greedy updates are an approximation for the beam updates,
            and are faster to compute.
        multitasks: additional multi-tasking components. Experimental.
        incorrect_spans_key (Optional[str]): Identifies spans that are known
            to be incorrect entity annotations. The incorrect entity annotations
            can be stored in the span group, under this key.
        """
        self.vocab = vocab
        self.name = name
@ -68,11 +97,16 @@ cdef class Parser(TrainablePipe):
            "learn_tokens": learn_tokens,
            "beam_width": beam_width,
            "beam_density": beam_density,
-            "beam_update_prob": beam_update_prob
+            "beam_update_prob": beam_update_prob,
            "incorrect_spans_key": incorrect_spans_key
        }
        if moves is None:
-            # defined by EntityRecognizer as a BiluoPushDown
+            # EntityRecognizer -> BiluoPushDown
-            moves = self.TransitionSystem(self.vocab.strings)
+            # DependencyParser -> ArcEager
            moves = self.TransitionSystem(
                self.vocab.strings,
                incorrect_spans_key=incorrect_spans_key
            )
        self.moves = moves
        self.model = model
        if self.moves.n_moves != 0:
@ -118,6 +152,10 @@ cdef class Parser(TrainablePipe):
        # Available for subclasses, e.g. to deprojectivize
        return []
    @property
    def incorrect_spans_key(self):
        return self.cfg["incorrect_spans_key"]
    def add_label(self, label):
        resized = False
        for action in self.moves.action_types:
@ -326,7 +364,6 @@ cdef class Parser(TrainablePipe):
        )
        for multitask in self._multitasks:
            multitask.update(examples, drop=drop, sgd=sgd)
        n_examples = len([eg for eg in examples if self.moves.has_gold(eg)])
        if n_examples == 0:
            return losses
@ -554,7 +591,7 @@ cdef class Parser(TrainablePipe):
                self._resize()
                self.model.from_bytes(bytes_data)
            except AttributeError:
-                raise ValueError(Errors.E149) from None
+                raise ValueError(Errors.E149)
        return self
    def to_bytes(self, exclude=tuple()):
--- a/spacy/tests/doc/test_add_entities.py
+++ b/spacy/tests/doc/test_add_entities.py
@ -18,14 +18,9 @@ def _ner_example(ner):
 def test_doc_add_entities_set_ents_iob(en_vocab):
    text = ["This", "is", "a", "lion"]
    doc = Doc(en_vocab, words=text)
    config = {
        "learn_tokens": False,
        "min_action_freq": 30,
        "update_with_oracle_cut_size": 100,
    }
    cfg = {"model": DEFAULT_NER_MODEL}
    model = registry.resolve(cfg, validate=True)["model"]
-    ner = EntityRecognizer(en_vocab, model, **config)
+    ner = EntityRecognizer(en_vocab, model)
    ner.initialize(lambda: [_ner_example(ner)])
    ner(doc)
@ -40,14 +35,9 @@ def test_ents_reset(en_vocab):
    """Ensure that resetting doc.ents does not change anything"""
    text = ["This", "is", "a", "lion"]
    doc = Doc(en_vocab, words=text)
    config = {
        "learn_tokens": False,
        "min_action_freq": 30,
        "update_with_oracle_cut_size": 100,
    }
    cfg = {"model": DEFAULT_NER_MODEL}
    model = registry.resolve(cfg, validate=True)["model"]
-    ner = EntityRecognizer(en_vocab, model, **config)
+    ner = EntityRecognizer(en_vocab, model)
    ner.initialize(lambda: [_ner_example(ner)])
    ner(doc)
    orig_iobs = [t.ent_iob_ for t in doc]
--- a/spacy/tests/parser/test_add_label.py
+++ b/spacy/tests/parser/test_add_label.py
@ -18,14 +18,9 @@ def vocab():
@pytest.fixture
 def parser(vocab):
    config = {
        "learn_tokens": False,
        "min_action_freq": 30,
        "update_with_oracle_cut_size": 100,
    }
    cfg = {"model": DEFAULT_PARSER_MODEL}
    model = registry.resolve(cfg, validate=True)["model"]
-    parser = DependencyParser(vocab, model, **config)
+    parser = DependencyParser(vocab, model)
    return parser
@ -77,19 +72,14 @@ def test_add_label(parser):
 def test_add_label_deserializes_correctly():
    config = {
        "learn_tokens": False,
        "min_action_freq": 30,
        "update_with_oracle_cut_size": 100,
    }
    cfg = {"model": DEFAULT_NER_MODEL}
    model = registry.resolve(cfg, validate=True)["model"]
-    ner1 = EntityRecognizer(Vocab(), model, **config)
+    ner1 = EntityRecognizer(Vocab(), model)
    ner1.add_label("C")
    ner1.add_label("B")
    ner1.add_label("A")
    ner1.initialize(lambda: [_ner_example(ner1)])
-    ner2 = EntityRecognizer(Vocab(), model, **config)
+    ner2 = EntityRecognizer(Vocab(), model)
    # the second model needs to be resized before we can call from_bytes
    ner2.model.attrs["resize_output"](ner2.model, ner1.moves.n_moves)
@ -113,12 +103,7 @@ def test_add_label_get_label(pipe_cls, n_moves, model_config):
    """
    labels = ["A", "B", "C"]
    model = registry.resolve({"model": model_config}, validate=True)["model"]
-    config = {
+    pipe = pipe_cls(Vocab(), model)
        "learn_tokens": False,
        "min_action_freq": 30,
        "update_with_oracle_cut_size": 100,
    }
    pipe = pipe_cls(Vocab(), model, **config)
    for label in labels:
        pipe.add_label(label)
    assert len(pipe.move_names) == len(labels) * n_moves
--- a/spacy/tests/parser/test_arc_eager_oracle.py
+++ b/spacy/tests/parser/test_arc_eager_oracle.py
@ -130,14 +130,9 @@ def test_get_oracle_actions():
        deps.append(dep)
        ents.append(ent)
    doc = Doc(Vocab(), words=[t[1] for t in annot_tuples])
    config = {
        "learn_tokens": False,
        "min_action_freq": 0,
        "update_with_oracle_cut_size": 100,
    }
    cfg = {"model": DEFAULT_PARSER_MODEL}
    model = registry.resolve(cfg, validate=True)["model"]
-    parser = DependencyParser(doc.vocab, model, **config)
+    parser = DependencyParser(doc.vocab, model)
    parser.moves.add_action(0, "")
    parser.moves.add_action(1, "")
    parser.moves.add_action(1, "")
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@ -9,11 +9,12 @@ from spacy.lookups import Lookups
 from spacy.pipeline._parser_internals.ner import BiluoPushDown
 from spacy.training import Example
 from spacy.tokens import Doc, Span
-from spacy.vocab import Vocab
+from spacy.vocab import Vocab, registry
 import logging
 from ..util import make_tempdir
-
+from ...pipeline import EntityRecognizer
 from ...pipeline.ner import DEFAULT_NER_MODEL
 TRAIN_DATA = [
    ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
@ -21,6 +22,11 @@ TRAIN_DATA = [
 ]
@pytest.fixture
 def neg_key():
    return "non_entities"
@pytest.fixture
 def vocab():
    return Vocab()
@ -59,39 +65,70 @@ def test_get_oracle_moves(tsys, doc, entity_annots):
    assert names == ["U-PERSON", "O", "O", "B-GPE", "L-GPE", "O"]
-@pytest.mark.filterwarnings("ignore::UserWarning")
+def test_negative_samples_two_word_input(tsys, vocab, neg_key):
-def test_get_oracle_moves_negative_entities(tsys, doc, entity_annots):
+    """Test that we don't get stuck in a two word input when we have a negative
-    entity_annots = [(s, e, "!" + label) for s, e, label in entity_annots]
+    span. This could happen if we don't have the right check on the B action.
    """
    tsys.cfg["neg_key"] = neg_key
    doc = Doc(vocab, words=["A", "B"])
    entity_annots = [None, None]
    example = Example.from_dict(doc, {"entities": entity_annots})
-    ex_dict = example.to_dict()
+    # These mean that the oracle sequence shouldn't have O for the first
-
+    # word, and it shouldn't analyse it as B-PERSON, L-PERSON
-    for i, tag in enumerate(ex_dict["doc_annotation"]["entities"]):
+    example.y.spans[neg_key] = [
-        if tag == "L-!GPE":
+        Span(example.y, 0, 1, label="O"),
-            ex_dict["doc_annotation"]["entities"][i] = "-"
+        Span(example.y, 0, 2, label="PERSON"),
-    example = Example.from_dict(doc, ex_dict)
+    ]
    act_classes = tsys.get_oracle_sequence(example)
    names = [tsys.get_class_name(act) for act in act_classes]
    assert names
    assert names[0] != "O"
    assert names[0] != "B-PERSON"
    assert names[1] != "L-PERSON"
-def test_get_oracle_moves_negative_entities2(tsys, vocab):
+def test_negative_samples_three_word_input(tsys, vocab, neg_key):
-    doc = Doc(vocab, words=["A", "B", "C", "D"])
+    """Test that we exclude a 2-word entity correctly using a negative example."""
-    entity_annots = ["B-!PERSON", "L-!PERSON", "B-!PERSON", "L-!PERSON"]
+    tsys.cfg["neg_key"] = neg_key
    doc = Doc(vocab, words=["A", "B", "C"])
    entity_annots = [None, None, None]
    example = Example.from_dict(doc, {"entities": entity_annots})
    # These mean that the oracle sequence shouldn't have O for the first
    # word, and it shouldn't analyse it as B-PERSON, L-PERSON
    example.y.spans[neg_key] = [
        Span(example.y, 0, 1, label="O"),
        Span(example.y, 0, 2, label="PERSON"),
    ]
    act_classes = tsys.get_oracle_sequence(example)
    names = [tsys.get_class_name(act) for act in act_classes]
    assert names
    assert names[0] != "O"
    assert names[1] != "B-PERSON"
-@pytest.mark.skip(reason="Maybe outdated? Unsure")
+def test_negative_samples_U_entity(tsys, vocab, neg_key):
-def test_get_oracle_moves_negative_O(tsys, vocab):
+    """Test that we exclude a 2-word entity correctly using a negative example."""
-    doc = Doc(vocab, words=["A", "B", "C", "D"])
+    tsys.cfg["neg_key"] = neg_key
-    entity_annots = ["O", "!O", "O", "!O"]
+    doc = Doc(vocab, words=["A"])
    entity_annots = [None]
    example = Example.from_dict(doc, {"entities": entity_annots})
    # These mean that the oracle sequence shouldn't have O for the first
    # word, and it shouldn't analyse it as B-PERSON, L-PERSON
    example.y.spans[neg_key] = [
        Span(example.y, 0, 1, label="O"),
        Span(example.y, 0, 1, label="PERSON"),
    ]
    act_classes = tsys.get_oracle_sequence(example)
    names = [tsys.get_class_name(act) for act in act_classes]
    assert names
    assert names[0] != "O"
    assert names[0] != "U-PERSON"
 def test_negative_sample_key_is_in_config(vocab, entity_types):
    actions = BiluoPushDown.get_actions(entity_types=entity_types)
    tsys = BiluoPushDown(vocab.strings, actions, incorrect_spans_key="non_entities")
    assert tsys.cfg["neg_key"] == "non_entities"
 # We can't easily represent this on a Doc object. Not sure what the best solution
@ -213,6 +250,27 @@ def test_train_empty():
            nlp.update(batch, losses=losses)
 def test_train_negative_deprecated():
    """Test that the deprecated negative entity format raises a custom error."""
    train_data = [
        ("Who is Shaka Khan?", {"entities": [(7, 17, "!PERSON")]}),
    ]
    nlp = English()
    train_examples = []
    for t in train_data:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
    ner = nlp.add_pipe("ner", last=True)
    ner.add_label("PERSON")
    nlp.initialize()
    for itn in range(2):
        losses = {}
        batches = util.minibatch(train_examples, size=8)
        for batch in batches:
            with pytest.raises(ValueError):
                nlp.update(batch, losses=losses)
 def test_overwrite_token():
    nlp = English()
    nlp.add_pipe("ner")
@ -265,6 +323,16 @@ def test_ruler_before_ner():
    assert [token.ent_type_ for token in doc] == expected_types
 def test_ner_constructor(en_vocab):
    config = {
        "update_with_oracle_cut_size": 100,
    }
    cfg = {"model": DEFAULT_NER_MODEL}
    model = registry.resolve(cfg, validate=True)["model"]
    ner_1 = EntityRecognizer(en_vocab, model, **config)
    ner_2 = EntityRecognizer(en_vocab, model)
 def test_ner_before_ruler():
    """ Test that an entity_ruler works after an NER: the second can overwrite O annotations """
    nlp = English()
@ -414,7 +482,7 @@ def test_beam_ner_scores():
            assert 0 - eps <= score <= 1 + eps
-def test_beam_overfitting_IO():
+def test_beam_overfitting_IO(neg_key):
    # Simple test to try and quickly overfit the Beam NER component
    nlp = English()
    beam_width = 16
@ -422,6 +490,7 @@ def test_beam_overfitting_IO():
    config = {
        "beam_width": beam_width,
        "beam_density": beam_density,
        "incorrect_spans_key": neg_key,
    }
    ner = nlp.add_pipe("beam_ner", config=config)
    train_examples = []
@ -438,12 +507,13 @@ def test_beam_overfitting_IO():
    assert losses["beam_ner"] < 0.0001
    # test the scores from the beam
-    test_text = "I like London."
+    test_text = "I like London"
    docs = [nlp.make_doc(test_text)]
    beams = ner.predict(docs)
    entity_scores = ner.scored_ents(beams)[0]
    assert entity_scores[(2, 3, "LOC")] == 1.0
    assert entity_scores[(2, 3, "PERSON")] == 0.0
    assert len(nlp(test_text).ents) == 1
    # Also test the results are still the same after IO
    with make_tempdir() as tmp_dir:
@ -456,6 +526,104 @@ def test_beam_overfitting_IO():
        assert entity_scores2[(2, 3, "LOC")] == 1.0
        assert entity_scores2[(2, 3, "PERSON")] == 0.0
    # Try to unlearn the entity by using negative annotations
    neg_doc = nlp.make_doc(test_text)
    neg_ex = Example(neg_doc, neg_doc)
    neg_ex.reference.spans[neg_key] = [Span(neg_doc, 2, 3, "LOC")]
    neg_train_examples = [neg_ex]
    for i in range(20):
        losses = {}
        nlp.update(neg_train_examples, sgd=optimizer, losses=losses)
    # test the "untrained" model
    assert len(nlp(test_text).ents) == 0
 def test_neg_annotation(neg_key):
    """Check that the NER update works with a negative annotation that is a different label of the correct one,
    or partly overlapping, etc"""
    nlp = English()
    beam_width = 16
    beam_density = 0.0001
    config = {
        "beam_width": beam_width,
        "beam_density": beam_density,
        "incorrect_spans_key": neg_key,
    }
    ner = nlp.add_pipe("beam_ner", config=config)
    train_text = "Who is Shaka Khan?"
    neg_doc = nlp.make_doc(train_text)
    ner.add_label("PERSON")
    ner.add_label("ORG")
    example = Example.from_dict(neg_doc, {"entities": [(7, 17, "PERSON")]})
    example.reference.spans[neg_key] = [Span(neg_doc, 2, 4, "ORG"), Span(neg_doc, 2, 3, "PERSON"), Span(neg_doc, 1, 4, "PERSON")]
    optimizer = nlp.initialize()
    for i in range(2):
        losses = {}
        nlp.update([example], sgd=optimizer, losses=losses)
 def test_neg_annotation_conflict(neg_key):
    # Check that NER raises for a negative annotation that is THE SAME as a correct one
    nlp = English()
    beam_width = 16
    beam_density = 0.0001
    config = {
        "beam_width": beam_width,
        "beam_density": beam_density,
        "incorrect_spans_key": neg_key,
    }
    ner = nlp.add_pipe("beam_ner", config=config)
    train_text = "Who is Shaka Khan?"
    neg_doc = nlp.make_doc(train_text)
    ner.add_label("PERSON")
    ner.add_label("LOC")
    example = Example.from_dict(neg_doc, {"entities": [(7, 17, "PERSON")]})
    example.reference.spans[neg_key] = [Span(neg_doc, 2, 4, "PERSON")]
    assert len(example.reference.ents) == 1
    assert example.reference.ents[0].text == "Shaka Khan"
    assert example.reference.ents[0].label_ == "PERSON"
    assert len(example.reference.spans[neg_key]) == 1
    assert example.reference.spans[neg_key][0].text == "Shaka Khan"
    assert example.reference.spans[neg_key][0].label_ == "PERSON"
    optimizer = nlp.initialize()
    for i in range(2):
        losses = {}
        with pytest.raises(ValueError):
            nlp.update([example], sgd=optimizer, losses=losses)
 def test_beam_valid_parse(neg_key):
    """Regression test for previously flakey behaviour"""
    nlp = English()
    beam_width = 16
    beam_density = 0.0001
    config = {
        "beam_width": beam_width,
        "beam_density": beam_density,
        "incorrect_spans_key": neg_key,
    }
    nlp.add_pipe("beam_ner", config=config)
    # fmt: off
    tokens = ['FEDERAL', 'NATIONAL', 'MORTGAGE', 'ASSOCIATION', '(', 'Fannie', 'Mae', '):', 'Posted', 'yields', 'on', '30', 'year', 'mortgage', 'commitments', 'for', 'delivery', 'within', '30', 'days', '(', 'priced', 'at', 'par', ')', '9.75', '%', ',', 'standard', 'conventional', 'fixed', '-', 'rate', 'mortgages', ';', '8.70', '%', ',', '6/2', 'rate', 'capped', 'one', '-', 'year', 'adjustable', 'rate', 'mortgages', '.', 'Source', ':', 'Telerate', 'Systems', 'Inc.']
    iob = ['B-ORG', 'I-ORG', 'I-ORG', 'L-ORG', 'O', 'B-ORG', 'L-ORG', 'O', 'O', 'O', 'O', 'B-DATE', 'L-DATE', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'L-DATE', 'O', 'O', 'O', 'O', 'O', 'B-PERCENT', 'L-PERCENT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PERCENT', 'L-PERCENT', 'O', 'U-CARDINAL', 'O', 'O', 'B-DATE', 'I-DATE', 'L-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
    # fmt: on
    doc = Doc(nlp.vocab, words=tokens)
    example = Example.from_dict(doc, {"ner": iob})
    neg_span = Span(doc, 50, 53, "ORG")
    example.reference.spans[neg_key] = [neg_span]
    optimizer = nlp.initialize()
    for i in range(5):
        losses = {}
        nlp.update([example], sgd=optimizer, losses=losses)
    assert "beam_ner" in losses
 def test_ner_warns_no_lookups(caplog):
    nlp = English()
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@ -5,10 +5,11 @@ from spacy.attrs import DEP
 from spacy.lang.en import English
 from spacy.training import Example
 from spacy.tokens import Doc
-from spacy import util
+from spacy import util, registry
 from ..util import apply_transition_sequence, make_tempdir
-
+from ...pipeline import DependencyParser
 from ...pipeline.dep_parser import DEFAULT_PARSER_MODEL
 TRAIN_DATA = [
    (
@ -215,6 +216,18 @@ def test_parser_set_sent_starts(en_vocab):
            assert token.head in sent
 def test_parser_constructor(en_vocab):
    config = {
        "learn_tokens": False,
        "min_action_freq": 30,
        "update_with_oracle_cut_size": 100,
    }
    cfg = {"model": DEFAULT_PARSER_MODEL}
    model = registry.resolve(cfg, validate=True)["model"]
    parser_1 = DependencyParser(en_vocab, model, **config)
    parser_2 = DependencyParser(en_vocab, model)
@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
 def test_incomplete_data(pipe_name):
    # Test that the parser works with incomplete information
--- a/spacy/tests/parser/test_preset_sbd.py
+++ b/spacy/tests/parser/test_preset_sbd.py
@ -23,14 +23,9 @@ def _parser_example(parser):
@pytest.fixture
 def parser(vocab):
    vocab.strings.add("ROOT")
    config = {
        "learn_tokens": False,
        "min_action_freq": 30,
        "update_with_oracle_cut_size": 100,
    }
    cfg = {"model": DEFAULT_PARSER_MODEL}
    model = registry.resolve(cfg, validate=True)["model"]
-    parser = DependencyParser(vocab, model, **config)
+    parser = DependencyParser(vocab, model)
    parser.cfg["token_vector_width"] = 4
    parser.cfg["hidden_width"] = 32
    # parser.add_label('right')
--- a/spacy/tests/regression/test_issue3001-3500.py
+++ b/spacy/tests/regression/test_issue3001-3500.py
@ -190,14 +190,9 @@ def test_issue3345():
    doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"])
    doc[4].is_sent_start = True
    ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}])
    config = {
        "learn_tokens": False,
        "min_action_freq": 30,
        "update_with_oracle_cut_size": 100,
    }
    cfg = {"model": DEFAULT_NER_MODEL}
    model = registry.resolve(cfg, validate=True)["model"]
-    ner = EntityRecognizer(doc.vocab, model, **config)
+    ner = EntityRecognizer(doc.vocab, model)
    # Add the OUT action. I wouldn't have thought this would be necessary...
    ner.moves.add_action(5, "")
    ner.add_label("GPE")
--- a/spacy/tests/regression/test_issue3501-4000.py
+++ b/spacy/tests/regression/test_issue3501-4000.py
@ -259,8 +259,6 @@ def test_issue3830_no_subtok():
    """Test that the parser doesn't have subtok label if not learn_tokens"""
    config = {
        "learn_tokens": False,
        "min_action_freq": 30,
        "update_with_oracle_cut_size": 100,
    }
    model = registry.resolve({"model": DEFAULT_PARSER_MODEL}, validate=True)["model"]
    parser = DependencyParser(Vocab(), model, **config)
@ -274,8 +272,6 @@ def test_issue3830_with_subtok():
    """Test that the parser does have subtok label if learn_tokens=True."""
    config = {
        "learn_tokens": True,
        "min_action_freq": 30,
        "update_with_oracle_cut_size": 100,
    }
    model = registry.resolve({"model": DEFAULT_PARSER_MODEL}, validate=True)["model"]
    parser = DependencyParser(Vocab(), model, **config)
--- a/spacy/tests/serialize/test_serialize_pipeline.py
+++ b/spacy/tests/serialize/test_serialize_pipeline.py
@ -61,8 +61,6 @@ def taggers(en_vocab):
@pytest.mark.parametrize("Parser", test_parsers)
 def test_serialize_parser_roundtrip_bytes(en_vocab, Parser):
    config = {
        "learn_tokens": False,
        "min_action_freq": 0,
        "update_with_oracle_cut_size": 100,
        "beam_width": 1,
        "beam_update_prob": 1.0,
@ -70,8 +68,8 @@ def test_serialize_parser_roundtrip_bytes(en_vocab, Parser):
    }
    cfg = {"model": DEFAULT_PARSER_MODEL}
    model = registry.resolve(cfg, validate=True)["model"]
-    parser = Parser(en_vocab, model, **config)
+    parser = Parser(en_vocab, model)
-    new_parser = Parser(en_vocab, model, **config)
+    new_parser = Parser(en_vocab, model)
    new_parser = new_parser.from_bytes(parser.to_bytes(exclude=["vocab"]))
    bytes_2 = new_parser.to_bytes(exclude=["vocab"])
    bytes_3 = parser.to_bytes(exclude=["vocab"])
@ -84,43 +82,27 @@ def test_serialize_parser_strings(Parser):
    vocab1 = Vocab()
    label = "FunnyLabel"
    assert label not in vocab1.strings
    config = {
        "learn_tokens": False,
        "min_action_freq": 0,
        "update_with_oracle_cut_size": 100,
        "beam_width": 1,
        "beam_update_prob": 1.0,
        "beam_density": 0.0,
    }
    cfg = {"model": DEFAULT_PARSER_MODEL}
    model = registry.resolve(cfg, validate=True)["model"]
-    parser1 = Parser(vocab1, model, **config)
+    parser1 = Parser(vocab1, model)
    parser1.add_label(label)
    assert label in parser1.vocab.strings
    vocab2 = Vocab()
    assert label not in vocab2.strings
-    parser2 = Parser(vocab2, model, **config)
+    parser2 = Parser(vocab2, model)
    parser2 = parser2.from_bytes(parser1.to_bytes(exclude=["vocab"]))
    assert label in parser2.vocab.strings
@pytest.mark.parametrize("Parser", test_parsers)
 def test_serialize_parser_roundtrip_disk(en_vocab, Parser):
    config = {
        "learn_tokens": False,
        "min_action_freq": 0,
        "update_with_oracle_cut_size": 100,
        "beam_width": 1,
        "beam_update_prob": 1.0,
        "beam_density": 0.0,
    }
    cfg = {"model": DEFAULT_PARSER_MODEL}
    model = registry.resolve(cfg, validate=True)["model"]
-    parser = Parser(en_vocab, model, **config)
+    parser = Parser(en_vocab, model)
    with make_tempdir() as d:
        file_path = d / "parser"
        parser.to_disk(file_path)
-        parser_d = Parser(en_vocab, model, **config)
+        parser_d = Parser(en_vocab, model)
        parser_d = parser_d.from_disk(file_path)
        parser_bytes = parser.to_bytes(exclude=["model", "vocab"])
        parser_d_bytes = parser_d.to_bytes(exclude=["model", "vocab"])
@ -198,17 +180,12 @@ def test_serialize_textcat_empty(en_vocab):
 def test_serialize_pipe_exclude(en_vocab, Parser):
    cfg = {"model": DEFAULT_PARSER_MODEL}
    model = registry.resolve(cfg, validate=True)["model"]
    config = {
        "learn_tokens": False,
        "min_action_freq": 0,
        "update_with_oracle_cut_size": 100,
    }
    def get_new_parser():
-        new_parser = Parser(en_vocab, model, **config)
+        new_parser = Parser(en_vocab, model)
        return new_parser
-    parser = Parser(en_vocab, model, **config)
+    parser = Parser(en_vocab, model)
    parser.cfg["foo"] = "bar"
    new_parser = get_new_parser().from_bytes(parser.to_bytes(exclude=["vocab"]))
    assert "foo" in new_parser.cfg
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@ -235,9 +235,9 @@ cdef class Example:
                    seen.update(indices)
        return output
-    def get_aligned_ner(self):
+    def get_aligned_ents_and_ner(self):
        if not self.y.has_annotation("ENT_IOB"):
-            return [None] * len(self.x)  # should this be 'missing' instead of 'None' ?
+            return [], [None] * len(self.x)
        x_ents = self.get_aligned_spans_y2x(self.y.ents, allow_overlap=False)
        # Default to 'None' for missing values
        x_tags = offsets_to_biluo_tags(
@ -253,6 +253,10 @@ cdef class Example:
                    x_tags[i] = "O"
                elif self.x[i].is_space:
                    x_tags[i] = "O"
        return x_ents, x_tags
    def get_aligned_ner(self):
        x_ents, x_tags = self.get_aligned_ents_and_ner()
        return x_tags
    def to_dict(self):
--- a/website/docs/api/dependencyparser.md
+++ b/website/docs/api/dependencyparser.md
@ -50,7 +50,7 @@ architectures and their arguments and hyperparameters.
 | Setting                       | Description                                                                                                                                                                                                                                                                                                           |
 | ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `moves`                       | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[List[str]]~~                                                                                                                                                                                                       |
+| `moves`                       | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[TransitionSystem]~~                                                                                                                                                                                                |
 | `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~                                                                   |
 | `learn_tokens`                | Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. Defaults to `False`. ~~bool~~                                                                                                                                                                                         |
 | `min_action_freq`             | The minimum frequency of labelled actions to retain. Rarer labelled actions have their label backed-off to "dep". While this primarily affects the label accuracy, it can also affect the attachment structure, as the labels are used to represent the pseudo-projectivity transformation. Defaults to `30`. ~~int~~ |
@ -88,8 +88,8 @@ shortcut for this and instantiate the component using its string name and
 | `name`                        | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                                                                                                                                                                                                 |
 | `moves`                       | A list of transition names. Inferred from the data if not provided. ~~Optional[List[str]]~~                                                                                                                                                                                                         |
 | _keyword-only_                |                                                                                                                                                                                                                                                                                                     |
-| `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. `100` is a good default. ~~int~~                                           |
+| `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~                                                 |
-| `learn_tokens`                | Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. ~~bool~~                                                                                                                                                                                            |
+| `learn_tokens`                | Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. Defaults to `False`. ~~bool~~                                                                                                                                                                       |
 | `min_action_freq`             | The minimum frequency of labelled actions to retain. Rarer labelled actions have their label backed-off to "dep". While this primarily affects the label accuracy, it can also affect the attachment structure, as the labels are used to represent the pseudo-projectivity transformation. ~~int~~ |
 ## DependencyParser.\_\_call\_\_ {#call tag="method"}
--- a/website/docs/api/entityrecognizer.md
+++ b/website/docs/api/entityrecognizer.md
@ -37,6 +37,7 @@ architectures and their arguments and hyperparameters.
 >    "moves": None,
 >    "update_with_oracle_cut_size": 100,
 >    "model": DEFAULT_NER_MODEL,
 >    "incorrect_spans_key": "incorrect_spans",
 > }
 > nlp.add_pipe("ner", config=config)
 > ```
@ -46,6 +47,7 @@ architectures and their arguments and hyperparameters.
 | `moves`                       | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[List[str]]~~                                                                                                                                     |
 | `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ |
 | `model`                       | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [TransitionBasedParser](/api/architectures#TransitionBasedParser). ~~Model[List[Doc], List[Floats2d]]~~                                                 |
 | `incorrect_spans_key`         | This key refers to a `SpanGroup` in `doc.spans` that specifies incorrect spans. The NER wiill learn not to predict (exactly) those spans. Defaults to `None`. ~~Optional[str]~~                                                                     |
 ```python
 %%GITHUB_SPACY/spacy/pipeline/ner.pyx
@ -72,14 +74,15 @@ Create a new pipeline instance. In your application, you would normally use a
 shortcut for this and instantiate the component using its string name and
 [`nlp.add_pipe`](/api/language#add_pipe).
-| Name                          | Description                                                                                                                                                                                                                                               |
+| Name                          | Description                                                                                                                                                                                                                                         |
-| ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `vocab`                       | The shared vocabulary. ~~Vocab~~                                                                                                                                                                                                                          |
+| `vocab`                       | The shared vocabulary. ~~Vocab~~                                                                                                                                                                                                                    |
-| `model`                       | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                      |
+| `model`                       | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                |
-| `name`                        | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                                                                                                                                                       |
+| `name`                        | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                                                                                                                                                 |
-| `moves`                       | A list of transition names. Inferred from the data if not provided. ~~Optional[List[str]]~~                                                                                                                                                               |
+| `moves`                       | A list of transition names. Inferred from the data if set to `None`, which is the default. ~~Optional[List[str]]~~                                                                                                                                  |
-| _keyword-only_                |                                                                                                                                                                                                                                                           |
+| _keyword-only_                |                                                                                                                                                                                                                                                     |
-| `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. `100` is a good default. ~~int~~ |
+| `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ |
 | `incorrect_spans_key`         | Identifies spans that are known to be incorrect entity annotations. The incorrect entity annotations can be stored in the span group, under this key. Defaults to `None`. ~~Optional[str]~~                                                         |
 ## EntityRecognizer.\_\_call\_\_ {#call tag="method"}
@ -220,14 +223,14 @@ model. Delegates to [`predict`](/api/entityrecognizer#predict) and
 > losses = ner.update(examples, sgd=optimizer)
 > ```
-| Name              | Description                                                                                                                        |
+| Name           | Description                                                                                                              |
-| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------ |
-| `examples`        | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~                                                  |
+| `examples`     | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~                                        |
-| _keyword-only_    |                                                                                                                                    |
+| _keyword-only_ |                                                                                                                          |
-| `drop`            | The dropout rate. ~~float~~                                                                                                        |
+| `drop`         | The dropout rate. ~~float~~                                                                                              |
-| `sgd`             | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                      |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~            |
-| `losses`          | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~           |
+| `losses`       | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
-| **RETURNS**       | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                              |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                    |
 ## EntityRecognizer.get_loss {#get_loss tag="method"}