Adapt parser and NER for transformers (#5449)

* Draft layer for BILUO actions * Fixes to biluo layer * WIP on BILUO layer * Add tests for BILUO layer * Format * Fix transitions * Update test * Link in the simple_ner * Update BILUO tagger * Update __init__ * Import simple_ner * Update test * Import * Add files * Add config * Fix label passing for BILUO and tagger * Fix label handling for simple_ner component * Update simple NER test * Update config * Hack train script * Update BILUO layer * Fix SimpleNER component * Update train_from_config * Add biluo_to_iob helper * Add IOB layer * Add IOBTagger model * Update biluo layer * Update SimpleNER tagger * Update BILUO * Read random seed in train-from-config * Update use of normal_init * Fix normalization of gradient in SimpleNER * Update IOBTagger * Remove print * Tweak masking in BILUO * Add dropout in SimpleNER * Update thinc * Tidy up simple_ner * Fix biluo model * Unhack train-from-config * Update setup.cfg and requirements * Add tb_framework.py for parser model * Try to avoid memory leak in BILUO * Move ParserModel into spacy.ml, avoid need for subclass. * Use updated parser model * Remove incorrect call to model.initializre in PrecomputableAffine * Update parser model * Avoid divide by zero in tagger * Add extra dropout layer in tagger * Refine minibatch_by_words function to avoid oom * Fix parser model after refactor * Try to avoid div-by-zero in SimpleNER * Fix infinite loop in minibatch_by_words * Use SequenceCategoricalCrossentropy in Tagger * Fix parser model when hidden layer * Remove extra dropout from tagger * Add extra nan check in tagger * Fix thinc version * Update tests and imports * Fix test * Update test * Update tests * Fix tests * Fix test Co-authored-by: Ines Montani <ines@ines.io>
2025-11-11 21:35:47 +03:00 · 2020-05-18 22:23:33 +02:00 · 2020-05-18 22:23:33 +02:00 · 333b1a308b
commit 333b1a308b
parent 3100c97e69
29 changed files with 1180 additions and 247 deletions
--- a/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg
+++ b/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg
@ -4,12 +4,18 @@ limit = 0
 dropout = 0.2
 patience = 10000
 eval_frequency = 200
-scores = ["ents_f"]
+scores = ["ents_p", "ents_r", "ents_f"]
 score_weights = {"ents_f": 1}
 orth_variant_level = 0.0
 gold_preproc = true
 max_length = 0
-batch_size = 25
+
 [training.batch_size]
@schedules = "compounding.v1"
 start = 3000
 stop = 3000
 compound = 1.001
 [optimizer]
@optimizers = "Adam.v1"
@ -21,45 +27,18 @@ beta2 = 0.999
 lang = "en"
 vectors = null
 [nlp.pipeline.tok2vec]
 factory = "tok2vec"
 [nlp.pipeline.tok2vec.model]
@architectures = "spacy.Tok2Vec.v1"
 [nlp.pipeline.tok2vec.model.extract]
@architectures = "spacy.Doc2Feats.v1"
 columns = ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"]
 [nlp.pipeline.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v1"
 columns = ${nlp.pipeline.tok2vec.model.extract:columns}
 width = 96
 rows = 2000
 use_subwords = true
 pretrained_vectors = null
 [nlp.pipeline.tok2vec.model.embed.mix]
@architectures = "spacy.LayerNormalizedMaxout.v1"
 width = ${nlp.pipeline.tok2vec.model.embed:width}
 maxout_pieces = 3
 [nlp.pipeline.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v1"
 width = ${nlp.pipeline.tok2vec.model.embed:width}
 window_size = 1
 maxout_pieces = 3
 depth = 2
 [nlp.pipeline.ner]
-factory = "ner"
+factory = "simple_ner"
 [nlp.pipeline.ner.model]
-@architectures = "spacy.TransitionBasedParser.v1"
+@architectures = "spacy.BiluoTagger.v1"
 nr_feature_tokens = 6
 hidden_width = 64
 maxout_pieces = 2
 [nlp.pipeline.ner.model.tok2vec]
-@architectures = "spacy.Tok2VecTensors.v1"
+@architectures = "spacy.HashEmbedCNN.v1"
-width = ${nlp.pipeline.tok2vec.model.embed:width}
+width = 128
 depth = 4
 embed_size = 7000
 maxout_pieces = 3
 window_size = 1
 subword_features = true
 pretrained_vectors = null
--- a/examples/training/train_ner.py
+++ b/examples/training/train_ner.py
@ -42,26 +42,28 @@ def main(model=None, output_dir=None, n_iter=100):
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
-    if "ner" not in nlp.pipe_names:
+    if "simple_ner" not in nlp.pipe_names:
-        ner = nlp.create_pipe("ner")
+        ner = nlp.create_pipe("simple_ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
-        ner = nlp.get_pipe("ner")
+        ner = nlp.get_pipe("simple_ner")
    # add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            print("Add label", ent[2])
            ner.add_label(ent[2])
    # get names of other pipes to disable them during training
-    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
+    pipe_exceptions = ["simple_ner"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        # reset and initialize the weights randomly – but only if we're
        # training a new model
        if model is None:
            nlp.begin_training()
        print("Transitions", list(enumerate(nlp.get_pipe("simple_ner").get_tag_names())))
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
@ -70,7 +72,7 @@ def main(model=None, output_dir=None, n_iter=100):
            for batch in batches:
                nlp.update(
                    batch,
-                    drop=0.5,  # dropout - make it harder to memorise data
+                    drop=0.0,  # dropout - make it harder to memorise data
                    losses=losses,
                )
            print("Losses", losses)
--- a/spacy/cli/train_from_config.py
+++ b/spacy/cli/train_from_config.py
@ -8,6 +8,7 @@ from wasabi import msg
 import thinc
 import thinc.schedules
 from thinc.api import Model
 import random
 from ..gold import GoldCorpus
 from .. import util
@ -119,6 +120,7 @@ class ConfigSchema(BaseModel):
    output_path=("Output directory to store model in", "option", "o", Path),
    meta_path=("Optional path to meta.json to use as base.", "option", "m", Path),
    raw_text=("Path to jsonl file with unlabelled text documents.", "option", "rt", Path),
    use_gpu=("Use GPU", "option", "g", int),
    # fmt: on
 )
 def train_from_config_cli(
@ -130,6 +132,7 @@ def train_from_config_cli(
    raw_text=None,
    debug=False,
    verbose=False,
    use_gpu=-1
 ):
    """
    Train or update a spaCy model. Requires data to be formatted in spaCy's
@ -147,6 +150,12 @@ def train_from_config_cli(
    if output_path is not None and not output_path.exists():
        output_path.mkdir()
    if use_gpu >= 0:
        msg.info("Using GPU")
        util.use_gpu(use_gpu)
    else:
        msg.info("Using CPU")
    train_from_config(
        config_path,
        {"train": train_path, "dev": dev_path},
@ -161,13 +170,8 @@ def train_from_config(
 ):
    msg.info(f"Loading config from: {config_path}")
    config = util.load_config(config_path, create_objects=False)
    util.fix_random_seed(config["training"]["seed"])
    nlp_config = config["nlp"]
    use_gpu = config["training"]["use_gpu"]
    if use_gpu >= 0:
        msg.info("Using GPU")
        util.use_gpu(use_gpu)
    else:
        msg.info("Using CPU")
    config = util.load_config(config_path, create_objects=True)
    msg.info("Creating nlp from config")
    nlp = util.load_model_from_config(nlp_config)
@ -177,7 +181,7 @@ def train_from_config(
    msg.info("Loading training corpus")
    corpus = GoldCorpus(data_paths["train"], data_paths["dev"], limit=limit)
    msg.info("Initializing the nlp pipeline")
-    nlp.begin_training(lambda: corpus.train_examples, device=use_gpu)
+    nlp.begin_training(lambda: corpus.train_examples)
    train_batches = create_train_batches(nlp, corpus, training)
    evaluate = create_evaluation_callback(nlp, optimizer, corpus, training)
@ -192,6 +196,7 @@ def train_from_config(
        training["dropout"],
        training["patience"],
        training["eval_frequency"],
        training["accumulate_gradient"]
    )
    msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}")
@ -220,43 +225,50 @@ def train_from_config(
 def create_train_batches(nlp, corpus, cfg):
    while True:
-        train_examples = corpus.train_dataset(
+        train_examples = list(corpus.train_dataset(
            nlp,
            noise_level=0.0,
            orth_variant_level=cfg["orth_variant_level"],
            gold_preproc=cfg["gold_preproc"],
            max_length=cfg["max_length"],
            ignore_misaligned=True,
-        )
+        ))
-        for batch in util.minibatch_by_words(train_examples, size=cfg["batch_size"]):
+        random.shuffle(train_examples)
        batches = util.minibatch_by_words(train_examples, size=cfg["batch_size"])
        for batch in batches:
            yield batch
 def create_evaluation_callback(nlp, optimizer, corpus, cfg):
    def evaluate():
-        with nlp.use_params(optimizer.averages):
+        dev_examples = list(
-            dev_examples = list(
+            corpus.dev_dataset(
-                corpus.dev_dataset(
+                nlp, gold_preproc=cfg["gold_preproc"], ignore_misaligned=True
                    nlp, gold_preproc=cfg["gold_preproc"], ignore_misaligned=True
                )
            )
-            n_words = sum(len(ex.doc) for ex in dev_examples)
+        )
-            start_time = timer()
+        n_words = sum(len(ex.doc) for ex in dev_examples)
-            scorer = nlp.evaluate(dev_examples)
+        start_time = timer()
-            end_time = timer()
+            
-            wps = n_words / (end_time - start_time)
+        if optimizer.averages:
-            scores = scorer.scores
+            with nlp.use_params(optimizer.averages):
-            # Calculate a weighted sum based on score_weights for the main score
+                scorer = nlp.evaluate(dev_examples, batch_size=32)
-            weights = cfg["score_weights"]
+        else:
-            weighted_score = sum(scores[s] * weights.get(s, 0.0) for s in weights)
+            scorer = nlp.evaluate(dev_examples, batch_size=32)
-            scores["speed"] = wps
+        end_time = timer()
        wps = n_words / (end_time - start_time)
        scores = scorer.scores
        # Calculate a weighted sum based on score_weights for the main score
        weights = cfg["score_weights"]
        weighted_score = sum(scores[s] * weights.get(s, 0.0) for s in weights)
        scores["speed"] = wps
        return weighted_score, scores
    return evaluate
 def train_while_improving(
-    nlp, optimizer, train_data, evaluate, dropout, patience, eval_frequency
+    nlp, optimizer, train_data, evaluate, dropout, patience, eval_frequency,
    accumulate_gradient
 ):
    """Train until an evaluation stops improving. Works as a generator,
    with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,
@ -303,7 +315,7 @@ def train_while_improving(
    losses = {}
    for step, batch in enumerate(train_data):
        dropout = next(dropouts)
-        for subbatch in subdivide_batch(batch):
+        for subbatch in subdivide_batch(batch, accumulate_gradient):
            nlp.update(subbatch, drop=dropout, losses=losses, sgd=False)
        for name, proc in nlp.pipeline:
            if hasattr(proc, "model"):
@ -332,8 +344,19 @@ def train_while_improving(
            break
-def subdivide_batch(batch):
+def subdivide_batch(batch, accumulate_gradient):
-    return [batch]
+    batch = list(batch)
    batch.sort(key=lambda eg: len(eg.doc))
    sub_len = len(batch) // accumulate_gradient
    start = 0
    for i in range(accumulate_gradient):
        subbatch = batch[start : start + sub_len]
        if subbatch:
            yield subbatch
        start += len(subbatch)
    subbatch = batch[start : ]
    if subbatch:
        yield subbatch
 def setup_printer(training, nlp):
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -608,6 +608,14 @@ def iob_to_biluo(tags):
    return out
 def biluo_to_iob(tags):
    out = []
    for tag in tags:
        tag = tag.replace("U-", "B-", 1).replace("L-", "I-", 1)
        out.append(tag)
    return out
 def _consume_os(tags):
    while tags and tags[0] == "O":
        yield tags.pop(0)
--- a/spacy/language.py
+++ b/spacy/language.py
@ -195,6 +195,7 @@ class Language(object):
            default_senter_config,
            default_tensorizer_config,
            default_tok2vec_config,
            default_simple_ner_config
        )
        self.defaults = {
@ -205,6 +206,7 @@ class Language(object):
            "entity_linker": default_nel_config(),
            "morphologizer": default_morphologizer_config(),
            "senter": default_senter_config(),
            "simple_ner": default_simple_ner_config(),
            "tensorizer": default_tensorizer_config(),
            "tok2vec": default_tok2vec_config(),
        }
--- a/spacy/ml/_biluo.py
+++ b/spacy/ml/_biluo.py
@ -0,0 +1,109 @@
 """Thinc layer to do simpler transition-based parsing, NER, etc."""
 from typing import List, Tuple, Dict, Optional
 import numpy
 from thinc.api import Ops, Model, with_array, softmax_activation, padded2list
 from thinc.api import to_numpy
 from thinc.types import Padded, Ints1d, Ints3d, Floats2d, Floats3d
 from ..tokens import Doc
 def BILUO() -> Model[Padded, Padded]:
    return Model(
        "biluo",
        forward,
        init=init,
        dims={"nO": None},
        attrs={"get_num_actions": get_num_actions}
    )
 def init(model, X: Optional[Padded]=None, Y: Optional[Padded]=None):
    if X is not None and Y is not None:
        if X.data.shape != Y.data.shape:
            # TODO: Fix error
            raise ValueError("Mismatched shapes (TODO: Fix message)")
        model.set_dim("nO", X.data.shape[2])
    elif X is not None:
        model.set_dim("nO", X.data.shape[2])
    elif Y is not None:
        model.set_dim("nO", Y.data.shape[2])
    elif model.get_dim("nO") is None:
        raise ValueError("Dimension unset for BILUO: nO")
 def forward(model: Model[Padded, Padded], Xp: Padded, is_train: bool):
    n_labels = (model.get_dim("nO") - 1) // 4
    n_tokens, n_docs, n_actions = Xp.data.shape
    # At each timestep, we make a validity mask of shape (n_docs, n_actions)
    # to indicate which actions are valid next for each sequence. To construct
    # the mask, we have a state of shape (2, n_actions) and a validity table of
    # shape (2, n_actions+1, n_actions). The first dimension of the state indicates
    # whether it's the last token, the second dimension indicates the previous
    # action, plus a special 'null action' for the first entry.
    valid_transitions = model.ops.asarray(_get_transition_table(n_labels))
    prev_actions = model.ops.alloc1i(n_docs)
    # Initialize as though prev action was O
    prev_actions.fill(n_actions - 1)
    Y = model.ops.alloc3f(*Xp.data.shape)
    masks = model.ops.alloc3f(*Y.shape)
    max_value = Xp.data.max()
    for t in range(Xp.data.shape[0]):
        is_last = (Xp.lengths < (t+2)).astype("i")
        masks[t] = valid_transitions[is_last, prev_actions]
        # Don't train the out-of-bounds sequences.
        masks[t, Xp.size_at_t[t]:] = 0
        # Valid actions get 0*10e8, invalid get large negative value
        Y[t] = Xp.data[t] + ((masks[t]-1) * max_value * 10)
        prev_actions = Y[t].argmax(axis=-1)
    def backprop_biluo(dY: Padded) -> Padded:
        dY.data *= masks
        return dY
    return Padded(Y, Xp.size_at_t, Xp.lengths, Xp.indices), backprop_biluo
 def get_num_actions(n_labels: int) -> int:
    # One BEGIN action per label
    # One IN action per label
    # One LAST action per label
    # One UNIT action per label
    # One OUT action
    return n_labels + n_labels + n_labels + n_labels + 1
 def _get_transition_table(
    n_labels: int, *, _cache: Dict[int, Floats3d] = {}
 ) -> Floats3d:
    n_actions = get_num_actions(n_labels)
    if n_actions in _cache:
        return _cache[n_actions]
    table = numpy.zeros((2, n_actions, n_actions), dtype="f")
    B_start, B_end = (0, n_labels)
    I_start, I_end = (B_end, B_end + n_labels)
    L_start, L_end = (I_end, I_end + n_labels)
    U_start, U_end = (L_end, L_end + n_labels)
    # Using ranges allows us to set specific cells, which is necessary to express
    # that only actions of the same label are valid continuations.
    B_range = numpy.arange(B_start, B_end)
    I_range = numpy.arange(I_start, I_end)
    L_range = numpy.arange(L_start, L_end)
    O_action = U_end
    # If this is the last token and the previous action was B or I, only L
    # of that label is valid
    table[1, B_range, L_range] = 1
    table[1, I_range, L_range] = 1
    # If this isn't the last token and the previous action was B or I, only I or
    # L of that label are valid.
    table[0, B_range, I_range] = 1
    table[0, B_range, L_range] = 1
    table[0, I_range, I_range] = 1
    table[0, I_range, L_range] = 1
    # If this isn't the last token and the previous was L, U or O, B is valid
    table[0, L_start:, :B_end] = 1
    # Regardless of whether this is the last token, if the previous action was
    # {L, U, O}, U and O are valid.
    table[:, L_start:, U_start:] = 1
    _cache[n_actions] = table
    return table
--- a/spacy/ml/_iob.py
+++ b/spacy/ml/_iob.py
@ -0,0 +1,92 @@
 """Thinc layer to do simpler transition-based parsing, NER, etc."""
 from typing import List, Tuple, Dict, Optional
 from thinc.api import Ops, Model, with_array, softmax_activation, padded2list
 from thinc.types import Padded, Ints1d, Ints3d, Floats2d, Floats3d
 from ..tokens import Doc
 def IOB() -> Model[Padded, Padded]:
    return Model(
        "biluo",
        forward,
        init=init,
        dims={"nO": None},
        attrs={"get_num_actions": get_num_actions}
    )
 def init(model, X: Optional[Padded]=None, Y: Optional[Padded]=None):
    if X is not None and Y is not None:
        if X.data.shape != Y.data.shape:
            # TODO: Fix error
            raise ValueError("Mismatched shapes (TODO: Fix message)")
        model.set_dim("nO", X.data.shape[2])
    elif X is not None:
        model.set_dim("nO", X.data.shape[2])
    elif Y is not None:
        model.set_dim("nO", Y.data.shape[2])
    elif model.get_dim("nO") is None:
        raise ValueError("Dimension unset for BILUO: nO")
 def forward(model: Model[Padded, Padded], Xp: Padded, is_train: bool):
    n_labels = (model.get_dim("nO") - 1) // 2
    n_tokens, n_docs, n_actions = Xp.data.shape
    # At each timestep, we make a validity mask of shape (n_docs, n_actions)
    # to indicate which actions are valid next for each sequence. To construct
    # the mask, we have a state of shape (2, n_actions) and a validity table of
    # shape (2, n_actions+1, n_actions). The first dimension of the state indicates
    # whether it's the last token, the second dimension indicates the previous
    # action, plus a special 'null action' for the first entry.
    valid_transitions = _get_transition_table(model.ops, n_labels)
    prev_actions = model.ops.alloc1i(n_docs)
    # Initialize as though prev action was O
    prev_actions.fill(n_actions - 1)
    Y = model.ops.alloc3f(*Xp.data.shape)
    masks = model.ops.alloc3f(*Y.shape)
    for t in range(Xp.data.shape[0]):
        masks[t] = valid_transitions[prev_actions]
        # Don't train the out-of-bounds sequences.
        masks[t, Xp.size_at_t[t]:] = 0
        # Valid actions get 0*10e8, invalid get -1*10e8
        Y[t] = Xp.data[t] + ((masks[t]-1) * 10e8)
        prev_actions = Y[t].argmax(axis=-1)
    def backprop_biluo(dY: Padded) -> Padded:
        # Masking the gradient seems to do poorly here. But why?
        #dY.data *= masks
        return dY
    return Padded(Y, Xp.size_at_t, Xp.lengths, Xp.indices), backprop_biluo
 def get_num_actions(n_labels: int) -> int:
    # One BEGIN action per label
    # One IN action per label
    # One LAST action per label
    # One UNIT action per label
    # One OUT action
    return n_labels * 2 + 1
 def _get_transition_table(
    ops: Ops, n_labels: int, _cache: Dict[int, Floats3d] = {}
 ) -> Floats3d:
    n_actions = get_num_actions(n_labels)
    if n_actions in _cache:
        return ops.asarray(_cache[n_actions])
    table = ops.alloc2f(n_actions, n_actions)
    B_start, B_end = (0, n_labels)
    I_start, I_end = (B_end, B_end + n_labels)
    O_action = I_end
    B_range = ops.xp.arange(B_start, B_end)
    I_range = ops.xp.arange(I_start, I_end)
    # B and O are always valid
    table[:, B_start : B_end] = 1
    table[:, O_action] = 1
    # I can only follow a matching B
    table[B_range, I_range] = 1
    _cache[n_actions] = table
    return table
--- a/spacy/ml/_precomputable_affine.py
+++ b/spacy/ml/_precomputable_affine.py
@ -9,7 +9,6 @@ def PrecomputableAffine(nO, nI, nF, nP):
        dims={"nO": nO, "nI": nI, "nF": nF, "nP": nP},
        params={"W": None, "b": None, "pad": None},
    )
    model.initialize()
    return model
@ -110,8 +109,7 @@ def init(model, X=None, Y=None):
    pad = model.ops.alloc4f(1, nF, nO, nP)
    ops = model.ops
-    scale = float(ops.xp.sqrt(1.0 / (nF * nI)))
+    W = normal_init(ops, W.shape, mean=float(ops.xp.sqrt(1.0 / nF * nI)))
    W = normal_init(ops, W.shape, mean=scale)
    model.set_param("W", W)
    model.set_param("b", b)
    model.set_param("pad", pad)
--- a/spacy/ml/models/init.py
+++ b/spacy/ml/models/init.py
@ -1,5 +1,6 @@
 from .entity_linker import *  # noqa
 from .parser import *  # noqa
 from .simple_ner import *
 from .tagger import *  # noqa
 from .tensorizer import *  # noqa
 from .textcat import *  # noqa
--- a/spacy/ml/models/defaults/init.py
+++ b/spacy/ml/models/defaults/init.py
@ -91,3 +91,13 @@ def default_tok2vec_config():
 def default_tok2vec():
    loc = Path(__file__).parent / "tok2vec_defaults.cfg"
    return util.load_config(loc, create_objects=True)["model"]
 def default_simple_ner_config():
    loc = Path(__file__).parent / "simple_ner_defaults.cfg"
    return util.load_config(loc, create_objects=False)
 def default_simple_ner():
    loc = Path(__file__).parent / "simple_ner_defaults.cfg"
    return util.load_config(loc, create_objects=True)["model"]
--- a/spacy/ml/models/defaults/simple_ner_defaults.cfg
+++ b/spacy/ml/models/defaults/simple_ner_defaults.cfg
@ -0,0 +1,12 @@
 [model]
@architectures = "spacy.BiluoTagger.v1"
 [model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v1"
 pretrained_vectors = null
 width = 128
 depth = 4
 embed_size = 7000
 window_size = 1
 maxout_pieces = 3
 subword_features = true
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@ -1,9 +1,9 @@
 from pydantic import StrictInt
-from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops
+from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops, with_array
 from ...util import registry
 from .._precomputable_affine import PrecomputableAffine
-from ...syntax._parser_model import ParserModel
+from ..tb_framework import TransitionModel
@registry.architectures.register("spacy.TransitionBasedParser.v1")
@ -12,21 +12,27 @@ def build_tb_parser_model(
    nr_feature_tokens: StrictInt,
    hidden_width: StrictInt,
    maxout_pieces: StrictInt,
    use_upper=True,
    nO=None,
 ):
    token_vector_width = tok2vec.get_dim("nO")
-    tok2vec = chain(tok2vec, list2array())
+    tok2vec = chain(
-    tok2vec.set_dim("nO", token_vector_width)
+        tok2vec,
        with_array(Linear(hidden_width, token_vector_width)),
        list2array(),
    )
    tok2vec.set_dim("nO", hidden_width)
    lower = PrecomputableAffine(
-        nO=hidden_width,
+        nO=hidden_width if use_upper else nO,
        nF=nr_feature_tokens,
        nI=tok2vec.get_dim("nO"),
-        nP=maxout_pieces,
+        nP=maxout_pieces
    )
-    lower.set_dim("nP", maxout_pieces)
+    if use_upper:
-    with use_ops("numpy"):
+        with use_ops("numpy"):
-        # Initialize weights at zero, as it's a classification layer.
+            # Initialize weights at zero, as it's a classification layer.
-        upper = Linear(nO=nO, init_W=zero_init)
+            upper = Linear(nO=nO, init_W=zero_init)
-    model = ParserModel(tok2vec, lower, upper)
+    else:
-    return model
+        upper = None
    return TransitionModel(tok2vec, lower, upper)
--- a/spacy/ml/models/simple_ner.py
+++ b/spacy/ml/models/simple_ner.py
@ -0,0 +1,82 @@
 import functools
 from typing import List, Tuple, Dict, Optional
 from thinc.api import Ops, Model, Linear, Softmax, with_array, softmax_activation, padded2list
 from thinc.api import chain, list2padded, configure_normal_init
 from thinc.api import Dropout
 from thinc.types import Padded, Ints1d, Ints3d, Floats2d, Floats3d
 from ...tokens import Doc
 from .._biluo import BILUO
 from .._iob import IOB
 from ...util import registry
@registry.architectures.register("spacy.BiluoTagger.v1")
 def BiluoTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], List[Floats2d]]:
    biluo = BILUO()
    linear = Linear(
        nO=None,
        nI=tok2vec.get_dim("nO"),
        init_W=configure_normal_init(mean=0.02)
    )
    model = chain(
        tok2vec,
        list2padded(),
        with_array(chain(Dropout(0.1), linear)),
        biluo,
        with_array(softmax_activation()),
        padded2list()
    )
    return Model(
        "biluo-tagger",
        forward,
        init=init,
        layers=[model, linear],
        refs={"tok2vec": tok2vec, "linear": linear, "biluo": biluo},
        dims={"nO": None},
        attrs={"get_num_actions": biluo.attrs["get_num_actions"]}
    )
@registry.architectures.register("spacy.IOBTagger.v1")
 def IOBTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], List[Floats2d]]:
    biluo = IOB()
    linear = Linear(nO=None, nI=tok2vec.get_dim("nO"))
    model = chain(
        tok2vec,
        list2padded(),
        with_array(linear),
        biluo,
        with_array(softmax_activation()),
        padded2list()
    )
    return Model(
        "iob-tagger",
        forward,
        init=init,
        layers=[model],
        refs={"tok2vec": tok2vec, "linear": linear, "biluo": biluo},
        dims={"nO": None},
        attrs={"get_num_actions": biluo.attrs["get_num_actions"]}
    )
 def init(model: Model[List[Doc], List[Floats2d]], X=None, Y=None) -> None:
    if model.get_dim("nO") is None and Y:
        model.set_dim("nO", Y[0].shape[1])
    nO = model.get_dim("nO")
    biluo = model.get_ref("biluo")
    linear = model.get_ref("linear")
    biluo.set_dim("nO", nO)
    if linear.has_dim("nO") is None:
        linear.set_dim("nO", nO)
    model.layers[0].initialize(X=X, Y=Y)
 def forward(model: Model, X: List[Doc], is_train: bool):
    return model.layers[0](X, is_train)
 __all__ = ["BiluoTagger"]
--- a/spacy/ml/models/tagger.py
+++ b/spacy/ml/models/tagger.py
@ -1,4 +1,5 @@
-from thinc.api import zero_init, with_array, Softmax, chain, Model
+from thinc.api import zero_init, with_array, Softmax, chain, Model, Dropout
 from thinc.api import glorot_uniform_init
 from ...util import registry
@ -11,6 +12,6 @@ def build_tagger_model(tok2vec, nO=None) -> Model:
    softmax = with_array(output_layer)
    model = chain(tok2vec, softmax)
    model.set_ref("tok2vec", tok2vec)
-    model.set_ref("softmax", softmax)
+    model.set_ref("softmax", output_layer)
    model.set_ref("output_layer", output_layer)
    return model
--- a/spacy/ml/tb_framework.py
+++ b/spacy/ml/tb_framework.py
@ -0,0 +1,86 @@
 from thinc.api import Model, noop, use_ops, Linear
 from ..syntax._parser_model import ParserStepModel
 def TransitionModel(tok2vec, lower, upper, unseen_classes=set()):
    """Set up a stepwise transition-based model"""
    if upper is None:
        has_upper = False
        upper = noop()
    else:
        has_upper = True
    # don't define nO for this object, because we can't dynamically change it
    return Model(
        name="parser_model",
        forward=forward,
        dims={"nI": tok2vec.get_dim("nI") if tok2vec.has_dim("nI") else None},
        layers=[tok2vec, lower, upper],
        refs={"tok2vec": tok2vec, "lower": lower, "upper": upper},
        init=init,
        attrs={
            "has_upper": has_upper,
            "unseen_classes": set(unseen_classes),
            "resize_output": resize_output
        }
    )
 def forward(model, X, is_train):
    step_model = ParserStepModel(
        X,
        model.layers,
        unseen_classes=model.attrs["unseen_classes"],
        train=is_train,
        has_upper=model.attrs["has_upper"]
    )
    return step_model, step_model.finish_steps
 def init(model, X=None, Y=None):
    tok2vec = model.get_ref("tok2vec").initialize()
    lower = model.get_ref("lower").initialize(X=X)
    if model.attrs["has_upper"]:
        statevecs = model.ops.alloc2f(2, lower.get_dim("nO"))
        model.get_ref("upper").initialize(X=statevecs)
 def resize_output(model, new_nO):
    tok2vec = model.get_ref("tok2vec")
    lower = model.get_ref("lower")
    upper = model.get_ref("upper")
    if not model.attrs["has_upper"]:
        if lower.has_dim("nO") is None:
            lower.set_dim("nO", new_nO)
        return
    elif upper.has_dim("nO") is None:
        upper.set_dim("nO", new_nO)
        return
    elif new_nO == upper.get_dim("nO"):
        return
    smaller = upper
    nI = None
    if smaller.has_dim("nI"):
        nI = smaller.get_dim("nI")
    with use_ops('numpy'):
        larger = Linear(nO=new_nO, nI=nI)
        larger.init = smaller.init
    # it could be that the model is not initialized yet, then skip this bit
    if nI:
        larger_W = larger.ops.alloc2f(new_nO, nI)
        larger_b = larger.ops.alloc1f(new_nO)
        smaller_W = smaller.get_param("W")
        smaller_b = smaller.get_param("b")
        # Weights are stored in (nr_out, nr_in) format, so we're basically
        # just adding rows here.
        if smaller.has_dim("nO"):
            larger_W[:smaller.get_dim("nO")] = smaller_W
            larger_b[:smaller.get_dim("nO")] = smaller_b
            for i in range(smaller.get_dim("nO"), new_nO):
                model.attrs["unseen_classes"].add(i)
        larger.set_param("W", larger_W)
        larger.set_param("b", larger_b)
    model._layers[-1] = larger
    model.set_ref("upper", larger)
    return model
--- a/spacy/pipeline/init.py
+++ b/spacy/pipeline/init.py
@ -1,6 +1,7 @@
 from .pipes import Tagger, DependencyParser, EntityRecognizer, EntityLinker
 from .pipes import TextCategorizer, Tensorizer, Pipe, Sentencizer
 from .pipes import SentenceRecognizer
 from .simple_ner import SimpleNER
 from .morphologizer import Morphologizer
 from .entityruler import EntityRuler
 from .tok2vec import Tok2Vec
@ -22,6 +23,7 @@ __all__ = [
    "SentenceSegmenter",
    "SentenceRecognizer",
    "SimilarityHook",
    "SimpleNER",
    "merge_entities",
    "merge_noun_chunks",
    "merge_subtokens",
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@ -3,7 +3,7 @@ import numpy
 import srsly
 import random
 from thinc.api import CosineDistance, to_categorical, get_array_module
-from thinc.api import set_dropout_rate
+from thinc.api import set_dropout_rate, SequenceCategoricalCrossentropy
 import warnings
 from ..tokens.doc cimport Doc
@ -464,6 +464,9 @@ class Tagger(Pipe):
            return
        set_dropout_rate(self.model, drop)
        tag_scores, bp_tag_scores = self.model.begin_update([ex.doc for ex in examples])
        for sc in tag_scores:
            if self.model.ops.xp.isnan(sc.sum()):
                raise ValueError("nan value in scores")
        loss, d_tag_scores = self.get_loss(examples, tag_scores)
        bp_tag_scores(d_tag_scores)
        if sgd not in (None, False):
@ -497,29 +500,11 @@ class Tagger(Pipe):
            losses[self.name] += (gradient**2).sum()
    def get_loss(self, examples, scores):
-        scores = self.model.ops.flatten(scores)
+        loss_func = SequenceCategoricalCrossentropy(names=self.labels)
-        tag_index = {tag: i for i, tag in enumerate(self.labels)}
+        truths = [eg.gold.tags for eg in examples]
-        cdef int idx = 0
+        d_scores, loss = loss_func(scores, truths)
-        correct = numpy.zeros((scores.shape[0],), dtype="i")
+        if self.model.ops.xp.isnan(loss):
-        guesses = scores.argmax(axis=1)
+            raise ValueError("nan value when computing loss")
        known_labels = numpy.ones((scores.shape[0], 1), dtype="f")
        for ex in examples:
            gold = ex.gold
            for tag in gold.tags:
                if tag is None:
                    correct[idx] = guesses[idx]
                elif tag in tag_index:
                    correct[idx] = tag_index[tag]
                else:
                    correct[idx] = 0
                    known_labels[idx] = 0.
                idx += 1
        correct = self.model.ops.xp.array(correct, dtype="i")
        d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
        d_scores *= self.model.ops.asarray(known_labels)
        loss = (d_scores**2).sum()
        docs = [ex.doc for ex in examples]
        d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
        return float(loss), d_scores
    def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None,
--- a/spacy/pipeline/simple_ner.py
+++ b/spacy/pipeline/simple_ner.py
@ -0,0 +1,149 @@
 from typing import List
 from thinc.types import Floats2d
 from thinc.api import SequenceCategoricalCrossentropy, set_dropout_rate
 from thinc.util import to_numpy
 from ..gold import Example, spans_from_biluo_tags, iob_to_biluo, biluo_to_iob
 from ..tokens import Doc
 from ..language import component
 from ..util import link_vectors_to_models
 from .pipes import Pipe
@component("simple_ner", assigns=["doc.ents"])
 class SimpleNER(Pipe):
    """Named entity recognition with a tagging model. The model should include
    validity constraints to ensure that only valid tag sequences are returned."""
    def __init__(self, vocab, model):
        self.vocab = vocab
        self.model = model
        self.cfg = {"labels": []}
        self.loss_func = SequenceCategoricalCrossentropy(
            names=self.get_tag_names(),
            normalize=True,
            missing_value=None
        )
        assert self.model is not None
    @property
    def labels(self):
        return self.cfg["labels"]
    @property
    def is_biluo(self):
        return self.model.name.startswith("biluo")
    def add_label(self, label):
        if label not in self.cfg["labels"]:
            self.cfg["labels"].append(label)
    def get_tag_names(self):
        if self.is_biluo:
            return (
                [f"B-{label}" for label in self.labels] +
                [f"I-{label}" for label in self.labels] +
                [f"L-{label}" for label in self.labels] +
                [f"U-{label}" for label in self.labels] +
                ["O"]
            )
        else:
            return (
                [f"B-{label}" for label in self.labels] +
                [f"I-{label}" for label in self.labels] +
                ["O"]
            )
    def predict(self, docs: List[Doc]) -> List[Floats2d]:
        scores = self.model.predict(docs)
        return scores
    def set_annotations(self, docs: List[Doc], scores: List[Floats2d], tensors=None):
        """Set entities on a batch of documents from a batch of scores."""
        tag_names = self.get_tag_names()
        for i, doc in enumerate(docs):
            actions = to_numpy(scores[i].argmax(axis=1))
            tags = [tag_names[actions[j]] for j in range(len(doc))]
            if not self.is_biluo:
                tags = iob_to_biluo(tags)
            doc.ents = spans_from_biluo_tags(doc, tags)
    def update(self, examples, set_annotations=False, drop=0.0, sgd=None, losses=None):
        if not any(_has_ner(eg) for eg in examples):
            return 0
        examples = Example.to_example_objects(examples)
        docs = [ex.doc for ex in examples]
        set_dropout_rate(self.model, drop)
        scores, bp_scores = self.model.begin_update(docs)
        loss, d_scores = self.get_loss(examples, scores)
        bp_scores(d_scores)
        if set_annotations:
            self.set_annotations(docs, scores)
        if sgd is not None:
            self.model.finish_update(sgd)
        if losses is not None:
            losses.setdefault("ner", 0.0)
            losses["ner"] += loss
        return loss
    def get_loss(self, examples, scores):
        loss = 0
        d_scores = []
        truths = []
        for eg in examples:
            gold_tags = [(tag if tag != "-" else None) for tag in eg.gold.ner]
            if not self.is_biluo:
                gold_tags = biluo_to_iob(gold_tags)
            truths.append(gold_tags)
        for i in range(len(scores)):
            if len(scores[i]) != len(truths[i]):
                raise ValueError(
                    f"Mismatched output and gold sizes.\n"
                    f"Output: {len(scores[i])}, gold: {len(truths[i])}."
                    f"Input: {len(examples[i].doc)}"
                )
        d_scores, loss = self.loss_func(scores, truths)
        return loss, d_scores
    def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs):
        self.cfg.update(kwargs)
        if not hasattr(get_examples, '__call__'):
            gold_tuples = get_examples
            get_examples = lambda: gold_tuples
        labels = _get_labels(get_examples())
        for label in _get_labels(get_examples()):
            self.add_label(label)
        labels = self.labels
        n_actions = self.model.attrs["get_num_actions"](len(labels))
        self.model.set_dim("nO", n_actions)
        self.model.initialize() 
        if pipeline is not None:
            self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg)
        link_vectors_to_models(self.vocab)
        self.loss_func = SequenceCategoricalCrossentropy(
            names=self.get_tag_names(),
            normalize=True,
            missing_value=None
        )
        return sgd
    def init_multitask_objectives(self, *args, **kwargs):
        pass
 def _has_ner(eg):
    for ner_tag in eg.gold.ner:
        if ner_tag != "-" and ner_tag != None:
            return True
    else:
        return False
 def _get_labels(examples):
    labels = set()
    for eg in examples:
        for ner_tag in eg.token_annotation.entities:
            if ner_tag != 'O' and ner_tag != '-':
                _, label = ner_tag.split('-', 1)
                labels.add(label)
    return list(sorted(labels))
--- a/spacy/syntax/_parser_model.pyx
+++ b/spacy/syntax/_parser_model.pyx
@ -12,7 +12,7 @@ cimport blis.cy
 import numpy
 import numpy.random
-from thinc.api import Linear, Model, CupyOps, NumpyOps, use_ops
+from thinc.api import Linear, Model, CupyOps, NumpyOps, use_ops, noop
 from ..typedefs cimport weight_t, class_t, hash_t
 from ..tokens.doc cimport Doc
@ -219,112 +219,27 @@ cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) no
    return best
 class ParserModel(Model):
    def __init__(self, tok2vec, lower_model, upper_model, unseen_classes=None):
        # don't define nO for this object, because we can't dynamically change it
        Model.__init__(self, name="parser_model", forward=forward, dims={"nI": None})
        if tok2vec.has_dim("nI"):
            self.set_dim("nI", tok2vec.get_dim("nI"))
        self._layers = [tok2vec, lower_model]
        if upper_model is not None:
            self._layers.append(upper_model)
        self.unseen_classes = set()
        if unseen_classes:
            for class_ in unseen_classes:
                self.unseen_classes.add(class_)
        self.set_ref("tok2vec", tok2vec)
    def predict(self, docs):
        step_model = ParserStepModel(docs, self._layers,
                        unseen_classes=self.unseen_classes, train=False)
        return step_model
    def resize_output(self, new_nO):
        if len(self._layers) == 2:
            return
        if self.upper.has_dim("nO") and (new_nO == self.upper.get_dim("nO")):
            return
        smaller = self.upper
        nI = None
        if smaller.has_dim("nI"):
            nI = smaller.get_dim("nI")
        with use_ops('numpy'):
            larger = Linear(nO=new_nO, nI=nI)
            larger.init = smaller.init
        # it could be that the model is not initialized yet, then skip this bit
        if nI:
            larger_W = larger.ops.alloc2f(new_nO, nI)
            larger_b = larger.ops.alloc1f(new_nO)
            smaller_W = smaller.get_param("W")
            smaller_b = smaller.get_param("b")
            # Weights are stored in (nr_out, nr_in) format, so we're basically
            # just adding rows here.
            if smaller.has_dim("nO"):
                larger_W[:smaller.get_dim("nO")] = smaller_W
                larger_b[:smaller.get_dim("nO")] = smaller_b
                for i in range(smaller.get_dim("nO"), new_nO):
                    self.unseen_classes.add(i)
            larger.set_param("W", larger_W)
            larger.set_param("b", larger_b)
        self._layers[-1] = larger
    def initialize(self, X=None, Y=None):
        self.tok2vec.initialize()
        self.lower.initialize(X=X, Y=Y)
        if self.upper is not None:
            # In case we need to trigger the callbacks
            statevecs = self.ops.alloc((2, self.lower.get_dim("nO")))
            self.upper.initialize(X=statevecs)
    def finish_update(self, optimizer):
        self.tok2vec.finish_update(optimizer)
        self.lower.finish_update(optimizer)
        if self.upper is not None:
            self.upper.finish_update(optimizer)
    @property
    def tok2vec(self):
        return self._layers[0]
    @property
    def lower(self):
        return self._layers[1]
    @property
    def upper(self):
        return self._layers[2]
 def forward(model:ParserModel, X, is_train):
    step_model = ParserStepModel(X, model._layers, unseen_classes=model.unseen_classes,
        train=is_train)
    return step_model, step_model.finish_steps
 class ParserStepModel(Model):
-    def __init__(self, docs, layers, unseen_classes=None, train=True):
+    def __init__(self, docs, layers, *, has_upper, unseen_classes=None, train=True):
        Model.__init__(self, name="parser_step_model", forward=step_forward)
        self.attrs["has_upper"] = has_upper
        self.tokvecs, self.bp_tokvecs = layers[0](docs, is_train=train)
        if layers[1].get_dim("nP") >= 2:
            activation = "maxout"
-        elif len(layers) == 2:
+        elif has_upper:
            activation = None
        else:
            activation = "relu"
        self.state2vec = precompute_hiddens(len(docs), self.tokvecs, layers[1],
                                            activation=activation, train=train)
-        if len(layers) == 3:
+        if has_upper:
            self.vec2scores = layers[-1]
        else:
            self.vec2scores = None
        self.cuda_stream = util.get_cuda_stream(non_blocking=True)
        self.backprops = []
-        if self.vec2scores is None:
+        self._class_mask = numpy.zeros((self.nO,), dtype='f')
            self._class_mask = numpy.zeros((self.state2vec.nO,), dtype='f')
        else:
            self._class_mask = numpy.zeros((self.vec2scores.get_dim("nO"),), dtype='f')
        self._class_mask.fill(1)
        if unseen_classes is not None:
            for class_ in unseen_classes:
@ -332,7 +247,10 @@ class ParserStepModel(Model):
    @property
    def nO(self):
-        return self.state2vec.nO
+        if self.attrs["has_upper"]:
            return self.vec2scores.get_dim("nO")
        else:
            return self.state2vec.get_dim("nO")
    def class_is_unseen(self, class_):
        return self._class_mask[class_]
@ -378,7 +296,7 @@ class ParserStepModel(Model):
 def step_forward(model: ParserStepModel, states, is_train):
    token_ids = model.get_token_ids(states)
    vector, get_d_tokvecs = model.state2vec(token_ids, is_train)
-    if model.vec2scores is not None:
+    if model.attrs["has_upper"]:
        scores, get_d_vector = model.vec2scores(vector, is_train)
    else:
        scores = NumpyOps().asarray(vector)
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -36,7 +36,6 @@ from ..util import link_vectors_to_models, create_default_optimizer, registry
 from ..compat import copy_array
 from ..errors import Errors, Warnings
 from .. import util
 from ._parser_model import ParserModel
 from . import _beam_utils
 from . import nonproj
@ -69,7 +68,8 @@ cdef class Parser:
        cfg.setdefault('beam_width', 1)
        cfg.setdefault('beam_update_prob', 1.0)  # or 0.5 (both defaults were previously used)
        self.model = model
-        self.set_output(self.moves.n_moves)
+        if self.moves.n_moves != 0:
            self.set_output(self.moves.n_moves)
        self.cfg = cfg
        self._multitasks = []
        self._rehearsal_model = None
@ -105,7 +105,7 @@ cdef class Parser:
    @property
    def tok2vec(self):
        '''Return the embedding and convolutional layer of the model.'''
-        return self.model.tok2vec
+        return self.model.get_ref("tok2vec")
    @property
    def postprocesses(self):
@ -122,9 +122,11 @@ cdef class Parser:
            self._resize()
    def _resize(self):
-        self.model.resize_output(self.moves.n_moves)
+        self.model.attrs["resize_output"](self.model, self.moves.n_moves)
        if self._rehearsal_model not in (True, False, None):
-            self._rehearsal_model.resize_output(self.moves.n_moves)
+            self._rehearsal_model.attrs["resize_output"](
                self._rehearsal_model, self.moves.n_moves
            )
    def add_multitask_objective(self, target):
        # Defined in subclasses, to avoid circular import
@ -216,7 +218,6 @@ cdef class Parser:
        # expand our model output.
        self._resize()
        model = self.model.predict(docs)
        W_param = model.vec2scores.get_param("W")
        weights = get_c_weights(model)
        for state in batch:
            if not state.is_final():
@ -237,7 +238,7 @@ cdef class Parser:
        # if labels are missing. We therefore have to check whether we need to
        # expand our model output.
        self._resize()
-        cdef int nr_feature = self.model.lower.get_dim("nF")
+        cdef int nr_feature = self.model.get_ref("lower").get_dim("nF")
        model = self.model.predict(docs)
        token_ids = numpy.zeros((len(docs) * beam_width, nr_feature),
                                 dtype='i', order='C')
@ -370,13 +371,16 @@ cdef class Parser:
                    beam_density=self.cfg.get('beam_density', 0.001))
        set_dropout_rate(self.model, drop)
-        # Chop sequences into lengths of this many transitions, to make the
+        cut_gold = True
-        # batch uniform length.
+        if cut_gold:
-        cut_gold = numpy.random.choice(range(20, 100))
+            # Chop sequences into lengths of this many transitions, to make the
-        states, golds, max_steps = self._init_gold_batch(examples, max_length=cut_gold)
+            # batch uniform length.
            cut_gold = numpy.random.choice(range(20, 100))
            states, golds, max_steps = self._init_gold_batch(examples, max_length=cut_gold)
        else:
            states, golds, max_steps = self._init_gold_batch_no_cut(examples)
        states_golds = [(s, g) for (s, g) in zip(states, golds)
                        if not s.is_final() and g is not None]
        # Prepare the stepwise model, and get the callback for finishing the batch
        model, backprop_tok2vec = self.model.begin_update([ex.doc for ex in examples])
        all_states = list(states)
@ -456,9 +460,17 @@ cdef class Parser:
        set_dropout_rate(self.model, drop)
        model, backprop_tok2vec = self.model.begin_update(docs)
        states_d_scores, backprops, beams = _beam_utils.update_beam(
-            self.moves, self.model.lower.get_dim("nF"), 10000, states, golds,
+            self.moves,
-            model.state2vec, model.vec2scores, width, losses=losses,
+            self.model.get_ref("lower").get_dim("nF"),
-            beam_density=beam_density)
+            10000,
            states,
            golds,
            model.state2vec,
            model.vec2scores,
            width,
            losses=losses,
            beam_density=beam_density
        )
        for i, d_scores in enumerate(states_d_scores):
            losses[self.name] += (d_scores**2).mean()
            ids, bp_vectors, bp_scores = backprops[i]
@ -497,6 +509,24 @@ cdef class Parser:
                queue.extend(node._layers)
        return gradients
    def _init_gold_batch_no_cut(self, whole_examples):
        states = self.moves.init_batch([eg.doc for eg in whole_examples])
        good_docs = []
        good_golds = []
        good_states = []
        for i, eg in enumerate(whole_examples):
            doc = eg.doc
            gold = self.moves.preprocess_gold(eg.gold)
            if gold is not None and self.moves.has_gold(gold):
                good_docs.append(doc)
                good_golds.append(gold)
                good_states.append(states[i])
        n_moves = []
        for doc, gold in zip(good_docs, good_golds):
            oracle_actions = self.moves.get_oracle_sequence(doc, gold)
            n_moves.append(len(oracle_actions))
        return good_states, good_golds, max(n_moves, default=0) * 2
    def _init_gold_batch(self, whole_examples, min_length=5, max_length=500):
        """Make a square batch, of length equal to the shortest doc. A long
        doc will get multiple states. Let's say we have a doc of length 2*N,
@ -550,16 +580,19 @@ cdef class Parser:
        cdef np.ndarray d_scores = numpy.zeros((len(states), self.moves.n_moves),
                                        dtype='f', order='C')
        c_d_scores = <float*>d_scores.data
        unseen_classes = self.model.attrs["unseen_classes"]
        for i, (state, gold) in enumerate(zip(states, golds)):
            memset(is_valid, 0, self.moves.n_moves * sizeof(int))
            memset(costs, 0, self.moves.n_moves * sizeof(float))
            self.moves.set_costs(is_valid, costs, state, gold)
            for j in range(self.moves.n_moves):
-                if costs[j] <= 0.0 and j in self.model.unseen_classes:
+                if costs[j] <= 0.0 and j in unseen_classes:
-                    self.model.unseen_classes.remove(j)
+                    unseen_classes.remove(j)
            cpu_log_loss(c_d_scores,
                costs, is_valid, &scores[i, 0], d_scores.shape[1])
            c_d_scores += d_scores.shape[1]
        if len(states):
            d_scores /= len(states)
        if losses is not None:
            losses.setdefault(self.name, 0.)
            losses[self.name] += (d_scores**2).sum()
@ -569,8 +602,7 @@ cdef class Parser:
        return create_default_optimizer()
    def set_output(self, nO):
-        if self.model.upper.has_dim("nO") is None:
+        self.model.attrs["resize_output"](self.model, nO)
            self.model.upper.set_dim("nO", nO)
    def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs):
        self.cfg.update(kwargs)
@ -597,7 +629,6 @@ cdef class Parser:
            for doc, gold in parses:
                doc_sample.append(doc)
                gold_sample.append(gold)
        self.model.initialize(doc_sample, gold_sample)
        if pipeline is not None:
            self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg)
--- a/spacy/tests/parser/test_add_label.py
+++ b/spacy/tests/parser/test_add_label.py
@ -65,7 +65,7 @@ def test_add_label_deserializes_correctly():
    ner2 = EntityRecognizer(Vocab(), default_ner())
    # the second model needs to be resized before we can call from_bytes
-    ner2.model.resize_output(ner1.moves.n_moves)
+    ner2.model.attrs["resize_output"](ner2.model, ner1.moves.n_moves)
    ner2.from_bytes(ner1.to_bytes())
    assert ner1.moves.n_moves == ner2.moves.n_moves
    for i in range(ner1.moves.n_moves):
--- a/spacy/tests/parser/test_neural_parser.py
+++ b/spacy/tests/parser/test_neural_parser.py
@ -3,9 +3,9 @@ from spacy.ml.models.defaults import default_parser, default_tok2vec
 from spacy.vocab import Vocab
 from spacy.syntax.arc_eager import ArcEager
 from spacy.syntax.nn_parser import Parser
 from spacy.syntax._parser_model import ParserModel
 from spacy.tokens.doc import Doc
 from spacy.gold import GoldParse
 from thinc.api import Model
@pytest.fixture
@ -34,7 +34,7 @@ def parser(vocab, arc_eager):
@pytest.fixture
 def model(arc_eager, tok2vec, vocab):
    model = default_parser()
-    model.resize_output(arc_eager.n_moves)
+    model.attrs["resize_output"](model, arc_eager.n_moves)
    model.initialize()
    return model
@ -50,7 +50,7 @@ def gold(doc):
 def test_can_init_nn_parser(parser):
-    assert isinstance(parser.model, ParserModel)
+    assert isinstance(parser.model, Model)
 def test_build_model(parser, vocab):
--- a/spacy/tests/pipeline/test_simple_ner.py
+++ b/spacy/tests/pipeline/test_simple_ner.py
@ -0,0 +1,417 @@
 import pytest
 from collections import namedtuple
 from thinc.api import NumpyOps
 from spacy.ml._biluo import BILUO, _get_transition_table
 from spacy.pipeline.simple_ner import SimpleNER
 import spacy
@pytest.fixture(params=[
    ["PER", "ORG", "LOC", "MISC"],
    ["GPE", "PERSON", "NUMBER", "CURRENCY", "EVENT"]
 ])
 def labels(request):
    return request.param
@pytest.fixture
 def ops():
    return NumpyOps()
 def _get_actions(labels):
    action_names = (
        [f"B{label}" for label in labels] + \
        [f"I{label}" for label in labels] + \
        [f"L{label}" for label in labels] + \
        [f"U{label}" for label in labels] + \
        ["O"]
    )
    A = namedtuple("actions", action_names)
    return A(**{name: i for i, name in enumerate(action_names)})
 def test_init_biluo_layer(labels):
    model = BILUO()
    model.set_dim("nO", model.attrs["get_num_actions"](len(labels)))
    model.initialize()
    assert model.get_dim("nO") == len(labels) * 4 + 1
 def test_transition_table(ops):
    labels = ["per", "loc", "org"]
    table = _get_transition_table(len(labels))
    a = _get_actions(labels)
    assert table.shape == (2, len(a), len(a))
    # Not last token, prev action was B
    assert table[0, a.Bper, a.Bper] == 0
    assert table[0, a.Bper, a.Bloc] == 0
    assert table[0, a.Bper, a.Borg] == 0
    assert table[0, a.Bper, a.Iper] == 1
    assert table[0, a.Bper, a.Iloc] == 0
    assert table[0, a.Bper, a.Iorg] == 0
    assert table[0, a.Bper, a.Lper] == 1
    assert table[0, a.Bper, a.Lloc] == 0
    assert table[0, a.Bper, a.Lorg] == 0
    assert table[0, a.Bper, a.Uper] == 0
    assert table[0, a.Bper, a.Uloc] == 0
    assert table[0, a.Bper, a.Uorg] == 0
    assert table[0, a.Bper, a.O] == 0
    assert table[0, a.Bloc, a.Bper] == 0
    assert table[0, a.Bloc, a.Bloc] == 0
    assert table[0, a.Bloc, a.Borg] == 0
    assert table[0, a.Bloc, a.Iper] == 0
    assert table[0, a.Bloc, a.Iloc] == 1
    assert table[0, a.Bloc, a.Iorg] == 0
    assert table[0, a.Bloc, a.Lper] == 0
    assert table[0, a.Bloc, a.Lloc] == 1
    assert table[0, a.Bloc, a.Lorg] == 0
    assert table[0, a.Bloc, a.Uper] == 0
    assert table[0, a.Bloc, a.Uloc] == 0
    assert table[0, a.Bloc, a.Uorg] == 0
    assert table[0, a.Bloc, a.O] == 0
    assert table[0, a.Borg, a.Bper] == 0
    assert table[0, a.Borg, a.Bloc] == 0
    assert table[0, a.Borg, a.Borg] == 0
    assert table[0, a.Borg, a.Iper] == 0
    assert table[0, a.Borg, a.Iloc] == 0
    assert table[0, a.Borg, a.Iorg] == 1
    assert table[0, a.Borg, a.Lper] == 0
    assert table[0, a.Borg, a.Lloc] == 0
    assert table[0, a.Borg, a.Lorg] == 1
    assert table[0, a.Borg, a.Uper] == 0
    assert table[0, a.Borg, a.Uloc] == 0
    assert table[0, a.Borg, a.Uorg] == 0
    assert table[0, a.Borg, a.O] == 0
    # Not last token, prev action was I
    assert table[0, a.Iper, a.Bper] == 0
    assert table[0, a.Iper, a.Bloc] == 0
    assert table[0, a.Iper, a.Borg] == 0
    assert table[0, a.Iper, a.Iper] == 1
    assert table[0, a.Iper, a.Iloc] == 0
    assert table[0, a.Iper, a.Iorg] == 0
    assert table[0, a.Iper, a.Lper] == 1
    assert table[0, a.Iper, a.Lloc] == 0
    assert table[0, a.Iper, a.Lorg] == 0
    assert table[0, a.Iper, a.Uper] == 0
    assert table[0, a.Iper, a.Uloc] == 0
    assert table[0, a.Iper, a.Uorg] == 0
    assert table[0, a.Iper, a.O] == 0
    assert table[0, a.Iloc, a.Bper] == 0
    assert table[0, a.Iloc, a.Bloc] == 0
    assert table[0, a.Iloc, a.Borg] == 0
    assert table[0, a.Iloc, a.Iper] == 0
    assert table[0, a.Iloc, a.Iloc] == 1
    assert table[0, a.Iloc, a.Iorg] == 0
    assert table[0, a.Iloc, a.Lper] == 0
    assert table[0, a.Iloc, a.Lloc] == 1
    assert table[0, a.Iloc, a.Lorg] == 0
    assert table[0, a.Iloc, a.Uper] == 0
    assert table[0, a.Iloc, a.Uloc] == 0
    assert table[0, a.Iloc, a.Uorg] == 0
    assert table[0, a.Iloc, a.O] == 0
    assert table[0, a.Iorg, a.Bper] == 0
    assert table[0, a.Iorg, a.Bloc] == 0
    assert table[0, a.Iorg, a.Borg] == 0
    assert table[0, a.Iorg, a.Iper] == 0
    assert table[0, a.Iorg, a.Iloc] == 0
    assert table[0, a.Iorg, a.Iorg] == 1
    assert table[0, a.Iorg, a.Lper] == 0
    assert table[0, a.Iorg, a.Lloc] == 0
    assert table[0, a.Iorg, a.Lorg] == 1
    assert table[0, a.Iorg, a.Uper] == 0
    assert table[0, a.Iorg, a.Uloc] == 0
    assert table[0, a.Iorg, a.Uorg] == 0
    assert table[0, a.Iorg, a.O] == 0
    # Not last token, prev action was L
    assert table[0, a.Lper, a.Bper] == 1
    assert table[0, a.Lper, a.Bloc] == 1
    assert table[0, a.Lper, a.Borg] == 1
    assert table[0, a.Lper, a.Iper] == 0
    assert table[0, a.Lper, a.Iloc] == 0
    assert table[0, a.Lper, a.Iorg] == 0
    assert table[0, a.Lper, a.Lper] == 0
    assert table[0, a.Lper, a.Lloc] == 0
    assert table[0, a.Lper, a.Lorg] == 0
    assert table[0, a.Lper, a.Uper] == 1
    assert table[0, a.Lper, a.Uloc] == 1
    assert table[0, a.Lper, a.Uorg] == 1
    assert table[0, a.Lper, a.O] == 1
    assert table[0, a.Lloc, a.Bper] == 1
    assert table[0, a.Lloc, a.Bloc] == 1
    assert table[0, a.Lloc, a.Borg] == 1
    assert table[0, a.Lloc, a.Iper] == 0
    assert table[0, a.Lloc, a.Iloc] == 0
    assert table[0, a.Lloc, a.Iorg] == 0
    assert table[0, a.Lloc, a.Lper] == 0
    assert table[0, a.Lloc, a.Lloc] == 0
    assert table[0, a.Lloc, a.Lorg] == 0
    assert table[0, a.Lloc, a.Uper] == 1
    assert table[0, a.Lloc, a.Uloc] == 1
    assert table[0, a.Lloc, a.Uorg] == 1
    assert table[0, a.Lloc, a.O] == 1
    assert table[0, a.Lorg, a.Bper] == 1
    assert table[0, a.Lorg, a.Bloc] == 1
    assert table[0, a.Lorg, a.Borg] == 1
    assert table[0, a.Lorg, a.Iper] == 0
    assert table[0, a.Lorg, a.Iloc] == 0
    assert table[0, a.Lorg, a.Iorg] == 0
    assert table[0, a.Lorg, a.Lper] == 0
    assert table[0, a.Lorg, a.Lloc] == 0
    assert table[0, a.Lorg, a.Lorg] == 0
    assert table[0, a.Lorg, a.Uper] == 1
    assert table[0, a.Lorg, a.Uloc] == 1
    assert table[0, a.Lorg, a.Uorg] == 1
    assert table[0, a.Lorg, a.O] == 1
    # Not last token, prev action was U
    assert table[0, a.Uper, a.Bper] == 1
    assert table[0, a.Uper, a.Bloc] == 1
    assert table[0, a.Uper, a.Borg] == 1
    assert table[0, a.Uper, a.Iper] == 0
    assert table[0, a.Uper, a.Iloc] == 0
    assert table[0, a.Uper, a.Iorg] == 0
    assert table[0, a.Uper, a.Lper] == 0
    assert table[0, a.Uper, a.Lloc] == 0
    assert table[0, a.Uper, a.Lorg] == 0
    assert table[0, a.Uper, a.Uper] == 1
    assert table[0, a.Uper, a.Uloc] == 1
    assert table[0, a.Uper, a.Uorg] == 1
    assert table[0, a.Uper, a.O] == 1
    assert table[0, a.Uloc, a.Bper] == 1
    assert table[0, a.Uloc, a.Bloc] == 1
    assert table[0, a.Uloc, a.Borg] == 1
    assert table[0, a.Uloc, a.Iper] == 0
    assert table[0, a.Uloc, a.Iloc] == 0
    assert table[0, a.Uloc, a.Iorg] == 0
    assert table[0, a.Uloc, a.Lper] == 0
    assert table[0, a.Uloc, a.Lloc] == 0
    assert table[0, a.Uloc, a.Lorg] == 0
    assert table[0, a.Uloc, a.Uper] == 1
    assert table[0, a.Uloc, a.Uloc] == 1
    assert table[0, a.Uloc, a.Uorg] == 1
    assert table[0, a.Uloc, a.O] == 1
    assert table[0, a.Uorg, a.Bper] == 1
    assert table[0, a.Uorg, a.Bloc] == 1
    assert table[0, a.Uorg, a.Borg] == 1
    assert table[0, a.Uorg, a.Iper] == 0
    assert table[0, a.Uorg, a.Iloc] == 0
    assert table[0, a.Uorg, a.Iorg] == 0
    assert table[0, a.Uorg, a.Lper] == 0
    assert table[0, a.Uorg, a.Lloc] == 0
    assert table[0, a.Uorg, a.Lorg] == 0
    assert table[0, a.Uorg, a.Uper] == 1
    assert table[0, a.Uorg, a.Uloc] == 1
    assert table[0, a.Uorg, a.Uorg] == 1
    assert table[0, a.Uorg, a.O] == 1
    # Not last token, prev action was O
    assert table[0, a.O, a.Bper] == 1
    assert table[0, a.O, a.Bloc] == 1
    assert table[0, a.O, a.Borg] == 1
    assert table[0, a.O, a.Iper] == 0
    assert table[0, a.O, a.Iloc] == 0
    assert table[0, a.O, a.Iorg] == 0
    assert table[0, a.O, a.Lper] == 0
    assert table[0, a.O, a.Lloc] == 0
    assert table[0, a.O, a.Lorg] == 0
    assert table[0, a.O, a.Uper] == 1
    assert table[0, a.O, a.Uloc] == 1
    assert table[0, a.O, a.Uorg] == 1
    assert table[0, a.O, a.O] == 1
    # Last token, prev action was B
    assert table[1, a.Bper, a.Bper] == 0
    assert table[1, a.Bper, a.Bloc] == 0
    assert table[1, a.Bper, a.Borg] == 0
    assert table[1, a.Bper, a.Iper] == 0
    assert table[1, a.Bper, a.Iloc] == 0
    assert table[1, a.Bper, a.Iorg] == 0
    assert table[1, a.Bper, a.Lper] == 1
    assert table[1, a.Bper, a.Lloc] == 0
    assert table[1, a.Bper, a.Lorg] == 0
    assert table[1, a.Bper, a.Uper] == 0
    assert table[1, a.Bper, a.Uloc] == 0
    assert table[1, a.Bper, a.Uorg] == 0
    assert table[1, a.Bper, a.O] == 0
    assert table[1, a.Bloc, a.Bper] == 0
    assert table[1, a.Bloc, a.Bloc] == 0
    assert table[0, a.Bloc, a.Borg] == 0
    assert table[1, a.Bloc, a.Iper] == 0
    assert table[1, a.Bloc, a.Iloc] == 0
    assert table[1, a.Bloc, a.Iorg] == 0
    assert table[1, a.Bloc, a.Lper] == 0
    assert table[1, a.Bloc, a.Lloc] == 1
    assert table[1, a.Bloc, a.Lorg] == 0
    assert table[1, a.Bloc, a.Uper] == 0
    assert table[1, a.Bloc, a.Uloc] == 0
    assert table[1, a.Bloc, a.Uorg] == 0
    assert table[1, a.Bloc, a.O] == 0
    assert table[1, a.Borg, a.Bper] == 0
    assert table[1, a.Borg, a.Bloc] == 0
    assert table[1, a.Borg, a.Borg] == 0
    assert table[1, a.Borg, a.Iper] == 0
    assert table[1, a.Borg, a.Iloc] == 0
    assert table[1, a.Borg, a.Iorg] == 0
    assert table[1, a.Borg, a.Lper] == 0
    assert table[1, a.Borg, a.Lloc] == 0
    assert table[1, a.Borg, a.Lorg] == 1
    assert table[1, a.Borg, a.Uper] == 0
    assert table[1, a.Borg, a.Uloc] == 0
    assert table[1, a.Borg, a.Uorg] == 0
    assert table[1, a.Borg, a.O] == 0
    # Last token, prev action was I
    assert table[1, a.Iper, a.Bper] == 0
    assert table[1, a.Iper, a.Bloc] == 0
    assert table[1, a.Iper, a.Borg] == 0
    assert table[1, a.Iper, a.Iper] == 0
    assert table[1, a.Iper, a.Iloc] == 0
    assert table[1, a.Iper, a.Iorg] == 0
    assert table[1, a.Iper, a.Lper] == 1
    assert table[1, a.Iper, a.Lloc] == 0
    assert table[1, a.Iper, a.Lorg] == 0
    assert table[1, a.Iper, a.Uper] == 0
    assert table[1, a.Iper, a.Uloc] == 0
    assert table[1, a.Iper, a.Uorg] == 0
    assert table[1, a.Iper, a.O] == 0
    assert table[1, a.Iloc, a.Bper] == 0
    assert table[1, a.Iloc, a.Bloc] == 0
    assert table[1, a.Iloc, a.Borg] == 0
    assert table[1, a.Iloc, a.Iper] == 0
    assert table[1, a.Iloc, a.Iloc] == 0
    assert table[1, a.Iloc, a.Iorg] == 0
    assert table[1, a.Iloc, a.Lper] == 0
    assert table[1, a.Iloc, a.Lloc] == 1
    assert table[1, a.Iloc, a.Lorg] == 0
    assert table[1, a.Iloc, a.Uper] == 0
    assert table[1, a.Iloc, a.Uloc] == 0
    assert table[1, a.Iloc, a.Uorg] == 0
    assert table[1, a.Iloc, a.O] == 0
    assert table[1, a.Iorg, a.Bper] == 0
    assert table[1, a.Iorg, a.Bloc] == 0
    assert table[1, a.Iorg, a.Borg] == 0
    assert table[1, a.Iorg, a.Iper] == 0
    assert table[1, a.Iorg, a.Iloc] == 0
    assert table[1, a.Iorg, a.Iorg] == 0
    assert table[1, a.Iorg, a.Lper] == 0
    assert table[1, a.Iorg, a.Lloc] == 0
    assert table[1, a.Iorg, a.Lorg] == 1
    assert table[1, a.Iorg, a.Uper] == 0
    assert table[1, a.Iorg, a.Uloc] == 0
    assert table[1, a.Iorg, a.Uorg] == 0
    assert table[1, a.Iorg, a.O] == 0
    # Last token, prev action was L
    assert table[1, a.Lper, a.Bper] == 0
    assert table[1, a.Lper, a.Bloc] == 0
    assert table[1, a.Lper, a.Borg] == 0
    assert table[1, a.Lper, a.Iper] == 0
    assert table[1, a.Lper, a.Iloc] == 0
    assert table[1, a.Lper, a.Iorg] == 0
    assert table[1, a.Lper, a.Lper] == 0
    assert table[1, a.Lper, a.Lloc] == 0
    assert table[1, a.Lper, a.Lorg] == 0
    assert table[1, a.Lper, a.Uper] == 1
    assert table[1, a.Lper, a.Uloc] == 1
    assert table[1, a.Lper, a.Uorg] == 1
    assert table[1, a.Lper, a.O] == 1
    assert table[1, a.Lloc, a.Bper] == 0
    assert table[1, a.Lloc, a.Bloc] == 0
    assert table[1, a.Lloc, a.Borg] == 0
    assert table[1, a.Lloc, a.Iper] == 0
    assert table[1, a.Lloc, a.Iloc] == 0
    assert table[1, a.Lloc, a.Iorg] == 0
    assert table[1, a.Lloc, a.Lper] == 0
    assert table[1, a.Lloc, a.Lloc] == 0
    assert table[1, a.Lloc, a.Lorg] == 0
    assert table[1, a.Lloc, a.Uper] == 1
    assert table[1, a.Lloc, a.Uloc] == 1
    assert table[1, a.Lloc, a.Uorg] == 1
    assert table[1, a.Lloc, a.O] == 1
    assert table[1, a.Lorg, a.Bper] == 0
    assert table[1, a.Lorg, a.Bloc] == 0
    assert table[1, a.Lorg, a.Borg] == 0
    assert table[1, a.Lorg, a.Iper] == 0
    assert table[1, a.Lorg, a.Iloc] == 0
    assert table[1, a.Lorg, a.Iorg] == 0
    assert table[1, a.Lorg, a.Lper] == 0
    assert table[1, a.Lorg, a.Lloc] == 0
    assert table[1, a.Lorg, a.Lorg] == 0
    assert table[1, a.Lorg, a.Uper] == 1
    assert table[1, a.Lorg, a.Uloc] == 1
    assert table[1, a.Lorg, a.Uorg] == 1
    assert table[1, a.Lorg, a.O] == 1
    # Last token, prev action was U
    assert table[1, a.Uper, a.Bper] == 0
    assert table[1, a.Uper, a.Bloc] == 0
    assert table[1, a.Uper, a.Borg] == 0
    assert table[1, a.Uper, a.Iper] == 0
    assert table[1, a.Uper, a.Iloc] == 0
    assert table[1, a.Uper, a.Iorg] == 0
    assert table[1, a.Uper, a.Lper] == 0
    assert table[1, a.Uper, a.Lloc] == 0
    assert table[1, a.Uper, a.Lorg] == 0
    assert table[1, a.Uper, a.Uper] == 1
    assert table[1, a.Uper, a.Uloc] == 1
    assert table[1, a.Uper, a.Uorg] == 1
    assert table[1, a.Uper, a.O] == 1
    assert table[1, a.Uloc, a.Bper] == 0
    assert table[1, a.Uloc, a.Bloc] == 0
    assert table[1, a.Uloc, a.Borg] == 0
    assert table[1, a.Uloc, a.Iper] == 0
    assert table[1, a.Uloc, a.Iloc] == 0
    assert table[1, a.Uloc, a.Iorg] == 0
    assert table[1, a.Uloc, a.Lper] == 0
    assert table[1, a.Uloc, a.Lloc] == 0
    assert table[1, a.Uloc, a.Lorg] == 0
    assert table[1, a.Uloc, a.Uper] == 1
    assert table[1, a.Uloc, a.Uloc] == 1
    assert table[1, a.Uloc, a.Uorg] == 1
    assert table[1, a.Uloc, a.O] == 1
    assert table[1, a.Uorg, a.Bper] == 0
    assert table[1, a.Uorg, a.Bloc] == 0
    assert table[1, a.Uorg, a.Borg] == 0
    assert table[1, a.Uorg, a.Iper] == 0
    assert table[1, a.Uorg, a.Iloc] == 0
    assert table[1, a.Uorg, a.Iorg] == 0
    assert table[1, a.Uorg, a.Lper] == 0
    assert table[1, a.Uorg, a.Lloc] == 0
    assert table[1, a.Uorg, a.Lorg] == 0
    assert table[1, a.Uorg, a.Uper] == 1
    assert table[1, a.Uorg, a.Uloc] == 1
    assert table[1, a.Uorg, a.Uorg] == 1
    assert table[1, a.Uorg, a.O] == 1
    # Last token, prev action was O
    assert table[1, a.O, a.Bper] == 0
    assert table[1, a.O, a.Bloc] == 0
    assert table[1, a.O, a.Borg] == 0
    assert table[1, a.O, a.Iper] == 0
    assert table[1, a.O, a.Iloc] == 0
    assert table[1, a.O, a.Iorg] == 0
    assert table[1, a.O, a.Lper] == 0
    assert table[1, a.O, a.Lloc] == 0
    assert table[1, a.O, a.Lorg] == 0
    assert table[1, a.O, a.Uper] == 1
    assert table[1, a.O, a.Uloc] == 1
    assert table[1, a.O, a.Uorg] == 1
    assert table[1, a.O, a.O] == 1
--- a/spacy/tests/regression/test_issue2001-2500.py
+++ b/spacy/tests/regression/test_issue2001-2500.py
@ -34,7 +34,8 @@ def test_issue2179():
    nlp2.add_pipe(nlp2.create_pipe("ner"))
    assert len(nlp2.get_pipe("ner").labels) == 0
-    nlp2.get_pipe("ner").model.resize_output(nlp.get_pipe("ner").moves.n_moves)
+    model = nlp2.get_pipe("ner").model
    model.attrs["resize_output"](model, nlp.get_pipe("ner").moves.n_moves)
    nlp2.from_bytes(nlp.to_bytes())
    assert "extra_labels" not in nlp2.get_pipe("ner").cfg
    assert nlp2.get_pipe("ner").labels == ("CITIZENSHIP",)
--- a/spacy/tests/regression/test_issue3001-3500.py
+++ b/spacy/tests/regression/test_issue3001-3500.py
@ -104,7 +104,8 @@ def test_issue3209():
    assert ner.move_names == move_names
    nlp2 = English()
    nlp2.add_pipe(nlp2.create_pipe("ner"))
-    nlp2.get_pipe("ner").model.resize_output(ner.moves.n_moves)
+    model = nlp2.get_pipe("ner").model
    model.attrs["resize_output"](model, ner.moves.n_moves)
    nlp2.from_bytes(nlp.to_bytes())
    assert nlp2.get_pipe("ner").move_names == move_names
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@ -110,10 +110,9 @@ def test_serialize_custom_nlp():
        nlp2 = spacy.load(d)
        model = nlp2.get_pipe("parser").model
        tok2vec = model.get_ref("tok2vec")
-        upper = model.upper
+        upper = model.get_ref("upper")
        # check that we have the correct settings, not the default ones
        assert tok2vec.get_dim("nO") == 321
        assert upper.get_dim("nI") == 65
@ -131,8 +130,7 @@ def test_serialize_parser():
        nlp2 = spacy.load(d)
        model = nlp2.get_pipe("parser").model
        tok2vec = model.get_ref("tok2vec")
-        upper = model.upper
+        upper = model.get_ref("upper")
        # check that we have the correct settings, not the default ones
        assert upper.get_dim("nI") == 66
        assert tok2vec.get_dim("nO") == 333
--- a/spacy/tests/serialize/test_serialize_pipeline.py
+++ b/spacy/tests/serialize/test_serialize_pipeline.py
@ -63,7 +63,7 @@ def test_to_from_bytes(parser, blank_parser):
    bytes_data = parser.to_bytes(exclude=["vocab"])
    # the blank parser needs to be resized before we can call from_bytes
-    blank_parser.model.resize_output(parser.moves.n_moves)
+    blank_parser.model.attrs["resize_output"](blank_parser.model, parser.moves.n_moves)
    blank_parser.from_bytes(bytes_data)
    assert blank_parser.model is not True
    assert blank_parser.moves.n_moves == parser.moves.n_moves
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@ -38,7 +38,7 @@ def test_util_get_package_path(package):
 def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2):
-    model = PrecomputableAffine(nO=nO, nI=nI, nF=nF, nP=nP)
+    model = PrecomputableAffine(nO=nO, nI=nI, nF=nF, nP=nP).initialize()
    assert model.get_param("W").shape == (nF, nO, nP, nI)
    tensor = model.ops.alloc((10, nI))
    Y, get_dX = model.begin_update(tensor)
--- a/spacy/util.py
+++ b/spacy/util.py
@ -571,8 +571,10 @@ def decaying(start, stop, decay):
        curr -= decay
-def minibatch_by_words(examples, size, tuples=True, count_words=len):
+def minibatch_by_words(examples, size, tuples=True, count_words=len, tolerance=0.2):
-    """Create minibatches of a given number of words."""
+    """Create minibatches of roughly a given number of words. If any examples
    are longer than the specified batch length, they will appear in a batch by
    themselves."""
    if isinstance(size, int):
        size_ = itertools.repeat(size)
    elif isinstance(size, List):
@ -580,18 +582,36 @@ def minibatch_by_words(examples, size, tuples=True, count_words=len):
    else:
        size_ = size
    examples = iter(examples)
    oversize = []
    while True:
        batch_size = next(size_)
        tol_size = batch_size * 0.2
        batch = []
-        while batch_size >= 0:
+        if oversize:
            example = oversize.pop(0)
            n_words = count_words(example.doc)
            batch.append(example)
            batch_size -= n_words
        while batch_size >= 1:
            try:
                example = next(examples)
            except StopIteration:
-                if batch:
+                if oversize:
-                    yield batch
+                    examples = iter(oversize)
-                return
+                    oversize = []
-            batch_size -= count_words(example.doc)
+                    if batch:
-            batch.append(example)
+                        yield batch
                    break
                else:
                    if batch:
                        yield batch
                    return
            n_words = count_words(example.doc)
            if n_words < (batch_size + tol_size):
                batch_size -= n_words
                batch.append(example)
            else:
                oversize.append(example)
        if batch:
            yield batch