Merge branch 'develop' into nightly.spacy.io

2025-12-23 10:03:15 +03:00 · 2020-07-09 20:48:24 +02:00 · 2020-07-09 20:48:24 +02:00 · 797ca6f3dd
commit 797ca6f3dd
parent ea01831f6a 018319a640
22 changed files with 408 additions and 170 deletions
--- a/examples/experiments/onto-ner.cfg
+++ b/examples/experiments/onto-ner.cfg
@ -5,7 +5,7 @@
 # data is passed in sentence-by-sentence via some prior preprocessing.
 gold_preproc = false
 # Limitations on training document length or number of examples.
-max_length = 5000
+max_length = 3000
 limit = 0
 # Data augmentation
 orth_variant_level = 0.0
@ -17,20 +17,20 @@ max_steps = 0
 eval_frequency = 1000
 # Other settings
 seed = 0
-accumulate_gradient = 2
+accumulate_gradient = 1
 use_pytorch_for_gpu_memory = false
 # Control how scores are printed and checkpoints are evaluated.
 scores = ["speed", "ents_p", "ents_r", "ents_f"]
 score_weights = {"ents_f": 1.0}
 # These settings are invalid for the transformer models.
 init_tok2vec = null
-discard_oversize = true
+discard_oversize = false
 omit_extra_lookups = false
-batch_by_words = true
+batch_by = "words"

 [training.batch_size]
@schedules = "compounding.v1"
-start = 1000
+start = 100
 stop = 1000
 compound = 1.001

@ -45,12 +45,6 @@ use_averages = true
 eps = 1e-8
 learn_rate = 0.001

-#[training.optimizer.learn_rate]
-#@schedules = "warmup_linear.v1"
-#warmup_steps = 1000
-#total_steps = 50000
-#initial_rate = 0.003
-
 [nlp]
 lang = "en"
 vectors = null
@ -74,6 +68,6 @@ width = 96
 depth = 4
 window_size = 1
 embed_size = 2000
-maxout_pieces = 1
+maxout_pieces = 3
 subword_features = true
 dropout = ${training:dropout}
--- a/spacy/about.py
+++ b/spacy/about.py
@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a2"
+__version__ = "3.0.0a3"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@ -120,8 +120,12 @@ def convert(
            no_print=silent,
            ner_map=ner_map,
        )
+        if file_type == "json":
+            data = [docs_to_json(docs)]
+        else:
+            data = DocBin(docs=docs, store_user_data=True).to_bytes()
        if output_dir == "-":
-            _print_docs_to_stdout(docs, file_type)
+            _print_docs_to_stdout(data, file_type)
        else:
            if input_loc != input_path:
                subpath = input_loc.relative_to(input_path)
@ -129,24 +133,23 @@ def convert(
            else:
                output_file = Path(output_dir) / input_loc.parts[-1]
                output_file = output_file.with_suffix(f".{file_type}")
-            _write_docs_to_file(docs, output_file, file_type)
+            _write_docs_to_file(data, output_file, file_type)
            msg.good(f"Generated output file ({len(docs)} documents): {output_file}")


-def _print_docs_to_stdout(docs, output_type):
+def _print_docs_to_stdout(data, output_type):
    if output_type == "json":
-        srsly.write_json("-", [docs_to_json(docs)])
+        srsly.write_json("-", data)
    else:
-        sys.stdout.buffer.write(DocBin(docs=docs, store_user_data=True).to_bytes())
+        sys.stdout.buffer.write(data)


-def _write_docs_to_file(docs, output_file, output_type):
+def _write_docs_to_file(data, output_file, output_type):
    if not output_file.parent.exists():
        output_file.parent.mkdir(parents=True)
    if output_type == "json":
-        srsly.write_json(output_file, [docs_to_json(docs)])
+        srsly.write_json(output_file, data)
    else:
-        data = DocBin(docs=docs, store_user_data=True).to_bytes()
        with output_file.open("wb") as file_:
            file_.write(data)

--- a/spacy/cli/project/clone.py
+++ b/spacy/cli/project/clone.py
@ -1,3 +1,4 @@
+from typing import Optional
 from pathlib import Path
 from wasabi import msg
 import subprocess
@ -24,22 +25,18 @@ DIRS = [
@project_cli.command("clone")
 def project_clone_cli(
    # fmt: off
-    name: str = Arg(..., help="The name of the template to fetch"),
-    dest: Path = Arg(Path.cwd(), help="Where to download and work. Defaults to current working directory.", exists=False),
-    repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to look in."),
+    name: str = Arg(..., help="The name of the template to clone"),
+    dest: Optional[Path] = Arg(None, help="Where to clone the project. Defaults to current working directory", exists=False),
+    repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to clone from"),
    # fmt: on
 ):
    """Clone a project template from a repository. Calls into "git" and will
    only download the files from the given subdirectory. The GitHub repo
    defaults to the official spaCy template repo, but can be customized
-    (including using a private repo). Setting the --git flag will also
-    initialize the project directory as a Git repo. If the project is intended
-    to be a Git repo, it should be initialized with Git first, before
-    initializing DVC (Data Version Control). This allows DVC to integrate with
-    Git.
+    (including using a private repo).
    """
-    if dest == Path.cwd():
-        dest = dest / name
+    if dest is None:
+        dest = Path.cwd() / name
    project_clone(name, dest, repo=repo)


--- a/spacy/cli/project/dvc.py
+++ b/spacy/cli/project/dvc.py
@ -30,7 +30,7 @@ def project_update_dvc_cli(
    """Auto-generate Data Version Control (DVC) config. A DVC
    project can only define one pipeline, so you need to specify one workflow
    defined in the project.yml. If no workflow is specified, the first defined
-    workflow is used. The DVC config will only be updated if
+    workflow is used. The DVC config will only be updated if the project.yml changed.
    """
    project_update_dvc(project_dir, workflow, verbose=verbose, force=force)

--- a/spacy/cli/project/run.py
+++ b/spacy/cli/project/run.py
@ -20,14 +20,14 @@ def project_run_cli(
    subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"),
    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
    force: bool = Opt(False, "--force", "-F", help="Force re-running steps, even if nothing changed"),
-    dry: bool = Opt(False, "--dry", "-D", help="Perform a dry run and don't execute commands"),
+    dry: bool = Opt(False, "--dry", "-D", help="Perform a dry run and don't execute scripts"),
    show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
    # fmt: on
 ):
-    """Run a named script or workflow defined in the project.yml. If a workflow
+    """Run a named command or workflow defined in the project.yml. If a workflow
    name is specified, all commands in the workflow are run, in order. If
-    commands define inputs and/or outputs, they will only be re-run if state
-    has changed.
+    commands define dependencies and/or outputs, they will only be re-run if
+    state has changed.
    """
    if show_help or not subcommand:
        print_run_help(project_dir, subcommand)
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -121,14 +121,14 @@ class ConfigSchema(BaseModel):
@app.command("train")
 def train_cli(
    # fmt: off
-    train_path: Path = Arg(..., help="Location of JSON-formatted training data", exists=True),
-    dev_path: Path = Arg(..., help="Location of JSON-formatted development data", exists=True),
+    train_path: Path = Arg(..., help="Location of training data", exists=True),
+    dev_path: Path = Arg(..., help="Location of development data", exists=True),
    config_path: Path = Arg(..., help="Path to config file", exists=True),
    output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store model in"),
    code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
    init_tok2vec: Optional[Path] = Opt(None, "--init-tok2vec", "-t2v", help="Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental."),
    raw_text: Optional[Path] = Opt(None, "--raw-text", "-rt", help="Path to jsonl file with unlabelled text documents."),
-    verbose: bool = Opt(False, "--verbose", "-VV", help="Display more information for debugging purposes"),
+    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
    use_gpu: int = Opt(-1, "--use-gpu", "-g", help="Use GPU"),
    tag_map_path: Optional[Path] = Opt(None, "--tag-map-path", "-tm", help="Location of JSON-formatted tag map"),
    omit_extra_lookups: bool = Opt(False, "--omit-extra-lookups", "-OEL", help="Don't include extra lookups in model"),
@ -203,8 +203,10 @@ def train(
        msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}")
        train_examples = list(
            corpus.train_dataset(
-                nlp, shuffle=False, gold_preproc=training["gold_preproc"],
-                max_length=training["max_length"]
+                nlp,
+                shuffle=False,
+                gold_preproc=training["gold_preproc"],
+                max_length=training["max_length"],
            )
        )
        nlp.begin_training(lambda: train_examples)
@ -303,21 +305,26 @@ def create_train_batches(nlp, corpus, cfg):
    )

    epoch = 0
+    batch_strategy = cfg.get("batch_by", "sequences")
    while True:
        if len(train_examples) == 0:
            raise ValueError(Errors.E988)
        epoch += 1
-        if cfg.get("batch_by_words", True):
+        if batch_strategy == "padded":
+            batches = util.minibatch_by_padded_size(
+                train_examples,
+                size=cfg["batch_size"],
+                buffer=256,
+                discard_oversize=cfg["discard_oversize"],
+            )
+        elif batch_strategy == "words":
            batches = util.minibatch_by_words(
                train_examples,
                size=cfg["batch_size"],
                discard_oversize=cfg["discard_oversize"],
            )
        else:
-            batches = util.minibatch(
-                train_examples,
-                size=cfg["batch_size"],
-            )
+            batches = util.minibatch(train_examples, size=cfg["batch_size"])

        # make sure the minibatch_by_words result is not empty, or we'll have an infinite training loop
        try:
@ -430,7 +437,9 @@ def train_while_improving(

    if raw_text:
        random.shuffle(raw_text)
-        raw_examples = [Example.from_dict(nlp.make_doc(rt["text"]), {}) for rt in raw_text]
+        raw_examples = [
+            Example.from_dict(nlp.make_doc(rt["text"]), {}) for rt in raw_text
+        ]
        raw_batches = util.minibatch(raw_examples, size=8)

    for step, (epoch, batch) in enumerate(train_data):
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -69,6 +69,9 @@ class Warnings(object):
    W027 = ("Found a large training file of {size} bytes. Note that it may "
            "be more efficient to split your training data into multiple "
            "smaller JSON files instead.")
+    W028 = ("Doc.from_array was called with a vector of type '{type}', "
+            "but is expecting one of type 'uint64' instead. This may result "
+            "in problems with the vocab further on in the pipeline.")
    W030 = ("Some entities could not be aligned in the text \"{text}\" with "
            "entities \"{entities}\". Use "
            "`spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)`"
--- a/spacy/gold/example.pyx
+++ b/spacy/gold/example.pyx
@ -36,6 +36,9 @@ cdef class Example:
        self.y = reference
        self._alignment = alignment

+    def __len__(self):
+        return len(self.predicted)
+
    property predicted:
        def __get__(self):
            return self.x
@ -326,8 +329,8 @@ def _fix_legacy_dict_data(example_dict):
    for key, value in old_token_dict.items():
        if key in ("text", "ids", "brackets"):
            pass
-        elif key in remapping:
-            token_dict[remapping[key]] = value
+        elif key.lower() in remapping:
+            token_dict[remapping[key.lower()]] = value
        else:
            raise KeyError(Errors.E983.format(key=key, dict="token_annotation", keys=remapping.keys()))
    text = example_dict.get("text", example_dict.get("raw"))
--- a/spacy/language.py
+++ b/spacy/language.py
@ -513,20 +513,23 @@ class Language(object):
    ):
        """Update the models in the pipeline.

-        examples (iterable): A batch of `Example` objects.
+        examples (Iterable[Example]): A batch of examples
        dummy: Should not be set - serves to catch backwards-incompatible scripts.
        drop (float): The dropout rate.
-        sgd (callable): An optimizer.
-        losses (dict): Dictionary to update with the loss, keyed by component.
-        component_cfg (dict): Config parameters for specific pipeline
+        sgd (Optimizer): An optimizer.
+        losses (Dict[str, float]): Dictionary to update with the loss, keyed by component.
+        component_cfg (Dict[str, Dict]): Config parameters for specific pipeline
            components, keyed by component name.
+        RETURNS (Dict[str, float]): The updated losses dictionary

        DOCS: https://spacy.io/api/language#update
        """
        if dummy is not None:
            raise ValueError(Errors.E989)
+        if losses is None:
+            losses = {}
        if len(examples) == 0:
-            return
+            return losses
        if not isinstance(examples, Iterable):
            raise TypeError(Errors.E978.format(name="language", method="update", types=type(examples)))
        wrong_types = set([type(eg) for eg in examples if not isinstance(eg, Example)])
@ -552,6 +555,7 @@ class Language(object):
            for name, proc in self.pipeline:
                if hasattr(proc, "model"):
                    proc.model.finish_update(sgd)
+        return losses

    def rehearse(self, examples, sgd=None, losses=None, config=None):
        """Make a "rehearsal" update to the models in the pipeline, to prevent
@ -757,18 +761,17 @@ class Language(object):
    ):
        """Process texts as a stream, and yield `Doc` objects in order.

-        texts (iterator): A sequence of texts to process.
+        texts (Iterable[str]): A sequence of texts to process.
        as_tuples (bool): If set to True, inputs should be a sequence of
            (text, context) tuples. Output will then be a sequence of
            (doc, context) tuples. Defaults to False.
        batch_size (int): The number of texts to buffer.
-        disable (list): Names of the pipeline components to disable.
+        disable (List[str]): Names of the pipeline components to disable.
        cleanup (bool): If True, unneeded strings are freed to control memory
            use. Experimental.
-        component_cfg (dict): An optional dictionary with extra keyword
+        component_cfg (Dict[str, Dict]): An optional dictionary with extra keyword
            arguments for specific components.
-        n_process (int): Number of processors to process texts, only supported
-            in Python3. If -1, set `multiprocessing.cpu_count()`.
+        n_process (int): Number of processors to process texts. If -1, set `multiprocessing.cpu_count()`.
        YIELDS (Doc): Documents in the order of the original text.

        DOCS: https://spacy.io/api/language#pipe
--- a/spacy/ml/models/textcat.py
+++ b/spacy/ml/models/textcat.py
@ -87,16 +87,16 @@ def build_text_classifier(
    cols = [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]
    with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
        lower = HashEmbed(
-            nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout
+            nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout, seed=10
        )
        prefix = HashEmbed(
-            nO=width // 2, nV=embed_size, column=cols.index(PREFIX), dropout=dropout
+            nO=width // 2, nV=embed_size, column=cols.index(PREFIX), dropout=dropout, seed=11
        )
        suffix = HashEmbed(
-            nO=width // 2, nV=embed_size, column=cols.index(SUFFIX), dropout=dropout
+            nO=width // 2, nV=embed_size, column=cols.index(SUFFIX), dropout=dropout, seed=12
        )
        shape = HashEmbed(
-            nO=width // 2, nV=embed_size, column=cols.index(SHAPE), dropout=dropout
+            nO=width // 2, nV=embed_size, column=cols.index(SHAPE), dropout=dropout, seed=13
        )

        width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape])
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@ -154,16 +154,16 @@ def LayerNormalizedMaxout(width, maxout_pieces):
 def MultiHashEmbed(
    columns, width, rows, use_subwords, pretrained_vectors, mix, dropout
 ):
-    norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout)
+    norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout, seed=6)
    if use_subwords:
        prefix = HashEmbed(
-            nO=width, nV=rows // 2, column=columns.index("PREFIX"), dropout=dropout
+            nO=width, nV=rows // 2, column=columns.index("PREFIX"), dropout=dropout, seed=7
        )
        suffix = HashEmbed(
-            nO=width, nV=rows // 2, column=columns.index("SUFFIX"), dropout=dropout
+            nO=width, nV=rows // 2, column=columns.index("SUFFIX"), dropout=dropout, seed=8
        )
        shape = HashEmbed(
-            nO=width, nV=rows // 2, column=columns.index("SHAPE"), dropout=dropout
+            nO=width, nV=rows // 2, column=columns.index("SHAPE"), dropout=dropout, seed=9
        )

    if pretrained_vectors:
@ -192,7 +192,7 @@ def MultiHashEmbed(

@registry.architectures.register("spacy.CharacterEmbed.v1")
 def CharacterEmbed(columns, width, rows, nM, nC, features, dropout):
-    norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout)
+    norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout, seed=5)
    chr_embed = _character_embed.CharacterEmbed(nM=nM, nC=nC)
    with Model.define_operators({">>": chain, "|": concatenate}):
        embed_layer = chr_embed | features >> with_array(norm)
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@ -58,12 +58,8 @@ class Pipe(object):
        Both __call__ and pipe should delegate to the `predict()`
        and `set_annotations()` methods.
        """
-        predictions = self.predict([doc])
-        if isinstance(predictions, tuple) and len(predictions) == 2:
-            scores, tensors = predictions
-            self.set_annotations([doc], scores, tensors=tensors)
-        else:
-            self.set_annotations([doc], predictions)
+        scores = self.predict([doc])
+        self.set_annotations([doc], scores)
        return doc

    def pipe(self, stream, batch_size=128):
@ -73,12 +69,8 @@ class Pipe(object):
        and `set_annotations()` methods.
        """
        for docs in util.minibatch(stream, size=batch_size):
-            predictions = self.predict(docs)
-            if isinstance(predictions, tuple) and len(tuple) == 2:
-                scores, tensors = predictions
-                self.set_annotations(docs, scores, tensors=tensors)
-            else:
-                self.set_annotations(docs, predictions)
+            scores = self.predict(docs)
+            self.set_annotations(docs, scores)
            yield from docs

    def predict(self, docs):
@ -87,7 +79,7 @@ class Pipe(object):
        """
        raise NotImplementedError

-    def set_annotations(self, docs, scores, tensors=None):
+    def set_annotations(self, docs, scores):
        """Modify a batch of documents, using pre-computed scores."""
        raise NotImplementedError

@ -281,9 +273,10 @@ class Tagger(Pipe):
                idx += 1
            doc.is_tagged = True

-    def update(self, examples, drop=0., sgd=None, losses=None, set_annotations=False):
-        if losses is not None and self.name not in losses:
-            losses[self.name] = 0.
+    def update(self, examples, *, drop=0., sgd=None, losses=None, set_annotations=False):
+        if losses is None:
+            losses = {}
+        losses.setdefault(self.name, 0.0)

        try:
            if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
@ -303,11 +296,11 @@ class Tagger(Pipe):
        if sgd not in (None, False):
            self.model.finish_update(sgd)

-        if losses is not None:
-            losses[self.name] += loss
+        losses[self.name] += loss
        if set_annotations:
            docs = [eg.predicted for eg in examples]
            self.set_annotations(docs, self._scores2guesses(tag_scores))
+        return losses

    def rehearse(self, examples, drop=0., sgd=None, losses=None):
        """Perform a 'rehearsal' update, where we try to match the output of
@ -635,7 +628,7 @@ class MultitaskObjective(Tagger):
    def labels(self, value):
        self.cfg["labels"] = value

-    def set_annotations(self, docs, dep_ids, tensors=None):
+    def set_annotations(self, docs, dep_ids):
        pass

    def begin_training(self, get_examples=lambda: [], pipeline=None,
@ -732,7 +725,7 @@ class ClozeMultitask(Pipe):
        self.cfg = cfg
        self.distance = CosineDistance(ignore_zeros=True, normalize=False)  # TODO: in config

-    def set_annotations(self, docs, dep_ids, tensors=None):
+    def set_annotations(self, docs, dep_ids):
        pass

    def begin_training(self, get_examples=lambda: [], pipeline=None,
@ -761,7 +754,7 @@ class ClozeMultitask(Pipe):
        loss = self.distance.get_loss(prediction, target)
        return loss, gradient

-    def update(self, examples, drop=0., set_annotations=False, sgd=None, losses=None):
+    def update(self, examples, *, drop=0., set_annotations=False, sgd=None, losses=None):
        pass

    def rehearse(self, examples, drop=0., sgd=None, losses=None):
@ -809,8 +802,8 @@ class TextCategorizer(Pipe):

    def pipe(self, stream, batch_size=128):
        for docs in util.minibatch(stream, size=batch_size):
-            scores, tensors = self.predict(docs)
-            self.set_annotations(docs, scores, tensors=tensors)
+            scores = self.predict(docs)
+            self.set_annotations(docs, scores)
            yield from docs

    def predict(self, docs):
@ -820,22 +813,25 @@ class TextCategorizer(Pipe):
            # Handle cases where there are no tokens in any docs.
            xp = get_array_module(tensors)
            scores = xp.zeros((len(docs), len(self.labels)))
-            return scores, tensors
+            return scores

        scores = self.model.predict(docs)
        scores = self.model.ops.asarray(scores)
-        return scores, tensors
+        return scores

-    def set_annotations(self, docs, scores, tensors=None):
+    def set_annotations(self, docs, scores):
        for i, doc in enumerate(docs):
            for j, label in enumerate(self.labels):
                doc.cats[label] = float(scores[i, j])

-    def update(self, examples, state=None, drop=0., set_annotations=False, sgd=None, losses=None):
+    def update(self, examples, *, drop=0., set_annotations=False, sgd=None, losses=None):
+        if losses is None:
+            losses = {}
+        losses.setdefault(self.name, 0.0)
        try:
            if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
                # Handle cases where there are no tokens in any docs.
-                return
+                return losses
        except AttributeError:
            types = set([type(eg) for eg in examples])
            raise TypeError(Errors.E978.format(name="TextCategorizer", method="update", types=types))
@ -847,12 +843,11 @@ class TextCategorizer(Pipe):
        bp_scores(d_scores)
        if sgd is not None:
            self.model.finish_update(sgd)
-        if losses is not None:
-            losses.setdefault(self.name, 0.0)
-            losses[self.name] += loss
+        losses[self.name] += loss
        if set_annotations:
            docs = [eg.predicted for eg in examples]
            self.set_annotations(docs, scores=scores)
+        return losses

    def rehearse(self, examples, drop=0., sgd=None, losses=None):
        if self._rehearsal_model is None:
@ -1076,12 +1071,13 @@ class EntityLinker(Pipe):
            sgd = self.create_optimizer()
        return sgd

-    def update(self, examples, state=None, set_annotations=False, drop=0.0, sgd=None, losses=None):
+    def update(self, examples, *, set_annotations=False, drop=0.0, sgd=None, losses=None):
        self.require_kb()
-        if losses is not None:
-            losses.setdefault(self.name, 0.0)
+        if losses is None:
+            losses = {}
+        losses.setdefault(self.name, 0.0)
        if not examples:
-            return 0
+            return losses
        sentence_docs = []
        try:
            docs = [eg.predicted for eg in examples]
@ -1124,20 +1120,19 @@ class EntityLinker(Pipe):
            return 0.0
        sentence_encodings, bp_context = self.model.begin_update(sentence_docs)
        loss, d_scores = self.get_similarity_loss(
-            scores=sentence_encodings,
+            sentence_encodings=sentence_encodings,
            examples=examples
        )
        bp_context(d_scores)
        if sgd is not None:
            self.model.finish_update(sgd)

-        if losses is not None:
-            losses[self.name] += loss
+        losses[self.name] += loss
        if set_annotations:
            self.set_annotations(docs, predictions)
-        return loss
+        return losses

-    def get_similarity_loss(self, examples, scores):
+    def get_similarity_loss(self, examples, sentence_encodings):
        entity_encodings = []
        for eg in examples:
            kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
@ -1149,41 +1144,23 @@ class EntityLinker(Pipe):

        entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32")

-        if scores.shape != entity_encodings.shape:
+        if sentence_encodings.shape != entity_encodings.shape:
            raise RuntimeError(Errors.E147.format(method="get_similarity_loss", msg="gold entities do not match up"))

-        gradients = self.distance.get_grad(scores, entity_encodings)
-        loss = self.distance.get_loss(scores, entity_encodings)
+        gradients = self.distance.get_grad(sentence_encodings, entity_encodings)
+        loss = self.distance.get_loss(sentence_encodings, entity_encodings)
        loss = loss / len(entity_encodings)
        return loss, gradients

-    def get_loss(self, examples, scores):
-        cats = []
-        for eg in examples:
-            kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
-            for ent in eg.predicted.ents:
-                kb_id = kb_ids[ent.start]
-                if kb_id:
-                    cats.append([1.0])
-
-        cats = self.model.ops.asarray(cats, dtype="float32")
-        if len(scores) != len(cats):
-            raise RuntimeError(Errors.E147.format(method="get_loss", msg="gold entities do not match up"))
-
-        d_scores = (scores - cats)
-        loss = (d_scores ** 2).sum()
-        loss = loss / len(cats)
-        return loss, d_scores
-
    def __call__(self, doc):
-        kb_ids, tensors = self.predict([doc])
-        self.set_annotations([doc], kb_ids, tensors=tensors)
+        kb_ids = self.predict([doc])
+        self.set_annotations([doc], kb_ids)
        return doc

    def pipe(self, stream, batch_size=128):
        for docs in util.minibatch(stream, size=batch_size):
-            kb_ids, tensors = self.predict(docs)
-            self.set_annotations(docs, kb_ids, tensors=tensors)
+            kb_ids = self.predict(docs)
+            self.set_annotations(docs, kb_ids)
            yield from docs

    def predict(self, docs):
@ -1191,10 +1168,9 @@ class EntityLinker(Pipe):
        self.require_kb()
        entity_count = 0
        final_kb_ids = []
-        final_tensors = []

        if not docs:
-            return final_kb_ids, final_tensors
+            return final_kb_ids

        if isinstance(docs, Doc):
            docs = [docs]
@ -1228,21 +1204,18 @@ class EntityLinker(Pipe):
                            if to_discard and ent.label_ in to_discard:
                                # ignoring this entity - setting to NIL
                                final_kb_ids.append(self.NIL)
-                                final_tensors.append(sentence_encoding)

                            else:
                                candidates = self.kb.get_candidates(ent.text)
                                if not candidates:
                                    # no prediction possible for this entity - setting to NIL
                                    final_kb_ids.append(self.NIL)
-                                    final_tensors.append(sentence_encoding)

                                elif len(candidates) == 1:
                                    # shortcut for efficiency reasons: take the 1 candidate

                                    # TODO: thresholding
                                    final_kb_ids.append(candidates[0].entity_)
-                                    final_tensors.append(sentence_encoding)

                                else:
                                    random.shuffle(candidates)
@ -1271,14 +1244,13 @@ class EntityLinker(Pipe):
                                    best_index = scores.argmax().item()
                                    best_candidate = candidates[best_index]
                                    final_kb_ids.append(best_candidate.entity_)
-                                    final_tensors.append(sentence_encoding)

-        if not (len(final_tensors) == len(final_kb_ids) == entity_count):
+        if not (len(final_kb_ids) == entity_count):
            raise RuntimeError(Errors.E147.format(method="predict", msg="result variables not of equal length"))

-        return final_kb_ids, final_tensors
+        return final_kb_ids

-    def set_annotations(self, docs, kb_ids, tensors=None):
+    def set_annotations(self, docs, kb_ids):
        count_ents = len([ent for doc in docs for ent in doc.ents])
        if count_ents != len(kb_ids):
            raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids)))
@ -1394,11 +1366,7 @@ class Sentencizer(Pipe):
    def pipe(self, stream, batch_size=128):
        for docs in util.minibatch(stream, size=batch_size):
            predictions = self.predict(docs)
-            if isinstance(predictions, tuple) and len(tuple) == 2:
-                scores, tensors = predictions
-                self.set_annotations(docs, scores, tensors=tensors)
-            else:
-                self.set_annotations(docs, predictions)
+            self.set_annotations(docs, predictions)
            yield from docs

    def predict(self, docs):
@ -1429,7 +1397,7 @@ class Sentencizer(Pipe):
            guesses.append(doc_guesses)
        return guesses

-    def set_annotations(self, docs, batch_tag_ids, tensors=None):
+    def set_annotations(self, docs, batch_tag_ids):
        if isinstance(docs, Doc):
            docs = [docs]
        cdef Doc doc
--- a/spacy/pipeline/simple_ner.py
+++ b/spacy/pipeline/simple_ner.py
@ -57,7 +57,7 @@ class SimpleNER(Pipe):
        scores = self.model.predict(docs)
        return scores

-    def set_annotations(self, docs: List[Doc], scores: List[Floats2d], tensors=None):
+    def set_annotations(self, docs: List[Doc], scores: List[Floats2d]):
        """Set entities on a batch of documents from a batch of scores."""
        tag_names = self.get_tag_names()
        for i, doc in enumerate(docs):
@ -67,9 +67,12 @@ class SimpleNER(Pipe):
                tags = iob_to_biluo(tags)
            doc.ents = spans_from_biluo_tags(doc, tags)

-    def update(self, examples, set_annotations=False, drop=0.0, sgd=None, losses=None):
+    def update(self, examples, *, set_annotations=False, drop=0.0, sgd=None, losses=None):
+        if losses is None:
+            losses = {}
+        losses.setdefault("ner", 0.0)
        if not any(_has_ner(eg) for eg in examples):
-            return 0
+            return losses
        docs = [eg.predicted for eg in examples]
        set_dropout_rate(self.model, drop)
        scores, bp_scores = self.model.begin_update(docs)
@ -79,10 +82,8 @@ class SimpleNER(Pipe):
            self.set_annotations(docs, scores)
        if sgd is not None:
            self.model.finish_update(sgd)
-        if losses is not None:
-            losses.setdefault("ner", 0.0)
-            losses["ner"] += loss
-        return loss
+        losses["ner"] += loss
+        return losses

    def get_loss(self, examples, scores):
        loss = 0
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@ -83,12 +83,14 @@ class Tok2Vec(Pipe):
            assert tokvecs.shape[0] == len(doc)
            doc.tensor = tokvecs

-    def update(self, examples, drop=0.0, sgd=None, losses=None, set_annotations=False):
+    def update(self, examples, *, drop=0.0, sgd=None, losses=None, set_annotations=False):
        """Update the model.
-        examples (iterable): A batch of examples
+        examples (Iterable[Example]): A batch of examples
        drop (float): The droput rate.
-        sgd (callable): An optimizer.
-        RETURNS (dict): Results from the update.
+        sgd (Optimizer): An optimizer.
+        losses (Dict[str, float]): Dictionary to update with the loss, keyed by component.
+        set_annotations (bool): whether or not to update the examples with the predictions
+        RETURNS (Dict[str, float]): The updated losses dictionary
        """
        if losses is None:
            losses = {}
@ -124,6 +126,7 @@ class Tok2Vec(Pipe):
        self.listeners[-1].receive(batch_id, tokvecs, backprop)
        if set_annotations:
            self.set_annotations(docs, tokvecs)
+        return losses

    def get_loss(self, docs, golds, scores):
        pass
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -153,7 +153,7 @@ cdef class Parser:
        doc (Doc): The document to be processed.
        """
        states = self.predict([doc])
-        self.set_annotations([doc], states, tensors=None)
+        self.set_annotations([doc], states)
        return doc

    def pipe(self, docs, int batch_size=256):
@ -170,7 +170,7 @@ cdef class Parser:
            for subbatch in util.minibatch(by_length, size=max(batch_size//4, 2)):
                subbatch = list(subbatch)
                parse_states = self.predict(subbatch)
-                self.set_annotations(subbatch, parse_states, tensors=None)
+                self.set_annotations(subbatch, parse_states)
            yield from batch_in_order

    def predict(self, docs):
@ -222,7 +222,7 @@ cdef class Parser:
            unfinished.clear()
        free_activations(&activations)

-    def set_annotations(self, docs, states, tensors=None):
+    def set_annotations(self, docs, states):
        cdef StateClass state
        cdef Doc doc
        for i, (state, doc) in enumerate(zip(states, docs)):
@ -263,7 +263,7 @@ cdef class Parser:
                states[i].push_hist(guess)
        free(is_valid)

-    def update(self, examples, drop=0., set_annotations=False, sgd=None, losses=None):
+    def update(self, examples, *, drop=0., set_annotations=False, sgd=None, losses=None):
        cdef StateClass state
        if losses is None:
            losses = {}
--- a/spacy/tests/regression/test_issue4001-4500.py
+++ b/spacy/tests/regression/test_issue4001-4500.py
@ -302,7 +302,7 @@ def test_multiple_predictions():
        def predict(self, docs):
            return ([1, 2, 3], [4, 5, 6])

-        def set_annotations(self, docs, scores, tensors=None):
+        def set_annotations(self, docs, scores):
            return docs

    nlp = Language()
--- a/spacy/tests/regression/test_issue5551.py
+++ b/spacy/tests/regression/test_issue5551.py
@ -0,0 +1,31 @@
+from spacy.lang.en import English
+from spacy.util import fix_random_seed
+
+
+def test_issue5551():
+    """Test that after fixing the random seed, the results of the pipeline are truly identical"""
+    component = "textcat"
+    pipe_cfg = {"exclusive_classes": False}
+
+    results = []
+    for i in range(3):
+        fix_random_seed(0)
+        nlp = English()
+        example = (
+            "Once hot, form ping-pong-ball-sized balls of the mixture, each weighing roughly 25 g.",
+            {"cats": {"Labe1": 1.0, "Label2": 0.0, "Label3": 0.0}},
+        )
+        nlp.add_pipe(nlp.create_pipe(component, config=pipe_cfg), last=True)
+        pipe = nlp.get_pipe(component)
+        for label in set(example[1]["cats"]):
+            pipe.add_label(label)
+        nlp.begin_training(component_cfg={component: pipe_cfg})
+
+        # Store the result of each iteration
+        result = pipe.model.predict([nlp.make_doc(example[0])])
+        results.append(list(result[0]))
+
+    # All results should be the same because of the fixed seed
+    assert len(results) == 3
+    assert results[0] == results[1]
+    assert results[0] == results[2]
--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@ -1,3 +1,4 @@
+import numpy
 from spacy.errors import AlignmentError
 from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
 from spacy.gold import spans_from_biluo_tags, iob_to_biluo
@ -154,6 +155,27 @@ def test_gold_biluo_misalign(en_vocab):
    assert tags == ["O", "O", "O", "-", "-", "-"]


+def test_example_constructor(en_vocab):
+    words = ["I", "like", "stuff"]
+    tags = ["NOUN", "VERB", "NOUN"]
+    tag_ids = [en_vocab.strings.add(tag) for tag in tags]
+    predicted = Doc(en_vocab, words=words)
+    reference = Doc(en_vocab, words=words)
+    reference = reference.from_array("TAG", numpy.array(tag_ids, dtype="uint64"))
+    example = Example(predicted, reference)
+    tags = example.get_aligned("TAG", as_string=True)
+    assert tags == ["NOUN", "VERB", "NOUN"]
+
+
+def test_example_from_dict_tags(en_vocab):
+    words = ["I", "like", "stuff"]
+    tags = ["NOUN", "VERB", "NOUN"]
+    predicted = Doc(en_vocab, words=words)
+    example = Example.from_dict(predicted, {"TAGS": tags})
+    tags = example.get_aligned("TAG", as_string=True)
+    assert tags == ["NOUN", "VERB", "NOUN"]
+
+
 def test_example_from_dict_no_ner(en_vocab):
    words = ["a", "b", "c", "d"]
    spaces = [True, True, False, True]
--- a/spacy/tests/test_models.py
+++ b/spacy/tests/test_models.py
@ -0,0 +1,156 @@
+from typing import List
+
+import pytest
+from thinc.api import fix_random_seed, Adam, set_dropout_rate
+from numpy.testing import assert_array_equal
+import numpy
+
+from spacy.ml.models import build_Tok2Vec_model
+from spacy.ml.models import build_text_classifier, build_simple_cnn_text_classifier
+from spacy.lang.en import English
+from spacy.lang.en.examples import sentences as EN_SENTENCES
+
+
+def get_all_params(model):
+    params = []
+    for node in model.walk():
+        for name in node.param_names:
+            params.append(node.get_param(name).ravel())
+    return node.ops.xp.concatenate(params)
+
+
+def get_docs():
+    nlp = English()
+    return list(nlp.pipe(EN_SENTENCES + [" ".join(EN_SENTENCES)]))
+
+
+def get_gradient(model, Y):
+    if isinstance(Y, model.ops.xp.ndarray):
+        dY = model.ops.alloc(Y.shape, dtype=Y.dtype)
+        dY += model.ops.xp.random.uniform(-1.0, 1.0, Y.shape)
+        return dY
+    elif isinstance(Y, List):
+        return [get_gradient(model, y) for y in Y]
+    else:
+        raise ValueError(f"Could not compare type {type(Y)}")
+
+
+def default_tok2vec():
+    return build_Tok2Vec_model(**TOK2VEC_KWARGS)
+
+
+TOK2VEC_KWARGS = {
+    "width": 96,
+    "embed_size": 2000,
+    "subword_features": True,
+    "char_embed": False,
+    "conv_depth": 4,
+    "bilstm_depth": 0,
+    "maxout_pieces": 4,
+    "window_size": 1,
+    "dropout": 0.1,
+    "nM": 0,
+    "nC": 0,
+    "pretrained_vectors": None,
+}
+
+TEXTCAT_KWARGS = {
+    "width": 64,
+    "embed_size": 2000,
+    "pretrained_vectors": None,
+    "exclusive_classes": False,
+    "ngram_size": 1,
+    "window_size": 1,
+    "conv_depth": 2,
+    "dropout": None,
+    "nO": 7
+}
+
+TEXTCAT_CNN_KWARGS = {
+    "tok2vec": default_tok2vec(),
+    "exclusive_classes": False,
+    "nO": 13,
+}
+
+
+@pytest.mark.parametrize(
+    "seed,model_func,kwargs",
+    [
+        (0, build_Tok2Vec_model, TOK2VEC_KWARGS),
+        (0, build_text_classifier, TEXTCAT_KWARGS),
+        (0, build_simple_cnn_text_classifier, TEXTCAT_CNN_KWARGS),
+    ],
+)
+def test_models_initialize_consistently(seed, model_func, kwargs):
+    fix_random_seed(seed)
+    model1 = model_func(**kwargs)
+    model1.initialize()
+    fix_random_seed(seed)
+    model2 = model_func(**kwargs)
+    model2.initialize()
+    params1 = get_all_params(model1)
+    params2 = get_all_params(model2)
+    assert_array_equal(params1, params2)
+
+
+@pytest.mark.parametrize(
+    "seed,model_func,kwargs,get_X",
+    [
+        (0, build_Tok2Vec_model, TOK2VEC_KWARGS, get_docs),
+        (0, build_text_classifier, TEXTCAT_KWARGS, get_docs),
+        (0, build_simple_cnn_text_classifier, TEXTCAT_CNN_KWARGS, get_docs),
+    ],
+)
+def test_models_predict_consistently(seed, model_func, kwargs, get_X):
+    fix_random_seed(seed)
+    model1 = model_func(**kwargs).initialize()
+    Y1 = model1.predict(get_X())
+    fix_random_seed(seed)
+    model2 = model_func(**kwargs).initialize()
+    Y2 = model2.predict(get_X())
+
+    if model1.has_ref("tok2vec"):
+        tok2vec1 = model1.get_ref("tok2vec").predict(get_X())
+        tok2vec2 = model2.get_ref("tok2vec").predict(get_X())
+        for i in range(len(tok2vec1)):
+            for j in range(len(tok2vec1[i])):
+                assert_array_equal(numpy.asarray(tok2vec1[i][j]), numpy.asarray(tok2vec2[i][j]))
+
+    if isinstance(Y1, numpy.ndarray):
+        assert_array_equal(Y1, Y2)
+    elif isinstance(Y1, List):
+        assert len(Y1) == len(Y2)
+        for y1, y2 in zip(Y1, Y2):
+            assert_array_equal(y1, y2)
+    else:
+        raise ValueError(f"Could not compare type {type(Y1)}")
+
+
+@pytest.mark.parametrize(
+    "seed,dropout,model_func,kwargs,get_X",
+    [
+        (0, 0.2, build_Tok2Vec_model, TOK2VEC_KWARGS, get_docs),
+        (0, 0.2, build_text_classifier, TEXTCAT_KWARGS, get_docs),
+        (0, 0.2, build_simple_cnn_text_classifier, TEXTCAT_CNN_KWARGS, get_docs),
+    ],
+)
+def test_models_update_consistently(seed, dropout, model_func, kwargs, get_X):
+    def get_updated_model():
+        fix_random_seed(seed)
+        optimizer = Adam(0.001)
+        model = model_func(**kwargs).initialize()
+        initial_params = get_all_params(model)
+        set_dropout_rate(model, dropout)
+        for _ in range(5):
+            Y, get_dX = model.begin_update(get_X())
+            dY = get_gradient(model, Y)
+            _ = get_dX(dY)
+            model.finish_update(optimizer)
+        updated_params = get_all_params(model)
+        with pytest.raises(AssertionError):
+            assert_array_equal(initial_params, updated_params)
+        return model
+
+    model1 = get_updated_model()
+    model2 = get_updated_model()
+    assert_array_equal(get_all_params(model1), get_all_params(model2))
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -803,7 +803,7 @@ cdef class Doc:
        attrs = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_)
                 for id_ in attrs]
        if array.dtype != numpy.uint64:
-            warnings.warn(Warnings.W101.format(type=array.dtype))
+            warnings.warn(Warnings.W028.format(type=array.dtype))

        if SENT_START in attrs and HEAD in attrs:
            raise ValueError(Errors.E032)
--- a/spacy/util.py
+++ b/spacy/util.py
@ -741,6 +741,50 @@ def minibatch(items, size=8):
        yield list(batch)


+def minibatch_by_padded_size(docs, size, buffer=256, discard_oversize=False):
+    if isinstance(size, int):
+        size_ = itertools.repeat(size)
+    else:
+        size_ = size
+    for outer_batch in minibatch(docs, buffer):
+        outer_batch = list(outer_batch)
+        target_size = next(size_)
+        for indices in _batch_by_length(outer_batch, target_size):
+            subbatch = [outer_batch[i] for i in indices]
+            padded_size = max(len(seq) for seq in subbatch) * len(subbatch)
+            if discard_oversize and padded_size >= target_size:
+                pass
+            else:
+                yield subbatch
+ 
+
+def _batch_by_length(seqs, max_words):
+    """Given a list of sequences, return a batched list of indices into the 
+    list, where the batches are grouped by length, in descending order.
+    
+    Batches may be at most max_words in size, defined as max sequence length * size.
+    """
+    # Use negative index so we can get sort by position ascending.
+    lengths_indices = [(len(seq), i) for i, seq in enumerate(seqs)]
+    lengths_indices.sort()
+    batches = []
+    batch = []
+    for length, i in lengths_indices:
+        if not batch:
+            batch.append(i)
+        elif length * (len(batch) + 1) <= max_words:
+            batch.append(i)
+        else:
+            batches.append(batch)
+            batch = [i]
+    if batch:
+        batches.append(batch)
+    # Check lengths match
+    assert sum(len(b) for b in batches) == len(seqs)
+    batches = [list(sorted(batch)) for batch in batches]
+    batches.reverse()
+    return batches
+
 def minibatch_by_words(docs, size, tolerance=0.2, discard_oversize=False):
    """Create minibatches of roughly a given number of words. If any examples
    are longer than the specified batch length, they will appear in a batch by
@ -787,7 +831,8 @@ def minibatch_by_words(docs, size, tolerance=0.2, discard_oversize=False):

        # yield the previous batch and start a new one. The new one gets the overflow examples.
        else:
-            yield batch
+            if batch:
+                yield batch
            target_size = next(size_)
            tol_size = target_size * tolerance
            batch = overflow
@ -807,15 +852,15 @@ def minibatch_by_words(docs, size, tolerance=0.2, discard_oversize=False):

            # this example does not fit with the previous overflow: start another new batch
            else:
-                yield batch
+                if batch:
+                    yield batch
                target_size = next(size_)
                tol_size = target_size * tolerance
                batch = [doc]
                batch_size = n_words

-    # yield the final batch
+    batch.extend(overflow)
    if batch:
-        batch.extend(overflow)
        yield batch